Guorui Zhong - Normalization example for SomaLogic proteomic data

Intial setup

library(here)
## here() starts at /Users/zhonggr/Library/CloudStorage/OneDrive-Personal/quarto
library(fs)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

### Project dir
dir  <-  "projects/2024_Plasma7k_SomaScan"

Step0: Load data

### Normalization methods
norm <- c(
    "raw", "hyb", "hyb.msnCal", "hyb.msnCal.ps", "hyb.msnCal.ps.cal", 
    "hyb.msnCal.ps.cal.msnAll"
    )
n_norm <- length(norm)

### Container to save normalized data
RFU <- vector("list", n_norm)

### Load sample meta as matrix
samples <- read.table(
    here(dir, "data/sample.txt"), header = T, quote = "", 
    comment.char = "", sep = "\t"
) |>
    as.matrix()

### 1806 human plasma samples,  2050 sample and controls across 22 plates
dim(samples)
## [1] 2050   32
samples[1:5, 1:5]
##      PlateId    PlateRunDate ScannerID    PlatePosition SlideId       
## [1,] "P0031168" "2021-12-03" "SG12344217" "A1"          "258633829464"
## [2,] "P0031168" "2021-12-03" "SG12344217" "A10"         "258633829473"
## [3,] "P0031168" "2021-12-03" "SG12344217" "A11"         "258633829518"
## [4,] "P0031168" "2021-12-03" "SG12344217" "A12"         "258633829519"
## [5,] "P0031168" "2021-12-03" "SG12344217" "A2"          "258633829465"
n_sample <- nrow(samples)
plates <- unique(samples[, "PlateId"])
n_plate <- length(plates)

### Load feature meta
somamers <- read.table(
    here(dir, "data/somamer.txt"), header = T, quote = "", 
    comment.char = "", sep = "\t"
) |>
    as.matrix()

### Total 7596 SOMAmers (feature); 7288 human protein, others are control
dim(somamers)
## [1] 7596   13
somamers[1:5, 1:13]
##      SeqId      SeqIdVersion SomaId     Target  UniProt  EntrezGeneID
## [1,] "10000-28" " 3"         "SL019233" "CRBB2" "P43320" "1415"      
## [2,] "10001-7"  " 3"         "SL002564" "c-Raf" "P04049" "5894"      
## [3,] "10003-15" " 3"         "SL019245" "ZNF41" "P51814" "7592"      
## [4,] "10006-25" " 3"         "SL019228" "ELK1"  "P19419" "2002"      
## [5,] "10008-43" " 3"         "SL019234" "GUC1A" "P43080" "2978"      
##      EntrezGeneSymbol Organism Units Type      Dilution
## [1,] "CRYBB2"         "Human"  "RFU" "Protein" "2e+01" 
## [2,] "RAF1"           "Human"  "RFU" "Protein" "2e+01" 
## [3,] "ZNF41"          "Human"  "RFU" "Protein" "5e-01" 
## [4,] "ELK1"           "Human"  "RFU" "Protein" "2e+01" 
## [5,] "GUCA1A"         "Human"  "RFU" "Protein" "2e+01" 
##      Cal_Interplate_Ref_Pass1 Cal_Interplate_Ref_Pass2
## [1,] "8.879371e+02"           "8.942875e+02"          
## [2,] "3.963486e+02"           "3.980628e+02"          
## [3,] "2.260907e+02"           "2.241294e+02"          
## [4,] "8.352066e+02"           "8.344351e+02"          
## [5,] "7.299779e+02"           "7.297289e+02"
n_somamer <- nrow(somamers)

### Load SomaScan raw assay data matrix (2050 x 7596)
RFU[[1]] <- read.table(here(dir, "data/RFU_raw.txt"), header = F, sep = "\t") |> 
    as.matrix()

### 2050x7596
dim(RFU[[1]])
## [1] 2050 7596
RFU[[1]][1:5, 1:5]
##         V1    V2    V3    V4     V5
## [1,] 748.8 325.4 217.1 572.0  720.7
## [2,] 577.5 385.4 325.7 634.8  731.0
## [3,] 526.8 318.6 203.0 581.3  618.0
## [4,] 508.4 422.7 213.1 641.2  688.2
## [5,] 450.5 290.5 202.2 580.2 2294.6

### Dilution groups
dilution <- c(0.005, 0.5, 20)
dilution_lab <- c("0_005", "0_5", "20")
n_dilution <- length(dilution)

Step1: Hybridization control normalization

Hybridization control normalization is designed to adjust for nui- sance variance on the basis of individual wells. Each well contains nHCE = 12 HCE (Hybridization Control Elu‑ tion) SOMAmers at different concentrations spanning more than 3 orders of magnitude. By comparing each observed HCE probe to its corresponding reference value, and then calculating the median over all HCE probes, we obtain the scale factor for the i-th well, i.e.

Notice that this normalization step is performed independently for each well; once the scale factor is determined, all SOMAmer RFUs in the well are multiplied by the same scale factor.

### well = sample
### Get idx for Hybridization feature 
somamer_HCE <- somamers[, "Type"] == "Hybridization Control Elution"
table(somamer_HCE)
## somamer_HCE
## FALSE  TRUE 
##  7584    12

### 12 control SOMAmer reagents addded to the eluate just prior to readout
n_HCE <- sum(somamer_HCE)

### Get hybridization control data: 1) Subset only the HYBs for all samples
RFU_HCE <- RFU[[1]][, somamer_HCE]

### Use another container to save hybridization normalized data
RFU[[2]] <- RFU[[1]]

### Empty container to save scale factor for hyb 
hyb_scale_factor <- rep(NA, n_sample)

### Empty container to save ratio
HCE_ratio <- matrix(rep(NA, n_sample * n_HCE), ncol = n_HCE)

### Loop through plates
for (i_plate in 1:n_plate) {

    ### Get current sample index from current plate
    sample_idx <- samples[, "PlateId"] == plates[i_plate]

    ### Get current sample data from current plate
    RFU_plate <- RFU[[2]][sample_idx, ]

    ### Current plate ratio
    HCE_ratio_plate <- HCE_ratio[sample_idx, ]

    ### Current plate hyb scale factor
    hyb_scale_factor_plate <- hyb_scale_factor[sample_idx]

    ### Calculate the median of each HYB for calibrators
    hyb_intra_ref <- apply(RFU_HCE[sample_idx, ], 2, median)
    
    ### Loop through sample within the plate for HCE
    for (i_sample in 1:sum(sample_idx)) {
        
        ### Calculate the ratio
        HCE_ratio_plate[i_sample, ] <- hyb_intra_ref / RFU_plate[i_sample, somamer_HCE]
        
        ### Use the median of HCE somamers as scale factor for each sample
        hyb_scale_factor_plate[i_sample] <- median(HCE_ratio_plate[i_sample, ])
       
        ### Apply the same scale factor for all features by each sample
        RFU_plate[i_sample, ] <- hyb_scale_factor_plate[i_sample] * RFU_plate[i_sample, ]
    }

    HCE_ratio[sample_idx, ] <- HCE_ratio_plate
    hyb_scale_factor[sample_idx] <- hyb_scale_factor_plate
    RFU[[2]][sample_idx, ] <- RFU_plate
}

HCE_ratio[1:5, 1:12]
##           [,1]      [,2]      [,3]     [,4]      [,5]      [,6]      [,7]
## [1,] 0.9028951 0.8465086 0.8636644 0.840359 0.8238109 0.8347273 0.8409453
## [2,] 1.0688565 1.0757213 1.0718344 1.074034 1.0761508 1.0865276 1.0898142
## [3,] 1.1755157 1.0443237 1.0618485 1.027817 1.0278632 1.0458846 1.0242561
## [4,] 1.2409731 1.0985134 1.1283522 1.093489 1.0620677 1.0887500 1.0404074
## [5,] 0.9606874 0.8826018 0.8870467 0.870571 0.8706513 0.8889404 0.8772853
##           [,8]      [,9]     [,10]     [,11]     [,12]
## [1,] 0.8245079 0.8392952 0.8265836 0.8554753 0.8322683
## [2,] 1.0839455 1.1161029 1.0670241 1.0616821 1.1057077
## [3,] 1.0360929 1.0523256 1.0140127 1.0951655 1.0508598
## [4,] 1.0976469 1.0806892 1.0949106 1.1302752 1.1075787
## [5,] 0.8698039 0.8517074 0.8504274 0.9175154 0.8744776

Step2: Median signal normalization on calibrators

Median signal normalization is an intra-plate normalization procedure performed within wells of the same sample class (i.e. separately for buffer, QC, calibrator, and experimental samples) and within SOMAmers of the same dilution grouping (i.e. 20%, 0.5%, and 0.005%).

It is intended to remove sample-to-sample differences in total RFU brightness that may be due to differences in overall protein concentration, pipetting variation, variation in reagent concentrations, assay timing, and other sources of variability within a group of otherwise comparable samples. Since RFU brightness differs significantly across SOMAmers, median signal normalization proceeds in two steps.

First, the median RFU of each SOMAmer is determined (across all samples of the same sample type) and sample RFUs are divided by it. The ratio corresponding to the i-th sample and α-th SOMAmer is thus given by Performing median signal normalization on experimental samples before inter-plate calibration presents the risk of enhancing plate-to-plate differences. Thus, in this step, we restrict median signal normalization to calibrators only.

RFU[[3]] = RFU[[2]]
SF_medNormInt <- matrix(rep(1, n_sample * n_dilution), ncol = n_dilution)

### Only do the normalization on calibrator samples
sample_type  <- c("Calibrator")

### Loop throuh plate
for (i_plate in 1:length(plates)) {
    
    sel1  <-  samples[, "PlateId"] == plates[i_plate]
    
    ### Loop through sample type
    for (i_sample_type in 1:length(sample_type)) {
        
        ### Get the Calibrator sample idx
        sel2  <-  samples[, "SampleType"] == sample_type[i_sample_type]
        
        sample_idx  <-  which(sel1 & sel2)
        
        ### Loop through dilution
        for (i_dilution in 1:n_dilution) {
            
            sel_somamer <- as.numeric(somamers[, "Dilution"]) == dilution[i_dilution]
            
            dat  <-  RFU[[3]][sample_idx, sel_somamer, drop = F]
            
            dat2 <- dat
            
            ### Loop throuhg dilution somamers
            for (i in 1:ncol(dat2)) {

                ### Get the median of dilution somamer  
                dat2[, i]  <-  dat2[, i] / median(dat2[, i])
            }

            for (i in 1:nrow(dat)) {
                
                SF_medNormInt[sample_idx[i], i_dilution] = 1 / median(dat2[i, ])
                
                dat[i, ] = dat[i, ] * SF_medNormInt[sample_idx[i], i_dilution]
            }

            RFU[[3]][sample_idx, sel_somamer]  <-  dat
        }
    }
}

RFU[[3]][1:5, 1:10]
##            V1       V2       V3       V4        V5       V6       V7       V8
## [1,] 628.8625 273.2797 182.3265 480.3811  605.2634 300.3222 1951.674 1160.725
## [2,] 621.3531 414.6658 350.4324 683.0042  786.5093 445.3299 3063.620 2156.176
## [3,] 550.5609 332.9702 212.1561 607.5190  645.8744 331.9251 2623.734 1432.524
## [4,] 557.3481 463.3970 233.6170 702.9339  754.4590 373.1733 2593.796 1444.457
## [5,] 394.5846 254.4436 177.1032 508.1864 2009.7976 243.4075 1846.358 1060.342
##            V9       V10
## [1,] 354.1551  721.9994
## [2,] 521.5062 1008.0445
## [3,] 431.5235  832.8435
## [4,] 431.6050  932.3851
## [5,] 312.3393  608.3873

Step3: Plate-scale normalization

Plate-scale normalization aims to control for variance in total signal intensity from plate to plate. No protein spikes are added to the calibrator; the procedure solely relies on the endogenous levels of each protein within the set of calibrator replicates

We utilize an internal reference determined by the median across all calibrators on all plates, i.e.

\[ \begin{aligned} RFU_{\alpha }^{Cal,ref}\equiv \text {median}\left\{ RFU_{i\alpha }^{Cal,obs} \right\} _{i=1,\ldots ,n_{Cal}}^{p=1,\ldots ,n_p}\ . \end{aligned} \]

For the - SOMAmer on the p- plate, \[ \begin{aligned} SF_\alpha ^p = RFU_\alpha ^{Cal,ref}/\text {median}\left\{ RFU_{i\alpha }^{Cal,obs} \right\} _{i=1,\ldots ,n_{Cal}}^p\ . \end{aligned} \]

RFU[[4]] = RFU[[3]]

SF <- matrix(rep(NA, n_plate * n_somamer), ncol = n_somamer)

### Get calibrator sample idx
sel_cal <- samples[, "SampleType"] == "Calibrator"

cal_interplate_ref <- rep(NA, n_somamer)

### Loop through features
for (i_somamer in 1:n_somamer) {
    ### Get the median of control
    cal_interplate_ref[i_somamer]  <-  median(RFU[[4]][sel_cal, i_somamer])

    for (i_plate in 1:n_plate) {

        sel_plate = samples[, "PlateId"] == plates[i_plate]
        
        cal_intraplate_ref = median(RFU[[4]][sel_cal & sel_plate, i_somamer])

        SF[i_plate, i_somamer] = cal_interplate_ref[i_somamer] / cal_intraplate_ref
    }
}

SF_plateScale  <-  apply(SF, 1, median)

for (i_plate in 1:n_plate) {

    sel_plate = samples[, "PlateId"] == plates[i_plate]

    RFU[[4]][sel_plate, ] = RFU[[4]][sel_plate, ] * SF_plateScale[i_plate]
}

n order to correct the overall brightness level of the p- plate, we calculate the plate-scale scale factor as the median of across all SOMAmers, i.e.

Step4: Inter-plate calibration

RFU[[5]] <- RFU[[4]]

SF_cal <- matrix(rep(NA, n_plate * n_somamer), ncol = n_somamer)
sel_cal <- samples[, "SampleType"] == "Calibrator"

### New interplate reference
cal_interplate_ref_N = rep(NA, n_somamer)

for (i_somamer in 1:n_somamer) {
    
    cal_interplate_ref_N[i_somamer] <- median(RFU[[5]][sel_cal, i_somamer])
    
    for (i_plate in 1:n_plate) {
    
        sel_plate <- samples[, "PlateId"] == plates[i_plate]
    
        cal_intraplate_ref <- median(RFU[[5]][sel_cal & sel_plate, i_somamer])

        SF_cal[i_plate, i_somamer] <- cal_interplate_ref_N[i_somamer] / cal_intraplate_ref
        
        RFU[[5]][sel_plate, i_somamer] <- RFU[[5]][sel_plate, i_somamer] * SF_cal[i_plate, i_somamer]
    }
}

Step6: Median signal normalization on all sample types

RFU[[6]] <- RFU[[5]]

SF_medNormFull  <- matrix(rep(1, n_sample * n_dilution), ncol = n_dilution)

sample_type <- c("QC", "Sample", "Buffer", "Calibrator")
n_sampl_type = length(sample_type)

for (i_sample_type in 1:length(sample_type)) {
    
    sample_idx <- which(samples[, "SampleType"] == sample_type[i_sample_type])
    
    for (i_dilution in 1:n_dilution) {

        sel_somamer <- as.numeric(somamers[, "Dilution"]) == dilution[i_dilution]
        
        dat <- RFU[[6]][sample_idx, sel_somamer, drop = F]
        
        dat2 <- dat
        
        for (i in 1:ncol(dat2)) {
            
            dat2[, i] <- dat2[, i] / median(dat2[, i])
        }

        for (i in 1:nrow(dat)) {
        
            SF_medNormFull[sample_idx[i], i_dilution] <- 1 / median(dat2[i, ])
        
            dat[i, ] = dat[i, ] * SF_medNormFull[sample_idx[i], i_dilution]
        }

        RFU[[6]][sample_idx, sel_somamer]  <-  dat
    }
}

Save results

### Save normalized results
dir_create(here(dir, "results"))
for (i_norm in 2:n_norm) {
    outfile <- path(here(dir, "results", paste0("RFU_", norm[i_norm], ".txt")))
    write(t(RFU[[i_norm]]), ncol = n_somamer, file = outfile, sep = "\t")
}

### Save sample scale factors
outfile <- path(here(dir, "results", "samples_SF.txt"))
header <- c(
    colnames(samples), "hyb_scale_factor", paste0("SF_msnCal_d", dilution_lab), 
    paste0("SF_msnAll_d", dilution_lab)
)
output <- rbind(header, cbind(samples, hyb_scale_factor, SF_medNormInt, SF_medNormFull))
write(t(output), ncol = ncol(output), file = outfile, sep = "\t")

### Save feature scale factor
outfile <- path(here(dir, "results", "somamers_SF.txt"))
header <- c(
    colnames(somamers), "Cal_Interplate_Ref_Pass1", "Cal_Interplate_Ref_Pass2"
)
output <- rbind(
    header, 
    cbind(somamers, cal_interplate_ref, cal_interplate_ref_N)
)
write(t(output), ncol = ncol(output), file = outfile, sep = "\t")

### Save plate scale factor
outfile <- path(here(dir, "results", "plates_SF.txt"))
header <- c("Plate", "SF_plateScale", paste0("SF_cal_", somamers[, "SeqId"]))
output <- rbind(header, cbind(plate, SF_plateScale, SF_cal))
write(t(output), ncol = ncol(output), file = outfile, sep = "\t")

Reference

Assessment of Variability in the Plasma 7k SomaScan Proteomics Assay