library(here)
## here() starts at /Users/zhonggr/Library/CloudStorage/OneDrive-Personal/quarto
library(fs)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
### Project dir
dir <- "projects/2024_Plasma7k_SomaScan"
Intial setup
Step0: Load data
### Normalization methods
norm <- c(
"raw", "hyb", "hyb.msnCal", "hyb.msnCal.ps", "hyb.msnCal.ps.cal",
"hyb.msnCal.ps.cal.msnAll"
)
n_norm <- length(norm)
### Container to save normalized data
RFU <- vector("list", n_norm)
### Load sample meta as matrix
samples <- read.table(
here(dir, "data/sample.txt"), header = T, quote = "",
comment.char = "", sep = "\t"
) |>
as.matrix()
### 1806 human plasma samples, 2050 sample and controls across 22 plates
dim(samples)
## [1] 2050 32
samples[1:5, 1:5]
## PlateId PlateRunDate ScannerID PlatePosition SlideId
## [1,] "P0031168" "2021-12-03" "SG12344217" "A1" "258633829464"
## [2,] "P0031168" "2021-12-03" "SG12344217" "A10" "258633829473"
## [3,] "P0031168" "2021-12-03" "SG12344217" "A11" "258633829518"
## [4,] "P0031168" "2021-12-03" "SG12344217" "A12" "258633829519"
## [5,] "P0031168" "2021-12-03" "SG12344217" "A2" "258633829465"
n_sample <- nrow(samples)
plates <- unique(samples[, "PlateId"])
n_plate <- length(plates)
### Load feature meta
somamers <- read.table(
here(dir, "data/somamer.txt"), header = T, quote = "",
comment.char = "", sep = "\t"
) |>
as.matrix()
### Total 7596 SOMAmers (feature); 7288 human protein, others are control
dim(somamers)
## [1] 7596 13
somamers[1:5, 1:13]
## SeqId SeqIdVersion SomaId Target UniProt EntrezGeneID
## [1,] "10000-28" " 3" "SL019233" "CRBB2" "P43320" "1415"
## [2,] "10001-7" " 3" "SL002564" "c-Raf" "P04049" "5894"
## [3,] "10003-15" " 3" "SL019245" "ZNF41" "P51814" "7592"
## [4,] "10006-25" " 3" "SL019228" "ELK1" "P19419" "2002"
## [5,] "10008-43" " 3" "SL019234" "GUC1A" "P43080" "2978"
## EntrezGeneSymbol Organism Units Type Dilution
## [1,] "CRYBB2" "Human" "RFU" "Protein" "2e+01"
## [2,] "RAF1" "Human" "RFU" "Protein" "2e+01"
## [3,] "ZNF41" "Human" "RFU" "Protein" "5e-01"
## [4,] "ELK1" "Human" "RFU" "Protein" "2e+01"
## [5,] "GUCA1A" "Human" "RFU" "Protein" "2e+01"
## Cal_Interplate_Ref_Pass1 Cal_Interplate_Ref_Pass2
## [1,] "8.879371e+02" "8.942875e+02"
## [2,] "3.963486e+02" "3.980628e+02"
## [3,] "2.260907e+02" "2.241294e+02"
## [4,] "8.352066e+02" "8.344351e+02"
## [5,] "7.299779e+02" "7.297289e+02"
n_somamer <- nrow(somamers)
### Load SomaScan raw assay data matrix (2050 x 7596)
RFU[[1]] <- read.table(here(dir, "data/RFU_raw.txt"), header = F, sep = "\t") |>
as.matrix()
### 2050x7596
dim(RFU[[1]])
## [1] 2050 7596
RFU[[1]][1:5, 1:5]
## V1 V2 V3 V4 V5
## [1,] 748.8 325.4 217.1 572.0 720.7
## [2,] 577.5 385.4 325.7 634.8 731.0
## [3,] 526.8 318.6 203.0 581.3 618.0
## [4,] 508.4 422.7 213.1 641.2 688.2
## [5,] 450.5 290.5 202.2 580.2 2294.6
### Dilution groups
dilution <- c(0.005, 0.5, 20)
dilution_lab <- c("0_005", "0_5", "20")
n_dilution <- length(dilution)
Step1: Hybridization control normalization
Hybridization control normalization is designed to adjust for nui- sance variance on the basis of individual wells. Each well contains nHCE = 12 HCE (Hybridization Control Elu‑ tion) SOMAmers at different concentrations spanning more than 3 orders of magnitude. By comparing each observed HCE probe to its corresponding reference value, and then calculating the median over all HCE probes, we obtain the scale factor for the i-th well, i.e.
Notice that this normalization step is performed independently for each well; once the scale factor is determined, all SOMAmer RFUs in the well are multiplied by the same scale factor.
### well = sample
### Get idx for Hybridization feature
somamer_HCE <- somamers[, "Type"] == "Hybridization Control Elution"
table(somamer_HCE)
## somamer_HCE
## FALSE TRUE
## 7584 12
### 12 control SOMAmer reagents addded to the eluate just prior to readout
n_HCE <- sum(somamer_HCE)
### Get hybridization control data: 1) Subset only the HYBs for all samples
RFU_HCE <- RFU[[1]][, somamer_HCE]
### Use another container to save hybridization normalized data
RFU[[2]] <- RFU[[1]]
### Empty container to save scale factor for hyb
hyb_scale_factor <- rep(NA, n_sample)
### Empty container to save ratio
HCE_ratio <- matrix(rep(NA, n_sample * n_HCE), ncol = n_HCE)
### Loop through plates
for (i_plate in 1:n_plate) {
### Get current sample index from current plate
sample_idx <- samples[, "PlateId"] == plates[i_plate]
### Get current sample data from current plate
RFU_plate <- RFU[[2]][sample_idx, ]
### Current plate ratio
HCE_ratio_plate <- HCE_ratio[sample_idx, ]
### Current plate hyb scale factor
hyb_scale_factor_plate <- hyb_scale_factor[sample_idx]
### Calculate the median of each HYB for calibrators
hyb_intra_ref <- apply(RFU_HCE[sample_idx, ], 2, median)
### Loop through sample within the plate for HCE
for (i_sample in 1:sum(sample_idx)) {
### Calculate the ratio
HCE_ratio_plate[i_sample, ] <- hyb_intra_ref / RFU_plate[i_sample, somamer_HCE]
### Use the median of HCE somamers as scale factor for each sample
hyb_scale_factor_plate[i_sample] <- median(HCE_ratio_plate[i_sample, ])
### Apply the same scale factor for all features by each sample
RFU_plate[i_sample, ] <- hyb_scale_factor_plate[i_sample] * RFU_plate[i_sample, ]
}
HCE_ratio[sample_idx, ] <- HCE_ratio_plate
hyb_scale_factor[sample_idx] <- hyb_scale_factor_plate
RFU[[2]][sample_idx, ] <- RFU_plate
}
HCE_ratio[1:5, 1:12]
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] 0.9028951 0.8465086 0.8636644 0.840359 0.8238109 0.8347273 0.8409453
## [2,] 1.0688565 1.0757213 1.0718344 1.074034 1.0761508 1.0865276 1.0898142
## [3,] 1.1755157 1.0443237 1.0618485 1.027817 1.0278632 1.0458846 1.0242561
## [4,] 1.2409731 1.0985134 1.1283522 1.093489 1.0620677 1.0887500 1.0404074
## [5,] 0.9606874 0.8826018 0.8870467 0.870571 0.8706513 0.8889404 0.8772853
## [,8] [,9] [,10] [,11] [,12]
## [1,] 0.8245079 0.8392952 0.8265836 0.8554753 0.8322683
## [2,] 1.0839455 1.1161029 1.0670241 1.0616821 1.1057077
## [3,] 1.0360929 1.0523256 1.0140127 1.0951655 1.0508598
## [4,] 1.0976469 1.0806892 1.0949106 1.1302752 1.1075787
## [5,] 0.8698039 0.8517074 0.8504274 0.9175154 0.8744776
Step2: Median signal normalization on calibrators
Median signal normalization is an intra-plate normalization procedure performed within wells of the same sample class (i.e. separately for buffer, QC, calibrator, and experimental samples) and within SOMAmers of the same dilution grouping (i.e. 20%, 0.5%, and 0.005%).
It is intended to remove sample-to-sample differences in total RFU brightness that may be due to differences in overall protein concentration, pipetting variation, variation in reagent concentrations, assay timing, and other sources of variability within a group of otherwise comparable samples. Since RFU brightness differs significantly across SOMAmers, median signal normalization proceeds in two steps.
- First, the median RFU of each SOMAmer is determined (across all samples of the same sample type) and sample RFUs are divided by it. The ratio corresponding to the i-th sample and α-th SOMAmer is thus given by Performing median signal normalization on experimental samples before inter-plate calibration presents the risk of enhancing plate-to-plate differences. Thus, in this step, we restrict median signal normalization to calibrators only.
RFU[[3]] = RFU[[2]]
SF_medNormInt <- matrix(rep(1, n_sample * n_dilution), ncol = n_dilution)
### Only do the normalization on calibrator samples
sample_type <- c("Calibrator")
### Loop throuh plate
for (i_plate in 1:length(plates)) {
sel1 <- samples[, "PlateId"] == plates[i_plate]
### Loop through sample type
for (i_sample_type in 1:length(sample_type)) {
### Get the Calibrator sample idx
sel2 <- samples[, "SampleType"] == sample_type[i_sample_type]
sample_idx <- which(sel1 & sel2)
### Loop through dilution
for (i_dilution in 1:n_dilution) {
sel_somamer <- as.numeric(somamers[, "Dilution"]) == dilution[i_dilution]
dat <- RFU[[3]][sample_idx, sel_somamer, drop = F]
dat2 <- dat
### Loop throuhg dilution somamers
for (i in 1:ncol(dat2)) {
### Get the median of dilution somamer
dat2[, i] <- dat2[, i] / median(dat2[, i])
}
for (i in 1:nrow(dat)) {
SF_medNormInt[sample_idx[i], i_dilution] = 1 / median(dat2[i, ])
dat[i, ] = dat[i, ] * SF_medNormInt[sample_idx[i], i_dilution]
}
RFU[[3]][sample_idx, sel_somamer] <- dat
}
}
}
RFU[[3]][1:5, 1:10]
## V1 V2 V3 V4 V5 V6 V7 V8
## [1,] 628.8625 273.2797 182.3265 480.3811 605.2634 300.3222 1951.674 1160.725
## [2,] 621.3531 414.6658 350.4324 683.0042 786.5093 445.3299 3063.620 2156.176
## [3,] 550.5609 332.9702 212.1561 607.5190 645.8744 331.9251 2623.734 1432.524
## [4,] 557.3481 463.3970 233.6170 702.9339 754.4590 373.1733 2593.796 1444.457
## [5,] 394.5846 254.4436 177.1032 508.1864 2009.7976 243.4075 1846.358 1060.342
## V9 V10
## [1,] 354.1551 721.9994
## [2,] 521.5062 1008.0445
## [3,] 431.5235 832.8435
## [4,] 431.6050 932.3851
## [5,] 312.3393 608.3873
Step3: Plate-scale normalization
Plate-scale normalization aims to control for variance in total signal intensity from plate to plate. No protein spikes are added to the calibrator; the procedure solely relies on the endogenous levels of each protein within the set of calibrator replicates
We utilize an internal reference determined by the median across all calibrators on all plates, i.e.
\[ \begin{aligned} RFU_{\alpha }^{Cal,ref}\equiv \text {median}\left\{ RFU_{i\alpha }^{Cal,obs} \right\} _{i=1,\ldots ,n_{Cal}}^{p=1,\ldots ,n_p}\ . \end{aligned} \]
For the - SOMAmer on the p- plate, \[ \begin{aligned} SF_\alpha ^p = RFU_\alpha ^{Cal,ref}/\text {median}\left\{ RFU_{i\alpha }^{Cal,obs} \right\} _{i=1,\ldots ,n_{Cal}}^p\ . \end{aligned} \]
RFU[[4]] = RFU[[3]]
SF <- matrix(rep(NA, n_plate * n_somamer), ncol = n_somamer)
### Get calibrator sample idx
sel_cal <- samples[, "SampleType"] == "Calibrator"
cal_interplate_ref <- rep(NA, n_somamer)
### Loop through features
for (i_somamer in 1:n_somamer) {
### Get the median of control
cal_interplate_ref[i_somamer] <- median(RFU[[4]][sel_cal, i_somamer])
for (i_plate in 1:n_plate) {
sel_plate = samples[, "PlateId"] == plates[i_plate]
cal_intraplate_ref = median(RFU[[4]][sel_cal & sel_plate, i_somamer])
SF[i_plate, i_somamer] = cal_interplate_ref[i_somamer] / cal_intraplate_ref
}
}
SF_plateScale <- apply(SF, 1, median)
for (i_plate in 1:n_plate) {
sel_plate = samples[, "PlateId"] == plates[i_plate]
RFU[[4]][sel_plate, ] = RFU[[4]][sel_plate, ] * SF_plateScale[i_plate]
}
n order to correct the overall brightness level of the p- plate, we calculate the plate-scale scale factor as the median of across all SOMAmers, i.e.
Step4: Inter-plate calibration
RFU[[5]] <- RFU[[4]]
SF_cal <- matrix(rep(NA, n_plate * n_somamer), ncol = n_somamer)
sel_cal <- samples[, "SampleType"] == "Calibrator"
### New interplate reference
cal_interplate_ref_N = rep(NA, n_somamer)
for (i_somamer in 1:n_somamer) {
cal_interplate_ref_N[i_somamer] <- median(RFU[[5]][sel_cal, i_somamer])
for (i_plate in 1:n_plate) {
sel_plate <- samples[, "PlateId"] == plates[i_plate]
cal_intraplate_ref <- median(RFU[[5]][sel_cal & sel_plate, i_somamer])
SF_cal[i_plate, i_somamer] <- cal_interplate_ref_N[i_somamer] / cal_intraplate_ref
RFU[[5]][sel_plate, i_somamer] <- RFU[[5]][sel_plate, i_somamer] * SF_cal[i_plate, i_somamer]
}
}
Step6: Median signal normalization on all sample types
RFU[[6]] <- RFU[[5]]
SF_medNormFull <- matrix(rep(1, n_sample * n_dilution), ncol = n_dilution)
sample_type <- c("QC", "Sample", "Buffer", "Calibrator")
n_sampl_type = length(sample_type)
for (i_sample_type in 1:length(sample_type)) {
sample_idx <- which(samples[, "SampleType"] == sample_type[i_sample_type])
for (i_dilution in 1:n_dilution) {
sel_somamer <- as.numeric(somamers[, "Dilution"]) == dilution[i_dilution]
dat <- RFU[[6]][sample_idx, sel_somamer, drop = F]
dat2 <- dat
for (i in 1:ncol(dat2)) {
dat2[, i] <- dat2[, i] / median(dat2[, i])
}
for (i in 1:nrow(dat)) {
SF_medNormFull[sample_idx[i], i_dilution] <- 1 / median(dat2[i, ])
dat[i, ] = dat[i, ] * SF_medNormFull[sample_idx[i], i_dilution]
}
RFU[[6]][sample_idx, sel_somamer] <- dat
}
}
Save results
### Save normalized results
dir_create(here(dir, "results"))
for (i_norm in 2:n_norm) {
outfile <- path(here(dir, "results", paste0("RFU_", norm[i_norm], ".txt")))
write(t(RFU[[i_norm]]), ncol = n_somamer, file = outfile, sep = "\t")
}
### Save sample scale factors
outfile <- path(here(dir, "results", "samples_SF.txt"))
header <- c(
colnames(samples), "hyb_scale_factor", paste0("SF_msnCal_d", dilution_lab),
paste0("SF_msnAll_d", dilution_lab)
)
output <- rbind(header, cbind(samples, hyb_scale_factor, SF_medNormInt, SF_medNormFull))
write(t(output), ncol = ncol(output), file = outfile, sep = "\t")
### Save feature scale factor
outfile <- path(here(dir, "results", "somamers_SF.txt"))
header <- c(
colnames(somamers), "Cal_Interplate_Ref_Pass1", "Cal_Interplate_Ref_Pass2"
)
output <- rbind(
header,
cbind(somamers, cal_interplate_ref, cal_interplate_ref_N)
)
write(t(output), ncol = ncol(output), file = outfile, sep = "\t")
### Save plate scale factor
outfile <- path(here(dir, "results", "plates_SF.txt"))
header <- c("Plate", "SF_plateScale", paste0("SF_cal_", somamers[, "SeqId"]))
output <- rbind(header, cbind(plate, SF_plateScale, SF_cal))
write(t(output), ncol = ncol(output), file = outfile, sep = "\t")