Analysis of single cell RNA-seq data with {SingleCellExperiment}

Using SingleCellExperiment object to analysis single cell data
scrna
Author
Published

Friday, March 15, 2024

Modified

Thursday, May 9, 2024

Note

All the contents are credited or adapted from Analysis of single cell RNA-seq data for leaning purpose.

Initial general setup

### Install and load packatges 
# if (!require("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")
# BiocManager::install(version = "3.18")
# BiocManager::install(
#     c(
#         "scater", "SingleCellExperiment", "AnnotationDbi", "org.Hs.eg.db",
#         "EnsDb.Hsapiens.v86"
#     ),
#     force = TRUE
# )
library(here)
library(tidyverse)
library(scater)
library(SingleCellExperiment)
library(AnnotationDbi)
library(org.Hs.eg.db)
library(EnsDb.Hsapiens.v86)
library(scales)

### Project dir
dir <- here("projects/2024_SCE_Course")

Qualtity control

### Creating a SingleCellExperiment object 
molecules <- read.delim(here(dir, "data/tung/molecules.txt"), row.names = 1)
annotation <- read.delim(
    here(dir, "data/tung/annotation.txt"), stringsAsFactors = TRUE
)

head(molecules[, 1:3])
head(annotation)

umi <- SingleCellExperiment(
    assays = list(counts = as.matrix(molecules)), 
    colData = annotation
)
umi
## Remove ERCC spike-ins features
altExp(umi,"ERCC") <- umi[grep("^ERCC-",rownames(umi)), ]
umi <- umi[grep("^ERCC-",rownames(umi), invert = TRUE), ]
umi

## Map Ensembl IDs to gene symbols
gene_names <- mapIds(
    org.Hs.eg.db,
    keys = rownames(umi),
    keytype = "ENSEMBL",
    columns = "SYMBOL",
    column = "SYMBOL"
)

head(gene_names)
class(gene_names)

## 903 returned NA
rowData(umi)$SYMBOL <- gene_names
table(is.na(gene_names))
## Remove all genes for which no symbols were found
umi <- umi[!is.na(rowData(umi)$SYMBOL), ]

##  Returns no mitrochandrial proteins in the newly annotated symbols
grep("^MT-", rowData(umi)$SYMBOL, value = TRUE)

## Find rebosomal proteins
grep("^RP[LS]", rowData(umi)$SYMBOL, value = TRUE)

## Annotation problems: mitochondrial protein ATP8 can be found
grep("ATP8", rowData(umi)$SYMBOL, value = TRUE)

columns(org.Hs.eg.db)

## Use a more detailed database
ensdb_genes <- genes(EnsDb.Hsapiens.v86)

## Find 13 protein-coding genes located in the mitochondrion
MT_names <- ensdb_genes[seqnames(ensdb_genes) == "MT"]$gene_id

# MT_names <- ensdb_genes[seqnames(ensdb_genes) == "MT"]$gene_name
is_mito <- rownames(umi) %in% MT_names
table(is_mito)

Baseic QC

umi_cell <- scater::perCellQCMetrics(umi, subsets = list(Mito = is_mito))
head(umi_cell)

umi_feature <- scater::perFeatureQCMetrics(umi)
head(umi_feature)

## Add the metrics
umi <- addPerCellQC(umi, subsets = list(Mito = is_mito))
umi <- addPerFeatureQC(umi)

## Manual filtering
hist(umi$total, breaks = 100)
abline(v = 25000, col = "red")

hist(umi_cell$detected, breaks = 100)
abline(v = 7000, col = "red")

Low number of detected genes, but high MT gene percentage, are hallmarks of a low quality cell

## adaptive threshold can help us identify points that are more than
## 3 median absolute deviations (MADs)
qc.lib2 <- isOutlier(umi_cell$sum, log = TRUE, type = "lower")
attr(qc.lib2, "thresholds")

qc.nexprs2 <- isOutlier(umi_cell$detected, log = TRUE, type = "lower")
attr(qc.nexprs2, "thresholds")

qc.spike2 <- isOutlier(umi_cell$altexps_ERCC_percent, type = "higher")
attr(qc.spike2, "thresholds")

qc.mito2 <- isOutlier(umi_cell$subsets_Mito_percent, type = "higher")
attr(qc.mito2, "thresholds")

discard2 <- qc.lib2 | qc.nexprs2 | qc.spike2 | qc.mito2

DataFrame(
    LibSize = sum(qc.lib2), NExprs = sum(qc.nexprs2),
    SpikeProp = sum(qc.spike2), MitoProp = sum(qc.mito2), Total = sum(discard2)
)

reasons <- quickPerCellQC(
    umi_cell,
    sub.fields = c("subsets_Mito_percent", "altexps_ERCC_percent")
)
colSums(as.matrix(reasons))

umi$discard <- reasons$discard
## Plotting various coldata (cell-level medadata) assays against each other
plotColData(umi, x = "sum", y = "subsets_Mito_percent", colour_by = "discard")
plotColData(umi, x = "sum", y = "detected", colour_by = "discard")
plotColData(
    umi, x = "altexps_ERCC_percent", 
    y = "subsets_Mito_percent", colour_by = "discard"
)
## splitting by batches to see if there are substantial batch-specific differences
plotColData(
    umi, x = "sum", y = "detected", colour_by = "discard", 
    other_fields = "individual"
) +
    facet_wrap(~individual) + 
    scale_x_continuous(labels = unit_format(unit = "k", scale = 1e-3))
plotColData(
    umi, x = "sum", y = "detected", colour_by = "discard", other_fields = "replicate"
) +
    facet_wrap(~replicate) +
    scale_x_continuous(labels = unit_format(unit = "k", scale = 1e-3))

Highly expressed genes

## Most of the genes we see are mitochondrial or ribosomal proteins, which is pretty typical for most scRNA-seq datasets.
plotHighestExprs(
    umi, exprs_values = "counts",
    feature_names_to_plot = "SYMBOL", colour_cells_by = "detected"
)

keep the genes which were detected (expression value > 1) in 2 or more cells. We’ll discard approximately 4,000 weakly expressed genes.

keep_feature <- nexprs(umi, byrow = TRUE, detection_limit = 1) >= 2
rowData(umi)$discard <- !keep_feature
table(rowData(umi)$discard)
assay(umi, "logcounts_raw") <- log2(counts(umi) + 1)
# saveRDS(umi, file = here(dir, "data/tung/umi.rds"))

Dimensionality reduction

## Remove unnecessary poorly expressed genes and low quality cells
umi.qc <- umi[!rowData(umi)$discard, !colData(umi)$discard]
umi.qc

Without log-transformation or normalization, PCA plot fails to separate the datasets by replicate or individual. We mostly see the effects of sequencing depth - samples (cells) with lots of expression, and particularly highly expressed genes, dominate the PCs

Before QC

## Before QC, Without log-transformation or normalization
umi <- runPCA(umi, exprs_values = "counts")
dim(reducedDim(umi, "PCA"))
p1 <- plotPCA(
    umi, 
    colour_by = "batch", 
    size_by = "detected", 
    shape_by = "individual"
)

With log-transformation, we equalize the large difference between strongly and weakly expressed genes, and immediately see cells form groups by replicate, individual, and sequencing depth.

umi <- runPCA(umi, exprs_values = "logcounts_raw")
dim(reducedDim(umi, "PCA"))
p2 <- plotPCA(
    umi,
    colour_by = "batch",
    size_by = "detected",
    shape_by = "individual"
)   

patchwork::wrap_plots(p1, p2, ncol = 1)
set.seed(123456)
umi <- runTSNE(umi, exprs_values = "logcounts_raw", perplexity = 130)
plotTSNE(
    umi,
    colour_by = "batch",
    size_by = "detected",
    shape_by = "individual"
)

After QC

umi.qc <- runPCA(umi.qc, exprs_values = "logcounts_raw")
dim(reducedDim(umi.qc, "PCA"))
plotPCA(
    umi.qc,
    colour_by = "batch",
    size_by = "detected",
    shape_by = "individual"
)

set.seed(123456)
umi.qc <- runTSNE(umi.qc, exprs_values = "logcounts_raw", perplexity = 130)
plotTSNE(
    umi.qc, 
    colour_by = "batch",
    size_by = "detected",
    shape_by = "individual"
)

Identifying confounding factors

### Detected genes
logcounts(umi.qc) <- assay(umi.qc, "logcounts_raw")
getExplanatoryPCs(umi.qc,variables = "sum")

## PC1 can be almost completely (86%) explained by the total UMI counts (sequencing depth)
plotExplanatoryPCs(umi.qc,variables = "sum") 
logcounts(umi.qc) <- NULL
plotExplanatoryVariables(
    umi.qc,
    exprs_values = "logcounts_raw",
    variables = c(
        "detected", "sum", "batch",
        "individual", "altexps_ERCC_percent", "subsets_Mito_percent"
    )
)