Guorui Zhong - Seurat V5 | 10X course example

Load packages and data

library(here)
library(tidyverse)
library(Seurat)
library(SCINA)
theme_set(theme_bw(base_size = 14))

dir <- here("projects/babraham_training_bioinformatics/10X_scRNAseq_Course")
data <- Read10X_h5(here(dir, "data/filtered_feature_bc_matrix.h5"))
data <- CreateSeuratObject(
    counts = data,
    project = "course",
    min.cells = 3,
    min.features = 200
)
data

head(rownames(data))

Explore QC metrics

### Amount of MT genes
grep("^MT-", rownames(data), value = TRUE)
data$percent_MT <- PercentageFeatureSet(data, pattern = "^MT-")
head(data$percent_MT)

### Amount of Ribosomal genes
grep("^RP[LS]", rownames(data), value = TRUE)
data$percent_Ribosomal <- PercentageFeatureSet(data, pattern = "^RP[LS]")
head(data$percent_Ribosomal)

### Percentage of Largest Gene
data_nomalat <- data[rownames(data) != "MALAT1", ]

data_nomalat$largest_count <- apply(
    data_nomalat@assays$RNA@layers$counts,
    2,
    max
)

data_nomalat$largest_index <- apply(
    data_nomalat@assays$RNA@layers$counts,
    2,
    which.max
)

data_nomalat$largest_gene <- rownames(data_nomalat)[data_nomalat$largest_index]

data_nomalat$percent.Largest.Gene <- 100 * data_nomalat$largest_count /
    data_nomalat$nCount_RNA

data_nomalat[[]][1:10, ]

data$largest_gene <- data_nomalat$largest_gene
data$percent.Largest.Gene <- data_nomalat$percent.Largest.Gene

data[[]][1:10, ]
rm(data_nomalat)

### For some metrics it’s better to view on a log scale.
VlnPlot(
    data,
    layer = "counts",
    features = c(
        "nCount_RNA", "percent_MT", "percent_Ribosomal", "percent.Largest.Gene"
    ),
    ncol = 2,
    log = TRUE
)

FeatureScatter(
    data,
    feature1 = "nCount_RNA", 
    feature2 = "percent.Largest.Gene"
)

## ggplot
qc_metrics <- dplyr::as_tibble(
    data[[]],
    rownames = "Cell.Barcode"
)

head(qc_metrics)

qc_metrics |>
    arrange(percent_MT) |>
    ggplot(aes(nCount_RNA, nFeature_RNA, colour = percent_MT)) +
    geom_point() +
    scale_color_gradientn(colors = c("black", "blue", "green2", "red", "yellow")) +
    ggtitle("Example of plotting QC metrics") +
    geom_hline(yintercept = 750) +
    geom_hline(yintercept = 2000)

## Log scale
qc_metrics |>
    arrange(percent_MT) |>
    ggplot(aes(nCount_RNA, nFeature_RNA, colour = percent_MT)) +
    geom_point(size = 0.7) +
    scale_color_gradientn(colors = c("black", "blue", "green2", "red", "yellow")) +
    ggtitle("Example of plotting QC metrics") +
    geom_hline(yintercept = 750) +
    geom_hline(yintercept = 2000) +
    scale_x_log10() +
    scale_y_log10()

### Complexity 
qc_metrics <- qc_metrics |>
  mutate(complexity=log10(nFeature_RNA) / log10(nCount_RNA))

complexity.lm <- lm(log10(qc_metrics$nFeature_RNA)~log10(qc_metrics$nCount_RNA)) 

complexity.lm

qc_metrics <- qc_metrics |>
  mutate(
    complexity_diff = log10(nFeature_RNA) - ((log10(qc_metrics$nCount_RNA)*complexity.lm$coefficients[2])+complexity.lm$coefficients[1])
  )

qc_metrics |>
  ggplot(aes(x=complexity_diff)) +
  geom_density(fill="yellow") 

complexity_scale <- min(c(max(qc_metrics$complexity_diff),0-min(qc_metrics$complexity_diff)))

qc_metrics |>
  mutate(complexity_diff=replace(complexity_diff,complexity_diff< -0.1,-0.1)) |>
  ggplot(aes(x=log10(nCount_RNA), y=log10(nFeature_RNA), colour=complexity_diff)) +
  geom_point(size=0.5) +
  geom_abline(slope=complexity.lm$coefficients[2], intercept = complexity.lm$coefficients[1]) +
  scale_colour_gradient2(low="blue2",mid="grey",high="red2")

qc_metrics |>
  ggplot(aes(x=complexity_diff, y=percent.Largest.Gene)) +
  geom_point()

### Largest gene
largest_gene_list <- qc_metrics |>
    group_by(largest_gene) |>
    count() |>
    arrange(desc(n))

largest_genes_to_plot <- largest_gene_list |>
    filter(n > 140) |>
    pull(largest_gene)

qc_metrics |>
    filter(largest_gene %in% largest_genes_to_plot) |>
    mutate(
      largest_gene = factor(largest_gene, levels = largest_genes_to_plot)
    ) |>
    arrange(largest_gene) |>
    ggplot(
      aes(x = log10(nCount_RNA), y = log10(nFeature_RNA), colour = largest_gene)
    ) +
    geom_point(size = 1) +
    scale_colour_manual(
      values = c("grey", RColorBrewer::brewer.pal(9, "Set1"))
    )

qc_metrics |>
    filter(largest_gene %in% largest_genes_to_plot) |>
    mutate(
      largest_gene = factor(largest_gene, levels = largest_genes_to_plot)
    ) |>
    arrange(largest_gene) |>
    ggplot(
      aes(x = complexity_diff, y = percent.Largest.Gene, colour = largest_gene)
    ) +
    geom_point() +
    scale_colour_manual(values = c("grey", RColorBrewer::brewer.pal(9, "Set1")))

qc_metrics |>
    arrange(percent_MT) |>
    ggplot(
      aes(x = complexity_diff, y = percent.Largest.Gene, colour = percent_MT)
    ) +
    geom_point() +
    scale_colour_gradient(low = "grey", high = "red2")

qc_metrics |>
    arrange(percent_Ribosomal) |>
    ggplot(
      aes(x = complexity_diff, y = percent.Largest.Gene, colour = percent_Ribosomal)
    ) +
    geom_point() +
    scale_colour_gradient(low = "grey", high = "red2")

Setting QC Cutoff and Filtering

qc_metrics |>
    ggplot(aes(percent_MT)) +
    geom_histogram(binwidth = 0.5, fill = "yellow", colour = "black") +
    ggtitle("Distribution of Percentage Mitochondrion") +
    geom_vline(xintercept = 10)

qc_metrics |>
    ggplot(aes(percent.Largest.Gene)) +
    geom_histogram(binwidth = 0.7, fill = "yellow", colour = "black") +
    ggtitle("Distribution of Percentage Largest Gene") +
    geom_vline(xintercept = 10) ### Filtering

data <- subset(
    data,
    nFeature_RNA > 750 &
        nFeature_RNA < 2000 &
        percent_MT < 10 &
        percent.Largest.Gene < 10
)

data

Normalisation, Selection and Scaling

### Normalization
data <- NormalizeData(data, normalization.method = "LogNormalize")

gene_expression <- apply(data@assays$RNA@layers$data, 1, mean)
gene_expression <- names(gene_expression) <- rownames(data)
gene_expression <- sort(gene_expression, decreasing = TRUE)
head(gene_expression, n = 10)

ggplot(mapping = aes(data["GAPDH", ]@assays$RNA@layers$data)) +
    geom_histogram(binwidth = 0.05, fill = "yellow", colour = "black") +
    ggtitle("GAPDH expression")


as_tibble(
    data@assays$RNA@layers$data[, 1:100]
) |>
    pivot_longer(
        cols = everything(),
        names_to = "cell",
        values_to = "expression"
    ) |>
    ggplot(aes(x = cell, y = expression)) +
    stat_summary(geom = "crossbar", fun.data = mean_sdl)

### Cell Cycle Scoring
cc.genes.updated.2019

data <- CellCycleScoring(
    data,
    s.features = cc.genes.updated.2019$s.genes,
    g2m.features = cc.genes.updated.2019$g2m.genes,
    set.ident = TRUE
)

as_tibble(data[[]])

as_tibble(data[[]]) |>
    ggplot(aes(Phase)) + geom_bar()

as_tibble(data[[]]) |>
    ggplot(aes(x = S.Score, y = G2M.Score, color = Phase)) +
    geom_point() +
    coord_cartesian(xlim = c(-0.15, 0.15), ylim = c(-0.15, 0.15))

### Gene selection
data <- FindVariableFeatures(
    data,
    selection.method = "vst",
    nfeatures = 500
)

variance_data <- as_tibble(
    HVFInfo(data), rownames = "Gene"
)

variance_data <- variance_data |>
    mutate(
        hypervariable = Gene %in% VariableFeatures(data)
    )

head(variance_data, n = 10)

variance_data |>
    ggplot(aes(log(mean), log(variance), color = hypervariable)) +
    geom_point() +
    scale_color_manual(values = c("black", "red"))

### Scaling
data <- ScaleData(data, features = rownames(data))

Dimensionality reduction

PCA

### Calculate all PCs
### list of genes which are most highly and lowly weighted in the different PCs
data <- RunPCA(data, features = VariableFeatures(data))

### See if the cell cycle is having a big effect on the clusters we’re picking out
DimPlot(data, reduction = "pca")

DimPlot(
    data, reduction = "pca",
    group.by = "largest_gene",
    label = TRUE,
    label.size = 3
) + NoLegend()

### This nicely shows us the power, but also the limitations of PCA in that we 
### can see that not all of the useful information is captured in the first two 
### principal components.
DimPlot(data, reduction = "pca", dims = c(3, 4))

### How far down the set of PCs do we need to go to capture all of the 
### biologically relevant information.
ElbowPlot(data)

### Plots of PCA weightings for the most highly and lowly weighted genes, 
### shown against the set of cells which are most highly influenced by the PC. 
### The idea is that as long as we’re seeing clear structure in one of these 
### plots then we’re still adding potentially useful information to the analysis.
DimHeatmap(data, dims = 1:15, cells = 500)

tSNE

### Setting this to a low value will help resolve small clusters, 
### but at the expense of large clusters becoming more diffuse
saved_seed <- 8482
set.seed(saved_seed)

data <- RunTSNE(data, dims = 1:15, seed.use = saved_seed, perplexity = 10)
DimPlot(data, reduction = "tsne", pt.size = 1) +
    ggtitle("tSNE with Perplexity 10")

data <- RunTSNE(data, dims = 1:15, seed.use = saved_seed, perplexity = 200)
DimPlot(data, reduction = "tsne", pt.size = 1) +
    ggtitle("tSNE with Perplexity 200")

data <- RunTSNE(data, dims = 1:15, seed.use = saved_seed)
DimPlot(data, reduction = "tsne", pt.size = 1) +
    ggtitle("tSNE with default Perplexity (30)")

Defining Cell Clusters

### Finds the ‘k’ nearest neighbours to each cell and makes this into a graph
data <- FindNeighbors(data, dims = 1:15)

### Get another sparse matrix of distances.
data@graphs$RNA_snn[1:10, 1:10]

### Segment the graph, 
### Larger resolution give larger clusters, smaller values gives smaller clusters.
data <- FindClusters(data, resolution = 0.5)

head(data$seurat_clusters, n = 5)

### Some of the clusters don't resolve very well in PC1 vs PC2
DimPlot(data, reduction = "pca", label = TRUE) +
    ggtitle("PC1 vs PC2 with Clusters")

### Some of the clusters which are overlaid in PC1 start to separate in other PCs
### These differences represent a small proportion of the overall variance
### but can be important in resolving changes.
DimPlot(data, reduction = "pca", dims = c(4, 9), label = TRUE) +
    ggtitle("PC4 vs PC9 with Clusters")

### tSNE plot showing all of the information across the PCs used is preserved
### and we see the overall similarity of the cells
DimPlot(data, reduction = "tsne", pt.size = 1, label = TRUE, label.size = 7)

### Check the clusters to see if they are influenced by any of the QC metrics
### We can see that some of the clusters are skewed in one or more of the
### metrics we’ve calculated so we will want to take note of this.
### Some of these skews could be biological in nature,
### but they could be noise coming from the data.
VlnPlot(data, features = "nCount_RNA")

### Cluster 8, 9, 11, and 12 could be GEMs where two or more cells were captured
### Since they all usually high coverage and diversity
### Theya are also small and tightly clustered away from the main groups of points
VlnPlot(data, features = "nFeature_RNA")

VlnPlot(data, features = "percent_MT")

VlnPlot(data, features = "MALAT1")

VlnPlot(data,features="percent.Largest.Gene")

### Which largest gene
data[[]] |>
    group_by(seurat_clusters, largest_gene) |>
    count() |>
    arrange(desc(n)) |>
    group_by(seurat_clusters) |>
    slice(1:2) |>
    ungroup() |>
    arrange(seurat_clusters, desc(n))

### Cell cycle
data@meta.data |>
    group_by(seurat_clusters, Phase) |>
    count() |>
    group_by(seurat_clusters) |>
    mutate(percent = 100 * n / sum(n)) |>
    ungroup() |>
    ggplot(aes(x = seurat_clusters, y = percent, fill = Phase)) +
    geom_col() +
    ggtitle("Percentage of cell cycle phases per cluster")

### That’s already quite nice for explaining some of the functionality of the 
### clusters, but there’s more in there than just the behaviour of the most 
### expressed gene
data@reductions$tsne@cell.embeddings |>
    as_tibble() |>
    add_column(
      seurat_clusters = data$seurat_clusters, largest_gene = data$largest_gene
    ) |>
    filter(largest_gene %in% largest_genes_to_plot) |>
    ggplot(aes(x = tSNE_1, y = tSNE_2, colour = seurat_clusters)) +
    geom_point() +
    facet_wrap(vars(largest_gene))

Find Markers

### Evaluate the cluster by identifying genes whose expression definies each
### cluster which has been identified

### Find genes appears to be upregulated in a specific cluster compared to all
### cells not in that cluster
FindMarkers(data, ident.1 = 0, min.pct = 0.25)[1:5, ]

### Show the expression levels of these genes in the cells in each cluster
### VCAN gene is more highly expressed in cluster 0 than any of the other 
### clusters, but we can also see that it is also reasonably highly expressed 
### in clusters 9 and 12. These were both clusters we suspected of being 
### multiple cells though
VlnPlot(data, features = "VCAN")

### FindMarkers function on all of the clusters
cluster_markers <- lapply(
    levels(data[["seurat_clusters"]][[1]]),
    function(x) FindMarkers(data, ident.1 = x, min.pct = 0.25)
)

### Adds the cluster number to the results of FindMarkers
sapply(
    0:(length(cluster_markers) - 1),
    function(x) {
        cluster_markers[[x + 1]]$gene <<- rownames(cluster_markers[[x + 1]])
        cluster_markers[[x + 1]]$cluster <<- x
    }
)

### Generate tibble and sort by FDR to put the most significant ones first
cluster_markers <- as_tibble(do.call(rbind, cluster_markers)) |>
    arrange(p_val_adj)

cluster_markers

### Extract the most upregulated gene from each cluster
best_wilcox_gene_per_cluster <- cluster_markers |>
    group_by(cluster) |>
    slice(1) |>
    pull(gene)

### We can see that for some clusters (eg Cluster 8 - CDKN1C) We really do have
### a gene which can uniquely predict, but for many others (eg cluster 7 IL7R)
### we have a hit which also picks up other clusters (clusters 1, 3 and 4 in this case).
VlnPlot(data, features = best_wilcox_gene_per_cluster)

### Clean this up for any individual cluster by using the roc analysis.
FindMarkers(
    data, ident.1 = 7, ident.2 = 4, test.use = "roc", only.pos = TRUE
)[1:5, ]

### Slightly better job at separating cluster 5 from cluster 4, but it also
### comes up all over the place in other clusters
VlnPlot(data, features = "LTB")

### This could actually be a better option to use as a marker for this cluster.
VlnPlot(data, features = "TPT1")

Automated Cell Type Annotation

scina_data <- as.data.frame(GetAssayData(data, layer = "data"))

signatures <- get(
    load(system.file("extdata", "example_signatures.RData", package = "SCINA"))
)

signatures

scina_results <- SCINA(
    scina_data,
    signatures,
    max_iter = 100,
    convergence_n = 10,
    convergence_rate = 0.999,
    sensitivity_cutoff = 0.9,
    rm_overlap = TRUE,
    allow_unknown = TRUE
)

data$scina_labels <- scina_results$cell_labels

### plot out the tsne spread coloured by the automatic annotation
DimPlot(
    data, reduction = "tsne", pt.size = 1, label = TRUE,
    group.by = "scina_labels", label.size = 5
)

### Relate this to the clusters which we automatically detected.
tibble(
    cluster = data$seurat_clusters,
    cell_type = data$scina_labels
) |>
    group_by(cluster, cell_type) |>
    count() |>
    group_by(cluster) |>
    mutate(
        percent = (100 * n) / sum(n)
    ) |>
    ungroup() |>
    mutate(
        cluster = paste("Cluster", cluster)
    ) |>
    ggplot(aes(x = "", y = percent, fill = cell_type)) +
    geom_col(width = 1) +
    coord_polar("y", start = 0) +
    facet_wrap(vars(cluster)) +
    theme(axis.text.x = element_blank()) +
    xlab(NULL) +
    ylab(NULL)

### Colouring by genes
### Some of these genes very specifically isolate to their own cluster, but for
### others we see expression which is more widely spread over a number of clusters
FeaturePlot(data, features = best_wilcox_gene_per_cluster, ncol = 3)

data@reductions$tsne@cell.embeddings[1:10, ]

# data@reductions$tsne@cell.embeddings[, ] |>
#     as_tibble(rownames = "barcode") |>
#     mutate(barcode = paste0(barcode, "-1")) |>
#     write_csv("for_loupe_import.csv")

Reference

Session info

Code

sessionInfo()