Visualization

There are many ways in which scRNAseq data can be visualized, ranging from how you demonstrate you want to show your QC to how you want to show gene expression. Here we are taking differentially expressed genes discovered by using logistic regression single cell models or DESeq2 pseduobulk models and looking at genes that overlap both results, using a ven diagram. Venn diagrams can be useful to observe overlapping information.

# Lets look at genes in a venn diagram'
LR.genes <- rownames(b.interferon.response)
DESeq2.genes <- rownames(b.interferon.response.aggr)

# Make a list
x <- list("LR.genes" = LR.genes, "DESeq2.genes" = DESeq2.genes)

# Make a Venn object
venn <- Venn(x)
data <- process_data(venn)
ggplot() +
  # 1. region count layer
  geom_sf(aes(fill = count), data = venn_region(data)) +
  # 2. set edge layer
  geom_sf(aes(color = name), data = venn_setedge(data), show.legend = TRUE, size = 2) +
  # 3. set label layer
  geom_sf_text(aes(label = name), data = venn_setlabel(data)) +
  # 4. region label layer
  geom_sf_label(aes(label = paste0(count, "(", scales::percent(count/sum(count), accuracy = 2), ")")), 
                data = venn_region(data),
                size = 3) +
  scale_fill_gradient(low = "white", high = "dodgerblue3") + # change color based on celltype
  # scale_color_manual(values = c("bmvsbf" = "black",
  #                               "mvsf" ="black", 
  #                               "wmvswf" = 'black'),
  # scale_color_manual(values = c("beta_m" = "black",
  #                               "beta_f" ="black",
  #                               "alpha_m" = 'black',
  #                               "alpha_f" = 'black')) +
  scale_color_manual(values = c("LR" = "black",
                                "DESeq2" ="black"),
                     labels = c('D' = 'D = bdiv_human')) +
  theme_void()
#OUTPUT

Note how 70% of genes overlap, but who are they? We can use information stored within the Venn object to identify overlaps. I would always default to pseudobulk gene testing, as in this manner differential count numbers across multiple datasets don’t affect your analysis, albeit this analysis can be very conservative. I personally prefer being conservative in the DE analysis.

# Look at all sets of genes forming overlaps
# https://github.com/yanlinlin82/ggvenn/issues/21
mylist <- data@region[["item"]]
names(mylist)
#OUTPUT
NULL

Mapping names onto data, and viewing lists. Note how 134 genes are overlapping both tests, whereas 52 are unique to LR and 4 are unique to DESeq2

names(mylist) <- data@region[["name"]]
mylist
#OUTPUT
$LR.genes
  [1] "RSAD2"    "OASL"     "PML"      "ETV7"     "DHX58"    "STAT2"    "DDX60"    "DEK"     
  [9] "ZBP1"     "ZNFX1"    "GSDMD"    "MTHFD2"   "HSP90AA1" "ELF1"     "B3GNT2"   "NDUFA9"  
 [17] "LACTB"    "AP3B1"    "DTX3L"    "TMEM140"  "GBP7"     "GLRX"     "SP140"    "TMX1"    
 [25] "BAG1"     "TAP2.1"   "ZNF267"   "CNDP2"    "STAP1"    "CD53"     "HIF1A"    "MYCBP2"  
 [33] "CHI3L2"   "GNB4"     "SYNE2"    "RARRES3"  "GPBP1"    "SNX6"     "CTSS"     "TANK"    
 [41] "IDH3A"    "SOD2"     "MCL1"     "CAST"     "HLA-F"    "EVL"      "RNASEH2B" "UBE2F"   
 [49] "ANXA2R"   "IRF2"     "SQRDL"    "TSPAN13"
$DESeq2.genes
[1] "HLA-B" "H3F3B" "HLA-C" "LAMP3"
$LR.genes..DESeq2.genes
  [1] "ISG15"     "ISG20"     "IFIT3"     "IFI6"      "IFIT1"     "MX1"       "TNFSF10"   "LY6E"     
  [9] "IFIT2"     "B2M"       "CXCL10"    "PLSCR1"    "IRF7"      "HERC5"     "UBE2L6"    "IFI44L"   
 [17] "EPSTI1"    "OAS1"      "GBP1"      "IFITM2"    "SAMD9L"    "NT5C3A"    "IFI35"     "PSMB9"    
 [25] "MX2"       "DYNLT1"    "BST2"      "IFITM3"    "CMPK2"     "SAT1"      "EIF2AK2"   "PPM1K"    
 [33] "GBP4"      "DDX58"     "PSMA2.1"   "LAP3"      "SAMD9"     "XAF1"      "IFI16"     "COX5A"    
 [41] "SOCS1"     "MYL12A"    "SP110"     "PARP14"    "PSME2"     "TMSB10"    "CHST12"    "FBXO6"    
 [49] "MT2A"      "PLAC8"     "TRIM22"    "DRAP1"     "SUB1"      "TNFSF13B"  "NMI"       "XRN1"     
 [57] "NEXN"      "RBCK1"     "CLEC2D"    "MNDA"      "RNF213"    "IFI44"     "GBP5"      "NPC2"     
 [65] "STAT1"     "WARS"      "OAS2"      "SELL"      "TAP1"      "DDX60L"    "IRF8"      "OAS3"     
 [73] "RTCB"      "IFITM1"    "KIAA0040"  "CXCL11"    "CARD16"    "PSMA4"     "DNAJA1"    "IFIH1"    
 [81] "TYMP"      "HLA-E"     "LGALS9"    "NUB1"      "C19orf66"  "GBP2"      "PSMB8"     "GNG5"     
 [89] "HAPLN3"    "PMAIP1"    "IFIT5"     "PARP9"     "CD38"      "GMPR"      "C5orf56"   "EAF2"     
 [97] "HERC6"     "CD48"      "RTP4"      "RABGAP1L"  "USP30-AS1" "TREX1"     "IGFBP4"    "INPP1"    
[105] "CCL8"      "CREM"      "CD164"     "CLIC1"     "APOL6"     "CCL2"      "SMCHD1"    "ODF2L"    
[113] "EHD4"      "NAPA"      "SP100"     "PHF11"     "FAM46A"    "PNPT1"     "ADAR"      "POMP"     
[121] "UNC93B1"   "DCK"       "IRF1"      "CCR7"      "TMEM123"   "CASP4"     "HSH2D"     "SLFN5"    
[129] "CFLAR"     "CASP1"     "VAMP5"     "PSME1"     "XBP1"      "MRPL44" 

Another intuitive way in which we can visualize data is via a dotplot. Dotplots are great because you can visualize information across multiple samples for some select genes.

# Visualization via dotplot
Idents(pbmc) <- factor(Idents(pbmc), levels = c("Mono/Mk Doublets", "pDC", "Eryth", 
                                                "Mk", "DC", "CD14 Mono", "CD16 Mono", 
                                                "B Activated", "B", "CD8 T", "NK", "T activated",
                                                "CD4 Naive T", "CD4 Memory T"))
markers.to.plot <- c("CD3D", "CREM", "HSPH1", "SELL", "GIMAP5", "CACYBP", "GNLY", "NKG7", "CCL5",
                     "CD8A", "MS4A1", "CD79A", "MIR155HG", "NME1", "FCGR3A", "VMO1", "CCL2", "S100A9", "HLA-DQA1",
                     "GPR183", "PPBP", "GNG11", "HBA2", "HBB", "TSPAN13", "IL3RA", "IGJ")
Idents(pbmc) <- "celltype"
DotPlot(pbmc, features = markers.to.plot, cols = c("blue", "red"), dot.scale = 8, split.by = "stim") +
  RotatedAxis()
#OUTPUT

Some other cool tricks, involve changing the color organization in the UMAP plot, here just change the color sequence in the “cols” command to adjust the colors you want to visualize. Note how the colors match the sequence of clusters. If you want to learn more about colors in R look at this site for more info. Another way to select colors is using hexadecimal codes which can be picked by using this website, just replace the color names with your hexadecimal code of choice.

# Change colors on UMAP
DimPlot(pbmc, #switch here to plot
        #split.by = "Diabetes Status", 
        group.by = "celltype", 
        label = FALSE, 
        ncol = 1, 
        raster = FALSE,
        pt.size = 0.05,
        cols = c("dodgerblue3",      
                 "turquoise2",       
                 "lightseagreen",    
                 "darkseagreen2",    
                 "khaki2",            
                 "springgreen4",     
                 "chartreuse3",      
                 "burlywood3",       
                 "darkorange2",      
                 "salmon3",          
                 "orange",           
                 "salmon",           
                 "red",              
                 "magenta3",         
                 "orchid1",          
                 "red4",             
                 "grey30"            
        )
)
#OUTPUT

We can also look at cellular numbers across various cell types. These plots can be useful to add to QC, or if you are interested in looking at the diversity of your dataset.

# Look at cell proportion
pbmc$donor.stim <- paste(pbmc$donor, pbmc$stim, sep = "_")
table(pbmc$donor.stim)
#OUTPUT

     B Activated_CTRL      B Activated_STIM                B_CTRL                B_STIM        CD14 Mono_CTRL 
                  183                   207                   405                   565                  2209 
       CD14 Mono_STIM        CD16 Mono_CTRL        CD16 Mono_STIM     CD4 Memory T_CTRL     CD4 Memory T_STIM 
                 2114                   518                   546                   813                   903 
     CD4 Naive T_CTRL      CD4 Naive T_STIM            CD8 T_CTRL            CD8 T_STIM               DC_CTRL 
                 1003                  1475                   320                   466                   226 
              DC_STIM            Eryth_CTRL            Eryth_STIM               Mk_CTRL               Mk_STIM 
                  194                    22                    33                    98                   122 
Mono/Mk Doublets_CTRL Mono/Mk Doublets_STIM               NK_CTRL               NK_STIM              pDC_CTRL 
                   42                    28                   312                   333                    51 
             pDC_STIM      T activated_CTRL      T activated_STIM 
                   77                   315                   343
# Let's make a bar plot of cellular proportion
dittoBarPlot(pbmc, "celltype", 
             retain.factor.levels = TRUE,
             scale = "percent",
             color.panel = c("dodgerblue3",      
                             "turquoise2",       
                             "lightseagreen",   
                             "darkseagreen2",   
                             "khaki2",           
                             "springgreen4",     
                             "chartreuse3",      
                             "burlywood3",      
                             "darkorange2",      
                             "salmon3",         
                             "orange",           
                             "salmon",           
                             "red",              
                             "magenta3",         
                             "orchid1",          
                             "red4",             
                             "grey30"),          
             group.by = "donor.stim") + coord_flip()
#OUTPUT

Similarly, we can look at the total cell number.

# Or even cellular number
dittoBarPlot(pbmc, "celltype", 
             retain.factor.levels = TRUE,
             scale = "count",
             color.panel = c("dodgerblue3",      
                             "turquoise2",       
                             "lightseagreen",   
                             "darkseagreen2",   
                             "khaki2",           
                             "springgreen4",     
                             "chartreuse3",      
                             "burlywood3",      
                             "darkorange2",      
                             "salmon3",         
                             "orange",           
                             "salmon",           
                             "red",              
                             "magenta3",         
                             "orchid1",          
                             "red4",             
                             "grey30"),          
             group.by = "donor.stim") + coord_flip()
#OUTPUT

Heatmaps are really intuitive methods to look at a variety of genes, this customized heatmap is based on the dittoHeatmap package. Note how we are looking at the pseudobulked data to generate a heatmap.

# Heatmap
label_genes <- c("CD3D", "CREM", "HSPH1", "SELL", "GIMAP5", "CACYBP", "GNLY", "NKG7", "CCL5",
                 "CD8A", "MS4A1", "CD79A", "MIR155HG", "NME1", "FCGR3A", "VMO1", "CCL2", "S100A9", "HLA-DQA1",
                 "GPR183", "PPBP", "GNG11", "HBA2", "HBB", "TSPAN13", "IL3RA", "IGJ")
genes.to.plot <- pbmc@assays[["RNA"]]@var.features
dittoHeatmap(
  combined_pbmc,
  genes = genes.to.plot,
  # metas = NULL,
  # cells.use = NULL,
  annot.by = c("celltype", "donor", "stim"),
  #annot.by = c("lib", "sex", "source"),
  order.by = c("celltype"),
  # main = NA,
  # cell.names.meta = NULL,
  # assay = .default_assay(object),
  # slot = .default_slot(object),
  # swap.rownames = NULL,
  heatmap.colors = colorRampPalette(c("dodgerblue", "white", "red3"))(50),
  breaks=seq(-2, 2, length.out=50),
  scaled.to.max = FALSE,
  # heatmap.colors.max.scaled = colorRampPalette(c("white", "red"))(25),
  # annot.colors = c(dittoColors(), dittoColors(1)[seq_len(7)]),
  # annotation_col = NULL,
  annotation_colors = list(celltype = c("CD14 Mono" = "salmon3",
                                        "CD4 Naive T" = "orange",
                                        "CD4 Memory T"= "lightseagreen",
                                        "CD16 Mono" = "dodgerblue3",
                                        "B" = "turquoise2",
                                        "CD8 T" = "burlywood3",
                                        "T activated" = "darkseagreen2",
                                        "NK" = "chartreuse3",
                                        "DC" = "darkorange2",
                                        "B Activated" = "red",
                                        "Mk" = "khaki2",
                                        "pDC" = "springgreen4",
                                        "Mono/Mk Doublets" = "orchid1",
                                        "Eryth" = "magenta3"),
                           donor = c("d1" = "dodgerblue",
                                     "d2" = "red2",
                                     "d3" = "green3",
                                     "d4" = "purple2"),
                           stim = c("CTRL" = "red4",
                                    "STIM" = "deepskyblue3")),
                           # ancestry = c("white" = "deepskyblue3",
                           #              "black" = "black",
                           #              "hispanic" = "darkorange"),
                           # source = c("nPOD" = "dodgerblue",
                           #            "Tulane" = "springgreen4",         
                           #            "UPENN" = "red4")),
  # # data.out = FALSE,
  # highlight.features = NULL,
  # highlight.genes = NULL,
  # show_colnames = isBulk(object),
  # show_rownames = TRUE,
  # scale = "row",
  #cluster_cols = TRUE,
  # border_color = NA,
  # legend_breaks = NA,
  # drop_levels = FALSE,
  # breaks = NA,
  # complex = FALSE
  #gaps_col = c(460),
  complex = TRUE,
  use_raster = TRUE,
  raster_quality = 5
) + rowAnnotation(mark = anno_mark(at = match(label_genes, 
                                              rownames(pbmc[genes.to.plot,])), 
                                   labels = label_genes, 
                                   which = "row",
                                   labels_gp = list(cex=1),
                                   #link_width = unit(4, "mm"), link_height = unit(4, "mm"),
                                   padding = 0.1))
#OUTPUT
It seems you are using RStudio IDE. `anno_mark()` needs to work with the physical size of the graphics
device. It only generates correct plot in the figure panel, while in the zoomed plot (by clicking the icon
'Zoom') or in the exported plot (by clicking the icon 'Export'), the connection to heatmap rows/columns might
be wrong. You can directly use e.g. pdf() to save the plot into a file.

Use `ht_opt$message = FALSE` to turn off this message.
Warning message:
It not suggested to both set `scale` and `breaks`. It makes the function confused.