## ----options, include=FALSE, echo=FALSE--------------------------------------- library(BiocStyle) knitr::opts_chunk$set(eval=TRUE, warning=FALSE, error=FALSE, message=FALSE) ## ----eval=FALSE--------------------------------------------------------------- # if (!require("BiocManager", quietly = TRUE)) # install.packages("BiocManager") # # BiocManager::install("ClusterFoldSimilarity") ## ----setup-------------------------------------------------------------------- library(ClusterFoldSimilarity) ## ----construct---------------------------------------------------------------- library(Seurat) library(scRNAseq) library(dplyr) # Human pancreatic single cell data 1 pancreasMuraro <- scRNAseq::MuraroPancreasData(ensembl=FALSE) pancreasMuraro <- pancreasMuraro[,rownames(colData(pancreasMuraro)[!is.na(colData(pancreasMuraro)$label),])] colData(pancreasMuraro)$cell.type <- colData(pancreasMuraro)$label rownames(pancreasMuraro) <- make.names(unlist(lapply(strsplit(rownames(pancreasMuraro), split="__"), function(x)x[[1]])), unique = TRUE) singlecell1Seurat <- CreateSeuratObject(counts=counts(pancreasMuraro), meta.data=as.data.frame(colData(pancreasMuraro))) ## ----echo=FALSE--------------------------------------------------------------- knitr::kable( table(colData(pancreasMuraro)$label), caption = 'Cell-types on pancreas dataset from Muraro et al.', align = "c" ) ## ----------------------------------------------------------------------------- # Human pancreatic single cell data 2 pancreasBaron <- scRNAseq::BaronPancreasData(which="human", ensembl=FALSE) colData(pancreasBaron)$cell.type <- colData(pancreasBaron)$label rownames(pancreasBaron) <- make.names(rownames(pancreasBaron), unique = TRUE) singlecell2Seurat <- CreateSeuratObject(counts=counts(pancreasBaron), meta.data=as.data.frame(colData(pancreasBaron))) ## ----echo=FALSE--------------------------------------------------------------- knitr::kable( table(colData(pancreasBaron)$label), caption = 'Cell-types on pancreas dataset from Baron et al.', align = "c" ) ## ----results='hide'----------------------------------------------------------- # Create a list with the unprocessed single-cell datasets singlecellObjectList <- list(singlecell1Seurat, singlecell2Seurat) # Apply the same processing to each dataset and return a list of single-cell analysis singlecellObjectList <- lapply(X=singlecellObjectList, FUN=function(scObject){ scObject <- NormalizeData(scObject) scObject <- FindVariableFeatures(scObject, selection.method="vst", nfeatures=2000) scObject <- ScaleData(scObject, features=VariableFeatures(scObject)) scObject <- RunPCA(scObject, features=VariableFeatures(object=scObject)) scObject <- FindNeighbors(scObject, dims=seq(16)) scObject <- FindClusters(scObject, resolution=0.4) }) ## ----fig.wide = TRUE---------------------------------------------------------- # Assign cell-type annotated from the original study to the cell labels: Idents(singlecellObjectList[[1]]) <- factor(singlecellObjectList[[1]][[]][, "cell.type"]) library(ClusterFoldSimilarity) similarityTable <- clusterFoldSimilarity(scList=singlecellObjectList, sampleNames=c("human", "humanNA"), topN=1, nSubsampling=24) ## ----echo=FALSE--------------------------------------------------------------- knitr::kable( head(similarityTable), caption = 'A table of the first 10 rows of the similarity results.', align = "c", format = "html", table.attr = "style='width:70%;'") %>% kableExtra::kable_styling(font_size=10) ## ----------------------------------------------------------------------------- typeCount <- singlecellObjectList[[2]][[]] %>% group_by(seurat_clusters) %>% count(cell.type) %>% arrange(desc(n), .by_group = TRUE) %>% filter(row_number()==1) ## ----echo=FALSE--------------------------------------------------------------- knitr::kable( cbind.data.frame(typeCount, matched.type=rep(table(typeCount$seurat_clusters), x=similarityTable[similarityTable$datasetL == "humanNA",]$clusterR)), caption = 'Cell-type label matching on pancreas single-cell data.', align = "c" ) ## ----fig.wide = TRUE---------------------------------------------------------- # Retrieve the top 3 similar cluster for each of the clusters: similarityTable3Top <- clusterFoldSimilarity(scList=singlecellObjectList, topN=3, sampleNames=c("human", "humanNA"), nSubsampling=24) ## ----echo=FALSE--------------------------------------------------------------- knitr::kable( head(similarityTable3Top), caption = 'Similarity results showing the top 3 most similar clusters.', align = "c", format = "html", table.attr = "style='width:70%;'") %>% kableExtra::kable_styling(font_size=10) ## ----fig.wide = TRUE---------------------------------------------------------- # Retrieve the top 5 features that contribute the most to the similarity between each pair of clusters: similarityTable5TopFeatures <- clusterFoldSimilarity(scList=singlecellObjectList, topNFeatures=5, nSubsampling=24) ## ----echo=FALSE--------------------------------------------------------------- knitr::kable( head(similarityTable5TopFeatures, n=10), caption = 'Similarity results showing the top 5 features that most contributed to the similarity.', align = "c", format = "html", table.attr = "style='width:70%;'") %>% kableExtra::kable_styling(font_size=10) ## ----fig.wide = TRUE, out.height="400px"------------------------------------- similarityTableAllValues <- clusterFoldSimilarity(scList=singlecellObjectList, sampleNames=c("human", "humanNA"), topN=Inf) dim(similarityTableAllValues) ## ----------------------------------------------------------------------------- library(dplyr) dataset1 <- "human" dataset2 <- "humanNA" similarityTable2 <- similarityTableAllValues %>% filter(datasetL == dataset1 & datasetR == dataset2) %>% arrange(desc(as.numeric(clusterL)), as.numeric(clusterR)) cls <- unique(similarityTable2$clusterL) cls2 <- unique(similarityTable2$clusterR) similarityMatrixAll <- t(matrix(similarityTable2$similarityValue, ncol=length(unique(similarityTable2$clusterL)))) rownames(similarityMatrixAll) <- cls colnames(similarityMatrixAll) <- cls2 ## ----echo=FALSE--------------------------------------------------------------- knitr::kable( similarityMatrixAll, caption = 'A 2 datasets Similarity matrix.', digits = 2, align = "c", format = "html", table.attr = "style='width:60%;'") %>% kableExtra::kable_styling(font_size=8) ## ----------------------------------------------------------------------------- # Mouse pancreatic single cell data pancreasBaronMM <- scRNAseq::BaronPancreasData(which="mouse", ensembl=FALSE) ## ----echo=FALSE--------------------------------------------------------------- knitr::kable( table(colData(pancreasBaronMM)$label), caption = 'Cell-types on pancreas dataset from Baron et al.', align = "c" ) ## ----------------------------------------------------------------------------- colData(pancreasBaronMM)$cell.type <- colData(pancreasBaronMM)$label # Translate mouse gene ids to human ids # *for the sake of simplicity we are going to transform to uppercase all mouse gene names rownames(pancreasBaronMM) <- make.names(toupper(rownames(pancreasBaronMM)), unique=TRUE) # Create seurat object singlecell3Seurat <- CreateSeuratObject(counts=counts(pancreasBaronMM), meta.data=as.data.frame(colData(pancreasBaronMM))) # We append the single-cell object to our list singlecellObjectList[[3]] <- singlecell3Seurat ## ----results='hide'----------------------------------------------------------- scObject <- singlecellObjectList[[3]] scObject <- NormalizeData(scObject) scObject <- FindVariableFeatures(scObject, selection.method="vst", nfeatures=2000) scObject <- ScaleData(scObject, features=VariableFeatures(scObject)) scObject <- RunPCA(scObject, features=VariableFeatures(object=scObject)) scObject <- FindNeighbors(scObject, dims=seq(16)) scObject <- FindClusters(scObject, resolution=0.4) singlecellObjectList[[3]] <- scObject ## ----fig.wide = TRUE---------------------------------------------------------- # We use the cell labels as a second reference, but we can also use the cluster labels if our interest is to match clusters Idents(singlecellObjectList[[3]]) <- factor(singlecellObjectList[[3]][[]][,"cell.type"]) # We subset the most variable genes in each experiment singlecellObjectListVariable <- lapply(singlecellObjectList, function(x){x[VariableFeatures(x),]}) # Setting the number of CPUs with BiocParallel: BiocParallel::register(BPPARAM = BiocParallel::MulticoreParam(workers = 6)) similarityTableHumanMouse <- clusterFoldSimilarity(scList=singlecellObjectListVariable, sampleNames=c("human", "humanNA", "mouse"), topN=1, nSubsampling=24, parallel=TRUE) ## ----fig.wide = TRUE, out.height="400px"------------------------------------- similarityTableHumanMouseAll <- clusterFoldSimilarity(scList=singlecellObjectListVariable, sampleNames=c("human", "humanNA", "mouse"), topN=Inf, nSubsampling=24, parallel=TRUE) ## ----fig.wide = TRUE, out.height="400px"------------------------------------- ClusterFoldSimilarity::similarityHeatmap(similarityTable=similarityTableHumanMouseAll, mainDataset="humanNA") ## ----fig.wide = TRUE, out.height="400px"------------------------------------- # Turn-off the highlighting: ClusterFoldSimilarity::similarityHeatmap(similarityTable=similarityTableHumanMouseAll, mainDataset="humanNA", highlightTop=FALSE) ## ----------------------------------------------------------------------------- sessionInfo()