---
title: "Guided tutorial to COTAN V.2"
author:
- name: "Silvia Giulia Galfrè"
affiliation: "Department of Computer Science, University of Pisa"
package: COTAN
output:
BiocStyle::html_document
vignette: >
%\VignetteIndexEntry{Guided tutorial to COTAN V.2}
%\VignetteEngine{knitr::rmarkdown}
%\VignetteEncoding{UTF-8}
---
## Preamble
```{r, include = FALSE}
knitr::opts_chunk[["set"]](
collapse = TRUE,
comment = "#>",
dev = "png",
fig.width = 6L,
fig.height = 4L,
dpi = 72L
)
```
```{r message=FALSE, warning=FALSE}
library(COTAN)
library(zeallot)
library(rlang)
library(data.table)
library(Rtsne)
library(qpdf)
library(GEOquery)
library(ComplexHeatmap)
options(parallelly.fork.enable = TRUE)
```
## Introduction
This tutorial contains the same functionalities as the first release of the
COTAN tutorial but done using the new and updated functions.
## Get the data-set
Download the data-set for `"mouse cortex E17.5"`.
```{r eval=TRUE, include=TRUE}
dataDir <- tempdir()
GEO <- "GSM2861514"
fName <- "GSM2861514_E175_Only_Cortical_Cells_DGE.txt.gz"
dataSetFile <- file.path(dataDir, GEO, fName)
dir.create(file.path(dataDir, GEO), showWarnings = FALSE)
if (!file.exists(dataSetFile)) {
getGEOSuppFiles(GEO, makeDirectory = TRUE,
baseDir = dataDir, fetch_files = TRUE,
filter_regex = fName)
}
sample.dataset <- read.csv(dataSetFile, sep = "\t", row.names = 1L)
```
Define a directory where the output will be stored.
```{r}
outDir <- dataDir
# Log-level 2 was chosen to showcase better how the package works
# In normal usage a level of 0 or 1 is more appropriate
setLoggingLevel(2L)
# This file will contain all the logs produced by the package
# as if at the highest logging level
setLoggingFile(file.path(outDir, "vignette_v2.log"))
message("COTAN uses the `torch` library when asked to `optimizeForSpeed`")
message("Run the command 'options(COTAN.UseTorch = FALSE)'",
" in your session to disable `torch` completely!")
# this command does check whether the torch library is properly installed
c(useTorch, deviceStr) %<-% COTAN:::canUseTorch(TRUE, "cuda")
if (useTorch) {
message("The `torch` library is available and ready to use")
if (deviceStr == "cuda") {
message("The `torch` library can use the `CUDA` GPU")
} else {
message("The `torch` library can only use the CPU")
message("Please ensure you have the `OpenBLAS` libraries",
" installed on the system")
}
}
rm(useTorch, deviceStr)
```
# Analytical pipeline
Initialize the `COTAN` object with the row count table and
the metadata for the experiment.
```{r}
cond <- "mouse_cortex_E17.5"
obj <- COTAN(raw = sample.dataset)
obj <- initializeMetaDataset(obj,
GEO = GEO,
sequencingMethod = "Drop_seq",
sampleCondition = cond)
logThis(paste0("Condition ", getMetadataElement(obj, datasetTags()[["cond"]])),
logLevel = 1L)
```
Before we proceed to the analysis, we need to clean the data.
The analysis will use a matrix of raw `UMI` counts as the input.
To obtain this matrix, we have to remove any potential cell doublets or
multiplets, as well as any low quality or dying cells.
## Data cleaning
We can check the library size (`UMI` number) with an *Empirical Cumulative
Distribution* function
```{r}
plot(ECDPlot(obj))
```
```{r}
plot(cellSizePlot(obj))
```
```{r}
plot(genesSizePlot(obj))
```
```{r}
plot(scatterPlot(obj))
```
During the cleaning, every time we want to remove cells or genes
we can use the `dropGenesCells()`function.
Drop cells with too many reads as they are probably doublets or multiplets
```{r}
cellsSizeThr <- 6000L
obj <- addElementToMetaDataset(obj, "Cells size threshold", cellsSizeThr)
cells_to_rem <- getCells(obj)[getCellsSize(obj) > cellsSizeThr]
obj <- dropGenesCells(obj, cells = cells_to_rem)
plot(cellSizePlot(obj))
```
To drop cells by gene number: high genes count might also indicate doublets...
```{r}
genesSizeHighThr <- 3000L
obj <- addElementToMetaDataset(obj, "Num genes high threshold",
genesSizeHighThr)
cells_to_rem <- getCells(obj)[getNumExpressedGenes(obj) > genesSizeHighThr]
obj <- dropGenesCells(obj, cells = cells_to_rem)
plot(genesSizePlot(obj))
```
Drop cells with too low genes expression as they are probably dead
```{r}
genesSizeLowThr <- 500L
obj <- addElementToMetaDataset(obj, "Num genes low threshold", genesSizeLowThr)
cells_to_rem <- getCells(obj)[getNumExpressedGenes(obj) < genesSizeLowThr]
obj <- dropGenesCells(obj, cells = cells_to_rem)
plot(genesSizePlot(obj))
```
Cells with a too high percentage of mitochondrial genes are
likely dead (or at the last problematic) cells. So we drop them!
```{r}
c(mitPlot, mitSizes) %<-% mitochondrialPercentagePlot(obj, genePrefix = "^Mt")
plot(mitPlot)
```
```{r}
mitPercThr <- 1.0
obj <- addElementToMetaDataset(obj, "Mitoc. perc. threshold", mitPercThr)
cells_to_rem <- rownames(mitSizes)[mitSizes[["mit.percentage"]] > mitPercThr]
obj <- dropGenesCells(obj, cells = cells_to_rem)
c(mitPlot, mitSizes) %<-% mitochondrialPercentagePlot(obj, genePrefix = "^Mt")
plot(mitPlot)
```
If we do not want to consider the mitochondrial genes we can remove them
before starting the analysis.
```{r}
genes_to_rem <- getGenes(obj)[grep("^Mt", getGenes(obj))]
cells_to_rem <- getCells(obj)[which(getCellsSize(obj) == 0L)]
obj <- dropGenesCells(obj, genes_to_rem, cells_to_rem)
```
We want also to log the current status.
```{r}
logThis(paste("n cells", getNumCells(obj)), logLevel = 1L)
```
The `clean()` function estimates all the parameters for the data. Therefore, we have to run it again every time we remove any genes or cells from the data.
```{r}
obj <- addElementToMetaDataset(obj, "Num drop B group", 0L)
obj <- clean(obj)
c(pcaCellsPlot, pcaCellsData, genesPlot,
UDEPlot, nuPlot, zoomedNuPlot) %<-% cleanPlots(obj)
plot(pcaCellsPlot)
```
```{r}
plot(genesPlot)
```
We can observe here that the red cells are really enriched in hemoglobin genes so we prefer to remove them. They can be extracted from the `pcaCellsData` object and removed.
```{r eval=TRUE, include=TRUE}
cells_to_rem <- rownames(pcaCellsData)[pcaCellsData[["groups"]] == "B"]
obj <- dropGenesCells(obj, cells = cells_to_rem)
obj <- addElementToMetaDataset(obj, "Num drop B group", 1L)
obj <- clean(obj)
c(pcaCellsPlot, pcaCellsData, genesPlot,
UDEPlot, nuPlot, zoomedNuPlot) %<-% cleanPlots(obj)
plot(pcaCellsPlot)
```
To color the PCA based on `nu` (so the cells' efficiency)
```{r}
plot(UDEPlot)
```
`UDE` (color) should not correlate with principal components! This is very important.
The next part is used to remove the cells with efficiency too low.
```{r}
plot(nuPlot)
```
We can zoom on the smallest values and, if COTAN detects a clear elbow,
we can decide to remove the cells.
```{r}
plot(zoomedNuPlot)
```
We also save the defined threshold in the metadata and re-run the estimation
```{r}
UDELowThr <- 0.30
obj <- addElementToMetaDataset(obj, "Low UDE cells' threshold", UDELowThr)
obj <- addElementToMetaDataset(obj, "Num drop B group", 2L)
obj <- estimateNuLinear(obj)
cells_to_rem <- getCells(obj)[getNu(obj) < UDELowThr]
obj <- dropGenesCells(obj, cells = cells_to_rem)
```
Repeat the estimation after the cells are removed
```{r}
obj <- clean(obj)
c(pcaCellsPlot, pcaCellsData, genesPlot,
UDEPlot, nuPlot, zoomedNuPlot) %<-% cleanPlots(obj)
plot(pcaCellsPlot)
```
```{r}
plot(scatterPlot(obj))
```
```{r}
logThis(paste("n cells", getNumCells(obj)), logLevel = 1L)
```
## COTAN analysis
In this part, all the contingency tables are computed
and used to get the statistics necessary to `COEX` evaluation and storing
```{r}
obj <- proceedToCoex(obj, calcCoex = TRUE,
optimizeForSpeed = TRUE, cores = 6L, deviceStr = "cuda",
saveObj = FALSE, outDir = outDir)
```
When `saveObj == TRUE`, in the previous step, this step can be skipped
as the `COTAN` object has already been saved in the `outDir`.
```{r eval=FALSE, include=TRUE}
# saving the structure
saveRDS(obj, file = file.path(outDir, paste0(cond, ".cotan.RDS")))
```
## Automatic run
It is also possible to run directly a single function
if we don't want to clean anything.
```{r eval=FALSE, include=TRUE}
obj2 <- automaticCOTANObjectCreation(
raw = sample.dataset,
GEO = GEO,
sequencingMethod = "Drop_seq",
sampleCondition = cond,
calcCoex = TRUE, cores = 6L, optimizeForSpeed = TRUE,
saveObj = TRUE, outDir = outDir)
```
# Analysis of the elaborated data
## `GDI`
To calculate and store the Global Differentiation Index (`GDI`) we can run:
```{r}
gdiDF <- calculateGDI(obj)
head(gdiDF)
# This will store only the $GDI column
obj <- storeGDI(obj, genesGDI = gdiDF)
```
The next function can either plot the `GDI` for the dataset directly or
use the pre-computed dataframe.
It marks the given threshold 1.43 (in red) and
the 50% and 75% quantiles (in blue).
We can also specify some gene sets (three in this case) that
we want to label explicitly in the `GDI` plot.
```{r}
genesList <- list(
"NPGs" = c("Nes", "Vim", "Sox2", "Sox1", "Notch1", "Hes1", "Hes5", "Pax6"),
"PNGs" = c("Map2", "Tubb3", "Neurod1", "Nefm", "Nefl", "Dcx", "Tbr1"),
"hk" = c("Calm1", "Cox6b1", "Ppia", "Rpl18", "Cox7c", "Erh", "H3f3a",
"Taf1", "Taf2", "Gapdh", "Actb", "Golph3", "Zfr", "Sub1",
"Tars", "Amacr")
)
GDIPlot(obj, cond = cond, genes = genesList, GDIThreshold = 1.40)
```
The percentage of cells expressing the gene in the third column of this
data-frame is reported.
## Heatmaps
To perform the Gene Pair Analysis, we can plot a heatmap of the `COEX` values
between two gene sets.
We have to define the different gene sets (`list.genes`) in a list.
Then we can choose which sets to use in the function parameter sets
(for example, from 1 to 3).
We also have to provide an array of the file name prefixes for each condition
(for example, "mouse_cortex_E17.5").
In fact, this function can plot genes relationships across many different
conditions to get a complete overview.
```{r}
plot(heatmapPlot(obj, genesLists = genesList))
```
We can also plot a general heatmap of `COEX` values based on some markers like
the following one.
```{r, eval=TRUE, include=TRUE}
invisible(
genesHeatmapPlot(obj, primaryMarkers = c("Satb2", "Bcl11b", "Vim", "Hes1"),
pValueThreshold = 0.001, symmetric = TRUE))
```
```{r, eval=FALSE, include=TRUE}
invisible(
genesHeatmapPlot(obj, primaryMarkers = c("Satb2", "Bcl11b", "Fezf2"),
secondaryMarkers = c("Gabra3", "Meg3", "Cux1", "Neurod6"),
pValueThreshold = 0.001, symmetric = FALSE))
```
## Get data tables
Sometimes we can also be interested in the numbers present directly in the
contingency tables for two specific genes. To get them we can use two functions:
`contingencyTables()` to produce the observed and expected data
```{r}
print("Contingency Tables:")
contingencyTables(obj, g1 = "Satb2", g2 = "Bcl11b")
print("Corresponding Coex")
getGenesCoex(obj)["Satb2", "Bcl11b"]
```
Another useful function is `getGenesCoex()`. This can be used to extract
the whole or a partial `COEX` matrix from a `COTAN` object.
```{r}
# For the whole matrix
coex <- getGenesCoex(obj, zeroDiagonal = FALSE)
coex[1000L:1005L, 1000L:1005L]
```
```{r}
# For a partial matrix
coex <- getGenesCoex(obj, genes = c("Satb2", "Bcl11b", "Fezf2"))
coex[1000L:1005L, ]
```
## Establishing genes' clusters
`COTAN` provides a way to establish genes' clusters given some lists of markers
```{r eval=TRUE, include=TRUE}
layersGenes <- list(
"L1" = c("Reln", "Lhx5"),
"L2/3" = c("Satb2", "Cux1"),
"L4" = c("Rorb", "Sox5"),
"L5/6" = c("Bcl11b", "Fezf2"),
"Prog" = c("Vim", "Hes1", "Dummy")
)
c(gSpace, eigPlot, pcaGenesClDF, treePlot) %<-%
establishGenesClusters(obj, groupMarkers = layersGenes,
numGenesPerMarker = 25L, kCuts = 5L)
plot(eigPlot)
```
```{r eval=TRUE, include=TRUE}
plot(treePlot)
```
```{r eval=TRUE, include=TRUE}
colSelection <- vapply(pcaGenesClDF, is.numeric, logical(1L))
genesUmapPl <- UMAPPlot(pcaGenesClDF[, colSelection, drop = FALSE],
clusters = getColumnFromDF(pcaGenesClDF, "hclust"),
elements = layersGenes,
title = "Genes' clusters UMAP Plot",
numNeighbors = 32L, minPointsDist = 0.25)
plot(genesUmapPl)
```
## Uniform Clustering
It is possible to obtain a cell clusterization based on the concept of
*uniformity* of expression of the genes across the cells.
That is the cluster satisfies the null hypothesis of the `COTAN` model:
the genes expression is not dependent on the cell in consideration.
There are two functions involved into obtaining a proper clusterization:
the first is `cellsUniformClustering` that uses standard tools clusterization
methods, but then discards and re-clusters any *non-uniform* cluster.
Please note that the most important parameters for the users are the
`GDIThreshold`s inside the **Uniform Transcript** checkers: they define how
strict is the check. Default constructed advance check gives a pretty strong
guarantee of uniformity for the *cluster*.
```{r eval=FALSE, include=TRUE}
# This code is a little too computationally heavy to be used in an example
# So we stored the result and we can load it in the next section
# default constructed checker is OK
advChecker <- new("AdvancedGDIUniformityCheck")
c(splitClusters, splitCoexDF) %<-%
cellsUniformClustering(obj, initialResolution = 0.8, checker = advChecker,
optimizeForSpeed = TRUE, deviceStr = "cuda",
cores = 6L, genesSel = "HGDI",
saveObj = TRUE, outDir = outDir)
obj <- addClusterization(obj, clName = "split",
clusters = splitClusters, coexDF = splitCoexDF)
table(splitClusters)
```
In the case one already has an existing *clusterization*, it is possible to
calculate the *DEA* `data.frame` and add it to the `COTAN` object.
```{r eval=TRUE, include=TRUE}
data("vignette.split.clusters", package = "COTAN")
splitClusters <- vignette.split.clusters[getCells(obj)]
splitCoexDF <- DEAOnClusters(obj, clusters = splitClusters)
obj <- addClusterization(obj, clName = "split", clusters = splitClusters,
coexDF = splitCoexDF, override = FALSE)
```
It is possible to have some statistics about the *clusterization*
```{r eval=TRUE, include=TRUE}
c(summaryData, summaryPlot) %<-%
clustersSummaryPlot(obj, clName = "split", plotTitle = "split summary")
summaryData
```
The `ExpGenes` column contains the number of genes that are expressed in any
cell of the relevant *cluster*, while the `ExpGenes25` column contains the number of genes expressed in at the least 25% of the cells of the relevant *cluster*
```{r eval=TRUE, include=TRUE}
plot(summaryPlot)
```
It is possible to visualize how relevant are the *marker genes'* `lists` with
respect to the given *clusterization*
```{r eval=TRUE, include=TRUE}
c(splitHeatmap, scoreDF, pValueDF) %<-%
clustersMarkersHeatmapPlot(obj, groupMarkers = layersGenes,
clName = "split", kCuts = 5L,
adjustmentMethod = "holm")
draw(splitHeatmap)
```
Since the algorithm that creates the *clusters* is not directly geared to
achieve cluster uniformity, there might be some *clusters* that can be merged
back together and still be **uniform**.
This is the purpose of the function `mergeUniformCellsClusters` that takes a
*clusterization* and tries to merge cluster pairs after checking that together
the pair forms a uniform cluster.
In order to avoid running the totality of the possible checks (as it can explode
quickly with the number of *clusters*), the function relies on a *related*
distance the find the cluster pairs that have the highest chance to be merged.
```{r eval=FALSE, include=TRUE}
c(mergedClusters, mergedCoexDF) %<-%
mergeUniformCellsClusters(obj, clusters = splitClusters,
checkers = advChecker,
optimizeForSpeed = TRUE, deviceStr = "cuda",
cores = 6L, saveObj = TRUE, outDir = outDir)
# merges are:
# 1 <- 06 + 07
# 2 <- '-1' + 08 + 11
# 3 <- 09
# 4 <- 01
# 5 <- 02
# 6 <- 12
# 7 <- 03 + 04 + 10 + 13
# 8 <- 05
obj <- addClusterization(obj, clName = "merge", override = TRUE,
clusters = mergedClusters, coexDF = mergedCoexDF)
table(mergedClusters)
```
Again, here, the most important parameters for the users are the `GDIThreshold`s
inside the **Uniform Transcript** checkers: they define how strict is the
check. Default constructed advance check gives a pretty strong guarantee of
uniformity for the *cluster*. If one wants to achieve a more *relaxed* merge,
it is possible to define a sequence of checkers, each less stringent than the
previous, that will be applied sequentially: First all the merges with the
stricter checker are performed, than those that satisfy the second, and so on.
```{r eval=FALSE, include=TRUE}
checkersList <- list(advChecker,
shiftCheckerThresholds(advChecker, 0.01),
shiftCheckerThresholds(advChecker, 0.03))
prevCheckRes <- data.frame()
# In this case we want to re-use the already calculated merge checks
# Se we reload them from the output files. This, along a similar facility for
# the split method, is also useful in the cases the execution is interrupted
# prematurely...
#
if (TRUE) {
# read from the last file among those named all_check_results_XX.csv
mergeDir <- file.path(outDir, cond, "leafs_merge")
resFiles <- list.files(path = mergeDir, pattern = "all_check_results_.*csv",
full.names = TRUE)
prevCheckRes <- read.csv(resFiles[length(resFiles)],
header = TRUE, row.names = 1L)
}
c(mergedClusters2, mergedCoexDF2) %<-%
mergeUniformCellsClusters(obj, clusters = splitClusters,
checkers = checkersList,
allCheckResults = prevCheckRes,
optimizeForSpeed = TRUE, deviceStr = "cuda",
cores = 6L, saveObj = TRUE, outDir = outDir)
# merges are:
# 1 <- '-1' + 06 + 09
# 2 <- 07
# 3 <- 03 + 04 + 10 + 13
# 4 <- 12
# 5 <- 01 + 08 + 11
# 6 <- 02 + 05
obj <- addClusterization(obj, clName = "merge2", override = TRUE,
clusters = mergedClusters2, coexDF = mergedCoexDF2)
table(mergedClusters2)
```
In the case one already has existing *clusterizations*, it is possible to
calculate their *DEA* `data.frame` and add them to the `COTAN` object.
```{r eval=TRUE, include=TRUE}
data("vignette.merge.clusters", package = "COTAN")
mergedClusters <- vignette.merge.clusters[getCells(obj)]
mergedCoexDF <- DEAOnClusters(obj, clusters = mergedClusters)
obj <- addClusterization(obj, clName = "merge", clusters = mergedClusters,
coexDF = mergedCoexDF, override = FALSE)
data("vignette.merge2.clusters", package = "COTAN")
mergedClusters2 <- vignette.merge2.clusters[getCells(obj)]
mergedCoexDF2 <- DEAOnClusters(obj, clusters = mergedClusters2)
obj <- addClusterization(obj, clName = "merge2", clusters = mergedClusters2,
coexDF = mergedCoexDF2, override = FALSE)
table(mergedClusters2, mergedClusters)
```
Here is possible to visualize how the `split` clusters are merged in to
`merge` and `merge2`
```{r eval=TRUE, include=TRUE}
c(mergeHeatmap, mergeScoresDF, mergePValuesDF) %<-%
clustersMarkersHeatmapPlot(obj, clName = "merge", condNameList = "split",
conditionsList = list(splitClusters))
draw(mergeHeatmap)
c(merge2Heatmap, merge2ScoresDF, merge2PValuesDF) %<-%
clustersMarkersHeatmapPlot(obj, clName = "merge2", condNameList = "split",
conditionsList = list(splitClusters))
draw(merge2Heatmap)
```
In the following graph, it is possible to check that the found clusters align
very well to the expression of the layers' genes defined above...
It is possible to plot the *clusterization* on a `UMAP` plot
```{r}
c(umapPlot, cellsRDM) %<-% cellsUMAPPlot(obj, clName = "merge2",
clusters = NULL,
useCoexEigen = TRUE,
dataMethod = "LogLikelihood",
numComp = 50L,
genesSel = "HGDI",
numGenes = 200L,
colors = NULL,
numNeighbors = 30L,
minPointsDist = 0.3)
plot(umapPlot)
```
## Vignette clean-up stage
The next few lines are just to clean.
```{r}
if (file.exists(file.path(outDir, paste0(cond, ".cotan.RDS")))) {
#Delete file if it exists
file.remove(file.path(outDir, paste0(cond, ".cotan.RDS")))
}
if (file.exists(file.path(outDir, paste0(cond, "_times.csv")))) {
#Delete file if it exists
file.remove(file.path(outDir, paste0(cond, "_times.csv")))
}
if (dir.exists(file.path(outDir, cond))) {
unlink(file.path(outDir, cond), recursive = TRUE)
}
if (dir.exists(file.path(outDir, GEO))) {
unlink(file.path(outDir, GEO), recursive = TRUE)
}
# stop logging to file
setLoggingFile("")
file.remove(file.path(outDir, "vignette_v2.log"))
options(parallelly.fork.enable = FALSE)
```
```{r}
Sys.time()
```
```{r}
sessionInfo()
```