---
title: "Guided tutorial to COTAN V.2"
author: 
- name: "Silvia Giulia Galfrè"
  affiliation: "Department of Computer Science, University of Pisa"

package: COTAN
output: 
    BiocStyle::html_document
vignette: >
  %\VignetteIndexEntry{Guided tutorial to COTAN V.2}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8} 
---

## Preamble

```{r, include = FALSE}
knitr::opts_chunk[["set"]](
  collapse = TRUE,
  comment = "#>",
  dev = "png",
  fig.width = 6L,
  fig.height = 4L,
  dpi = 72L
)

```

```{r message=FALSE, warning=FALSE}
library(COTAN)
library(zeallot)
library(rlang)
library(data.table)
library(Rtsne)
library(qpdf)
library(GEOquery)
library(ComplexHeatmap)

options(parallelly.fork.enable = TRUE)
```

## Introduction

This tutorial contains the same functionalities as the first release of the
COTAN tutorial but done using the new and updated functions.

## Get the data-set

Download the data-set for `"mouse cortex E17.5"`.

```{r eval=TRUE, include=TRUE}
dataDir <- tempdir()

GEO <- "GSM2861514"
fName <- "GSM2861514_E175_Only_Cortical_Cells_DGE.txt.gz"

dataSetFile <- file.path(dataDir, GEO, fName)

dir.create(file.path(dataDir, GEO), showWarnings = FALSE)

if (!file.exists(dataSetFile)) {
  getGEOSuppFiles(GEO, makeDirectory = TRUE,
                  baseDir = dataDir, fetch_files = TRUE,
                  filter_regex = fName)
}

sample.dataset <- read.csv(dataSetFile, sep = "\t", row.names = 1L)
```

Define a directory where the output will be stored.

```{r}
outDir <- dataDir

# Log-level 2 was chosen to showcase better how the package works
# In normal usage a level of 0 or 1 is more appropriate
setLoggingLevel(2L)

# This file will contain all the logs produced by the package
# as if at the highest logging level
setLoggingFile(file.path(outDir, "vignette_v2.log"))

message("COTAN uses the `torch` library when asked to `optimizeForSpeed`")
message("Run the command 'options(COTAN.UseTorch = FALSE)'",
        " in your session to disable `torch` completely!")

# this command does check whether the torch library is properly installed
c(useTorch, deviceStr) %<-% COTAN:::canUseTorch(TRUE, "cuda")
if (useTorch) {
  message("The `torch` library is available and ready to use")
  if (deviceStr == "cuda") {
    message("The `torch` library can use the `CUDA` GPU")
  } else {
    message("The `torch` library can only use the CPU")
    message("Please ensure you have the `OpenBLAS` libraries",
            " installed on the system")
  }
}

rm(useTorch, deviceStr)
```

# Analytical pipeline

Initialize the `COTAN` object with the row count table and
the metadata for the experiment.

```{r}
cond <- "mouse_cortex_E17.5"

obj <- COTAN(raw = sample.dataset)
obj <- initializeMetaDataset(obj,
                             GEO = GEO,
                             sequencingMethod = "Drop_seq",
                             sampleCondition = cond)

logThis(paste0("Condition ", getMetadataElement(obj, datasetTags()[["cond"]])),
        logLevel = 1L)
```

Before we proceed to the analysis, we need to clean the data.
The analysis will use a matrix of raw `UMI` counts as the input.
To obtain this matrix, we have to remove any potential cell doublets or
multiplets, as well as any low quality or dying cells.

## Data cleaning

We can check the library size (`UMI` number) with an *Empirical Cumulative
Distribution* function

```{r}
plot(ECDPlot(obj))
```

```{r}
plot(cellSizePlot(obj))
```

```{r}
plot(genesSizePlot(obj))
```

```{r}
plot(scatterPlot(obj))
```

During the cleaning, every time we want to remove cells or genes
we can use the `dropGenesCells()`function.

Drop cells with too many reads as they are probably doublets or multiplets
```{r}
cellsSizeThr <- 6000L
obj <- addElementToMetaDataset(obj, "Cells size threshold", cellsSizeThr)

cells_to_rem <- getCells(obj)[getCellsSize(obj) > cellsSizeThr]
obj <- dropGenesCells(obj, cells = cells_to_rem)

plot(cellSizePlot(obj))
```

To drop cells by gene number: high genes count might also indicate doublets...

```{r}
genesSizeHighThr <- 3000L
obj <- addElementToMetaDataset(obj, "Num genes high threshold",
                               genesSizeHighThr)

cells_to_rem <- getCells(obj)[getNumExpressedGenes(obj) > genesSizeHighThr]
obj <- dropGenesCells(obj, cells = cells_to_rem)

plot(genesSizePlot(obj))
```

Drop cells with too low genes expression as they are probably dead

```{r}
genesSizeLowThr <- 500L
obj <- addElementToMetaDataset(obj, "Num genes low threshold", genesSizeLowThr)

cells_to_rem <- getCells(obj)[getNumExpressedGenes(obj) < genesSizeLowThr]
obj <- dropGenesCells(obj, cells = cells_to_rem)

plot(genesSizePlot(obj))
```


Cells with a too high percentage of mitochondrial genes are
likely dead (or at the last problematic) cells. So we drop them!

```{r}
c(mitPlot, mitSizes) %<-% mitochondrialPercentagePlot(obj, genePrefix = "^Mt")

plot(mitPlot)
```

```{r}
mitPercThr <- 1.0
obj <- addElementToMetaDataset(obj, "Mitoc. perc. threshold", mitPercThr)

cells_to_rem <- rownames(mitSizes)[mitSizes[["mit.percentage"]] > mitPercThr]
obj <- dropGenesCells(obj, cells = cells_to_rem)

c(mitPlot, mitSizes) %<-% mitochondrialPercentagePlot(obj, genePrefix = "^Mt")

plot(mitPlot)
```


If we do not want to consider the mitochondrial genes we can remove them
before starting the analysis.

```{r}
genes_to_rem <- getGenes(obj)[grep("^Mt", getGenes(obj))]
cells_to_rem <- getCells(obj)[which(getCellsSize(obj) == 0L)]

obj <- dropGenesCells(obj, genes_to_rem, cells_to_rem)
```

We want also to log the current status.

```{r}
logThis(paste("n cells", getNumCells(obj)), logLevel = 1L)
```

The `clean()` function estimates all the parameters for the data. Therefore, we have to run it again every time we remove any genes or cells from the data.

```{r}
obj <- addElementToMetaDataset(obj, "Num drop B group", 0L)

obj <- clean(obj)

c(pcaCellsPlot, pcaCellsData, genesPlot,
  UDEPlot, nuPlot, zoomedNuPlot) %<-% cleanPlots(obj)

plot(pcaCellsPlot)
```

```{r}
plot(genesPlot)
```

We can observe here that the red cells are really enriched in hemoglobin genes so we prefer to remove them. They can be extracted from the `pcaCellsData` object and removed.

```{r eval=TRUE, include=TRUE}
cells_to_rem <- rownames(pcaCellsData)[pcaCellsData[["groups"]] == "B"]
obj <- dropGenesCells(obj, cells = cells_to_rem)

obj <- addElementToMetaDataset(obj, "Num drop B group", 1L)

obj <- clean(obj)

c(pcaCellsPlot, pcaCellsData, genesPlot,
  UDEPlot, nuPlot, zoomedNuPlot) %<-% cleanPlots(obj)

plot(pcaCellsPlot)
```

To color the PCA based on `nu` (so the cells' efficiency)

```{r}
plot(UDEPlot)
```

`UDE` (color) should not correlate with principal components! This is very important.

The next part is used to remove the cells with efficiency too low.

```{r}
plot(nuPlot)
```

We can zoom on the smallest values and, if COTAN detects a clear elbow,
we can decide to remove the cells.

```{r}
plot(zoomedNuPlot)
```

We also save the defined threshold in the metadata and re-run the estimation

```{r}
UDELowThr <- 0.30
obj <- addElementToMetaDataset(obj, "Low UDE cells' threshold", UDELowThr)

obj <- addElementToMetaDataset(obj, "Num drop B group", 2L)

obj <- estimateNuLinear(obj)

cells_to_rem <- getCells(obj)[getNu(obj) < UDELowThr]
obj <- dropGenesCells(obj, cells = cells_to_rem)
```

Repeat the estimation after the cells are removed

```{r}
obj <- clean(obj)

c(pcaCellsPlot, pcaCellsData, genesPlot,
  UDEPlot, nuPlot, zoomedNuPlot) %<-% cleanPlots(obj)

plot(pcaCellsPlot)
```

```{r}
plot(scatterPlot(obj))
```

```{r}
logThis(paste("n cells", getNumCells(obj)), logLevel = 1L)
```

## COTAN analysis

In this part, all the contingency tables are computed
and used to get the statistics necessary to `COEX` evaluation and storing

```{r}
obj <- proceedToCoex(obj, calcCoex = TRUE,
                     optimizeForSpeed = TRUE, cores = 6L, deviceStr = "cuda",
                     saveObj = FALSE, outDir = outDir)
```

When `saveObj == TRUE`, in the previous step, this step can be skipped
as the `COTAN` object has already been saved in the `outDir`.

```{r eval=FALSE, include=TRUE}
# saving the structure
saveRDS(obj, file = file.path(outDir, paste0(cond, ".cotan.RDS")))
```

## Automatic run

It is also possible to run directly a single function
if we don't want to clean anything.

```{r eval=FALSE, include=TRUE}
obj2 <- automaticCOTANObjectCreation(
  raw = sample.dataset,
  GEO = GEO,
  sequencingMethod = "Drop_seq",
  sampleCondition = cond,
  calcCoex = TRUE, cores = 6L, optimizeForSpeed = TRUE,
  saveObj = TRUE, outDir = outDir)
```

# Analysis of the elaborated data

## `GDI`

To calculate and store the Global Differentiation Index (`GDI`) we can run:

```{r}
gdiDF <- calculateGDI(obj)

head(gdiDF)

# This will store only the $GDI column
obj <- storeGDI(obj, genesGDI = gdiDF)
```

The next function can either plot the `GDI` for the dataset directly or
use the pre-computed dataframe.

It marks the given threshold 1.43 (in red) and
the 50% and 75% quantiles (in blue).
We can also specify some gene sets (three in this case) that
we want to label explicitly in the `GDI` plot.

```{r}
genesList <- list(
  "NPGs" = c("Nes", "Vim", "Sox2", "Sox1", "Notch1", "Hes1", "Hes5", "Pax6"),
  "PNGs" = c("Map2", "Tubb3", "Neurod1", "Nefm", "Nefl", "Dcx", "Tbr1"),
  "hk"   = c("Calm1", "Cox6b1", "Ppia", "Rpl18", "Cox7c", "Erh", "H3f3a",
             "Taf1", "Taf2", "Gapdh", "Actb", "Golph3", "Zfr", "Sub1",
             "Tars", "Amacr")
)

GDIPlot(obj, cond = cond, genes = genesList, GDIThreshold = 1.40)
```

The percentage of cells expressing the gene in the third column of this
data-frame is reported.

## Heatmaps

To perform the Gene Pair Analysis, we can plot a heatmap of the `COEX` values
between two gene sets.
We have to define the different gene sets (`list.genes`) in a list.
Then we can choose which sets to use in the function parameter sets
(for example, from 1 to 3).
We also have to provide an array of the file name prefixes for each condition
(for example, "mouse_cortex_E17.5").
In fact, this function can plot genes relationships across many different
conditions to get a complete overview.

```{r}
plot(heatmapPlot(obj, genesLists = genesList))
```

We can also plot a general heatmap of `COEX` values based on some markers like
the following one.

```{r, eval=TRUE, include=TRUE}
invisible(
  genesHeatmapPlot(obj, primaryMarkers = c("Satb2", "Bcl11b", "Vim", "Hes1"),
                   pValueThreshold = 0.001, symmetric = TRUE))
```

```{r, eval=FALSE, include=TRUE}
invisible(
  genesHeatmapPlot(obj, primaryMarkers = c("Satb2", "Bcl11b", "Fezf2"),
                   secondaryMarkers = c("Gabra3", "Meg3", "Cux1", "Neurod6"),
                   pValueThreshold = 0.001, symmetric = FALSE))
```

## Get data tables

Sometimes we can also be interested in the numbers present directly in the
contingency tables for two specific genes. To get them we can use two functions:

`contingencyTables()` to produce the observed and expected data

```{r}
print("Contingency Tables:")
contingencyTables(obj, g1 = "Satb2", g2 = "Bcl11b")

print("Corresponding Coex")
getGenesCoex(obj)["Satb2", "Bcl11b"]
```

Another useful function is `getGenesCoex()`. This can be used to extract
the whole or a partial `COEX` matrix from a `COTAN` object.

```{r}
# For the whole matrix
coex <- getGenesCoex(obj, zeroDiagonal = FALSE)
coex[1000L:1005L, 1000L:1005L]
```

```{r}
# For a partial matrix
coex <- getGenesCoex(obj, genes = c("Satb2", "Bcl11b", "Fezf2"))
coex[1000L:1005L, ]
```

## Establishing genes' clusters

`COTAN` provides a way to establish genes' clusters given some lists of markers

```{r eval=TRUE, include=TRUE}
layersGenes <- list(
  "L1"   = c("Reln",   "Lhx5"),
  "L2/3" = c("Satb2",  "Cux1"),
  "L4"   = c("Rorb",   "Sox5"),
  "L5/6" = c("Bcl11b", "Fezf2"),
  "Prog" = c("Vim",    "Hes1", "Dummy")
)
c(gSpace, eigPlot, pcaGenesClDF, treePlot) %<-%
  establishGenesClusters(obj, groupMarkers = layersGenes,
                         numGenesPerMarker = 25L, kCuts = 5L)

plot(eigPlot)
```

```{r eval=TRUE, include=TRUE}
plot(treePlot)
```

```{r eval=TRUE, include=TRUE}
colSelection <- vapply(pcaGenesClDF, is.numeric, logical(1L))
genesUmapPl <- UMAPPlot(pcaGenesClDF[, colSelection, drop = FALSE],
                        clusters = getColumnFromDF(pcaGenesClDF, "hclust"),
                        elements = layersGenes,
                        title = "Genes' clusters UMAP Plot",
                        numNeighbors = 32L, minPointsDist = 0.25)

plot(genesUmapPl)
```

## Uniform Clustering

It is possible to obtain a cell clusterization based on the concept of 
*uniformity* of expression of the genes across the cells.
That is the cluster satisfies the null hypothesis of the `COTAN` model:
the genes expression is not dependent on the cell in consideration.

There are two functions involved into obtaining a proper clusterization:
the first is `cellsUniformClustering` that uses standard tools clusterization
methods, but then discards and re-clusters any *non-uniform* cluster.

Please note that the most important parameters for the users are the
`GDIThreshold`s inside the **Uniform Transcript** checkers: they define how 
strict is the check. Default constructed advance check gives a pretty strong
guarantee of uniformity for the *cluster*.

```{r eval=FALSE, include=TRUE}
# This code is a little too computationally heavy to be used in an example
# So we stored the result and we can load it in the next section

# default constructed checker is OK
advChecker <- new("AdvancedGDIUniformityCheck")

c(splitClusters, splitCoexDF) %<-%
  cellsUniformClustering(obj, initialResolution = 0.8, checker = advChecker,
                         optimizeForSpeed = TRUE, deviceStr = "cuda",
                         cores = 6L, genesSel = "HGDI",
                         saveObj = TRUE, outDir = outDir)

obj <- addClusterization(obj, clName = "split",
                         clusters = splitClusters, coexDF = splitCoexDF)

table(splitClusters)
```

In the case one already has an existing *clusterization*, it is possible to
calculate the *DEA* `data.frame` and add it to the `COTAN` object.
```{r eval=TRUE, include=TRUE}
data("vignette.split.clusters", package = "COTAN")
splitClusters <- vignette.split.clusters[getCells(obj)]

splitCoexDF <- DEAOnClusters(obj, clusters = splitClusters)

obj <- addClusterization(obj, clName = "split", clusters = splitClusters,
                         coexDF = splitCoexDF, override = FALSE)
```

It is possible to have some statistics about the *clusterization*
```{r eval=TRUE, include=TRUE}
c(summaryData, summaryPlot) %<-%
  clustersSummaryPlot(obj, clName = "split", plotTitle = "split summary")

summaryData
```

The `ExpGenes` column contains the number of genes that are expressed in any
cell of the relevant *cluster*, while the `ExpGenes25` column contains the number of genes expressed in at the least 25% of the cells of the relevant *cluster*
```{r eval=TRUE, include=TRUE}
plot(summaryPlot)
```


It is possible to visualize how relevant are the *marker genes'* `lists` with
respect to the given *clusterization*
```{r eval=TRUE, include=TRUE}
c(splitHeatmap, scoreDF, pValueDF) %<-%
  clustersMarkersHeatmapPlot(obj, groupMarkers = layersGenes,
                             clName = "split", kCuts = 5L,
                             adjustmentMethod = "holm")

draw(splitHeatmap)
```


Since the algorithm that creates the *clusters* is not directly geared to
achieve cluster uniformity, there might be some *clusters* that can be merged
back together and still be **uniform**.

This is the purpose of the function `mergeUniformCellsClusters` that takes a
*clusterization* and tries to merge cluster pairs after checking that together
the pair forms a uniform cluster.

In order to avoid running the totality of the possible checks (as it can explode
quickly with the number of *clusters*), the function relies on a *related*
distance the find the cluster pairs that have the highest chance to be merged.

```{r eval=FALSE, include=TRUE}
c(mergedClusters, mergedCoexDF) %<-%
  mergeUniformCellsClusters(obj, clusters = splitClusters,
                            checkers = advChecker,
                            optimizeForSpeed = TRUE, deviceStr = "cuda",
                            cores = 6L, saveObj = TRUE, outDir = outDir)

# merges are:
#  1 <- 06 + 07
#  2 <- '-1' + 08 + 11
#  3 <- 09
#  4 <- 01
#  5 <- 02
#  6 <- 12
#  7 <- 03 + 04 + 10 + 13
#  8 <- 05

obj <- addClusterization(obj, clName = "merge", override = TRUE,
                         clusters = mergedClusters, coexDF = mergedCoexDF)

table(mergedClusters)
```

Again, here, the most important parameters for the users are the `GDIThreshold`s
inside the **Uniform Transcript** checkers: they define how  strict is the
check. Default constructed advance check gives a pretty strong guarantee of
uniformity for the *cluster*. If one wants to achieve a more *relaxed* merge,
it is possible to define a sequence of checkers, each less stringent than the
previous, that will be applied sequentially: First all the merges with the
stricter checker are performed, than those that satisfy the second, and so on.

```{r eval=FALSE, include=TRUE}

checkersList <- list(advChecker,
                     shiftCheckerThresholds(advChecker, 0.01),
                     shiftCheckerThresholds(advChecker, 0.03))

prevCheckRes <- data.frame()

# In this case we want to re-use the already calculated merge checks
# Se we reload them from the output files. This, along a similar facility for
# the split method, is also useful in the cases the execution is interrupted
# prematurely...
#
if (TRUE) {
  # read from the last file among those named all_check_results_XX.csv
  mergeDir <- file.path(outDir, cond, "leafs_merge")
  resFiles <- list.files(path = mergeDir, pattern = "all_check_results_.*csv",
                         full.names = TRUE)

  prevCheckRes <- read.csv(resFiles[length(resFiles)],
                           header = TRUE, row.names = 1L)
}

c(mergedClusters2, mergedCoexDF2) %<-%
  mergeUniformCellsClusters(obj, clusters = splitClusters,
                            checkers = checkersList,
                            allCheckResults = prevCheckRes,
                            optimizeForSpeed = TRUE, deviceStr = "cuda",
                            cores = 6L, saveObj = TRUE, outDir = outDir)

# merges are:
#  1 <- '-1' + 06 + 09
#  2 <- 07
#  3 <- 03  + 04 + 10 + 13
#  4 <- 12
#  5 <- 01 + 08 + 11
#  6 <- 02 + 05

obj <- addClusterization(obj, clName = "merge2", override = TRUE,
                         clusters = mergedClusters2, coexDF = mergedCoexDF2)

table(mergedClusters2)
```


In the case one already has existing *clusterizations*, it is possible to
calculate their *DEA* `data.frame` and add them to the `COTAN` object.
```{r eval=TRUE, include=TRUE}
data("vignette.merge.clusters", package = "COTAN")
mergedClusters <- vignette.merge.clusters[getCells(obj)]

mergedCoexDF <- DEAOnClusters(obj, clusters = mergedClusters)

obj <- addClusterization(obj, clName = "merge", clusters = mergedClusters,
                         coexDF = mergedCoexDF, override = FALSE)

data("vignette.merge2.clusters", package = "COTAN")
mergedClusters2 <- vignette.merge2.clusters[getCells(obj)]

mergedCoexDF2 <- DEAOnClusters(obj, clusters = mergedClusters2)

obj <- addClusterization(obj, clName = "merge2", clusters = mergedClusters2,
                         coexDF = mergedCoexDF2, override = FALSE)

table(mergedClusters2, mergedClusters)
```


Here is possible to visualize how the `split` clusters are merged in to 
`merge` and `merge2`

```{r eval=TRUE, include=TRUE}
c(mergeHeatmap, mergeScoresDF, mergePValuesDF) %<-%
  clustersMarkersHeatmapPlot(obj, clName = "merge", condNameList = "split",
                             conditionsList = list(splitClusters))
draw(mergeHeatmap)

c(merge2Heatmap, merge2ScoresDF, merge2PValuesDF) %<-%
  clustersMarkersHeatmapPlot(obj, clName = "merge2", condNameList = "split",
                             conditionsList = list(splitClusters))
draw(merge2Heatmap)
```

In the following graph, it is possible to check that the found clusters align
very well to the expression of the layers' genes defined above...

It is possible to plot the *clusterization* on a `UMAP` plot

```{r}
c(umapPlot, cellsRDM) %<-% cellsUMAPPlot(obj, clName = "merge2",
                                         clusters = NULL,
                                         useCoexEigen = TRUE,
                                         dataMethod = "LogLikelihood",
                                         numComp = 50L,
                                         genesSel = "HGDI",
                                         numGenes = 200L,
                                         colors = NULL,
                                         numNeighbors = 30L,
                                         minPointsDist = 0.3)

plot(umapPlot)
```


## Vignette clean-up stage

The next few lines are just to clean.

```{r}
if (file.exists(file.path(outDir, paste0(cond, ".cotan.RDS")))) {
  #Delete file if it exists
  file.remove(file.path(outDir, paste0(cond, ".cotan.RDS")))
}
if (file.exists(file.path(outDir, paste0(cond, "_times.csv")))) {
  #Delete file if it exists
  file.remove(file.path(outDir, paste0(cond, "_times.csv")))
}
if (dir.exists(file.path(outDir, cond))) {
  unlink(file.path(outDir, cond), recursive = TRUE)
}
if (dir.exists(file.path(outDir, GEO))) {
  unlink(file.path(outDir, GEO), recursive = TRUE)
}

# stop logging to file
setLoggingFile("")
file.remove(file.path(outDir, "vignette_v2.log"))

options(parallelly.fork.enable = FALSE)
```

```{r}
Sys.time()
```


```{r}
sessionInfo()
```