---
title: "Comparison with base R"
author: "Stefano Mangiola"
date: "`r Sys.Date()`"
package: tidybulk
output:
  BiocStyle::html_document:
    toc_float: true
vignette: >
  %\VignetteEngine{knitr::knitr}
  %\VignetteIndexEntry{Comparison with base R}
  %\usepackage[UTF-8]{inputenc}
---



<style type="text/css">
.r {
  max-width: 2800px;!important
  margin-left: auto;
  margin-right: auto;
  padding=0px;!important 
  margin-right=10px;!important 
}
pre{
	padding-left:20px;
}
.column-left{
  float: left;
  width: 50%;
  text-align: left;
}
.column-right{
  float: left;
  width: 50%;
  text-align: left;
}
</style>



 <!-- badges: start -->
  [![Lifecycle:maturing](https://img.shields.io/badge/lifecycle-maturing-blue.svg)](https://www.tidyverse.org/lifecycle/#maturing)
  <!-- badges: end -->

<!---

[![Build Status](https://travis-ci.org/stemangiola/tidybulk.svg?branch=master)](https://travis-ci.org/stemangiola/tidybulk) [![Coverage Status](https://coveralls.io/repos/github/stemangiola/tidybulk/badge.svg?branch=master)](https://coveralls.io/github/stemangiola/tidybulk?branch=master)

-->


```{r, echo=FALSE, include=FALSE, }
library(knitr)
knitr::opts_chunk$set(cache = TRUE, warning = FALSE,
                      message = FALSE, cache.lazy = FALSE)

library(dplyr)
library(tidyr)
library(tibble)
library(magrittr)
library(ggplot2)
library(ggrepel)
library(tidybulk)
library(tidySummarizedExperiment)


my_theme = 	
	theme_bw() +
	theme(
		panel.border = element_blank(),
		axis.line = element_line(),
		panel.grid.major = element_line(size = 0.2),
		panel.grid.minor = element_line(size = 0.1),
		text = element_text(size=12),
		legend.position="bottom",
		aspect.ratio=1,
		strip.background = element_blank(),
		axis.title.x  = element_text(margin = margin(t = 10, r = 10, b = 10, l = 10)),
		axis.title.y  = element_text(margin = margin(t = 10, r = 10, b = 10, l = 10))
	)

tibble_counts = tidybulk::counts_SE %>% tidybulk() %>% as_tibble()

```

In this article we show some examples of the differences in coding between tidybulk/tidyverse and base R. We noted a decrease > 10x of assignments and a decrease of > 2x of line numbers.

## Create `tidybulk` tibble. 

```{r}
tt = se_mini
```

## Aggregate duplicated `transcripts`

<div class="column-left">

Tidy transcriptomics
```{r aggregate, cache=TRUE, message=FALSE, warning=FALSE, results='hide', class.source='yellow'}
rowData(tt)$gene_name = rownames(tt)
tt.aggr = tt %>% aggregate_duplicates(.transcript = gene_name)
```
</div>
<div class="column-right">
Base R
```{r aggregate long, eval=FALSE}
temp = data.frame(
	symbol = dge_list$genes$symbol,
	dge_list$counts
)
dge_list.nr <- by(temp,	temp$symbol,
	function(df)
		if(length(df[1,1])>0)
			matrixStats:::colSums(as.matrix(df[,-1]))
)
dge_list.nr <- do.call("rbind", dge_list.nr)
colnames(dge_list.nr) <- colnames(dge_list)
```
</div>
<div style="clear:both;"></div>

## Scale `counts`

<div class="column-left">
Tidy transcriptomics
```{r normalise, cache=TRUE}
tt.norm = tt.aggr %>% identify_abundant(factor_of_interest = condition) %>% scale_abundance()
```

</div>
<div class="column-right">
Base R
```{r normalise long, eval=FALSE}
library(edgeR)

dgList <- DGEList(count_m=x,group=group)
keep <- filterByExpr(dgList)
dgList <- dgList[keep,,keep.lib.sizes=FALSE]
[...]
dgList <- calcNormFactors(dgList, method="TMM")
norm_counts.table <- cpm(dgList)
```
</div>
<div style="clear:both;"></div>

## Filter `variable transcripts`

We may want to identify and filter variable transcripts.

<div class="column-left">
Tidy transcriptomics
```{r filter variable, cache=TRUE}
tt.norm.variable = tt.norm %>% keep_variable()
```
</div>
<div class="column-right">
Base R
```{r filter variable long, eval=FALSE}
library(edgeR)

x = norm_counts.table

s <- rowMeans((x-rowMeans(x))^2)
o <- order(s,decreasing=TRUE)
x <- x[o[1L:top],,drop=FALSE]

norm_counts.table = norm_counts.table[rownames(x)]

norm_counts.table$cell_type = tibble_counts[
	match(
		tibble_counts$sample,
		rownames(norm_counts.table)
	),
	"Cell type"
]
```

</div>
<div style="clear:both;"></div>


## Reduce `dimensions`

<div class="column-left">
Tidy transcriptomics
```{r mds, cache=TRUE}
tt.norm.MDS =
  tt.norm %>%
  reduce_dimensions(method="MDS", .dims = 2)

```
</div>
<div class="column-right">
Base R
```{r, eval = FALSE}
library(limma)

count_m_log = log(count_m + 1)
cmds = limma::plotMDS(ndim = .dims, plot = FALSE)

cmds = cmds %$%	
	cmdscale.out %>%
	setNames(sprintf("Dim%s", 1:6))

cmds$cell_type = tibble_counts[
	match(tibble_counts$sample, rownames(cmds)),
	"Cell type"
]
```
</div>
<div style="clear:both;"></div>


### PCA
<div class="column-left">
Tidy transcriptomics
```{r pca, cache=TRUE, message=FALSE, warning=FALSE, results='hide'}
tt.norm.PCA =
  tt.norm %>%
  reduce_dimensions(method="PCA", .dims = 2)
```
</div>
<div class="column-right">
Base R
```{r,eval=FALSE}
count_m_log = log(count_m + 1)
pc = count_m_log %>% prcomp(scale = TRUE)
variance = pc$sdev^2
variance = (variance / sum(variance))[1:6]
pc$cell_type = counts[
	match(counts$sample, rownames(pc)),
	"Cell type"
]
```
</div>
<div style="clear:both;"></div>

### tSNE
<div class="column-left">
Tidy transcriptomics
```{r tsne, cache=TRUE, message=FALSE, warning=FALSE, results='hide'}
tt.norm.tSNE =
	breast_tcga_mini_SE %>%
	tidybulk(		sample, ens, count_scaled) %>%
	identify_abundant() %>%
	reduce_dimensions(
		method = "tSNE",
		perplexity=10,
		pca_scale =TRUE
	)
```


</div>
<div class="column-right">
Base R
```{r, eval=FALSE}
count_m_log = log(count_m + 1)

tsne = Rtsne::Rtsne(
	t(count_m_log),
	perplexity=10,
		pca_scale =TRUE
)$Y
tsne$cell_type = tibble_counts[
	match(tibble_counts$sample, rownames(tsne)),
	"Cell type"
]
```
</div>
<div style="clear:both;"></div>


## Rotate `dimensions`

<div class="column-left">
Tidy transcriptomics
```{r rotate, cache=TRUE}
tt.norm.MDS.rotated =
  tt.norm.MDS %>%
	rotate_dimensions(`Dim1`, `Dim2`, rotation_degrees = 45, action="get")
```
</div>
<div class="column-right">
Base R
```{r, eval=FALSE}
rotation = function(m, d) {
	r = d * pi / 180
	((bind_rows(
		c(`1` = cos(r), `2` = -sin(r)),
		c(`1` = sin(r), `2` = cos(r))
	) %>% as_matrix) %*% m)
}
mds_r = pca %>% rotation(rotation_degrees)
mds_r$cell_type = counts[
	match(counts$sample, rownames(mds_r)),
	"Cell type"
]
```
</div>
<div style="clear:both;"></div>

## Test `differential abundance`

<div class="column-left">
Tidy transcriptomics
```{r de, cache=TRUE, message=FALSE, warning=FALSE, results='hide'}
tt.de =
	tt %>%
	test_differential_abundance( ~ condition, action="get")
tt.de
```
</div>
<div class="column-right">
Base R
```{r, eval=FALSE}
library(edgeR)

dgList <- DGEList(counts=counts_m,group=group)
keep <- filterByExpr(dgList)
dgList <- dgList[keep,,keep.lib.sizes=FALSE]
dgList <- calcNormFactors(dgList)
design <- model.matrix(~group)
dgList <- estimateDisp(dgList,design)
fit <- glmQLFit(dgList,design)
qlf <- glmQLFTest(fit,coef=2)
topTags(qlf, n=Inf)
```
</div>
<div style="clear:both;"></div>

## Adjust `counts`

<div class="column-left">
Tidy transcriptomics
```{r adjust, cache=TRUE, message=FALSE, warning=FALSE, results='hide'}
tt.norm.adj =
	tt.norm %>% adjust_abundance(	~ condition + time)

```
</div>
<div class="column-right">
Base R
```{r, eval=FALSE}
library(sva)

count_m_log = log(count_m + 1)

design =
		model.matrix(
			object = ~ condition + time,
			data = annotation
		)

count_m_log.sva =
	ComBat(
			batch =	design[,2],
			mod = design,
			...
		)

count_m_log.sva = ceiling(exp(count_m_log.sva) -1)
count_m_log.sva$cell_type = counts[
	match(counts$sample, rownames(count_m_log.sva)),
	"Cell type"
]

```
</div>
<div style="clear:both;"></div>

## Deconvolve `Cell type composition`


<div class="column-left">
Tidy transcriptomics
```{r cibersort, cache=TRUE, eval=FALSE}
tt.cibersort =
	tt %>%
	deconvolve_cellularity(action="get", cores=1)

```
</div>
<div class="column-right">
Base R
```{r, eval=FALSE}

source(‘CIBERSORT.R’)
count_m %>% write.table("mixture_file.txt")
results <- CIBERSORT(
	"sig_matrix_file.txt",
	"mixture_file.txt",
	perm=100, QN=TRUE
)
results$cell_type = tibble_counts[
	match(tibble_counts$sample, rownames(results)),
	"Cell type"
]

```
</div>
<div style="clear:both;"></div>

## Cluster `samples`

### k-means

<div class="column-left">
Tidy transcriptomics
```{r cluster, cache=TRUE}
tt.norm.cluster = tt.norm.MDS %>%
  cluster_elements(method="kmeans",	centers = 2, action="get" )
```
</div>
<div class="column-right">
Base R
```{r, eval=FALSE}
count_m_log = log(count_m + 1)

k = kmeans(count_m_log, iter.max = 1000, ...)
cluster = k$cluster

cluster$cell_type = tibble_counts[
	match(tibble_counts$sample, rownames(cluster)),
	c("Cell type", "Dim1", "Dim2")
]

```
</div>
<div style="clear:both;"></div>

### SNN

Matrix package (v1.3-3) causes an error with Seurat::FindNeighbors used in this method. We are trying to solve this issue. At the moment this option in unaviable.

<div class="column-left">
Tidy transcriptomics
```{r SNN, eval=FALSE, message=FALSE, warning=FALSE, results='hide'}
tt.norm.SNN =
	tt.norm.tSNE %>%
	cluster_elements(method = "SNN")
```
</div>
<div class="column-right">
Base R
```{r, eval=FALSE}
library(Seurat)

snn = CreateSeuratObject(count_m)
snn = ScaleData(
	snn, display.progress = TRUE,
	num.cores=4, do.par = TRUE
)
snn = FindVariableFeatures(snn, selection.method = "vst")
snn = FindVariableFeatures(snn, selection.method = "vst")
snn = RunPCA(snn, npcs = 30)
snn = FindNeighbors(snn)
snn = FindClusters(snn, method = "igraph", ...)
snn = snn[["seurat_clusters"]]

snn$cell_type = tibble_counts[
	match(tibble_counts$sample, rownames(snn)),
	c("Cell type", "Dim1", "Dim2")
]

```
</div>
<div style="clear:both;"></div>

## Drop `redundant` transcripts

<div class="column-left">
Tidy transcriptomics

```{r drop, cache=TRUE}
tt.norm.non_redundant =
	tt.norm.MDS %>%
  remove_redundancy(	method = "correlation" )
```
</div>
<div class="column-right">
Base R
```{r, eval=FALSE}
library(widyr)

.data.correlated =
	pairwise_cor(
		counts,
		sample,
		transcript,
		rc,
		sort = TRUE,
		diag = FALSE,
		upper = FALSE
	) %>%
	filter(correlation > correlation_threshold) %>%
	distinct(item1) %>%
	rename(!!.element := item1)

# Return non redundant data frame
counts %>% anti_join(.data.correlated) %>%
	spread(sample, rc, - transcript) %>%
	left_join(annotation)



```
</div>
<div style="clear:both;"></div>

## Draw `heatmap`

<div class="column-left">
tidytranscriptomics
```{r heat, eval=FALSE}
tt.norm.MDS %>%

  # filter lowly abundant
  keep_abundant() %>%

  # extract 500 most variable genes
  keep_variable( .abundance = count_scaled, top = 500) %>%

  # create heatmap
  heatmap(sample, transcript, count_scaled, transform = log1p) %>%
	add_tile(`Cell type`) 
```
</div>
<div class="column-right">
Base R
```{r, eval=FALSE}
# Example taken from airway dataset from BioC2020 workshop. 
dgList <- SE2DGEList(airway)
group <- factor(dgList$samples$`Cell type`)
keep.exprs <- filterByExpr(dgList, group=group)
dgList <- dgList[keep.exprs,, keep.lib.sizes=FALSE]
dgList <- calcNormFactors(dgList)
logcounts <- cpm(dgList, log=TRUE)
var_genes <- apply(logcounts, 1, var)
select_var <- names(sort(var_genes, decreasing=TRUE))[1:500]
highly_variable_lcpm <- logcounts[select_var,]
colours <- c("#440154FF", "#21908CFF", "#fefada" )
col.group <- c("red","grey")[group]
gplots::heatmap.2(highly_variable_lcpm, col=colours, trace="none", ColSideColors=col.group, scale="row")
```
</div>
<div style="clear:both;"></div>

## Draw `density plot`

<div class="column-left">
tidytranscriptomics
```{r density, eval=FALSE}
# Example taken from airway dataset from BioC2020 workshop. 
airway %>%
    tidybulk() %>%
	  identify_abundant() %>%
    scale_abundance() %>%
    pivot_longer(cols = starts_with("counts"), names_to = "source", values_to = "abundance") %>%
    filter(!lowly_abundant) %>%
    ggplot(aes(x=abundance + 1, color=sample)) +
    geom_density() +
    facet_wrap(~source) +
    scale_x_log10() 
```
</div>
<div class="column-right">
Base R
```{r, eval=FALSE}
# Example taken from airway dataset from BioC2020 workshop. 
dgList <- SE2DGEList(airway)
group <- factor(dgList$samples$dex)
keep.exprs <- filterByExpr(dgList, group=group)
dgList <- dgList[keep.exprs,, keep.lib.sizes=FALSE]
dgList <- calcNormFactors(dgList)
logcounts <- cpm(dgList, log=TRUE)
var_genes <- apply(logcounts, 1, var)
select_var <- names(sort(var_genes, decreasing=TRUE))[1:500]
highly_variable_lcpm <- logcounts[select_var,]
colours <- c("#440154FF", "#21908CFF", "#fefada" )
col.group <- c("red","grey")[group]
gplots::heatmap.2(highly_variable_lcpm, col=colours, trace="none", ColSideColors=col.group, scale="row")
```
</div>
<div style="clear:both;"></div>

## Appendix

```{r}
sessionInfo()
```