--- title: "Comparison with base R" author: "Stefano Mangiola" date: "`r Sys.Date()`" package: tidybulk output: BiocStyle::html_document: toc_float: true vignette: > %\VignetteEngine{knitr::knitr} %\VignetteIndexEntry{Comparison with base R} %\usepackage[UTF-8]{inputenc} --- [![Lifecycle:maturing](https://img.shields.io/badge/lifecycle-maturing-blue.svg)](https://www.tidyverse.org/lifecycle/#maturing) ```{r, echo=FALSE, include=FALSE, } library(knitr) knitr::opts_chunk$set(cache = TRUE, warning = FALSE, message = FALSE, cache.lazy = FALSE) library(dplyr) library(tidyr) library(tibble) library(magrittr) library(ggplot2) library(ggrepel) library(tidybulk) library(tidySummarizedExperiment) my_theme = theme_bw() + theme( panel.border = element_blank(), axis.line = element_line(), panel.grid.major = element_line(size = 0.2), panel.grid.minor = element_line(size = 0.1), text = element_text(size=12), legend.position="bottom", aspect.ratio=1, strip.background = element_blank(), axis.title.x = element_text(margin = margin(t = 10, r = 10, b = 10, l = 10)), axis.title.y = element_text(margin = margin(t = 10, r = 10, b = 10, l = 10)) ) tibble_counts = tidybulk::counts_SE %>% tidybulk() %>% as_tibble() ``` In this article we show some examples of the differences in coding between tidybulk/tidyverse and base R. We noted a decrease > 10x of assignments and a decrease of > 2x of line numbers. ## Create `tidybulk` tibble. ```{r} tt = se_mini ``` ## Aggregate duplicated `transcripts`

Tidy transcriptomics ```{r aggregate, cache=TRUE, message=FALSE, warning=FALSE, results='hide', class.source='yellow'} rowData(tt)$gene_name = rownames(tt) tt.aggr = tt %>% aggregate_duplicates(.transcript = gene_name) ```

Base R ```{r aggregate long, eval=FALSE} temp = data.frame( symbol = dge_list$genes$symbol, dge_list$counts ) dge_list.nr <- by(temp, temp$symbol, function(df) if(length(df[1,1])>0) matrixStats:::colSums(as.matrix(df[,-1])) ) dge_list.nr <- do.call("rbind", dge_list.nr) colnames(dge_list.nr) <- colnames(dge_list) ```

## Scale `counts`

Tidy transcriptomics ```{r normalise, cache=TRUE} tt.norm = tt.aggr %>% identify_abundant(factor_of_interest = condition) %>% scale_abundance() ```

Base R ```{r normalise long, eval=FALSE} library(edgeR) dgList <- DGEList(count_m=x,group=group) keep <- filterByExpr(dgList) dgList <- dgList[keep,,keep.lib.sizes=FALSE] [...] dgList <- calcNormFactors(dgList, method="TMM") norm_counts.table <- cpm(dgList) ```

## Filter `variable transcripts` We may want to identify and filter variable transcripts.

Tidy transcriptomics ```{r filter variable, cache=TRUE} tt.norm.variable = tt.norm %>% keep_variable() ```

Base R ```{r filter variable long, eval=FALSE} library(edgeR) x = norm_counts.table s <- rowMeans((x-rowMeans(x))^2) o <- order(s,decreasing=TRUE) x <- x[o[1L:top],,drop=FALSE] norm_counts.table = norm_counts.table[rownames(x)] norm_counts.table$cell_type = tibble_counts[ match( tibble_counts$sample, rownames(norm_counts.table) ), "Cell type" ] ```

## Reduce `dimensions`

Tidy transcriptomics ```{r mds, cache=TRUE} tt.norm.MDS = tt.norm %>% reduce_dimensions(method="MDS", .dims = 2) ```

Base R ```{r, eval = FALSE} library(limma) count_m_log = log(count_m + 1) cmds = limma::plotMDS(ndim = .dims, plot = FALSE) cmds = cmds %$% cmdscale.out %>% setNames(sprintf("Dim%s", 1:6)) cmds$cell_type = tibble_counts[ match(tibble_counts$sample, rownames(cmds)), "Cell type" ] ```

### PCA

Tidy transcriptomics ```{r pca, cache=TRUE, message=FALSE, warning=FALSE, results='hide'} tt.norm.PCA = tt.norm %>% reduce_dimensions(method="PCA", .dims = 2) ```

Base R ```{r,eval=FALSE} count_m_log = log(count_m + 1) pc = count_m_log %>% prcomp(scale = TRUE) variance = pc$sdev^2 variance = (variance / sum(variance))[1:6] pc$cell_type = counts[ match(counts$sample, rownames(pc)), "Cell type" ] ```

### tSNE

Tidy transcriptomics ```{r tsne, cache=TRUE, message=FALSE, warning=FALSE, results='hide'} tt.norm.tSNE = breast_tcga_mini_SE %>% tidybulk( sample, ens, count_scaled) %>% identify_abundant() %>% reduce_dimensions( method = "tSNE", perplexity=10, pca_scale =TRUE ) ```

Base R ```{r, eval=FALSE} count_m_log = log(count_m + 1) tsne = Rtsne::Rtsne( t(count_m_log), perplexity=10, pca_scale =TRUE )$Y tsne$cell_type = tibble_counts[ match(tibble_counts$sample, rownames(tsne)), "Cell type" ] ```

## Rotate `dimensions`

Tidy transcriptomics ```{r rotate, cache=TRUE} tt.norm.MDS.rotated = tt.norm.MDS %>% rotate_dimensions(`Dim1`, `Dim2`, rotation_degrees = 45, action="get") ```

Base R ```{r, eval=FALSE} rotation = function(m, d) { r = d * pi / 180 ((bind_rows( c(`1` = cos(r), `2` = -sin(r)), c(`1` = sin(r), `2` = cos(r)) ) %>% as_matrix) %*% m) } mds_r = pca %>% rotation(rotation_degrees) mds_r$cell_type = counts[ match(counts$sample, rownames(mds_r)), "Cell type" ] ```

## Test `differential abundance`

Tidy transcriptomics ```{r de, cache=TRUE, message=FALSE, warning=FALSE, results='hide'} tt.de = tt %>% test_differential_abundance( ~ condition, action="get") tt.de ```

Base R ```{r, eval=FALSE} library(edgeR) dgList <- DGEList(counts=counts_m,group=group) keep <- filterByExpr(dgList) dgList <- dgList[keep,,keep.lib.sizes=FALSE] dgList <- calcNormFactors(dgList) design <- model.matrix(~group) dgList <- estimateDisp(dgList,design) fit <- glmQLFit(dgList,design) qlf <- glmQLFTest(fit,coef=2) topTags(qlf, n=Inf) ```

## Adjust `counts`

Tidy transcriptomics ```{r adjust, cache=TRUE, message=FALSE, warning=FALSE, results='hide'} tt.norm.adj = tt.norm %>% adjust_abundance( ~ condition + time) ```

Base R ```{r, eval=FALSE} library(sva) count_m_log = log(count_m + 1) design = model.matrix( object = ~ condition + time, data = annotation ) count_m_log.sva = ComBat( batch = design[,2], mod = design, ... ) count_m_log.sva = ceiling(exp(count_m_log.sva) -1) count_m_log.sva$cell_type = counts[ match(counts$sample, rownames(count_m_log.sva)), "Cell type" ] ```

## Deconvolve `Cell type composition`

Tidy transcriptomics ```{r cibersort, cache=TRUE, eval=FALSE} tt.cibersort = tt %>% deconvolve_cellularity(action="get", cores=1) ```

Base R ```{r, eval=FALSE} source(‘CIBERSORT.R’) count_m %>% write.table("mixture_file.txt") results <- CIBERSORT( "sig_matrix_file.txt", "mixture_file.txt", perm=100, QN=TRUE ) results$cell_type = tibble_counts[ match(tibble_counts$sample, rownames(results)), "Cell type" ] ```

## Cluster `samples` ### k-means

Tidy transcriptomics ```{r cluster, cache=TRUE} tt.norm.cluster = tt.norm.MDS %>% cluster_elements(method="kmeans", centers = 2, action="get" ) ```

Base R ```{r, eval=FALSE} count_m_log = log(count_m + 1) k = kmeans(count_m_log, iter.max = 1000, ...) cluster = k$cluster cluster$cell_type = tibble_counts[ match(tibble_counts$sample, rownames(cluster)), c("Cell type", "Dim1", "Dim2") ] ```

### SNN Matrix package (v1.3-3) causes an error with Seurat::FindNeighbors used in this method. We are trying to solve this issue. At the moment this option in unaviable.

Tidy transcriptomics ```{r SNN, eval=FALSE, message=FALSE, warning=FALSE, results='hide'} tt.norm.SNN = tt.norm.tSNE %>% cluster_elements(method = "SNN") ```

Base R ```{r, eval=FALSE} library(Seurat) snn = CreateSeuratObject(count_m) snn = ScaleData( snn, display.progress = TRUE, num.cores=4, do.par = TRUE ) snn = FindVariableFeatures(snn, selection.method = "vst") snn = FindVariableFeatures(snn, selection.method = "vst") snn = RunPCA(snn, npcs = 30) snn = FindNeighbors(snn) snn = FindClusters(snn, method = "igraph", ...) snn = snn[["seurat_clusters"]] snn$cell_type = tibble_counts[ match(tibble_counts$sample, rownames(snn)), c("Cell type", "Dim1", "Dim2") ] ```

## Drop `redundant` transcripts

Tidy transcriptomics ```{r drop, cache=TRUE} tt.norm.non_redundant = tt.norm.MDS %>% remove_redundancy( method = "correlation" ) ```

Base R ```{r, eval=FALSE} library(widyr) .data.correlated = pairwise_cor( counts, sample, transcript, rc, sort = TRUE, diag = FALSE, upper = FALSE ) %>% filter(correlation > correlation_threshold) %>% distinct(item1) %>% rename(!!.element := item1) # Return non redundant data frame counts %>% anti_join(.data.correlated) %>% spread(sample, rc, - transcript) %>% left_join(annotation) ```

## Draw `heatmap`

tidytranscriptomics ```{r heat, eval=FALSE} tt.norm.MDS %>% # filter lowly abundant keep_abundant() %>% # extract 500 most variable genes keep_variable( .abundance = count_scaled, top = 500) %>% # create heatmap heatmap(sample, transcript, count_scaled, transform = log1p) %>% add_tile(`Cell type`) ```

Base R ```{r, eval=FALSE} # Example taken from airway dataset from BioC2020 workshop. dgList <- SE2DGEList(airway) group <- factor(dgList$samples$`Cell type`) keep.exprs <- filterByExpr(dgList, group=group) dgList <- dgList[keep.exprs,, keep.lib.sizes=FALSE] dgList <- calcNormFactors(dgList) logcounts <- cpm(dgList, log=TRUE) var_genes <- apply(logcounts, 1, var) select_var <- names(sort(var_genes, decreasing=TRUE))[1:500] highly_variable_lcpm <- logcounts[select_var,] colours <- c("#440154FF", "#21908CFF", "#fefada" ) col.group <- c("red","grey")[group] gplots::heatmap.2(highly_variable_lcpm, col=colours, trace="none", ColSideColors=col.group, scale="row") ```

## Draw `density plot`

tidytranscriptomics ```{r density, eval=FALSE} # Example taken from airway dataset from BioC2020 workshop. airway %>% tidybulk() %>% identify_abundant() %>% scale_abundance() %>% pivot_longer(cols = starts_with("counts"), names_to = "source", values_to = "abundance") %>% filter(!lowly_abundant) %>% ggplot(aes(x=abundance + 1, color=sample)) + geom_density() + facet_wrap(~source) + scale_x_log10() ```

Base R ```{r, eval=FALSE} # Example taken from airway dataset from BioC2020 workshop. dgList <- SE2DGEList(airway) group <- factor(dgList$samples$dex) keep.exprs <- filterByExpr(dgList, group=group) dgList <- dgList[keep.exprs,, keep.lib.sizes=FALSE] dgList <- calcNormFactors(dgList) logcounts <- cpm(dgList, log=TRUE) var_genes <- apply(logcounts, 1, var) select_var <- names(sort(var_genes, decreasing=TRUE))[1:500] highly_variable_lcpm <- logcounts[select_var,] colours <- c("#440154FF", "#21908CFF", "#fefada" ) col.group <- c("red","grey")[group] gplots::heatmap.2(highly_variable_lcpm, col=colours, trace="none", ColSideColors=col.group, scale="row") ```

## Appendix ```{r} sessionInfo() ```