## ----v1, include = FALSE------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  eval = TRUE
)

## ----install, eval=FALSE------------------------------------------------------
#  if (!requireNamespace("BiocManager"))
#      install.packages("BiocManager")
#  BiocManager::install("CellMixS")

## ----load, message=FALSE------------------------------------------------------
library(CellMixS)

## ---- warning=FALSE-----------------------------------------------------------
# Load required packages
suppressPackageStartupMessages({
    library(SingleCellExperiment)
    library(cowplot)
    library(limma)
    library(magrittr)
    library(dplyr)
    library(purrr)
    library(ggplot2)
    library(scater)
})

## ----data---------------------------------------------------------------------
# Load sim_list example data
sim_list <- readRDS(system.file(file.path("extdata", "sim50.rds"), 
                                package = "CellMixS"))
names(sim_list)

sce50 <- sim_list[["batch50"]]
class(sce50)

table(sce50[["batch"]])

## ----visBatch50---------------------------------------------------------------
# Visualize batch distribution in sce50
visGroup(sce50, group = "batch")

## ----visBatchAll, fig.wide=TRUE-----------------------------------------------
# Visualize batch distribution in other elements of sim_list 
batch_names <- c("batch0", "batch20")
  
vis_batch <- lapply(batch_names, function(name){
    sce <- sim_list[[name]]
    visGroup(sce, "batch") + ggtitle(paste0("sim_", name))
})

plot_grid(plotlist = vis_batch, ncol = 2)

## ----cms----------------------------------------------------------------------
# Call cell-specific mixing score for sce50
# Note that cell_min is set to 4 due to the low number of cells and small k.
# Usually default parameters are recommended!! 
sce50 <- cms(sce50, k = 30, group = "batch", res_name = "unaligned", 
             n_dim = 3, cell_min = 4)
head(colData(sce50))

# Call cell-specific mixing score for all datasets
sim_list <- lapply(batch_names, function(name){
    sce <- sim_list[[name]]
    sce <- cms(sce, k = 30, group = "batch", res_name = "unaligned", 
               n_dim = 3, cell_min = 4)
}) %>% set_names(batch_names)

# Append cms50
sim_list[["batch50"]] <- sce50

## ----hist, fig.wide=TRUE------------------------------------------------------
# p-value histogram of cms50
visHist(sce50)

# p-value histogram sim30
# Combine cms results in one matrix
batch_names <- names(sim_list)
cms_mat <- batch_names %>% 
  map(function(name) sim_list[[name]]$cms.unaligned) %>% 
  bind_cols() %>% set_colnames(batch_names)

visHist(cms_mat, n_col = 3)

## ----singlePlots, fig.wide= TRUE----------------------------------------------
# cms only cms20
sce20 <- sim_list[["batch20"]]
metric_plot <- visMetric(sce20, metric_var = "cms_smooth.unaligned")

# group only cms20
group_plot <- visGroup(sce20, group = "batch")

plot_grid(metric_plot, group_plot, ncol = 2)

## ----overview-----------------------------------------------------------------
# Add random celltype assignments as new metadata
sce20[["celltype"]] <- rep(c("CD4+", "CD8+", "CD3"), length.out = ncol(sce20)) %>% 
    as.factor 

visOverview(sce20, "batch", other_var = "celltype")

## ----compareCluster, fig.small=TRUE-------------------------------------------
visCluster(sce20, metric_var = "cms.unaligned", cluster_var = "celltype")

## ----batchCorrectionMethods---------------------------------------------------
# MNN - embeddings are stored in the reducedDims slot of sce
reducedDimNames(sce20)
sce20 <- cms(sce20, k = 30, group = "batch", 
             dim_red = "MNN", res_name = "MNN", n_dim = 3, cell_min = 4)

# Run limma
sce20 <- scater::logNormCounts(sce20)
limma_corrected <- removeBatchEffect(logcounts(sce20), batch = sce20$batch)
# Add corrected counts to sce
assay(sce20, "lim_corrected") <- limma_corrected 

# Run cms
sce20 <- cms(sce20, k = 30, group = "batch", 
             assay_name = "lim_corrected", res_name = "limma", n_dim = 3, 
             cell_min = 4)

names(colData(sce20))

## ----batch correction methods vis---------------------------------------------
# As pvalue histograms
visHist(sce20, metric = "cms.",  n_col = 3)

## ----ldfDiff, warning=FALSE---------------------------------------------------
# Prepare input 
# List with single SingleCellExperiment objects 
# (Important: List names need to correspond to batch levels! See ?ldfDiff)
sce_pre_list <- list("1" = sce20[,sce20$batch == "1"], 
                     "2" = sce20[,sce20$batch == "2"], 
                     "3" = sce20[,sce20$batch == "3"])

sce20 <- ldfDiff(sce_pre_list, sce_combined = sce20, 
                 group = "batch", k = 70, dim_red = "PCA", 
                 dim_combined = "MNN", assay_pre = "counts", 
                 n_dim = 3, res_name = "MNN")

sce20 <- ldfDiff(sce_pre_list, sce_combined = sce20, 
                 group = "batch", k = 70, dim_red = "PCA", 
                 dim_combined = "PCA", assay_pre = "counts", 
                 assay_combined = "lim_corrected",  
                 n_dim = 3, res_name = "limma")

names(colData(sce20))

## ----visldfDiff---------------------------------------------------------------
# ldfDiff score summarized
visIntegration(sce20, metric = "diff_ldf", metric_name = "ldfDiff") 

## ----evalIntegration----------------------------------------------------------
sce50 <- evalIntegration(metrics = c("isi", "entropy"), sce50, 
                         group = "batch", k = 30, n_dim = 2, cell_min = 4,
                         res_name = c("weighted_isi", "entropy"))

visOverview(sce50, "batch", 
            metric = c("cms_smooth.unaligned", "weighted_isi", "entropy"), 
            prefix = FALSE)

## ----sessionInfo--------------------------------------------------------------
sessionInfo()