## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
    collapse = TRUE,
    comment = "#>", 
    eval = TRUE,
    crop = NULL
)

## ----install-pkg--------------------------------------------------------------
# if (!require("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")
# 
# BiocManager::install("treeclimbR")

## ----setup--------------------------------------------------------------------
suppressPackageStartupMessages({
    library(TreeSummarizedExperiment)
    library(treeclimbR)
    library(ggtree)
    library(dplyr)
    library(ggplot2)
})

## ----da-load-and-visualize-data-----------------------------------------------
## Read data
da_lse <- readRDS(system.file("extdata", "da_sim_100_30_18de.rds", 
                              package = "treeclimbR"))
da_lse

## Generate tree visualization where true signal leaves are colored orange
## ...Find internal nodes in the subtrees where all leaves are differentially 
##    abundant. These will be colored orange.
nds <- joinNode(tree = rowTree(da_lse), 
                node = rownames(da_lse)[rowData(da_lse)$Signal])
br <- unlist(findDescendant(tree = rowTree(da_lse), node = nds,
                            only.leaf = FALSE, self.include = TRUE))
df_color <- data.frame(node = showNode(tree = rowTree(da_lse), 
                                       only.leaf = FALSE)) |>
    mutate(signal = ifelse(node %in% br, "yes", "no"))
## ...Generate tree
da_fig_tree <- ggtree(tr = rowTree(da_lse), layout = "rectangular", 
                      branch.length = "none", 
                      aes(color = signal)) %<+% df_color +
    scale_color_manual(values = c(no = "grey", yes = "orange"))
## ...Zoom into the subtree defined by a particular node. In this case, we 
##    know that all true signal leaves were sampled from the subtree defined 
##    by a particular node (stored in metadata(da_lse)$parentNodeForSignal).
da_fig_tree <- scaleClade(da_fig_tree, 
                          node = metadata(da_lse)$parentNodeForSignal, 
                          scale = 4)

## Extract count matrix and scale each row to [0, 1]
count <- assay(da_lse, "counts")
scale_count <- t(apply(count, 1, FUN = function(x) {
    xx <- x
    rx <- (max(xx) - min(xx))
    (xx - min(xx))/max(rx, 1)
}))
rownames(scale_count) <- rownames(count)
colnames(scale_count) <- colnames(count)

## Plot tree and heatmap of scaled counts
## ...Generate sample annotation
vv <- gsub(pattern = "_.*", "", colnames(count))
names(vv) <- colnames(scale_count)
anno_c <- structure(vv, names = vv)
TreeHeatmap(tree = rowTree(da_lse), tree_fig = da_fig_tree, 
            hm_data = scale_count, legend_title_hm = "Scaled\ncount",
            column_split = vv, rel_width = 0.6,
            tree_hm_gap = 0.3,
            column_split_label = anno_c) +
    scale_fill_viridis_c(option = "B") +
    scale_y_continuous(expand = c(0, 10))

## ----da-aggregate-------------------------------------------------------------
## Get a list of all node IDs
all_node <- showNode(tree = rowTree(da_lse), only.leaf = FALSE)

## Calculate counts for internal nodes
da_tse <- aggTSE(x = da_lse, rowLevel = all_node, rowFun = sum)
da_tse

## ----da-run-edger-------------------------------------------------------------
## Run differential analysis
da_res <- runDA(da_tse, assay = "counts", option = "glmQL", 
                design = model.matrix(~ group, data = colData(da_tse)), 
                contrast = c(0, 1), filter_min_count = 0, 
                filter_min_prop = 0, filter_min_total_count = 15)

## ----da-runda-res-------------------------------------------------------------
names(da_res)
class(da_res$edgeR_results)

## Nodes with too low total count
da_res$nodes_drop

## ----da-node-results----------------------------------------------------------
da_tbl <- nodeResult(da_res, n = Inf, type = "DA")
dim(da_tbl)
head(da_tbl)

## ----da-get-cand--------------------------------------------------------------
## Get candidates
da_cand <- getCand(tree = rowTree(da_tse), score_data = da_tbl, 
                   node_column = "node", p_column = "PValue",
                   threshold = 0.05, sign_column = "logFC", message = FALSE)

## ----da-plot-cand-------------------------------------------------------------
## All candidates
names(da_cand$candidate_list)

## Nodes contained in the candidate corresponding to t = 0.03
## This is a mix of leaves and internal nodes
(da_cand_0.03 <- da_cand$candidate_list[["0.03"]])

## Visualize candidate
da_fig_tree +
    geom_point2(aes(subset = (node %in% da_cand_0.03)), 
                color = "navy", size = 0.5) +
    labs(title = "t = 0.03") +
    theme(legend.position = "none",
          plot.title = element_text(color = "navy", size = 7, 
                                    hjust = 0.5, vjust = -0.08))

## ----da-eval-cand-------------------------------------------------------------
## Evaluate candidates
da_best <- evalCand(tree = rowTree(da_tse), levels = da_cand$candidate_list, 
                    score_data = da_tbl, node_column = "node",
                    p_column = "PValue", sign_column = "logFC")

## ----da-info-cand-------------------------------------------------------------
infoCand(object = da_best)

## ----da-topnodes--------------------------------------------------------------
da_out <- topNodes(object = da_best, n = Inf, p_value = 0.05)

## ----da-plot-significant------------------------------------------------------
da_fig_tree +
    geom_point2(aes(subset = node %in% da_out$node),
                color = "red")

## ----da-fdr-tpr---------------------------------------------------------------
fdr(rowTree(da_tse), truth = rownames(da_lse)[rowData(da_lse)$Signal], 
    found = da_out$node, only.leaf = TRUE)

tpr(rowTree(da_tse), truth = rownames(da_lse)[rowData(da_lse)$Signal], 
    found = da_out$node, only.leaf = TRUE)

## ----ds-load-and-visualize-data-----------------------------------------------
ds_tse <- readRDS(system.file("extdata", "ds_sim_20_500_8de.rds", 
                              package = "treeclimbR"))
ds_tse

## Assignment of cells to high-resolution clusters, samples and conditions
head(colData(ds_tse))

## Tree providing the successive aggregation of the high-resolution clusters
## into more coarse-grained ones
## Node numbers are indicated in blue, node labels in orange
ggtree(colTree(ds_tse)) +
    geom_text2(aes(label = node), color = "darkblue",
               hjust = -0.5, vjust = 0.7) +
    geom_text2(aes(label = label), color = "darkorange",
               hjust = -0.1, vjust = -0.7)

## ----ds-list-truth------------------------------------------------------------
rowData(ds_tse)[rowData(ds_tse)$Signal != "no", , drop = FALSE] 

## ----ds-aggregate-------------------------------------------------------------
ds_se <- aggDS(TSE = ds_tse, assay = "counts", sample_id = "sample_id", 
               group_id = "group", cluster_id = "cluster_id", FUN = sum)
ds_se

## ----ds-ncells----------------------------------------------------------------
metadata(ds_se)$n_cells

## ----ds-run-edger-------------------------------------------------------------
ds_res <- runDS(SE = ds_se, tree = colTree(ds_tse), option = "glmQL", 
                design = model.matrix(~ group, data = colData(ds_se)), 
                contrast = c(0, 1), filter_min_count = 0, 
                filter_min_total_count = 1, filter_min_prop = 0, min_cells = 5, 
                group_column = "group", message = FALSE)

## ----ds-runds-res-------------------------------------------------------------
names(ds_res)
names(ds_res$edgeR_results)

## ----ds-node-results----------------------------------------------------------
ds_tbl <- nodeResult(ds_res, type = "DS", n = Inf)
dim(ds_tbl)
head(ds_tbl)

## ----ds-get-cand--------------------------------------------------------------
## Split result table by feature
ds_tbl_list <- split(ds_tbl, f = ds_tbl$feature)

## Find candidates for each gene separately
ds_cand_list <- lapply(seq_along(ds_tbl_list), 
                       FUN = function(x) {
                           getCand(
                               tree = colTree(ds_tse),
                               t = seq(from = 0.05, to = 1, by = 0.05),
                               score_data = ds_tbl_list[[x]], 
                               node_column = "node", 
                               p_column = "PValue", 
                               sign_column = "logFC",
                               message = FALSE)$candidate_list
                       })
names(ds_cand_list) <- names(ds_tbl_list)

## ----ds-eval-cand-------------------------------------------------------------
ds_best <- evalCand(tree = colTree(ds_tse), type = "multiple", 
                    levels = ds_cand_list, score_data = ds_tbl_list, 
                    node_column = "node", 
                    p_column = "PValue", 
                    sign_column = "logFC", 
                    feature_column = "feature",
                    limit_rej = 0.05,
                    message = FALSE,
                    use_pseudo_leaf = FALSE)

ds_out <- topNodes(object = ds_best, n = Inf, p_value = 0.05) 
ds_out

## ----ds-check-cand------------------------------------------------------------
lapply(findDescendant(colTree(ds_tse), node = c(11, 13, 5, 9), 
                      self.include = TRUE, 
                      use.alias = FALSE, only.leaf = TRUE), 
       function(x) convertNode(colTree(ds_tse), node = x))

## ----ds-list-truth-again------------------------------------------------------
rowData(ds_tse)[rowData(ds_tse)$Signal != "no", , drop = FALSE] 

## ----session-info-------------------------------------------------------------
sessionInfo()