## ----eval = FALSE-------------------------------------------------------------
# if (!requireNamespace("BiocManager", quietly = TRUE)) {
#  install.packages("BiocManager")
# }
# BiocManager::install("xCell2")

## ----eval = FALSE-------------------------------------------------------------
# if (!requireNamespace("devtools", quietly = TRUE)) {
#  install.packages("devtools")
# }
# devtools::install_github("AlmogAngel/xCell2")

## ----eval = TRUE--------------------------------------------------------------
library(xCell2)

## ----eval = TRUE--------------------------------------------------------------
library(xCell2)

# Load the demo data
data(dice_demo_ref, package = "xCell2")

# Extract reference gene expression matrix
# (Note that values are in TPM but log transformed)
dice_ref <- SummarizedExperiment::assay(dice_demo_ref, "logcounts")
colnames(dice_ref) <- make.unique(colnames(dice_ref)) # Ensure unique sample names

# Extract reference metadata
dice_labels <- SummarizedExperiment::colData(dice_demo_ref)
dice_labels <- as.data.frame(dice_labels) # "label" column already exists

# Prepare labels data frame (the "label" column already exist)
dice_labels$ont <- NA # Here we assign the cell ontology (next section)
dice_labels$sample <- colnames(dice_ref)
dice_labels$dataset <- "DICE"

## ----eval = TRUE--------------------------------------------------------------
# Assign ontologies to cell types
dice_labels[dice_labels$label == "B cells", ]$ont <- "CL:0000236"
dice_labels[dice_labels$label == "Monocytes", ]$ont <- "CL:0000576"
dice_labels[dice_labels$label == "NK cells", ]$ont <- "CL:0000623"
dice_labels[dice_labels$label == "T cells, CD8+", ]$ont <- "CL:0000625"
dice_labels[dice_labels$label == "T cells, CD4+", ]$ont <- "CL:0000624"
dice_labels[dice_labels$label == "T cells, CD4+, memory", ]$ont <- "CL:0000897"

## ----eval=FALSE, include=FALSE------------------------------------------------
# xCell2GetLineage(labels = dice_labels, outFile = "demo_dice_dep.tsv")

## ----eval = TRUE--------------------------------------------------------------
lineage_list <- xCell2GetLineage(labels = dice_labels)

## ----eval = TRUE--------------------------------------------------------------
library(BiocParallel)
parallel_param <- MulticoreParam(workers = 2) # Adjust workers as needed

set.seed(123) # (optional) For reproducibility

DICE_demo.xCell2Ref <- xCell2Train(
  ref = dice_ref,
  labels = dice_labels,
  refType = "rnaseq",
  BPPARAM = parallel_param
)

## ----eval = FALSE-------------------------------------------------------------
# saveRDS(DICE_demo.xCell2Ref, file = "DICE_demo.xCell2Ref.rds")

## ----eval = FALSE-------------------------------------------------------------
# # Load the BlueprintEncode reference
# data(BlueprintEncode.xCell2Ref, package = "xCell2")

## ----eval = FALSE-------------------------------------------------------------
# # Set the URL of the pre-trained reference
# ref_url <- "https://dviraran.github.io/xCell2refs/references/BlueprintEncode.xCell2Ref.rds"
# # Set the local filename to save the reference
# local_filename <- "BlueprintEncode.xCell2Ref.rds"
# # Download the file
# download.file(ref_url, local_filename, mode = "wb")
# # Load the downloaded reference
# BlueprintEncode.xCell2Ref <- readRDS(local_filename)

## ----eval = TRUE--------------------------------------------------------------
library(xCell2)
data("DICE_demo.xCell2Ref", package = "xCell2")
genes_ref <- getGenesUsed(DICE_demo.xCell2Ref)
genes_ref[1:10]

## ----eval = TRUE--------------------------------------------------------------
# Load demo bulk gene expression mixture
data("mix_demo", package = "xCell2")
genes_mix <- rownames(mix_demo)

# Calculate the percentage of overlapping genes
overlap_percentage <- round(length(intersect(genes_mix, genes_ref)) / length(genes_ref) * 100, 2)
print(paste0("Overlap between mixture and reference genes is: ", overlap_percentage, "%"))

## ----eval = TRUE--------------------------------------------------------------
xcell2_results <- xCell2Analysis(
  mix = mix_demo,
  xcell2object = DICE_demo.xCell2Ref,
  minSharedGenes = 0.8 # Allow for a lower overlap threshold
)

## ----eval = TRUE--------------------------------------------------------------
xcell2_results <- xCell2Analysis(
 mix = mix_demo,
 xcell2object = DICE_demo.xCell2Ref
)

xcell2_results

## ----eval=TRUE----------------------------------------------------------------
library(ggplot2, quietly = TRUE)

# Generate pseudo enrichment scores for demonstration
set.seed(123)
conditions <- c(rep("Healthy", 5), rep("Disease", 5))
enrichment_scores <- data.frame(
  Condition = conditions,
  T_cells_CD8 = c(runif(5, 0.4, 0.6), runif(5, 0.6, 0.8)),
  T_cells_CD4 = c(runif(5, 0.3, 0.5), runif(5, 0.5, 0.7))
)

# Wilcoxon rank-sum test for CD8+ T cells
wilcox_test <- wilcox.test(T_cells_CD8 ~ Condition, data = enrichment_scores)
print(wilcox_test)

# Boxplot with p-value
ggplot(enrichment_scores, aes(x = Condition, y = T_cells_CD8)) +
  geom_boxplot() +
  geom_jitter(width = 0.2) +
  labs(
    title = "CD8+ T Cells Enrichment Across Conditions",
    x = "Condition", y = "Enrichment Score"
  ) +
  annotate("text", x = 1.5, y = max(enrichment_scores$T_cells_CD8), 
           label = paste("p =", signif(wilcox_test$p.value, 3)))

## ----eval=TRUE----------------------------------------------------------------
library(EnhancedVolcano, quietly = TRUE)
library(dplyr, quietly = TRUE, verbose = FALSE)

# Simulate enrichment scores for 50 cell types
set.seed(123) 
cell_types <- paste0("CellType_", 1:50)

enrichment_scores_many <- data.frame(
  Sample = paste0("Sample", 1:10),
  Condition = factor(rep(c("Healthy", "Disease"), each = 5))
)

for (cell_type in cell_types) {
  # Healthy group: random around baseline
  healthy <- rnorm(5, mean = 0.5, sd = 0.1)
  # Disease group: add variability and directional shifts
  disease <- rnorm(5, mean = sample(c(0.4, 0.5, 0.6), 1), sd = 0.1)
  enrichment_scores_many[[cell_type]] <- c(healthy, disease)
}

special_cells <- sample(cell_types, 2)
for (cell_type in special_cells) {
  enrichment_scores_many[1:5, cell_type] <- rnorm(5, mean = 0.4, sd = 0.05) # Lower in healthy
}

# Calculate log fold change (LFC) and p-values for each cell type using Wilcoxon test
differential_results <- enrichment_scores_many %>%
  tidyr::pivot_longer(cols = starts_with("CellType"), names_to = "CellType", values_to = "Enrichment") %>%
  group_by(CellType) %>%
  summarise(
    LFC = log2(median(Enrichment[Condition == "Disease"]) / median(Enrichment[Condition == "Healthy"])),
    p_value = wilcox.test(Enrichment ~ Condition)$p.value
  ) %>%
  ungroup()

# Prepare data for EnhancedVolcano
volcano_data <- differential_results %>%
  mutate(
    neg_log10_p = -log10(p_value),
    Significance = ifelse(p_value < 0.05, "Significant", "Not Significant")
  ) %>%
  rename(
    log2FC = LFC,
    pval = p_value
  )

# Plot using EnhancedVolcano
EnhancedVolcano(
  volcano_data,
  lab = volcano_data$CellType,
  x = "log2FC",
  y = "pval",
  pCutoff = 0.05,
  FCcutoff = 0.5,
  title = "Volcano Plot of Cell Type Enrichment",
  subtitle = "Simulated Data for 50 Cell Types",
  xlab = "Log2 Fold Change (LFC)",
  ylab = "-Log10(p-value)",
  pointSize = 2.0,
  labSize = 3.0,
  xlim = c(-1, 1),
  ylim = c(0, 3)
)

## ----eval=TRUE----------------------------------------------------------------
# Simulated clinical feature (e.g., mutation burden)
set.seed(123)

mutation_burden <- runif(10, min = 0, max = 100)

# Calculate Pearson and Spearman correlation
pearson_corr <- cor.test(enrichment_scores$T_cells_CD8, mutation_burden, method = "pearson")
spearman_corr <- cor.test(enrichment_scores$T_cells_CD8, mutation_burden, method = "spearman")

# Scatterplot of the correlation with annotations for Pearson and Spearman coefficients
ggplot(enrichment_scores, aes(x = mutation_burden, y = T_cells_CD8)) +
  geom_point(size = 3) +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(
    title = "Correlation Between Mutation Burden and CD8+ T Cells Enrichment",
    x = "Mutation Burden",
    y = "CD8+ T Cells Enrichment Score"
  ) +
  annotate("text", x = 15, y = 0.7, 
           label = paste0("Pearson: ", signif(pearson_corr$estimate, 3), 
                          "\nP-value: ", signif(pearson_corr$p.value, 3)),
           hjust = 0, color = "darkred") +
  annotate("text", x = 15, y = 0.65, 
           label = paste0("Spearman: ", signif(spearman_corr$estimate, 3), 
                          "\nP-value: ", signif(spearman_corr$p.value, 3)),
           hjust = 0, color = "darkgreen")

## ----eval=TRUE----------------------------------------------------------------
# Simulated enrichment scores for demonstration
set.seed(123) 
enrichment_scores <- data.frame(
  Sample = paste0("Sample", 1:10),
  Condition = rep(c("Healthy", "Disease"), each = 5),
  CD8_T_cells = runif(10, min = 0.1, max = 0.8),
  B_cells = runif(10, min = 0.2, max = 0.6),
  NK_cells = runif(10, min = 0.05, max = 0.5)
)

# Perform PCA on the simulated enrichment scores
pca <- prcomp(enrichment_scores[, 3:5], scale. = TRUE)

# Create a data frame for PCA results
pca_df <- data.frame(
  PC1 = pca$x[, 1],
  PC2 = pca$x[, 2],
  Condition = enrichment_scores$Condition
)

# Visualize the first two principal components
ggplot(pca_df, aes(x = PC1, y = PC2, color = Condition)) +
  geom_point(size = 3) +
  labs(
    title = "PCA of Enrichment Scores",
    x = "PC1 (Principal Component 1)",
    y = "PC2 (Principal Component 2)"
  ) +
  theme_minimal()

## ----eval = TRUE--------------------------------------------------------------
kmeans_result <- kmeans(enrichment_scores[, 3:5], centers = 2, nstart = 25)

# Add cluster assignment to the PCA results
pca_df <- data.frame(
  PC1 = pca$x[, 1],
  PC2 = pca$x[, 2],
  Condition = enrichment_scores$Condition,
  Cluster = as.factor(kmeans_result$cluster)
)

# Visualize the PCA results with k-means clusters
ggplot(pca_df, aes(x = PC1, y = PC2, color = Cluster, shape = Condition)) +
  geom_point(size = 3) +
  labs(
    title = "PCA of Enrichment Scores with K-Means Clusters",
    x = "PC1 (Principal Component 1)",
    y = "PC2 (Principal Component 2)"
  ) +
  scale_color_manual(values = c("1" = "blue", "2" = "red")) +
  theme_minimal()

## ----eval=TRUE, dpi=32--------------------------------------------------------
library(randomForest, quietly = TRUE)

# Simulated enrichment scores for demonstration
set.seed(123)

enrichment_scores <- data.frame(
  Sample = paste0("Sample", 1:10),
  Condition = factor(rep(c("Healthy", "Disease"), each = 5)),
  CD8_T_cells = runif(10, min = 0.1, max = 0.8),
  B_cells = runif(10, min = 0.2, max = 0.6),
  NK_cells = runif(10, min = 0.05, max = 0.5)
)

# Train a random forest model
rf_model <- randomForest(Condition ~ CD8_T_cells + B_cells + NK_cells, 
                         data = enrichment_scores, 
                         ntree = 500, importance = TRUE)


# Assess variable importance
var_importance <- data.frame(
  Variable = rownames(importance(rf_model)),
  Importance = importance(rf_model)[, 1]
)

# Plot variable importance
ggplot(var_importance, aes(x = reorder(Variable, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(
    title = "Feature Importance from Random Forest Model",
    x = "Features",
    y = "Importance Score"
  ) +
  theme_minimal()

## ----eval = FALSE-------------------------------------------------------------
# library(BiocParallel)
# 
# # Example: Setting up parallel processing with 2 workers
# BPPARAM <- MulticoreParam(workers = 2) # Adjust the number of workers as needed

## ----eval = FALSE-------------------------------------------------------------
# BPPARAM <- SnowParam(workers = 2)

## ----eval = FALSE-------------------------------------------------------------
# # Generate the xCell2 reference object with parallel processing
# DICE_demo.xCell2Ref <- xCell2Train(
#   ref = dice_ref,
#   labels = dice_labels,
#   refType = "rnaseq",
#   BPPARAM = BPPARAM
# )

## ----eval = FALSE-------------------------------------------------------------
# # Perform cell type enrichment analysis with parallel processing
# xcell2_results <- xCell2Analysis(
#   mix = mix_demo,
#   xcell2object = DICE_demo.xCell2Ref,
#   BPPARAM = BPPARAM
# )

## -----------------------------------------------------------------------------
sessionInfo()