## ----message=FALSE, warning=FALSE---------------------------------------------
set.seed(123)
library(DepInfeR)
library(tibble)
library(tidyr)
library(ggplot2)
library(BiocParallel)
library(dplyr)

## ----installation, eval = FALSE-----------------------------------------------
# if (!requireNamespace("BiocManager", quietly = TRUE)) {
#   install.packages("BiocManager")
# }
# BiocManager::install("DepInfeR")

## -----------------------------------------------------------------------------
data(targetsGDSC, drug_response_GDSC)

## -----------------------------------------------------------------------------
head(targetsGDSC)

## -----------------------------------------------------------------------------
head(drug_response_GDSC)

## -----------------------------------------------------------------------------
targetsGDSC <- mutate(targetsGDSC,
  targetName = ifelse(targetName %in% "BCR", "BCR/ABL", targetName)
)

## -----------------------------------------------------------------------------
targetMatrix <- filter(
  targetsGDSC,
  targetClassification == "High confidence"
) %>%
  select(drugID, targetName, Kd) %>%
  pivot_wider(names_from = "targetName", values_from = "Kd") %>%
  remove_rownames() %>%
  column_to_rownames("drugID") %>%
  as.matrix()

## -----------------------------------------------------------------------------
ProcessTargetResults <- processTarget(targetMatrix,
  KdAsInput = TRUE,
  removeCorrelated = TRUE
)

## -----------------------------------------------------------------------------
responseMatrix <- filter(drug_response_GDSC, `Drug Id` %in% targetsGDSC$drugID) %>%
  select(`Drug Id`, `Cell line name`, AUC) %>%
  pivot_wider(names_from = `Cell line name`, values_from = AUC) %>%
  remove_rownames() %>%
  column_to_rownames("Drug Id") %>%
  as.matrix()

## ----fig.cap="This figure shows the number of cell lines included in the analysis as a function of the percentage of missing values allowed per cell line."----

missTab <- data.frame(
  NA_cutoff = 0,
  remain_celllines = 0, stringsAsFactors = FALSE
)

for (i in 0:138) {
  a <- dim(responseMatrix[, colSums(is.na(responseMatrix)) <= i])[2]
  missTab[i, ] <- c(i, a)
}

ggplot(missTab, aes(x=NA_cutoff, y=remain_celllines)) +
    geom_line() + theme_bw() +
    xlab("Allowed NA values") +
    ylab("Number of remaining cell lines") +
    geom_vline(xintercept = 24, color = "red", linetype = "dashed")

## -----------------------------------------------------------------------------
responseMatrix <- responseMatrix[, colSums(is.na(responseMatrix)) <= 24]

## -----------------------------------------------------------------------------
library(missForest)

impRes <- missForest(t(responseMatrix))
imp_missforest <- impRes$ximp

responseMatrix_imputed <- t(imp_missforest)

## -----------------------------------------------------------------------------
responseMatrix_scaled <- t(scale(t(responseMatrix_imputed)))

## -----------------------------------------------------------------------------
targetInput <- ProcessTargetResults$targetMatrix

overlappedDrugs <- intersect(
  rownames(responseMatrix_scaled),
  rownames(targetInput)
)

targetInput <- targetInput[overlappedDrugs, ]
responseInput <- responseMatrix_scaled[overlappedDrugs, ]

## -----------------------------------------------------------------------------
#set up BiocParallel back-end
param <- MulticoreParam(workers = 2, RNGseed = 333)

result <- runLASSORegression(
  TargetMatrix = targetInput,
  ResponseMatrix = responseInput, repeats = 3,
  BPPARAM = param
)

## -----------------------------------------------------------------------------
useTar <- rowSums(result$coefMat != 0) > 0
result$coefMat <- result$coefMat[useTar, ]

## -----------------------------------------------------------------------------
nrow(result$coefMat)

## ----fig.cap="Heatmap visualization of the protein dependence values for selected kinases, inferred from the GDSC drug screen dataset. Red indicates a high dependence value and blue indicates a low dependence value. In the column annotations, cell lines with certain genomic variations are colored black. Columns are ordered by hierarchical clustering based on euclidean distance.", fig.height=13, fig.width=9----

library(RColorBrewer)
library(pheatmap)

# firstly, we need to load the processed mutation annotation of the cell lines
data("mutation_GDSC")

#setup column annotation and color scheme
annoColor <- list(
  H2O2 = c(`-1` = "red", `0` = "black", `1` = "green"),
  IL.1 = c(`-1` = "red", `0` = "black", `1` = "green"),
  JAK.STAT = c(`-1` = "red", `0` = "black", `1` = "green"),
  MAPK.only = c(`-1` = "red", `0` = "black", `1` = "green"),
  MAPK.PI3K = c(`-1` = "red", `0` = "black"),
  TLR = c(`-1` = "red", `0` = "black", `1` = "green"),
  Wnt = c(`-1` = "red", `0` = "black", `1` = "green"),
  VEGF = c(`-1` = "red", `0` = "black", `1` = "green"),
  PI3K.only = c(`-1` = "red", `0` = "black", `1` = "green"),
  TCGA.classification =
    c(
      ALL = "#BC3C29FF", AML = "#E18727FF",
      DLBC = "#20854EFF", "BRCAHer-" = "#0072B5FF",
      "BRCAHer+" = "#7876B1FF"
    ),
  ARID1A_mut = c(`1` = "black", `0` = "grey80"),
  EP300_mut = c(`1` = "black", `0` = "grey80"),
  PTEN_mut = c(`1` = "black", `0` = "grey80"),
  TP53_mut = c(`1` = "black", `0` = "grey80"),
  PIK3CA_mut = c(`1` = "black", `0` = "grey80"),
  BRCA2_mut = c(`1` = "black", `0` = "grey80"),
  BRCA1_mut = c(`1` = "black", `0` = "grey80"),
  CDH1_mut = c(`1` = "black", `0` = "grey80"),
  FBXW7_mut = c(`1` = "black", `0` = "grey80"),
  NRAS_mut = c(`1` = "black", `0` = "grey80"),
  ASXL1_mut = c(`1` = "black", `0` = "grey80"),
  MLL2_mut = c(`1` = "black", `0` = "grey80"),
  ABL1_trans = c(`1` = "black", `0` = "grey80"),
  missing_value_perc = c(`0` = "white", `25` = "red")
)
importanceTab <- result$coefMat
#change "-" to "." in the column name to match cell line annotation
colnames(importanceTab) <- gsub("-",".", colnames(importanceTab),)
plotTab <- importanceTab
# Row normalization while keeping sign
plotTab_scaled <- scale(t(plotTab), center = FALSE, scale = TRUE)
plotTab <- plotTab_scaled
mutation_GDSC$TCGA.classification[mutation_GDSC$TCGA.classification == "BRCA"] <- "BRCAHer-"
mutation_GDSC$TCGA.classification <-
  factor(mutation_GDSC$TCGA.classification,
    levels = c("ALL", "AML", "DLBC", "BRCAHer-", "BRCAHer+")
  )
pheatmap(plotTab,
  color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")),
    bias = 1.8
  )(100),
  annotation_row = mutation_GDSC,
  annotation_colors = annoColor,
  clustering_method = "ward.D2", scale = "none",
  main = "", fontsize = 9,
  fontsize_row = 7, fontsize_col = 10
)

## ----gdsc genomic-------------------------------------------------------------
cell_anno_final <- mutation_GDSC %>%
  select(-missing_value_perc) %>%
  rename(cancer_type = TCGA.classification) %>%
  filter(rownames(mutation_GDSC) %in% colnames(importanceTab))
colnames(cell_anno_final) <- gsub("_mut", "", colnames(cell_anno_final))
colnames(cell_anno_final) <- gsub("_trans", "_translocation",
                                  colnames(cell_anno_final))

## -----------------------------------------------------------------------------
diffImportance <- function(coefMat, Annotation) {
  # process genetic background table
  geneBack <- Annotation
  geneBack <- geneBack[colnames(coefMat), ]
  keepCols <- apply(geneBack, 2, function(x) {
    length(unique(na.omit(x))) >= 2 & all(table(x) > 6)
  })
  geneBack <- geneBack[, keepCols]

  pTab <- lapply(rownames(coefMat), function(targetName) {
    lapply(colnames(geneBack), function(mutName) {
      impVec <- coefMat[targetName, ]
      genoVec <- geneBack[, mutName]
      resTab <- data.frame(
        targetName = targetName, mutName = mutName,
        stringsAsFactors = FALSE
      )
      if (length(unique(na.omit(genoVec))) == 2) {
        res <- t.test(impVec ~ genoVec, var.equal = TRUE, na.action = na.omit)
        resTab$p <- res$p.value
        resTab$FC <- (res$estimate[[2]] - res$estimate[[1]]) / abs(res$estimate[[1]])
      } else if (length(unique(na.omit(genoVec))) >= 3) {
        res <- anova(lm(impVec ~ genoVec, na.action = na.omit))
        diffTab <- data.frame(val = impVec, gr = genoVec) %>%
          filter(!is.na(gr)) %>%
          group_by(gr) %>%
          summarise(meanVal = mean(val))
        resTab$p <- res$`Pr(>F)`[1]
        resTab$FC <- max(diffTab$meanVal) - min(diffTab$meanVal)
      }
      resTab
    }) %>% bind_rows()
  }) %>%
    bind_rows() %>%
    arrange(p) %>%
    mutate(p.adj = p.adjust(p, method = "BH"))
  pTab
}

## ----gdsc t-test--------------------------------------------------------------
testRes <- diffImportance(importanceTab, cell_anno_final)

## -----------------------------------------------------------------------------
CancerType <- testRes %>%
  filter(mutName == "cancer_type") %>%
  filter(p.adj < 0.05, FC > 0.1)

plotTab <- t(scale(t(importanceTab))) %>%
  data.frame() %>%
  rownames_to_column("target") %>%
  gather(key = "CellLine", value = "coef", -target) %>%
  mutate(Cancer_Type = mutation_GDSC[CellLine, ]$TCGA.classification) %>%
  group_by(target, Cancer_Type) %>%
  mutate(meanCoef = mean(coef)) %>%
  arrange(meanCoef) %>%
  ungroup() %>%
  mutate(target = factor(target, levels = unique(target)))

plotTab <- plotTab %>% filter(target %in% CancerType$targetName)

plotTab$Cancer_Type <- factor(plotTab$Cancer_Type,
  levels = c(
    "ALL", "AML", "DLBC",
    "BRCAHer-", "BRCAHer+"
  )
)

## ----fig.height=4, fig.width=10, fig.cap="Distributions of the protein dependence coefficients for kinases with significant association with the cancer types. Within each panel, each point corresponds to a cell line. The coefficients were centered and scaled to obtain a per-protein z-score, and the points are grouped and colored by cancer type (ALL, red; AML, orange; DLBC, green, BRCAHer-, blue; BRCAHer+, purple). Only the associations past the criteria of BH adjusted p-value < 0.05, Fold Change > 0.1 from ANOVA tests are shown."----

ggplot(plotTab, aes(x = target, y = coef, group = Cancer_Type)) +
  geom_jitter(aes(color = Cancer_Type),
    position = position_jitterdodge(
      jitter.width = 0.2,
      dodge.width = 0.8
    ),
    size = 1.2
  ) +
  stat_summary(
    fun = mean, fun.min = mean, fun.max = mean, colour = "grey25",
    geom = "crossbar", size = 0.8,
    position = position_dodge(0.8)
  ) +
  scale_color_manual(
    values = c(
      "#BC3C29FF", "#E18727FF",
      "#20854EFF", "#0072B5FF", "#7876B1FF"
    ),
    guide = guide_legend(override.aes = list(size = 3))
  ) +
  ggtitle("Protein dependence associated with cancer type") +
  ylab("Protein dependence coefficient") +
  xlab("Protein") +
  theme_bw() +
  geom_vline(xintercept = seq(from = 1.5, to = 8.5, by = 1), color = "darkgrey") +
  labs(color = "Cancer Type")

## -----------------------------------------------------------------------------
colList2 <- c(
  `not significant` = "grey80", mutated = "#BC3C29FF",
  unmutated = "#0072B5FF"
)
pos <- position_jitter(width = 0.15, seed = 10)
plotTab <- testRes %>%
  filter(mutName != "cancer_type") %>%
  mutate(type = ifelse(p.adj > 0.1, "not significant",
    ifelse(FC > 0, "mutated", "unmutated")
  )) %>%
  mutate(varName = ifelse(type == "not significant", "", targetName)) %>%
  mutate(p.adj = ifelse(p.adj < 1e-5, 1e-5, p.adj))

# subset for mutation with at least one significant association
plotMut <- unique(filter(testRes, p.adj <= 0.1)$mutName)
plotTab <- plotTab %>% filter(mutName %in% plotMut)
plotTab$type <- factor(plotTab$type,
  levels = c("mutated", "unmutated", "not significant")
)

## ----fig.height=6, fig.width=10, fig.cap = "Overview of the significant protein dependence versus gene mutation associations. This figure shows the -log10(adjusted P values) for each protein-mutation association. The color indicates the direction of association. Red indicates a higher dependence in the mutated samples and blue indicated a lower dependence in mutated samples. P values are from Student's t-test. The dashed line indicates 10% FDR."----

library(ggrepel)
p <- ggplot(data = plotTab, aes(
  x = mutName, y = -log10(p.adj),
  col = type, label = varName
)) +
  geom_text_repel(position = pos, color = "black", size = 6, force = 3) +
  geom_hline(yintercept = -log10(0.1), linetype = "dotted", color = "grey20") +
  geom_point(size = 3, position = pos) +
  ylab(expression(-log[10] * "(" * adjusted ~ italic("P") ~ value * ")")) +
  xlab("Mutation") +
  scale_color_manual(values = colList2) +
  scale_y_continuous(trans = "exp", limits = c(0, 2.5), breaks = c(0, 1, 1.5, 2)) +
  coord_flip() +
  labs(col = "Higher dependence in") +
  theme_bw() +
  theme(
    legend.position = c(0.80, 0.6),
    legend.background = element_rect(fill = NA),
    legend.text = element_text(size = 14),
    legend.title = element_text(size = 16),
    axis.title = element_text(size = 18),
    axis.text = element_text(size = 18)
  )
plot(p)

## -----------------------------------------------------------------------------
library(ggbeeswarm)

# Function to format floats
formatNum <- function(i, limit = 0.01, digits = 1, format = "e") {
  r <- sapply(i, function(n) {
    if (n < limit) {
      formatC(n, digits = digits, format = format)
    } else {
      format(n, digits = digits)
    }
  })
  return(r)
}

#function for plot boxplot
plotDiffBox <- function(pTab, coefMat, cellAnno, fdrCut = 0.05) {
  pTab.sig <- filter(pTab, p.adj <= fdrCut)
  geneBack <- cellAnno
  geneBack <- geneBack[colnames(coefMat), ]
  pList <- lapply(seq(nrow(pTab.sig)), function(i) {
    geno <- pTab.sig[i, ]$mutName
    target <- pTab.sig[i, ]$targetName
    pval <- pTab.sig[i, ]$p
    plotTab <- tibble(
      id = colnames(coefMat),
      mut = geneBack[, geno], val = coefMat[target, ]
    ) %>% filter(!is.na(mut))
    
    plotTab <- mutate(plotTab, mut = ifelse(mut %in% c("M", "1", 1),
        "Mutated", "Unmutated"
      )) %>% mutate(mut = factor(mut, levels = c("Unmutated", "Mutated")))
    
    numTab <- group_by(plotTab, mut) %>% summarise(n = length(id))
    plotTab <- left_join(plotTab, numTab, by = "mut") %>%
      mutate(mutNum = sprintf("%s\n(n=%s)", mut, n)) %>%
      arrange(mut) %>%
      mutate(mutNum = factor(mutNum, levels = unique(mutNum)))
    titleText <- sprintf("%s ~ %s %s", target, geno, "mutations")
    pval <- formatNum(pval, digits = 1, format = "e")
    titleText <- bquote(atop(.(titleText), "(" ~ italic("P") ~ "=" ~ .(pval) ~ ")"))
    ggplot(plotTab, aes(x = mutNum, y = val)) +
      stat_boxplot(geom = "errorbar", width = 0.3) +
      geom_boxplot(outlier.shape = NA, col = "black", width = 0.4) +
      geom_beeswarm(cex = 2, size = 2, aes(col = mutNum)) +
      theme_classic() +
      xlab("") +
      ylab("Protein dependence") +
      ggtitle(titleText) +
      xlab("") +
      scale_color_manual(values = c("#0072B5FF", "#BC3C29FF")) +
      theme(
        axis.line.x = element_blank(), axis.ticks.x = element_blank(),
        axis.title = element_text(size = 18), axis.text = element_text(size = 18),
        plot.title = element_text(size = 20, face = "bold", hjust = 0.5),
        legend.position = "none"
      )
  })
  names(pList) <- paste0(pTab.sig$targetName, "_", pTab.sig$mutName)
  return(pList)
}

## ----fig.height=4.5, fig.width=5, fig.cap="The beeswarm plot shows the protein dependence coefficient of MAP2K2 protein in NRAS mutated and wildtype cell lines. P-value was from Student's t-test."----
pList <- plotDiffBox(testRes, importanceTab, cell_anno_final, 0.05)
pList$MAP2K2_NRAS

## -----------------------------------------------------------------------------
sessionInfo()