## ----message=FALSE, warning=FALSE---------------------------------------------
set.seed(123)
library(DepInfeR)
library(tibble)
library(tidyr)
library(ggplot2)
library(BiocParallel)
library(dplyr)
## ----installation, eval = FALSE-----------------------------------------------
# if (!requireNamespace("BiocManager", quietly = TRUE)) {
# install.packages("BiocManager")
# }
# BiocManager::install("DepInfeR")
## -----------------------------------------------------------------------------
data(targetsGDSC, drug_response_GDSC)
## -----------------------------------------------------------------------------
head(targetsGDSC)
## -----------------------------------------------------------------------------
head(drug_response_GDSC)
## -----------------------------------------------------------------------------
targetsGDSC <- mutate(targetsGDSC,
targetName = ifelse(targetName %in% "BCR", "BCR/ABL", targetName)
)
## -----------------------------------------------------------------------------
targetMatrix <- filter(
targetsGDSC,
targetClassification == "High confidence"
) %>%
select(drugID, targetName, Kd) %>%
pivot_wider(names_from = "targetName", values_from = "Kd") %>%
remove_rownames() %>%
column_to_rownames("drugID") %>%
as.matrix()
## -----------------------------------------------------------------------------
ProcessTargetResults <- processTarget(targetMatrix,
KdAsInput = TRUE,
removeCorrelated = TRUE
)
## -----------------------------------------------------------------------------
responseMatrix <- filter(drug_response_GDSC, `Drug Id` %in% targetsGDSC$drugID) %>%
select(`Drug Id`, `Cell line name`, AUC) %>%
pivot_wider(names_from = `Cell line name`, values_from = AUC) %>%
remove_rownames() %>%
column_to_rownames("Drug Id") %>%
as.matrix()
## ----fig.cap="This figure shows the number of cell lines included in the analysis as a function of the percentage of missing values allowed per cell line."----
missTab <- data.frame(
NA_cutoff = 0,
remain_celllines = 0, stringsAsFactors = FALSE
)
for (i in 0:138) {
a <- dim(responseMatrix[, colSums(is.na(responseMatrix)) <= i])[2]
missTab[i, ] <- c(i, a)
}
ggplot(missTab, aes(x=NA_cutoff, y=remain_celllines)) +
geom_line() + theme_bw() +
xlab("Allowed NA values") +
ylab("Number of remaining cell lines") +
geom_vline(xintercept = 24, color = "red", linetype = "dashed")
## -----------------------------------------------------------------------------
responseMatrix <- responseMatrix[, colSums(is.na(responseMatrix)) <= 24]
## -----------------------------------------------------------------------------
library(missForest)
impRes <- missForest(t(responseMatrix))
imp_missforest <- impRes$ximp
responseMatrix_imputed <- t(imp_missforest)
## -----------------------------------------------------------------------------
responseMatrix_scaled <- t(scale(t(responseMatrix_imputed)))
## -----------------------------------------------------------------------------
targetInput <- ProcessTargetResults$targetMatrix
overlappedDrugs <- intersect(
rownames(responseMatrix_scaled),
rownames(targetInput)
)
targetInput <- targetInput[overlappedDrugs, ]
responseInput <- responseMatrix_scaled[overlappedDrugs, ]
## -----------------------------------------------------------------------------
#set up BiocParallel back-end
param <- MulticoreParam(workers = 2, RNGseed = 333)
result <- runLASSORegression(
TargetMatrix = targetInput,
ResponseMatrix = responseInput, repeats = 3,
BPPARAM = param
)
## -----------------------------------------------------------------------------
useTar <- rowSums(result$coefMat != 0) > 0
result$coefMat <- result$coefMat[useTar, ]
## -----------------------------------------------------------------------------
nrow(result$coefMat)
## ----fig.cap="Heatmap visualization of the protein dependence values for selected kinases, inferred from the GDSC drug screen dataset. Red indicates a high dependence value and blue indicates a low dependence value. In the column annotations, cell lines with certain genomic variations are colored black. Columns are ordered by hierarchical clustering based on euclidean distance.", fig.height=13, fig.width=9----
library(RColorBrewer)
library(pheatmap)
# firstly, we need to load the processed mutation annotation of the cell lines
data("mutation_GDSC")
#setup column annotation and color scheme
annoColor <- list(
H2O2 = c(`-1` = "red", `0` = "black", `1` = "green"),
IL.1 = c(`-1` = "red", `0` = "black", `1` = "green"),
JAK.STAT = c(`-1` = "red", `0` = "black", `1` = "green"),
MAPK.only = c(`-1` = "red", `0` = "black", `1` = "green"),
MAPK.PI3K = c(`-1` = "red", `0` = "black"),
TLR = c(`-1` = "red", `0` = "black", `1` = "green"),
Wnt = c(`-1` = "red", `0` = "black", `1` = "green"),
VEGF = c(`-1` = "red", `0` = "black", `1` = "green"),
PI3K.only = c(`-1` = "red", `0` = "black", `1` = "green"),
TCGA.classification =
c(
ALL = "#BC3C29FF", AML = "#E18727FF",
DLBC = "#20854EFF", "BRCAHer-" = "#0072B5FF",
"BRCAHer+" = "#7876B1FF"
),
ARID1A_mut = c(`1` = "black", `0` = "grey80"),
EP300_mut = c(`1` = "black", `0` = "grey80"),
PTEN_mut = c(`1` = "black", `0` = "grey80"),
TP53_mut = c(`1` = "black", `0` = "grey80"),
PIK3CA_mut = c(`1` = "black", `0` = "grey80"),
BRCA2_mut = c(`1` = "black", `0` = "grey80"),
BRCA1_mut = c(`1` = "black", `0` = "grey80"),
CDH1_mut = c(`1` = "black", `0` = "grey80"),
FBXW7_mut = c(`1` = "black", `0` = "grey80"),
NRAS_mut = c(`1` = "black", `0` = "grey80"),
ASXL1_mut = c(`1` = "black", `0` = "grey80"),
MLL2_mut = c(`1` = "black", `0` = "grey80"),
ABL1_trans = c(`1` = "black", `0` = "grey80"),
missing_value_perc = c(`0` = "white", `25` = "red")
)
importanceTab <- result$coefMat
#change "-" to "." in the column name to match cell line annotation
colnames(importanceTab) <- gsub("-",".", colnames(importanceTab),)
plotTab <- importanceTab
# Row normalization while keeping sign
plotTab_scaled <- scale(t(plotTab), center = FALSE, scale = TRUE)
plotTab <- plotTab_scaled
mutation_GDSC$TCGA.classification[mutation_GDSC$TCGA.classification == "BRCA"] <- "BRCAHer-"
mutation_GDSC$TCGA.classification <-
factor(mutation_GDSC$TCGA.classification,
levels = c("ALL", "AML", "DLBC", "BRCAHer-", "BRCAHer+")
)
pheatmap(plotTab,
color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")),
bias = 1.8
)(100),
annotation_row = mutation_GDSC,
annotation_colors = annoColor,
clustering_method = "ward.D2", scale = "none",
main = "", fontsize = 9,
fontsize_row = 7, fontsize_col = 10
)
## ----gdsc genomic-------------------------------------------------------------
cell_anno_final <- mutation_GDSC %>%
select(-missing_value_perc) %>%
rename(cancer_type = TCGA.classification) %>%
filter(rownames(mutation_GDSC) %in% colnames(importanceTab))
colnames(cell_anno_final) <- gsub("_mut", "", colnames(cell_anno_final))
colnames(cell_anno_final) <- gsub("_trans", "_translocation",
colnames(cell_anno_final))
## -----------------------------------------------------------------------------
diffImportance <- function(coefMat, Annotation) {
# process genetic background table
geneBack <- Annotation
geneBack <- geneBack[colnames(coefMat), ]
keepCols <- apply(geneBack, 2, function(x) {
length(unique(na.omit(x))) >= 2 & all(table(x) > 6)
})
geneBack <- geneBack[, keepCols]
pTab <- lapply(rownames(coefMat), function(targetName) {
lapply(colnames(geneBack), function(mutName) {
impVec <- coefMat[targetName, ]
genoVec <- geneBack[, mutName]
resTab <- data.frame(
targetName = targetName, mutName = mutName,
stringsAsFactors = FALSE
)
if (length(unique(na.omit(genoVec))) == 2) {
res <- t.test(impVec ~ genoVec, var.equal = TRUE, na.action = na.omit)
resTab$p <- res$p.value
resTab$FC <- (res$estimate[[2]] - res$estimate[[1]]) / abs(res$estimate[[1]])
} else if (length(unique(na.omit(genoVec))) >= 3) {
res <- anova(lm(impVec ~ genoVec, na.action = na.omit))
diffTab <- data.frame(val = impVec, gr = genoVec) %>%
filter(!is.na(gr)) %>%
group_by(gr) %>%
summarise(meanVal = mean(val))
resTab$p <- res$`Pr(>F)`[1]
resTab$FC <- max(diffTab$meanVal) - min(diffTab$meanVal)
}
resTab
}) %>% bind_rows()
}) %>%
bind_rows() %>%
arrange(p) %>%
mutate(p.adj = p.adjust(p, method = "BH"))
pTab
}
## ----gdsc t-test--------------------------------------------------------------
testRes <- diffImportance(importanceTab, cell_anno_final)
## -----------------------------------------------------------------------------
CancerType <- testRes %>%
filter(mutName == "cancer_type") %>%
filter(p.adj < 0.05, FC > 0.1)
plotTab <- t(scale(t(importanceTab))) %>%
data.frame() %>%
rownames_to_column("target") %>%
gather(key = "CellLine", value = "coef", -target) %>%
mutate(Cancer_Type = mutation_GDSC[CellLine, ]$TCGA.classification) %>%
group_by(target, Cancer_Type) %>%
mutate(meanCoef = mean(coef)) %>%
arrange(meanCoef) %>%
ungroup() %>%
mutate(target = factor(target, levels = unique(target)))
plotTab <- plotTab %>% filter(target %in% CancerType$targetName)
plotTab$Cancer_Type <- factor(plotTab$Cancer_Type,
levels = c(
"ALL", "AML", "DLBC",
"BRCAHer-", "BRCAHer+"
)
)
## ----fig.height=4, fig.width=10, fig.cap="Distributions of the protein dependence coefficients for kinases with significant association with the cancer types. Within each panel, each point corresponds to a cell line. The coefficients were centered and scaled to obtain a per-protein z-score, and the points are grouped and colored by cancer type (ALL, red; AML, orange; DLBC, green, BRCAHer-, blue; BRCAHer+, purple). Only the associations past the criteria of BH adjusted p-value < 0.05, Fold Change > 0.1 from ANOVA tests are shown."----
ggplot(plotTab, aes(x = target, y = coef, group = Cancer_Type)) +
geom_jitter(aes(color = Cancer_Type),
position = position_jitterdodge(
jitter.width = 0.2,
dodge.width = 0.8
),
size = 1.2
) +
stat_summary(
fun = mean, fun.min = mean, fun.max = mean, colour = "grey25",
geom = "crossbar", size = 0.8,
position = position_dodge(0.8)
) +
scale_color_manual(
values = c(
"#BC3C29FF", "#E18727FF",
"#20854EFF", "#0072B5FF", "#7876B1FF"
),
guide = guide_legend(override.aes = list(size = 3))
) +
ggtitle("Protein dependence associated with cancer type") +
ylab("Protein dependence coefficient") +
xlab("Protein") +
theme_bw() +
geom_vline(xintercept = seq(from = 1.5, to = 8.5, by = 1), color = "darkgrey") +
labs(color = "Cancer Type")
## -----------------------------------------------------------------------------
colList2 <- c(
`not significant` = "grey80", mutated = "#BC3C29FF",
unmutated = "#0072B5FF"
)
pos <- position_jitter(width = 0.15, seed = 10)
plotTab <- testRes %>%
filter(mutName != "cancer_type") %>%
mutate(type = ifelse(p.adj > 0.1, "not significant",
ifelse(FC > 0, "mutated", "unmutated")
)) %>%
mutate(varName = ifelse(type == "not significant", "", targetName)) %>%
mutate(p.adj = ifelse(p.adj < 1e-5, 1e-5, p.adj))
# subset for mutation with at least one significant association
plotMut <- unique(filter(testRes, p.adj <= 0.1)$mutName)
plotTab <- plotTab %>% filter(mutName %in% plotMut)
plotTab$type <- factor(plotTab$type,
levels = c("mutated", "unmutated", "not significant")
)
## ----fig.height=6, fig.width=10, fig.cap = "Overview of the significant protein dependence versus gene mutation associations. This figure shows the -log10(adjusted P values) for each protein-mutation association. The color indicates the direction of association. Red indicates a higher dependence in the mutated samples and blue indicated a lower dependence in mutated samples. P values are from Student's t-test. The dashed line indicates 10% FDR."----
library(ggrepel)
p <- ggplot(data = plotTab, aes(
x = mutName, y = -log10(p.adj),
col = type, label = varName
)) +
geom_text_repel(position = pos, color = "black", size = 6, force = 3) +
geom_hline(yintercept = -log10(0.1), linetype = "dotted", color = "grey20") +
geom_point(size = 3, position = pos) +
ylab(expression(-log[10] * "(" * adjusted ~ italic("P") ~ value * ")")) +
xlab("Mutation") +
scale_color_manual(values = colList2) +
scale_y_continuous(trans = "exp", limits = c(0, 2.5), breaks = c(0, 1, 1.5, 2)) +
coord_flip() +
labs(col = "Higher dependence in") +
theme_bw() +
theme(
legend.position = c(0.80, 0.6),
legend.background = element_rect(fill = NA),
legend.text = element_text(size = 14),
legend.title = element_text(size = 16),
axis.title = element_text(size = 18),
axis.text = element_text(size = 18)
)
plot(p)
## -----------------------------------------------------------------------------
library(ggbeeswarm)
# Function to format floats
formatNum <- function(i, limit = 0.01, digits = 1, format = "e") {
r <- sapply(i, function(n) {
if (n < limit) {
formatC(n, digits = digits, format = format)
} else {
format(n, digits = digits)
}
})
return(r)
}
#function for plot boxplot
plotDiffBox <- function(pTab, coefMat, cellAnno, fdrCut = 0.05) {
pTab.sig <- filter(pTab, p.adj <= fdrCut)
geneBack <- cellAnno
geneBack <- geneBack[colnames(coefMat), ]
pList <- lapply(seq(nrow(pTab.sig)), function(i) {
geno <- pTab.sig[i, ]$mutName
target <- pTab.sig[i, ]$targetName
pval <- pTab.sig[i, ]$p
plotTab <- tibble(
id = colnames(coefMat),
mut = geneBack[, geno], val = coefMat[target, ]
) %>% filter(!is.na(mut))
plotTab <- mutate(plotTab, mut = ifelse(mut %in% c("M", "1", 1),
"Mutated", "Unmutated"
)) %>% mutate(mut = factor(mut, levels = c("Unmutated", "Mutated")))
numTab <- group_by(plotTab, mut) %>% summarise(n = length(id))
plotTab <- left_join(plotTab, numTab, by = "mut") %>%
mutate(mutNum = sprintf("%s\n(n=%s)", mut, n)) %>%
arrange(mut) %>%
mutate(mutNum = factor(mutNum, levels = unique(mutNum)))
titleText <- sprintf("%s ~ %s %s", target, geno, "mutations")
pval <- formatNum(pval, digits = 1, format = "e")
titleText <- bquote(atop(.(titleText), "(" ~ italic("P") ~ "=" ~ .(pval) ~ ")"))
ggplot(plotTab, aes(x = mutNum, y = val)) +
stat_boxplot(geom = "errorbar", width = 0.3) +
geom_boxplot(outlier.shape = NA, col = "black", width = 0.4) +
geom_beeswarm(cex = 2, size = 2, aes(col = mutNum)) +
theme_classic() +
xlab("") +
ylab("Protein dependence") +
ggtitle(titleText) +
xlab("") +
scale_color_manual(values = c("#0072B5FF", "#BC3C29FF")) +
theme(
axis.line.x = element_blank(), axis.ticks.x = element_blank(),
axis.title = element_text(size = 18), axis.text = element_text(size = 18),
plot.title = element_text(size = 20, face = "bold", hjust = 0.5),
legend.position = "none"
)
})
names(pList) <- paste0(pTab.sig$targetName, "_", pTab.sig$mutName)
return(pList)
}
## ----fig.height=4.5, fig.width=5, fig.cap="The beeswarm plot shows the protein dependence coefficient of MAP2K2 protein in NRAS mutated and wildtype cell lines. P-value was from Student's t-test."----
pList <- plotDiffBox(testRes, importanceTab, cell_anno_final, 0.05)
pList$MAP2K2_NRAS
## -----------------------------------------------------------------------------
sessionInfo()