## ----style, echo = FALSE, results = 'asis'--------------------------------- BiocStyle::markdown() ## ----load_libraries, message=FALSE, warning=FALSE, echo=TRUE--------------- library("dplyr") library("ggplot2") library("viridis") library("tibble") library("gridExtra") library("stringr") library("depmap") library("ExperimentHub") ## ----load_data, message=FALSE, warning=FALSE, echo=TRUE-------------------- ## create ExperimentHub query object eh <- ExperimentHub() query(eh, "depmap") rnai <- eh[["EH2260"]] crispr <- eh[["EH2261"]] copyNumber <- eh[["EH2262"]] # note: the datasets listed above are from the 19Q1 release. Newer datasets, # such as 19Q2 and 19Q3 are available. ## ----dep_score_BRCA1_184A1Breast, echo=TRUE-------------------------------- dep_score_BRCA1_184A1Breast <- rnai %>% select(cell_line, gene_name, dependency) %>% filter(cell_line == "184A1_BREAST", gene_name == "BRCA1") dep_score_BRCA1_184A1Breast ## ---- BRCA1_Avg_Dep_Score, echo=TRUE--------------------------------------- brca1_dep_score_avg_rnai <- rnai %>% select(gene_name, dependency) %>% filter(gene_name == "BRCA1") %>% summarise(mean_dependency_brca1 = mean(dependency, na.rm=TRUE)) brca1_dep_score_avg_rnai ## ---- all_gene_ds_avg_rnai, echo=TRUE-------------------------------------- all_gene_dep_score_avg_rnai <- rnai %>% select(gene_name, dependency) %>% summarise(mean_dependency_all_genes_rnai = mean(dependency, na.rm=TRUE)) all_gene_dep_score_avg_rnai ## ---- soft_tissue_cell_lines, echo=TRUE------------------------------------ soft_tissue_dependency_rnai <- rnai %>% select(cell_line, gene_name, dependency) %>% filter(stringr::str_detect(cell_line, "SOFT_TISSUE")) %>% arrange(dependency) soft_tissue_dependency_rnai ## ----cell_lines_with_entrez_id_NRF2, echo=TRUE----------------------------- entrez_id_NRF2 <- rnai %>% select(entrez_id, cell_line, gene_name, dependency) %>% filter(entrez_id == "4780") entrez_id_NRF2 ## ----greatest_Dep_Score_NFE2L2, echo=TRUE---------------------------------- top_dep_score_NFE2L2_rnai <- rnai %>% select(cell_line, gene_name, dependency) %>% filter(gene_name == "NFE2L2") %>% arrange(dependency) top_dep_score_NFE2L2_rnai ## ----top_dep_score_NCIH2066_LUNG_rnai, echo=TRUE--------------------------- top_dep_score_NCIH2066_LUNG_rnai <- rnai %>% select(cell_line, gene_name, dependency) %>% filter(cell_line == "NCIH2066_LUNG") %>% arrange(dependency) top_dep_score_NCIH2066_LUNG_rnai ## ----greatest_dep_score_gene_rnai, echo=TRUE------------------------------- greatest_dep_score_gene_rnai <- rnai %>% select(cell_line, gene_name, dependency) %>% arrange(dependency) greatest_dep_score_gene_rnai ## ----cell_line_gene_rnai_lowest_dep_score, echo=TRUE----------------------- lowest_dep_score_gene_rnai <- rnai %>% select(cell_line, gene_name, dependency) %>% arrange(desc(dependency)) lowest_dep_score_gene_rnai ## ----cell_line_gene_crispr_greatest_dep_score, echo=TRUE------------------- greatest_dep_score_gene_crispr <- crispr %>% select(cell_line, gene_name, dependency) %>% arrange(dependency) greatest_dep_score_gene_crispr ## ----cell_line_gene_crispr_lowest_dep_score, echo=TRUE--------------------- lowest_dep_score_gene_crispr <- crispr %>% select(cell_line, gene_name, dependency) %>% arrange(desc(dependency)) lowest_dep_score_gene_crispr ## ----comparison_rnai_crispr_dep_scores, fig.height=6, fig.width=7, fig.align="center", echo=FALSE, message=FALSE---- # sort `crispr` dep scores by most cancer inducing top_20_dep_scores_crispr <- crispr %>% select(gene_name, dependency) %>% arrange(desc(dependency)) %>% top_n(20) # sort `crispr` dep scores by least cancer inducing lowest_20_dep_scores_crispr <- crispr %>% select(gene_name, dependency) %>% arrange(dependency) %>% top_n(-20) # sort `rnai` dep scores by most cancer inducing top_20_dep_scores_rnai <- rnai %>% select(gene_name, dependency) %>% arrange(desc(dependency)) %>% top_n(20) # shorten gene name so it fits on plot top_20_dep_scores_rnai[12, 1] <- "GAS6-AS2" # sort `rnai` dep scores by least cancer inducing lowest_20_dep_scores_rnai <- rnai %>% select(gene_name, dependency) %>% arrange(dependency) %>% top_n(-20) # rnai highest dep scores p1 <- ggplot(lowest_20_dep_scores_rnai) + aes(gene_name, dependency, color = dependency) + geom_point() + scale_colour_viridis() + theme(axis.text.x = element_text(angle=90, hjust=1)) + ggtitle("High DS genes rnai") # crispr highest dep scores p2 <- ggplot(lowest_20_dep_scores_crispr) + aes(gene_name, dependency, color = dependency) + geom_point() + scale_colour_viridis() + theme(axis.text.x = element_text(angle=90, hjust=1)) + ggtitle("High DS genes crispr ") # rnai lowest dep scores p3 <- ggplot(top_20_dep_scores_rnai) + aes(gene_name, dependency, color = dependency) + geom_point() + scale_colour_viridis() + theme(axis.text.x = element_text(angle=90, hjust=1)) + ggtitle("Low DS genes rnai") # crispr lowest dep scores p4 <- ggplot(top_20_dep_scores_crispr) + aes(gene_name, dependency, color = dependency) + geom_point() + scale_colour_viridis() + theme(axis.text.x = element_text(angle=90, hjust=1)) + ggtitle("Low DS genes crispr") #plot as 1x2 grid grid.arrange(p1, p2, p3, p4, nrow=2, top = "Most Extreme Dep Scores for CRISPR and RNAI") ## ----count_comp_rnai_crispr_dep_scores, fig.height=6, fig.width=7, fig.align="center", echo=FALSE, message=FALSE---- # get counts of top 50 genes in `crispr` that are most cancer-vitality inducing unique_lowest_dep_scores_gene_crispr <- crispr %>% select(gene_name, dependency) %>% arrange(desc(dependency)) %>% top_n(50) %>% count(gene_name) %>% arrange(desc(n)) # TBC1D3 appears to be an extremely common for `crispr` and dominates top 100 # most cancer-vitality-inducing genes. # get counts of top 50 genes in `rnai` that are most cancer-vitality inducing unique_lowest_dep_scores_gene_rnai <- rnai %>% select(gene_name, dependency) %>% arrange(desc(dependency)) %>% top_n(50) %>% count(gene_name) %>% arrange(desc(n)) # Whereas for `rnai` UBBP4 and ACTG1P4 are most common. # shorten gene name to fit on plot unique_lowest_dep_scores_gene_rnai[9, 1] <- "GAS6-AS2" # get counts of top 50 genes in `crispr` with greatest dependency unique_top_dep_scores_crispr <- crispr %>% select(gene_name, dependency) %>% arrange(dependency) %>% top_n(-50) %>% count(gene_name) %>% arrange(desc(n)) # Most common gene for top 100 most dependent genes in `crispr` is RAN # get counts of top 50 genes in `rnai` with greatest dependency unique_top_dep_scores_rnai <- rnai %>% select(gene_name, dependency) %>% arrange(dependency) %>% top_n(-50) %>% count(gene_name) %>% arrange(desc(n)) # Most common genes for top 100 most dependent genes in `rnai` are RPL7, EIF3B # and RPL14. # rnai highest dep scores p5 <- ggplot(unique_top_dep_scores_crispr, aes(x=gene_name, y=n)) + geom_bar(stat='identity', fill="steelblue2") + ylab("count") + theme(axis.text.x = element_text(angle=90, hjust=1)) + ggtitle("Highest DS genes crispr") p6 <- ggplot(unique_top_dep_scores_rnai, aes(x=gene_name, y=n)) + geom_bar(stat='identity', fill="steelblue2") + ylab("count") + theme(axis.text.x = element_text(angle=90, hjust=1)) + ggtitle("Highest DS genes rnai") p7 <- ggplot(unique_lowest_dep_scores_gene_crispr, aes(x=gene_name, y=n)) + geom_bar(stat='identity', fill="steelblue2") + ylab("count") + theme(axis.text.x = element_text(angle=90, hjust=1)) + ggtitle("Lowest DS genes crispr") p8 <- ggplot(unique_lowest_dep_scores_gene_rnai, aes(x=gene_name, y=n)) + geom_bar(stat='identity', fill="steelblue2") + ylab("count") + theme(axis.text.x = element_text(angle=90, hjust=1)) + ggtitle("Lowest DS genes rnai") ## plot as 1x2 grid grid.arrange(p5, p6, p7, p8, nrow=2, top = "Top 50 CRISPR and RNAI genes with High and Low Dep Score") ## ----mean_log_copy_num_gene, fig.height=4, fig.width=6, fig.align="center", echo=FALSE, message=FALSE---- # mean log copy number for all genes mean_log_copy_num_gene <- copyNumber %>% select(gene_name, log_copy_number) %>% summarise(mean_log_copy_number_all_genes = mean(log_copy_number, na.rm = TRUE)) # get average value val_mean_log_copy_num_gene <- as.numeric(mean_log_copy_num_gene[1,1]) # mean log copy number for each gene each_log_copy_num_gene <- copyNumber %>% select(gene_name, log_copy_number) %>% group_by(gene_name) %>% summarise(mean_log_copy_number = mean(log_copy_number, na.rm = TRUE)) # add an ID column all_log_copy_num_gene <- as.data.frame(na.omit(each_log_copy_num_gene$mean_log_copy_number)) %>% tibble::rowid_to_column(., "ID") # add col names colnames(all_log_copy_num_gene, do.NULL = FALSE) colnames(all_log_copy_num_gene) <- c("gene", "log_copy_number") # plot of mean copy number for every gene p9 <- ggplot(all_log_copy_num_gene) + aes(x=gene, y=log_copy_number, color = log_copy_number) + geom_point() + scale_colour_viridis() + geom_hline(yintercept = val_mean_log_copy_num_gene, linetype = "dashed", color = "red") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + ggtitle("Mean log copy number change for every gene") p9 ## ----greatest_log_copy_num_genes, fig.height=4, fig.width=6, fig.align="center", echo=FALSE, message=FALSE---- # mean log copy number for each gene greatest_log_copy_num_genes <- copyNumber %>% select(gene_name, log_copy_number) %>% group_by(gene_name) %>% summarise(mean_long_copy_num = mean(log_copy_number))%>% arrange(desc(mean_long_copy_num)) %>% top_n(20) p10 <- ggplot(greatest_log_copy_num_genes, aes(x=gene_name, y=mean_long_copy_num)) + geom_bar(stat='identity', fill="steelblue2") + ylab("log Copy Number") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + ggtitle("Top 20 genes with largest log copy number") p10 ## ----echo = FALSE---------------------------------------------------------- sessionInfo()