## ----setup-options, include = FALSE------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.align = "center", warning = FALSE, message = FALSE, out.width = "100%", dpi = 150 ) ## ----setup, message=FALSE----------------------------------------------------- library(PubMatrixR) library(knitr) library(kableExtra) library(dplyr) library(pheatmap) library(ggplot2) ## ----eval=FALSE--------------------------------------------------------------- # result <- PubMatrix( # A = gene_set_1, # B = gene_set_2, # API.key = "your_api_key_here", # Database = "pubmed" # ) ## ----gene_extraction, eval = FALSE-------------------------------------------- # # Extract WNT-related genes # A <- msigdf::msigdf.human %>% # dplyr::filter(grepl(geneset, pattern = "wnt", ignore.case = TRUE)) %>% # dplyr::pull(symbol) %>% # unique() # # # Extract obesity-related genes # B <- msigdf::msigdf.human %>% # dplyr::filter(grepl(geneset, pattern = "obesity", ignore.case = TRUE)) %>% # dplyr::pull(symbol) %>% # unique() # # # Sample genes for demonstration (making them equal in length) # A <- sample(A, 10, replace = FALSE) # B <- sample(B, 10, replace = FALSE) ## ----example_genes------------------------------------------------------------ # WNT signaling pathway genes A <- c("WNT1", "WNT2", "WNT3A", "WNT5A", "WNT7B", "CTNNB1", "DVL1") # Obesity-related genes B <- c("LEPR", "ADIPOQ", "PPARG", "TNF", "IL6", "ADRB2", "INSR") ## ----pubmatrix_analysis, eval = FALSE----------------------------------------- # # Intentionally not run during package checks: this performs live NCBI queries. # # Run actual PubMatrix analysis # current_year <- as.integer(format(Sys.Date(), "%Y")) # result <- PubMatrix( # A = A, # B = B, # Database = "pubmed", # daterange = c(2020, current_year), # outfile = "pubmatrix_result" # ) ## ----pubmatrix_analysis_offline----------------------------------------------- # Offline deterministic example used for vignette rendering/package checks. result <- outer(seq_along(B), seq_along(A), function(i, j) { 8 + (i * 6) + (j * 5) + ((i + j) %% 4) * 3 + ((i * j) %% 5) }) result <- as.data.frame(result, check.names = FALSE) colnames(result) <- A rownames(result) <- B ## ----results_table------------------------------------------------------------ kable(result, caption = "Co-occurrence Matrix: WNT Genes vs Obesity Genes (Publication Counts)", align = "c", format = if (knitr::pandoc_to() == "html") "html" else "markdown" ) %>% kableExtra::kable_styling( bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE, position = "center" ) %>% kableExtra::add_header_above(c(" " = 1, "Obesity Genes" = length(B))) ## ----bar_plots, fig.width=10, fig.height=7, out.width="100%", dpi=150--------- # Create data frame for List A genes (rows) colored by List B genes (columns) a_genes_data <- data.frame( gene = rownames(result), total_pubs = rowSums(result), stringsAsFactors = FALSE ) # Add color coding based on max overlap with B genes a_genes_data$max_b_gene <- apply(result, 1, function(x) colnames(result)[which.max(x)]) a_genes_data$max_overlap <- apply(result, 1, max) # Create data frame for List B genes (columns) colored by List A genes (rows) b_genes_data <- data.frame( gene = colnames(result), total_pubs = colSums(result), stringsAsFactors = FALSE ) # Add color coding based on max overlap with A genes b_genes_data$max_a_gene <- apply(result, 2, function(x) rownames(result)[which.max(x)]) b_genes_data$max_overlap <- apply(result, 2, max) bar_plot_theme <- theme_minimal(base_size = 12) + theme( plot.title = element_text(face = "bold", size = 15), plot.subtitle = element_text(size = 10), axis.title = element_text(size = 11), axis.text = element_text(size = 10), panel.grid.major.y = element_blank(), legend.position = "bottom", legend.title = element_text(size = 10), legend.text = element_text(size = 9), legend.box = "vertical" ) n_fill_a <- length(unique(a_genes_data$max_b_gene)) n_fill_b <- length(unique(b_genes_data$max_a_gene)) # Plot A genes colored by their strongest B gene partner p1 <- ggplot(a_genes_data, aes(x = reorder(gene, total_pubs), y = total_pubs, fill = max_b_gene)) + geom_col(width = 0.75) + coord_flip() + labs( title = "List A Genes by Publication Count", subtitle = "Colored by strongest List B partner", x = "Genes (List A)", y = "Total Publications", fill = "Strongest B Partner" ) + scale_fill_viridis_d(option = "D", end = 0.9) + guides(fill = guide_legend(nrow = max(1, ceiling(n_fill_a / 4)), byrow = TRUE)) + bar_plot_theme # Plot B genes colored by their strongest A gene partner p2 <- ggplot(b_genes_data, aes(x = reorder(gene, total_pubs), y = total_pubs, fill = max_a_gene)) + geom_col(width = 0.75) + coord_flip() + labs( title = "List B Genes by Publication Count", subtitle = "Colored by strongest List A partner", x = "Genes (List B)", y = "Total Publications", fill = "Strongest A Partner" ) + scale_fill_viridis_d(option = "D", end = 0.9) + guides(fill = guide_legend(nrow = max(1, ceiling(n_fill_b / 4)), byrow = TRUE)) + bar_plot_theme print(p1) print(p2) ## ----heatmap_with_numbers_asymmetric, fig.width=10.5, fig.height=7.5, out.width="100%", dpi=150, fig.cap="Overlap percentage heatmap with values displayed in each cell"---- plot_pubmatrix_heatmap(result, title = "WNT-Obesity Overlap (%)", show_numbers = TRUE, cellwidth = 44, cellheight = 32, width = 12, height = 10 ) ## ----heatmap_clean_asymmetric, fig.width=10.5, fig.height=7.5, out.width="100%", dpi=150, fig.cap="Co-occurrence heatmap without numbers for better visual clarity"---- plot_pubmatrix_heatmap(result, title = "WNT-Obesity Co-occurrence (Clean)", show_numbers = FALSE, cellwidth = 44, cellheight = 32, width = 12, height = 10 ) ## ----asymmetric_example, eval = FALSE----------------------------------------- # # Intentionally not run during package checks: this performs live NCBI queries. # A <- c("NCOR2", "NCSTN", "NKD1", "NOTCH1", "NOTCH4", "NUMB", "PPARD", "PSEN2", "PTCH1", "RBPJ", "SKP2", "TCF7", "TP53") # # # B <- c("EIF1", "EIF1AX", "EIF2B1", "EIF2B2", "EIF2B3", "EIF2B4", "EIF2B5", "EIF2S1", "EIF2S2", "EIF2S3", "ELAVL1") # # # # Run actual PubMatrix analysis # current_year <- as.integer(format(Sys.Date(), "%Y")) # result <- PubMatrix( # A = A, # B = B, # Database = "pubmed", # daterange = c(2020, current_year), # outfile = "pubmatrix_result" # ) ## ----asymmetric_example_offline----------------------------------------------- # Offline deterministic example used for vignette rendering/package checks. A <- c("NCOR2", "NCSTN", "NKD1", "NOTCH1", "NOTCH4", "NUMB", "PPARD", "PSEN2", "PTCH1", "RBPJ", "SKP2", "TCF7", "TP53") B <- c("EIF1", "EIF1AX", "EIF2B1", "EIF2B2", "EIF2B3", "EIF2B4", "EIF2B5", "EIF2S1", "EIF2S2", "EIF2S3", "ELAVL1") result <- outer(seq_along(B), seq_along(A), function(i, j) { 12 + (i * 4) + (j * 3) + ((i * j) %% 9) + ((i + 2 * j) %% 6) }) result <- as.data.frame(result, check.names = FALSE) colnames(result) <- A rownames(result) <- B ## ----results_table_asymmetric------------------------------------------------- kable(result, caption = "Co-occurrence Matrix: Longer Lists (Publication Counts)", align = "c", format = if (knitr::pandoc_to() == "html") "html" else "markdown" ) %>% kableExtra::kable_styling( bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE, position = "center" ) %>% kableExtra::add_header_above(c(" " = 1, "A Genes" = length(A))) ## ----bar_plots_asymmetric, fig.width=10, fig.height=8, out.width="100%", dpi=150---- # Create data frame for List A genes (rows) colored by List B genes (columns) a_genes_data2 <- data.frame( gene = rownames(result), total_pubs = rowSums(result), stringsAsFactors = FALSE ) # Add color coding based on max overlap with B genes a_genes_data2$max_b_gene <- apply(result, 1, function(x) colnames(result)[which.max(x)]) a_genes_data2$max_overlap <- apply(result, 1, max) # Create data frame for List B genes (columns) colored by List A genes (rows) b_genes_data2 <- data.frame( gene = colnames(result), total_pubs = colSums(result), stringsAsFactors = FALSE ) # Add color coding based on max overlap with A genes b_genes_data2$max_a_gene <- apply(result, 2, function(x) rownames(result)[which.max(x)]) b_genes_data2$max_overlap <- apply(result, 2, max) bar_plot_theme <- theme_minimal(base_size = 12) + theme( plot.title = element_text(face = "bold", size = 15), plot.subtitle = element_text(size = 10), axis.title = element_text(size = 11), axis.text = element_text(size = 10), panel.grid.major.y = element_blank(), legend.position = "bottom", legend.title = element_text(size = 10), legend.text = element_text(size = 8.5), legend.box = "vertical" ) n_fill_a2 <- length(unique(a_genes_data2$max_b_gene)) n_fill_b2 <- length(unique(b_genes_data2$max_a_gene)) # Plot A genes colored by their strongest B gene partner p3 <- ggplot(a_genes_data2, aes(x = reorder(gene, total_pubs), y = total_pubs, fill = max_b_gene)) + geom_col(width = 0.75) + coord_flip() + labs( title = "List A Genes by Publication Count (Asymmetric)", subtitle = "Colored by strongest List B partner", x = "Genes (List A)", y = "Total Publications", fill = "Strongest B Partner" ) + scale_fill_viridis_d(option = "D", end = 0.9) + guides(fill = guide_legend(nrow = max(1, ceiling(n_fill_a2 / 4)), byrow = TRUE)) + bar_plot_theme # Plot B genes colored by their strongest A gene partner p4 <- ggplot(b_genes_data2, aes(x = reorder(gene, total_pubs), y = total_pubs, fill = max_a_gene)) + geom_col(width = 0.75) + coord_flip() + labs( title = "List B Genes by Publication Count (Asymmetric)", subtitle = "Colored by strongest List A partner", x = "Genes (List B)", y = "Total Publications", fill = "Strongest A Partner" ) + scale_fill_viridis_d(option = "D", end = 0.9) + guides(fill = guide_legend(nrow = max(1, ceiling(n_fill_b2 / 4)), byrow = TRUE)) + bar_plot_theme print(p3) print(p4) ## ----heatmap_with_numbers_asymmetric2, fig.width=10.5, fig.height=7.5, out.width="100%", dpi=150, fig.cap="Overlap percentage heatmap with values displayed in each cell"---- plot_pubmatrix_heatmap(result, title = "Asymmetric Lists Overlap (%)", show_numbers = TRUE, cellwidth = 44, cellheight = 32, width = 12, height = 10 ) ## ----heatmap_clean_asymmetric2, fig.width=10.5, fig.height=7.5, out.width="100%", dpi=150, fig.cap="Co-occurrence heatmap without numbers for better visual clarity"---- plot_pubmatrix_heatmap(result, title = "Asymmetric Lists Co-occurrence (Clean)", show_numbers = FALSE, cellwidth = 44, cellheight = 32, width = 12, height = 10 ) ## ----system_info-------------------------------------------------------------- sessionInfo()