## ----setup, include=FALSE-------------------------------------------------- knitr::opts_chunk$set(echo = TRUE) knitr::opts_chunk$set(cache = FALSE) ## ---- message=FALSE-------------------------------------------------------- library(MMUPHin) # tidyverse packages for utilities library(magrittr) library(dplyr) library(ggplot2) ## ----Installation, eval = FALSE-------------------------------------------- # if (!requireNamespace("BiocManager", quietly = TRUE)) # install.packages("BiocManager") # BiocManager::install("MMUPHin") ## ----load data------------------------------------------------------------- data("CRC_abd", "CRC_meta") # CRC_abd is the feature (species) abundance matrix. Rows are features and # columns are samples. CRC_abd[1:5, 1, drop = FALSE] # CRC_meta is the metadata data frame. Columns are samples. CRC_meta[1, 1:5] # A total of five studies are included table(CRC_meta$studyID) # The following were used to access and format the two objects # library(curatedMetagenomicData) # library(phyloseq) # library(genefilter) # datasets <- curatedMetagenomicData( # c("FengQ_2015.metaphlan_bugs_list.stool" , # "HanniganGD_2017.metaphlan_bugs_list.stool", # "VogtmannE_2016.metaphlan_bugs_list.stool", # "YuJ_2015.metaphlan_bugs_list.stool", # "ZellerG_2014.metaphlan_bugs_list.stool"), # dryrun = FALSE) # # Construct phyloseq object from the five datasets # physeq <- # # Aggregate the five studies into ExpressionSet # mergeData(datasets) %>% # # Convert to phyloseq object # ExpressionSet2phyloseq() %>% # # Subset samples to only CRC and controls # subset_samples(study_condition %in% c("CRC", "control")) %>% # # Subset features to species # subset_taxa(!is.na(Species) & is.na(Strain)) %>% # # Normalize abundances to relative abundance scale # transform_sample_counts(function(x) x / sum(x)) %>% # # Filter features to be of at least 1e-5 relative abundance in five samples # filter_taxa(kOverA(5, 1e-5), prune = TRUE) # CRC_abd <- otu_table(physeq)@.Data # CRC_meta <- data.frame(sample_data(physeq)) # CRC_meta$studyID <- factor(CRC_meta$studyID) ## ----adjust_batch---------------------------------------------------------- # The function call indicates for adjust_batch to correct for the effect # of the batch variable, studyID, while controlling for the effect of the # disease variable, study_condition. Many additional options are available # through the control parameter, here we specify verbose=FALSE to avoid # excessive messages, although they can often be helpful in practice! fit_adjust_batch <- adjust_batch(feature_abd = CRC_abd, batch = "studyID", covariates = "study_condition", data = CRC_meta, control = list(verbose = FALSE)) # Note that adjust_batch returns a list of more than one components, and # feature_abd_adj is the corrected feature abundance matrix. See # help(adjust_batch) for the meaning of other components. CRC_abd_adj <- fit_adjust_batch$feature_abd_adj ## ----permanova------------------------------------------------------------- library(vegan) # adonis requires as input sample-versus-sample dissimilarities between # microbial profiles D_before <- vegdist(t(CRC_abd)) D_after <- vegdist(t(CRC_abd_adj)) # fix random seed as adonis runs randomized permutations set.seed(1) fit_adonis_before <- adonis(D_before ~ studyID, data = CRC_meta) fit_adonis_after <- adonis(D_after ~ studyID, data = CRC_meta) print(fit_adonis_before) print(fit_adonis_after) ## ----lm_meta--------------------------------------------------------------- # lm_meta runs regression and meta-analysis models to identify consistent # effects of the exposure (study_condition, i.e., disease) on feature_abd # (microbial feature abundances). Batch variable (studyID) needs to be # specified to identify different studies. Additional covariates to include in # the regression model can be specified via covariates (here set to gender, # age, BMI). Check help(lm_meta) for additional parameter options. # Note the warnings: lm_meta can tell if a covariate cannot be meaningfully fit # within a batch and will inform the user of such cases through warnings. fit_lm_meta <- lm_meta(feature_abd = CRC_abd, batch = "studyID", exposure = "study_condition", covariates = c("gender", "age", "BMI"), data = CRC_meta, control = list(verbose = FALSE)) # Again, lm_meta returns a list of more than one components. # meta_fits provides the final meta-analytical testing results. See # help(lm_meta) for the meaning of other components. meta_fits <- fit_lm_meta$meta_fits ## ----significant differential abundant species----------------------------- meta_fits %>% filter(qval.fdr < 0.05) %>% arrange(coef) %>% mutate(feature = factor(feature, levels = feature)) %>% ggplot(aes(y = coef, x = feature)) + geom_bar(stat = "identity") + coord_flip() ## ----discrete_discover----------------------------------------------------- # First subset both feature abundance table and metadata to only control samples control_meta <- subset(CRC_meta, study_condition == "control") control_abd_adj <- CRC_abd_adj[, rownames(control_meta)] # discrete_discover takes as input sample-by-sample dissimilarity measurements # rather than abundance table. The former can be easily computed from the # latter with existing R packages. D_control <- vegdist(t(control_abd_adj)) fit_discrete <- discrete_discover(D = D_control, batch = "studyID", data = control_meta, control = list(k_max = 8, verbose = FALSE)) ## ----visualize discrete structure------------------------------------------ internal <- data.frame( # By default, fit_discrete evaluates cluster numbers 2-10 K = 2:8, statistic = fit_discrete$internal_mean[, "ZellerG_2014.metaphlan_bugs_list.stool"], se = fit_discrete$internal_se[, "ZellerG_2014.metaphlan_bugs_list.stool"], type = "internal") external <- data.frame( # By default, fit_discrete evaluates cluster numbers 2-10 K = 2:8, statistic = fit_discrete$external_mean[, "ZellerG_2014.metaphlan_bugs_list.stool"], se = fit_discrete$external_se[, "ZellerG_2014.metaphlan_bugs_list.stool"], type = "external") rbind(internal, external) %>% ggplot(aes(x = K, y = statistic, color = type)) + geom_point(position = position_dodge(width = 0.5)) + geom_line(position = position_dodge(width = 0.5)) + geom_errorbar(aes(ymin = statistic - se, ymax = statistic + se), position = position_dodge(width = 0.5), width = 0.5) + ggtitle("Evaluation of discrete structure in control stool microbiome (ZellerG_2014)") ## ----discrete_discover vaginal--------------------------------------------- # library(curatedMetagenomicData) # library(phyloseq) # datasets <- curatedMetagenomicData( # "*metaphlan_bugs_list.vagina*", # dryrun = FALSE) # # Construct phyloseq object from the five datasets # physeq <- # # Aggregate the five studies into ExpressionSet # mergeData(datasets) %>% # # Convert to phyloseq object # ExpressionSet2phyloseq() %>% # # Subset features to species # subset_taxa(!is.na(Species) & is.na(Strain)) %>% # # Normalize abundances to relative abundance scale # transform_sample_counts(function(x) x / sum(x)) %>% # # Filter features to be of at least 1e-5 relative abundance in two samples # filter_taxa(kOverA(2, 1e-5), prune = TRUE) # vaginal_abd <- otu_table(physeq)@.Data # vaginal_meta <- data.frame(sample_data(physeq)) # vaginal_meta$studyID <- factor(vaginal_meta$studyID) data("vaginal_abd", "vaginal_meta") D_vaginal <- vegdist(t(vaginal_abd)) fit_discrete_vag <- discrete_discover(D = D_vaginal, batch = "studyID", data = vaginal_meta, control = list(verbose = FALSE, k_max = 8)) # Examine results for the larger study, HMP_2012 data.frame( # By default, fit_discrete evaluates cluster numbers 2-10 K = 2:8, statistic = fit_discrete_vag$internal_mean[, "HMP_2012.metaphlan_bugs_list.vagina"], se = fit_discrete_vag$internal_se[, "HMP_2012.metaphlan_bugs_list.vagina"]) %>% ggplot(aes(x = K, y = statistic)) + geom_point() + geom_line() + geom_errorbar(aes(ymin = statistic - se, ymax = statistic + se), width = 0.5) + ggtitle("Evaluation of discrete structure in vaginal microbiome (HMP_2012)") ## ----continuos_structre---------------------------------------------------- # Much like adjust_batch and lm_meta, continuous_discover also takes # as input feature-by-sample abundances. control offers many tuning parameters # and here we set one of them, var_perc_cutoff, to 0.5, which asks the method # to include top principal components within each batch that in total explain # at least 50% of the total variability in the batch. See # help(continuosu_discover) for more details on the tuning parameters and # their interpretations. fit_continuous <- continuous_discover(feature_abd = control_abd_adj, batch = "studyID", data = control_meta, control = list(var_perc_cutoff = 0.5, verbose = FALSE)) ## ----visualize continuous structure---------------------------------------- # Examine top loadings loading <- data.frame(feature = rownames(fit_continuous$consensus_loadings), loading1 = fit_continuous$consensus_loadings[, 1]) loading %>% dplyr::arrange(-abs(loading1)) %>% dplyr::slice(1:20) %>% dplyr::arrange(loading1) %>% dplyr::mutate(feature = factor(feature, levels = feature)) %>% ggplot(aes(x = feature, y = loading1)) + geom_bar(stat = "identity") + coord_flip() + ggtitle("Features with top loadings") # Ordinate the samples mds <- cmdscale(d = D_control) colnames(mds) <- c("Axis1", "Axis2") as.data.frame(mds) %>% dplyr::mutate(score1 = fit_continuous$consensus_scores[, 1]) %>% ggplot(aes(x = Axis1, y = Axis2, color = score1)) + geom_point() + coord_fixed() ## ----sessioninfo----------------------------------------------------------- sessionInfo()