## ----setup, include=FALSE--------------------------------------------------
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(cache = FALSE)

## ---- message=FALSE--------------------------------------------------------
library(MMUPHin)
# tidyverse packages for utilities
library(magrittr)
library(dplyr)
library(ggplot2)

## ----Installation, eval = FALSE--------------------------------------------
#  if (!requireNamespace("BiocManager", quietly = TRUE))
#      install.packages("BiocManager")
#  BiocManager::install("MMUPHin")

## ----load data-------------------------------------------------------------
data("CRC_abd", "CRC_meta")
# CRC_abd is the feature (species) abundance matrix. Rows are features and 
# columns are samples.
CRC_abd[1:5, 1, drop = FALSE]
# CRC_meta is the metadata data frame. Columns are samples.
CRC_meta[1, 1:5]
# A total of five studies are included 
table(CRC_meta$studyID)
# The following were used to access and format the two objects
# library(curatedMetagenomicData)
# library(phyloseq)
# library(genefilter)
# datasets <- curatedMetagenomicData(
#   c("FengQ_2015.metaphlan_bugs_list.stool"  ,
#     "HanniganGD_2017.metaphlan_bugs_list.stool",
#     "VogtmannE_2016.metaphlan_bugs_list.stool",
#     "YuJ_2015.metaphlan_bugs_list.stool",
#     "ZellerG_2014.metaphlan_bugs_list.stool"),
#   dryrun = FALSE)
# # Construct phyloseq object from the five datasets
# physeq <-
#   # Aggregate the five studies into ExpressionSet
#   mergeData(datasets) %>%
#   # Convert to phyloseq object
#   ExpressionSet2phyloseq() %>%
#   # Subset samples to only CRC and controls
#   subset_samples(study_condition %in% c("CRC", "control")) %>%
#   # Subset features to species
#   subset_taxa(!is.na(Species) & is.na(Strain)) %>%
#   # Normalize abundances to relative abundance scale
#   transform_sample_counts(function(x) x / sum(x)) %>%
#   # Filter features to be of at least 1e-5 relative abundance in five samples
#   filter_taxa(kOverA(5, 1e-5), prune = TRUE)
# CRC_abd <- otu_table(physeq)@.Data
# CRC_meta <- data.frame(sample_data(physeq))
# CRC_meta$studyID <- factor(CRC_meta$studyID)

## ----adjust_batch----------------------------------------------------------
# The function call indicates for adjust_batch to correct for the effect
# of the batch variable, studyID, while controlling for the effect of the 
# disease variable, study_condition. Many additional options are available 
# through the control parameter, here we specify verbose=FALSE to avoid 
# excessive messages, although they can often be helpful in practice!
fit_adjust_batch <- adjust_batch(feature_abd = CRC_abd,
                                 batch = "studyID",
                                 covariates = "study_condition",
                                 data = CRC_meta,
                                 control = list(verbose = FALSE))
# Note that adjust_batch returns a list of more than one components, and 
# feature_abd_adj is the corrected feature abundance matrix. See 
# help(adjust_batch) for the meaning of other components.
CRC_abd_adj <- fit_adjust_batch$feature_abd_adj

## ----permanova-------------------------------------------------------------
library(vegan)
# adonis requires as input sample-versus-sample dissimilarities between 
# microbial profiles
D_before <- vegdist(t(CRC_abd))
D_after <- vegdist(t(CRC_abd_adj))
# fix random seed as adonis runs randomized permutations
set.seed(1)
fit_adonis_before <- adonis(D_before ~ studyID, data = CRC_meta)
fit_adonis_after <- adonis(D_after ~ studyID, data = CRC_meta)
print(fit_adonis_before)
print(fit_adonis_after)

## ----lm_meta---------------------------------------------------------------
# lm_meta runs regression and meta-analysis models to identify consistent 
# effects of the exposure (study_condition, i.e., disease) on feature_abd
# (microbial feature abundances). Batch variable (studyID) needs to be 
# specified to identify different studies. Additional covariates to include in 
# the regression model can be specified via covariates (here set to gender, 
# age, BMI). Check help(lm_meta) for additional parameter options.
# Note the warnings: lm_meta can tell if a covariate cannot be meaningfully fit
# within a batch and will inform the user of such cases through warnings.
fit_lm_meta <- lm_meta(feature_abd = CRC_abd,
                       batch = "studyID",
                       exposure = "study_condition",
                       covariates = c("gender", "age", "BMI"),
                       data = CRC_meta,
                       control = list(verbose = FALSE))
# Again, lm_meta returns a list of more than one components. 
# meta_fits provides the final meta-analytical testing results. See 
# help(lm_meta) for the meaning of other components.
meta_fits <- fit_lm_meta$meta_fits

## ----significant differential abundant species-----------------------------
meta_fits %>% 
  filter(qval.fdr < 0.05) %>% 
  arrange(coef) %>% 
  mutate(feature = factor(feature, levels = feature)) %>% 
  ggplot(aes(y = coef, x = feature)) +
  geom_bar(stat = "identity") +
  coord_flip()

## ----discrete_discover-----------------------------------------------------
# First subset both feature abundance table and metadata to only control samples
control_meta <- subset(CRC_meta, study_condition == "control")
control_abd_adj <- CRC_abd_adj[, rownames(control_meta)]
# discrete_discover takes as input sample-by-sample dissimilarity measurements 
# rather than abundance table. The former can be easily computed from the 
# latter with existing R packages.
D_control <- vegdist(t(control_abd_adj))
fit_discrete <- discrete_discover(D = D_control,
                                  batch = "studyID",
                                  data = control_meta,
                                  control = list(k_max = 8,
                                                 verbose = FALSE))

## ----visualize discrete structure------------------------------------------
internal <- data.frame(
  # By default, fit_discrete evaluates cluster numbers 2-10
  K = 2:8,
  statistic = 
    fit_discrete$internal_mean[, "ZellerG_2014.metaphlan_bugs_list.stool"],
  se = 
    fit_discrete$internal_se[, "ZellerG_2014.metaphlan_bugs_list.stool"],
  type = "internal")
external <- data.frame(
  # By default, fit_discrete evaluates cluster numbers 2-10
  K = 2:8,
  statistic = 
    fit_discrete$external_mean[, "ZellerG_2014.metaphlan_bugs_list.stool"],
  se = 
    fit_discrete$external_se[, "ZellerG_2014.metaphlan_bugs_list.stool"],
  type = "external")
rbind(internal, external) %>% 
  ggplot(aes(x = K, y = statistic, color = type)) +
  geom_point(position = position_dodge(width = 0.5)) + 
  geom_line(position = position_dodge(width = 0.5)) +
  geom_errorbar(aes(ymin = statistic - se, ymax = statistic + se),
                position = position_dodge(width = 0.5), width = 0.5) +
  ggtitle("Evaluation of discrete structure in control stool microbiome (ZellerG_2014)")

## ----discrete_discover vaginal---------------------------------------------
# library(curatedMetagenomicData)
# library(phyloseq)
# datasets <- curatedMetagenomicData(
#   "*metaphlan_bugs_list.vagina*",
#   dryrun = FALSE)
# # Construct phyloseq object from the five datasets
# physeq <-
#   # Aggregate the five studies into ExpressionSet
#   mergeData(datasets) %>%
#   # Convert to phyloseq object
#   ExpressionSet2phyloseq() %>%
#   # Subset features to species
#   subset_taxa(!is.na(Species) & is.na(Strain)) %>%
#   # Normalize abundances to relative abundance scale
#   transform_sample_counts(function(x) x / sum(x)) %>%
#   # Filter features to be of at least 1e-5 relative abundance in two samples
#   filter_taxa(kOverA(2, 1e-5), prune = TRUE)
# vaginal_abd <- otu_table(physeq)@.Data
# vaginal_meta <- data.frame(sample_data(physeq))
# vaginal_meta$studyID <- factor(vaginal_meta$studyID)
data("vaginal_abd", "vaginal_meta")
D_vaginal <- vegdist(t(vaginal_abd))
fit_discrete_vag <- discrete_discover(D = D_vaginal,
                                      batch = "studyID",
                                      data = vaginal_meta,
                                      control = list(verbose = FALSE,
                                                     k_max = 8))
# Examine results for the larger study, HMP_2012
data.frame(
  # By default, fit_discrete evaluates cluster numbers 2-10
  K = 2:8,
  statistic = 
    fit_discrete_vag$internal_mean[, "HMP_2012.metaphlan_bugs_list.vagina"],
  se = 
    fit_discrete_vag$internal_se[, "HMP_2012.metaphlan_bugs_list.vagina"]) %>% 
  ggplot(aes(x = K, y = statistic)) +
  geom_point() + geom_line() +
  geom_errorbar(aes(ymin = statistic - se, ymax = statistic + se), 
                width = 0.5) +
  ggtitle("Evaluation of discrete structure in vaginal microbiome (HMP_2012)")

## ----continuos_structre----------------------------------------------------
# Much like adjust_batch and lm_meta, continuous_discover also takes
# as input feature-by-sample abundances. control offers many tuning parameters
# and here we set one of them, var_perc_cutoff, to 0.5, which asks the method
# to include top principal components within each batch that in total explain
# at least 50% of the total variability in the batch. See 
# help(continuosu_discover) for more details on the tuning parameters and 
# their interpretations.
fit_continuous <- continuous_discover(feature_abd = control_abd_adj,
                                      batch = "studyID",
                                      data = control_meta,
                                      control = list(var_perc_cutoff = 0.5,
                                                     verbose = FALSE))

## ----visualize continuous structure----------------------------------------
# Examine top loadings
loading <- data.frame(feature = rownames(fit_continuous$consensus_loadings),
                      loading1 = fit_continuous$consensus_loadings[, 1])
loading %>%
    dplyr::arrange(-abs(loading1)) %>%
    dplyr::slice(1:20) %>%
    dplyr::arrange(loading1) %>%
    dplyr::mutate(feature = factor(feature, levels = feature)) %>%
    ggplot(aes(x = feature, y = loading1)) +
    geom_bar(stat = "identity") +
    coord_flip() +
    ggtitle("Features with top loadings")
# Ordinate the samples
mds <- cmdscale(d = D_control)
colnames(mds) <- c("Axis1", "Axis2")
as.data.frame(mds) %>% 
  dplyr::mutate(score1 = fit_continuous$consensus_scores[, 1]) %>% 
  ggplot(aes(x = Axis1, y = Axis2, color = score1)) +
  geom_point() +
  coord_fixed()

## ----sessioninfo-----------------------------------------------------------
sessionInfo()