## ----include=FALSE------------------------------------------------------------
knitr::opts_chunk$set(collapse=TRUE)

## ----setup, warning=FALSE, message=FALSE--------------------------------------
library("tidyverse")
library("SIAMCAT")
library("ggpubr")

## ----curateMGD, eval=FALSE----------------------------------------------------
#  library("curatedMetagenomicData")

## ----get_metadata, eval=FALSE-------------------------------------------------
#  meta.nielsen.full <- combined_metadata %>%
#      filter(dataset_name=='NielsenHB_2014')

## ----reason_meta_clean_2, eval=FALSE------------------------------------------
#  print(length(unique(meta.nielsen.full$subjectID)))
#  print(nrow(meta.nielsen.full))

## ----clean_metadata_1, eval=FALSE---------------------------------------------
#  meta.nielsen <- meta.nielsen.full %>%
#      select(sampleID, subjectID, study_condition, disease_subtype,
#          disease, age, country, number_reads, median_read_length, BMI) %>%
#      mutate(visit=str_extract(sampleID, '_[0-9]+$')) %>%
#      mutate(visit=str_remove(visit, '_')) %>%
#      mutate(visit=as.numeric(visit)) %>%
#      mutate(visit=case_when(is.na(visit)~0, TRUE~visit)) %>%
#      group_by(subjectID) %>%
#      filter(visit==min(visit)) %>%
#      ungroup() %>%
#      mutate(Sample_ID=sampleID) %>%
#      mutate(Group=case_when(disease=='healthy'~'CTR',
#                              TRUE~disease_subtype))

## ----clean_metadata_4, eval=FALSE---------------------------------------------
#  meta.nielsen <- meta.nielsen %>%
#      filter(Group %in% c('UC', 'CTR'))

## ----clean_metadata_3, eval=FALSE---------------------------------------------
#  meta.nielsen <- meta.nielsen %>%
#      mutate(Country=country)
#  meta.nielsen <- as.data.frame(meta.nielsen)
#  rownames(meta.nielsen) <- meta.nielsen$sampleID

## ----tax_profiles, eval=FALSE-------------------------------------------------
#  x <- 'NielsenHB_2014.metaphlan_bugs_list.stool'
#  feat <- curatedMetagenomicData(x=x, dryrun=FALSE)
#  feat <- feat[[x]]@assayData$exprs

## ----clean_tax_profiles, eval=FALSE-------------------------------------------
#  feat <- feat[grep(x=rownames(feat), pattern='s__'),]
#  feat <- feat[grep(x=rownames(feat),pattern='t__', invert = TRUE),]
#  feat <- t(t(feat)/100)

## ----clean_feature_names, eval=FALSE------------------------------------------
#  rownames(feat) <- str_extract(rownames(feat), 's__.*$')

## ----load_motus---------------------------------------------------------------
# base url for data download
data.loc <- 'https://zenodo.org/api/files/d81e429c-870f-44e0-a44a-2a4aa541b6c1/'
## metadata
meta.nielsen <- read_tsv(paste0(data.loc, 'meta_Nielsen.tsv'))
# also here, we have to remove repeated samplings and CD samples
meta.nielsen <- meta.nielsen %>%
    filter(Group %in% c('CTR', 'UC')) %>%
    group_by(Individual_ID) %>%
    filter(Sampling_day==min(Sampling_day)) %>%
    ungroup() %>%
    as.data.frame()
rownames(meta.nielsen) <- meta.nielsen$Sample_ID

## features
feat <- read.table(paste0(data.loc, 'metaHIT_motus.tsv'), 
                    stringsAsFactors = FALSE, sep='\t',
                    check.names = FALSE, quote = '', comment.char = '')
feat <- feat[,colSums(feat) > 0]
feat <- prop.table(as.matrix(feat), 2)

## ----siamcat------------------------------------------------------------------
# remove Danish samples
meta.nielsen.esp <- meta.nielsen[meta.nielsen$Country == 'ESP',]
sc.obj <- siamcat(feat=feat, meta=meta.nielsen.esp, label='Group', case='UC')

## ----feature_filtering--------------------------------------------------------
sc.obj <- filter.features(sc.obj, cutoff=1e-04,
                            filter.method = 'abundance')
sc.obj <- filter.features(sc.obj, cutoff=0.05,
                            filter.method='prevalence',
                            feature.type = 'filtered')

## ----assoc_plot, message=FALSE, warning=FALSE---------------------------------
sc.obj <- check.associations(sc.obj, log.n0 = 1e-06, alpha=0.1)
association.plot(sc.obj, fn.plot = './association_plot_nielsen.pdf', 
                panels = c('fc'))

## ----check_confounders, warning=FALSE-----------------------------------------
check.confounders(sc.obj, fn.plot = './confounders_nielsen.pdf')

## ----ml_workflow, eval=FALSE--------------------------------------------------
#  sc.obj <- normalize.features(sc.obj, norm.method = 'log.std',
#                              norm.param = list(log.n0=1e-06, sd.min.q=0))
#  ## Features normalized successfully.
#  sc.obj <- create.data.split(sc.obj, num.folds = 5, num.resample = 5)
#  ## Features splitted for cross-validation successfully.
#  sc.obj <- train.model(sc.obj, method='lasso')
#  ## Trained lasso models successfully.
#  sc.obj <- make.predictions(sc.obj)
#  ## Made predictions successfully.
#  sc.obj <- evaluate.predictions(sc.obj)
#  ## Evaluated predictions successfully.

## ----model_eval_plot, eval=FALSE----------------------------------------------
#  model.evaluation.plot(sc.obj, fn.plot = './eval_plot_nielsen.pdf')
#  ## Plotted evaluation of predictions successfully to: ./eval_plot_nielsen.pdf

## ----model_interpretation_plot, eval=FALSE------------------------------------
#  model.interpretation.plot(sc.obj, consens.thres = 0.8,
#                              fn.plot = './interpret_nielsen.pdf')
#  ## Successfully plotted model interpretation plot to: ./interpret_nielsen.pdf

## ----confounders--------------------------------------------------------------
table(meta.nielsen$Group, meta.nielsen$Country)

## ----conf_start---------------------------------------------------------------
sc.obj.full <- siamcat(feat=feat, meta=meta.nielsen,
                        label='Group', case='UC')
sc.obj.full <- filter.features(sc.obj.full, cutoff=1e-04,
                                filter.method = 'abundance')
sc.obj.full <- filter.features(sc.obj.full, cutoff=0.05,
                                filter.method='prevalence',
                                feature.type = 'filtered')

## ----conf_country, eval=FALSE-------------------------------------------------
#  check.confounders(sc.obj.full, fn.plot = './confounders_dnk.pdf')

## ----assoc_plot_2, warning=FALSE, message=FALSE-------------------------------
sc.obj.full <- check.associations(sc.obj.full, log.n0 = 1e-06, alpha=0.1) 

## ----conf_assoc_plot, warning=FALSE-------------------------------------------
assoc.sp <- associations(sc.obj)
assoc.sp$species <- rownames(assoc.sp)
assoc.sp_dnk <- associations(sc.obj.full)
assoc.sp_dnk$species <- rownames(assoc.sp_dnk)

df.plot <- full_join(assoc.sp, assoc.sp_dnk, by='species')
df.plot %>%
    mutate(highlight=str_detect(species, 'formicigenerans')) %>%
    ggplot(aes(x=-log10(p.adj.x), y=-log10(p.adj.y), col=highlight)) +
    geom_point(alpha=0.3) +
        xlab('Spanish samples only\n-log10(q)') +
        ylab('Spanish and Danish samples only\n-log10(q)') +
        theme_classic() +
        theme(panel.grid.major = element_line(colour='lightgrey'),
            aspect.ratio = 1.3) +
        scale_colour_manual(values=c('darkgrey', '#D41645'), guide='none') +
        annotate('text', x=0.7, y=8, label='Dorea formicigenerans')

## ----dorea_plot---------------------------------------------------------------
# extract information out of the siamcat object
feat.dnk <- get.filt_feat.matrix(sc.obj.full)
label.dnk <- label(sc.obj.full)$label
country <- meta(sc.obj.full)$Country
names(country) <- rownames(meta(sc.obj.full))

df.plot <- tibble(dorea=log10(feat.dnk[
    str_detect(rownames(feat.dnk),'formicigenerans'),
    names(label.dnk)] + 1e-05),
    label=label.dnk, country=country) %>%
    mutate(label=case_when(label=='-1'~'CTR', TRUE~"UC")) %>%
    mutate(x_value=paste0(country, '_', label))

df.plot %>%
    ggplot(aes(x=x_value, y=dorea)) +
        geom_boxplot(outlier.shape = NA) +
        geom_jitter(width = 0.08, stroke=0, alpha=0.2) +
        theme_classic() +
        xlab('') +
        ylab("log10(Dorea formicigenerans)") +
        stat_compare_means(comparisons = list(c('DNK_CTR', 'ESP_CTR'),
                                                c('DNK_CTR', 'ESP_UC'),
                                                c('ESP_CTR', 'ESP_UC')))

## ----ml_workflow_dnk, eval=FALSE----------------------------------------------
#  sc.obj.full <- normalize.features(sc.obj.full, norm.method = 'log.std',
#                                  norm.param = list(log.n0=1e-06, sd.min.q=0))
#  ## Features normalized successfully.
#  sc.obj.full <- create.data.split(sc.obj.full, num.folds = 5, num.resample = 5)
#  ## Features splitted for cross-validation successfully.
#  sc.obj.full <- train.model(sc.obj.full, method='lasso')
#  ## Trained lasso models successfully.
#  sc.obj.full <- make.predictions(sc.obj.full)
#  ## Made predictions successfully.
#  sc.obj.full <- evaluate.predictions(sc.obj.full)
#  ## Evaluated predictions successfully.

## ----eval_plot_comp, eval=FALSE-----------------------------------------------
#  model.evaluation.plot("Spanish samples only"=sc.obj,
#                      "Danish and Spanish samples"=sc.obj.full,
#                      fn.plot = './eval_plot_dnk.pdf')
#  ## Plotted evaluation of predictions successfully to: ./eval_plot_dnk.pdf

## ----ml_workflow_country, eval=FALSE------------------------------------------
#  meta.nielsen.country <- meta.nielsen[meta.nielsen$Group=='CTR',]
#  
#  sc.obj.country <- siamcat(feat=feat, meta=meta.nielsen.country,
#                              label='Country', case='ESP')
#  sc.obj.country <- filter.features(sc.obj.country, cutoff=1e-04,
#                              filter.method = 'abundance')
#  sc.obj.country <- filter.features(sc.obj.country, cutoff=0.05,
#                              filter.method='prevalence',
#                              feature.type = 'filtered')
#  sc.obj.country <- normalize.features(sc.obj.country, norm.method = 'log.std',
#                                      norm.param = list(log.n0=1e-06,
#                                          sd.min.q=0))
#  sc.obj.country <- create.data.split(sc.obj.country,
#                                      num.folds = 5, num.resample = 5)
#  sc.obj.country <- train.model(sc.obj.country, method='lasso')
#  sc.obj.country <- make.predictions(sc.obj.country)
#  sc.obj.country <- evaluate.predictions(sc.obj.country)
#  
#  print(eval_data(sc.obj.country)$auroc)
#  ## Area under the curve: 0.9701

## ----session_info-------------------------------------------------------------
sessionInfo()