## ----setup, include = FALSE, warning = FALSE----------------------------------
knitr::opts_chunk$set(comment = FALSE, 
                      warning = FALSE, 
                      message = FALSE)

## -----------------------------------------------------------------------------
require(IgGeneUsage)
require(knitr)
require(ggplot2)
require(ggforce)
require(gridExtra)
require(ggrepel)
require(rstan)
require(reshape2)
rstan_options(auto_write = TRUE)

## -----------------------------------------------------------------------------
data("d_zibb", package = "IgGeneUsage")
knitr::kable(head(d_zibb))

## ---- fig.width=6, fig.height=3-----------------------------------------------
ggplot(data = d_zibb)+
  geom_point(aes(x = gene_name, y = gene_usage_count, col = condition),
             position = position_dodge(width = .7), shape = 21)+
  theme_bw(base_size = 11)+
  ylab(label = "Gene usage")+
  xlab(label = '')+
  theme(legend.position = "top")+
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4))

## -----------------------------------------------------------------------------
M <- DGU(usage.data = d_zibb, # input data
         mcmc.warmup = 500, # how many MCMC warm-ups per chain (default: 500)
         mcmc.steps = 1500, # how many MCMC steps per chain (default: 1,500)
         mcmc.chains = 3, # how many MCMC chain to run (default: 4)
         mcmc.cores = 1, # how many PC cores to use? (e.g. parallel chains)
         hdi.level = 0.95, # highest density interval level (de fault: 0.95)
         adapt.delta = 0.8, # MCMC target acceptance rate (default: 0.95)
         max.treedepth = 10) # tree depth evaluated at each step (default: 12)

## -----------------------------------------------------------------------------
summary(M)

## -----------------------------------------------------------------------------
rstan::check_hmc_diagnostics(M$glm)

## ---- fig.height = 3, fig.width = 6-------------------------------------------
gridExtra::grid.arrange(rstan::stan_rhat(object = M$glm),
                        rstan::stan_ess(object = M$glm),
                        nrow = 1)

## ---- fig.height = 5.5, fig.width = 6-----------------------------------------
ggplot(data = M$ppc.data$ppc.repertoire)+
  facet_wrap(facets = ~sample_name, nrow = 3)+
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", col = "darkgray")+
  geom_errorbar(aes(x = observed_count, y = ppc_mean_count, 
                    ymin = ppc_L_count, ymax = ppc_H_count), col = "darkgray")+
  geom_point(aes(x = observed_count, y = ppc_mean_count, 
                 fill = condition), shape = 21, size = 1)+
  theme_bw(base_size = 11)+
  theme(legend.position = "top")+
  scale_x_log10()+
  scale_y_log10()+
  xlab(label = "Observed usage [counts]")+
  ylab(label = "Predicted usage [counts]")+
  annotation_logticks(base = 10, sides = "lb")

## ---- fig.height = 4, fig.width = 4-------------------------------------------
ggplot(data = M$ppc.data$ppc.gene)+
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", col = "darkgray")+
  geom_errorbar(aes(x = observed_prop*100, ymin = ppc_L_prop*100, 
                    ymax = ppc_H_prop*100), col = "darkgray")+
  geom_point(aes(x = observed_prop*100, y = ppc_mean_prop*100, 
                 col = condition), size = 2)+
  theme_bw(base_size = 11)+
  theme(legend.position = "top")+
  xlab(label = "Observed usage [%]")+
  ylab(label = "Predicted usage [%]")+
  scale_x_log10()+
  scale_y_log10()+
  annotation_logticks(base = 10, sides = "lb")

## -----------------------------------------------------------------------------
kable(x = head(M$glm.summary), row.names = FALSE, digits = 3)

## ---- fig.height = 4, fig.width = 6-------------------------------------------
# format data
stats <- M$glm.summary
stats <- stats[order(abs(stats$es_mean), decreasing = FALSE), ]
stats$gene_fac <- factor(x = stats$gene_name, levels = stats$gene_name)

stats <- merge(x = stats, y = M$test.summary, by = "gene_name")

ggplot(data = stats)+
  geom_hline(yintercept = 0, linetype = "dashed", col = "gray")+
  geom_errorbar(aes(x = pmax, y = es_mean, ymin = es_L, ymax = es_H), 
                col = "darkgray")+
  geom_point(aes(x = pmax, y = es_mean), col = "darkgray")+
  geom_text_repel(data = stats[stats$pmax >= 0.9, ],
                  aes(x = pmax, y = es_mean, label = gene_fac),
                  min.segment.length = 0, size = 2.75)+
  theme_bw(base_size = 11)+
  xlab(label = expression(pi))+
  xlim(c(0, 1))

## ---- fig.height = 2.5, fig.width = 3-----------------------------------------
promising.genes <- stats$gene_name[stats$pmax >= 0.9]

ppc.gene <- M$ppc.data$ppc.gene
ppc.gene <- ppc.gene[ppc.gene$gene_name %in% promising.genes, ]

ppc.rep <- M$ppc.data$ppc.repertoire
ppc.rep <- ppc.rep[ppc.rep$gene_name %in% promising.genes, ]



ggplot()+
  geom_point(data = ppc.rep,
             aes(x = gene_name, y = observed_prop*100, col = condition),
             size = 1.5, fill = "black",
             position = position_jitterdodge(jitter.width = 0.1, 
                                             jitter.height = 0, 
                                             dodge.width = 0.35))+
  geom_errorbar(data = ppc.gene, 
                aes(x = gene_name, ymin = ppc_L_prop*100, 
                    ymax = ppc_H_prop*100, group = condition),
                position = position_dodge(width = 0.35), width = 0.15)+
  theme_bw(base_size = 11)+
  theme(legend.position = "top")+
  ylab(label = "Usage [%]")+
  xlab(label = '')

## ---- fig.height = 2.5, fig.width = 3-----------------------------------------
ggplot()+
  geom_point(data = ppc.rep,
             aes(x = gene_name, y = observed_count, col = condition),
             size = 1.5, fill = "black",
             position = position_jitterdodge(jitter.width = 0.1, 
                                             jitter.height = 0, 
                                             dodge.width = 0.35))+
  theme_bw(base_size = 11)+
  theme(legend.position = "top")+
  ylab(label = "Usage count")+
  xlab(label = '')

## ---- fig.height = 4, fig.width = 4-------------------------------------------
ggplot()+
  geom_hline(yintercept = c(-log10(0.05), -log10(0.01)), 
             linetype = "dashed", col = "darkgray")+
  geom_point(data = stats, col = "red", size = 2,
             aes(x = pmax, y = -log10(t.test.fdr.pvalue)))+
  geom_text_repel(data = stats[stats$pmax >= 0.5, ], 
                  aes(x = pmax, y = -log10(t.test.fdr.pvalue), 
                      label = gene_name), size = 2.75, 
                  min.segment.length = 0)+
  xlim(0, 1)+
  ylab(label = "-log10 (P-value) from T-test [FDR corrected]")+
  xlab(label = expression(pi))+
  theme_bw(base_size = 11)+
  theme(legend.position = "top")+
  scale_color_discrete(name = '')

## ---- fig.height = 4, fig.width = 4-------------------------------------------
ggplot()+
  geom_hline(yintercept = c(-log10(0.05), -log10(0.01)), 
             linetype = "dashed", col = "darkgray")+
  geom_point(data = stats, col = "red", size = 2,
             aes(x = pmax, y = -log10(u.test.fdr.pvalue)))+
  geom_text_repel(data = stats[stats$pmax >= 0.5, ], 
                  aes(x = pmax, y = -log10(u.test.fdr.pvalue), 
                      label = gene_name), size = 2.75, 
                  min.segment.length = 0)+
  xlim(0, 1)+
  ylab(label = "-log10 (P-value) from U-test [FDR corrected]")+
  xlab(label = expression(pi))+
  theme_bw(base_size = 11)+
  theme(legend.position = "top")+
  scale_color_discrete(name = '')

## -----------------------------------------------------------------------------
sessionInfo()