## ----fig.align = "center", out.width = "50%", echo=FALSE----------------------
knitr::include_graphics("coseq_hex.png")
options(rmarkdown.html_vignette.check_title = FALSE)

## ----eval=FALSE---------------------------------------------------------------
# run <- coseq(counts, K=2:25)
# summary(run)
# plot(run)

## ----eval=FALSE---------------------------------------------------------------
# clusters(run)

## ----eval=FALSE---------------------------------------------------------------
# assay(run)

## ----eval=FALSE---------------------------------------------------------------
# run <- coseq(counts, K=2:25, seed=12345)

## ----message=FALSE, warning=FALSE---------------------------------------------
library(coseq)
library(Biobase)
library(corrplot)

data("fietz")
counts <- exprs(fietz)
conds <- pData(fietz)$tissue

## Equivalent to the following:
## counts <- read.table("http://www.math.univ-toulouse.fr/~maugis/coseq/Fietz_mouse_counts.txt",
##                       header=TRUE)
## conds <- c(rep("CP",5),rep("SVZ",5),rep("VZ",5))

## ----eval=FALSE---------------------------------------------------------------
# run <- coseq(..., parallel=TRUE)

## -----------------------------------------------------------------------------
runLogCLR <- coseq(counts, K=2:25, transformation="logclr",norm="TMM", 
                      meanFilterCutoff=50, model="kmeans",
                   nstart=1, iter.max=10)

## -----------------------------------------------------------------------------
runArcsin <- coseq(counts, K=2:20, model="Normal", transformation="arcsin", 
                   meanFilterCutoff=200, iter=10)
runLogit <- coseq(counts, K=2:20, model="Normal", transformation="logit", 
                  meanFilterCutoff=200, verbose=FALSE, iter=10)

## -----------------------------------------------------------------------------
class(runArcsin)
runArcsin

## ----figure=TRUE, fig.width=6, fig.height=4-----------------------------------
compareICL(list(runArcsin, runLogit))

## ----figure=TRUE, fig.width=7, fig.height=7-----------------------------------
compareARI(runArcsin, K=8:12)

## -----------------------------------------------------------------------------
summary(runArcsin)

## ----fig.width=6, fig.height=4------------------------------------------------
## To obtain all plots
## plot(runArcsin)
plot(runArcsin, graphs="boxplots")
plot(runArcsin, graphs="boxplots", add_lines = FALSE)
plot(runArcsin, graphs="boxplots", conds=conds)
plot(runArcsin, graphs="boxplots", conds=conds, collapse_reps = "sum")
plot(runArcsin, graphs="profiles")

plot(runArcsin, graphs="probapost_boxplots")
plot(runArcsin, graphs="probapost_histogram")

## ----figure=TRUE, fig.width=4, fig.height=4-----------------------------------
rho <- NormMixParam(runArcsin)$rho
## Covariance matrix for cluster 1
rho1 <- rho[,,1]
colnames(rho1) <- rownames(rho1) <- paste0(colnames(rho1), "\n", conds)
corrplot(rho1)

## -----------------------------------------------------------------------------
labels <- clusters(runArcsin)
table(labels)
probapost <- assay(runArcsin)
head(probapost)

metadata(runArcsin)
likelihood(runArcsin)
nbCluster(runArcsin)
ICL(runArcsin)
model(runArcsin)
transformationType(runArcsin)

## -----------------------------------------------------------------------------
## arcsine-transformed normalized profiles
tcounts(runArcsin)

## normalized profiles
profiles(runArcsin)

## -----------------------------------------------------------------------------
fullres <- coseqFullResults(runArcsin)
class(fullres)
length(fullres)
names(fullres)

## ----message=FALSE, warning=FALSE---------------------------------------------
library(DESeq2)
dds <- DESeqDataSetFromMatrix(counts, DataFrame(group=factor(conds)), ~group)
dds <- DESeq(dds, test="LRT", reduced = ~1)
res <- results(dds)
summary(res)

## By default, alpha = 0.10
run <- coseq(dds, K=2:15, verbose=FALSE)

## The following two lines provide identical results
run <- coseq(dds, K=2:15, alpha=0.05)
run <- coseq(dds, K=2:15, subset=results(dds, alpha=0.05))

## ----message=FALSE, warning=FALSE---------------------------------------------
library(edgeR)
y <- DGEList(counts=counts, group=factor(conds))
y <- calcNormFactors(y)
design <- model.matrix(~conds)
y <- estimateDisp(y, design)

## edgeR: QLF test
fit <- glmQLFit(y, design)
qlf <- glmQLFTest(fit, coef=2)

## edgeR: LRT test
fit <- glmFit(y,design)
lrt <- glmLRT(fit,coef=2)

run <- coseq(counts, K=2:15, subset=lrt, alpha=0.1)
run <- coseq(counts, K=2:15, subset=qlf, alpha=0.1)

## ----fig.width=6, fig.height=4------------------------------------------------
## Simulate toy data, n = 300 observations
set.seed(12345)
countmat <- matrix(runif(300*10, min=0, max=500), nrow=300, ncol=10)
countmat <- countmat[which(rowSums(countmat) > 0),]
conds <- c(rep("A", 4), rep("B", 3), rep("D", 3))

## Run coseq
coexp <- coseq(object=countmat, K=2:15, iter=5, transformation="logclr",
                    model="kmeans", seed=12345)

## Original boxplot
p <- plot(coexp, graphs = "boxplots", conds = conds, 
     profiles_order = sort(unique(clusters(coexp)), decreasing=TRUE), 
     collapse_reps = "average",
     n_row = 3, n_col = 4)
p$boxplots

## ----fig.width=6, fig.height=4------------------------------------------------
library(ggplot2)
p$boxplot + 
  theme_bw() + 
  coord_fixed(ratio = 20) +
  theme(axis.ticks = element_line(color="black", size=1.5),
        axis.line = element_line(color="black", size=1.5),
        text = element_text(size = 15)) +
  scale_fill_brewer(type = "qual")

## ----eval=FALSE---------------------------------------------------------------
# ## Plot and summarize results
# p <- plot(coexp, graph = "boxplots")$boxplots
# library(ggforce)
# pdf("coseq_boxplots_by_page.pdf")
# for(k in seq_len(ncol(assay(coexp))))
#     print(p + facet_wrap_paginate(~labels, page = k, nrow = 1, ncol = 1))
# dev.off()

## ----message=FALSE------------------------------------------------------------
## Reproducible example
set.seed(12345)
countmat <- matrix(runif(300*8, min=0, max=500), nrow=300, ncol=8)
countmat <- countmat[which(rowSums(countmat) > 0),]
conds <- rep(c("A","C","B","D"), each=2)
colnames(countmat) <- factor(1:8)
run <- coseq(object=countmat, K=2:4, iter=5, model = "Normal",
             transformation = "arcsin")
 
## Problem: Conditions B and C are not in the same order in these graphics
p1 <- plot(run, graph = "boxplots", conds = conds)$boxplots
p2 <- plot(run, graph = "boxplots", conds = conds, 
           collapse_reps = "average")$boxplots
 
## Solution 1 ------------------------------------------------------
## Create a new plot
dat_adjust <- p2$data
dat_adjust$conds <- factor(dat_adjust$conds, levels = c("A", "C", "B", "D"))
gg_color <- function(n) {
    hues = seq(15, 375, length = n + 1)
    hcl(h = hues, l = 65, c = 100)[1:n]
}
## p3 and p1 have conditions in the same order now
p3 <- ggplot(dat_adjust, aes(x=conds, y=y_prof)) +
    geom_boxplot(aes(fill = conds)) +
    scale_fill_manual(name = "Conditions",
                        values = gg_color(4)[c(1,3,2,4)]) +
    stat_summary(fun=mean, geom="line", aes(group=1), colour="red")  +
    stat_summary(fun=mean, geom="point", colour="red") +
    scale_y_continuous(name="Average expression profiles") +
    scale_x_discrete(name="Conditions") +
    facet_wrap(~labels)
 
## Solution 2 -------------------------------------------------------
## Order samples by conditions in the count matrix before running coseq
 
countmat_order <- countmat[,order(conds)]
conds_order <- conds[order(conds)]
run_order <- coseq(object=countmat_order, K=2:4, iter=5, model = "Normal",
             transformation = "arcsin")
## p4 and p5 have conditions in the same order too
p4 <- plot(run_order, graph = "boxplots", conds = conds_order)$boxplots
p5 <- plot(run_order, graph = "boxplots", conds = conds_order, 
           collapse_reps = "average")$boxplots

## ----message = FALSE----------------------------------------------------------
# Simulate toy data, n = 300 observations, factorial 2x2 design ---------------
set.seed(12345)
counts <- round(matrix(runif(300*8, min=0, max=500), nrow=300, ncol=8))
factor_1 <- rep(c("A","B"), each=4)
factor_2 <- rep(c("C", "D"), times=4)
## Create some "differential expression" for each factor
counts[1:50, 1:4] <- 10*counts[1:50, 1:4]
counts[51:100, c(1,3,5,7)] <- 10*counts[51:100, c(1,3,5,7)]
design <- model.matrix(~factor_1 + factor_2)

## Option 1: edgeR analysis (here with the QLF test) with several contrasts-----
y <- DGEList(counts=counts)
y <- calcNormFactors(y)
y <- estimateDisp(y, design)
fit <- glmQLFit(y, design)
## Test of significance for factor_1
qlf_1 <- glmQLFTest(fit, coef=2)
## Test of significance for factor_2
qlf_2 <- glmQLFTest(fit, coef=3)
## Make sure to use sort.by = "none" in topTags so that the indices are in the 
##   same order! Also only keep unique indices
de_indices <- c(unique(
    which(topTags(qlf_1, sort.by = "none", n=Inf)$table$FDR < 0.05),
    which(topTags(qlf_2, sort.by = "none", n=Inf)$table$FDR < 0.05)
))
## Now run coseq on the subset of DE genes, using prev normalization factors
run <- coseq(counts, K=2:25, 
             normFactors = y$samples$norm.factors,
             subset = de_indices)

## Option 2: edgeR analysis with overall LRT test-------------------------------
y <- DGEList(counts=counts)
y <- calcNormFactors(y)
y <- estimateDisp(y, design)
fit <- glmQLFit(y, design)
qlf_any <- glmQLFTest(fit, coef=2:3)
##  Now run coseq directly on the edgeR output
run2 <- coseq(counts, K=2:25, subset=qlf_any, alpha=0.05)