## ----setup, include=FALSE-----------------------------------------------------
options(fig_caption=TRUE)
library(knitr)
opts_chunk$set(out.extra='style="display:block; margin: auto"', fig.align="center")

## ----lib, warning=FALSE, message=FALSE, results="hide", include=FALSE---------
library(testthat)
library(BioQC)
library(hgu133plus2.db) ## to simulate an microarray expression dataset
library(lattice)
library(latticeExtra)
library(gridExtra)
library(gplots)
library(reshape2)
library(plyr)
library(ggplot2)
library(rbenchmark)

pdf.options(family="ArialMT", useDingbats=FALSE)

set.seed(1887)

## list human genes
humanGenes <- unique(na.omit(unlist(as.list(hgu133plus2SYMBOL))))

## read tissue-specific gene signatures
gmtFile <- system.file("extdata/exp.tissuemark.affy.roche.symbols.gmt",
                       package="BioQC")
gmt <- readGmt(gmtFile)

## ----helper, include=FALSE----------------------------------------------------
## Summarizes data.
## Gives count, mean, standard deviation, standard error of the mean, and confidence interval (default 95%).
##   data: a data frame.
##   measurevar: the name of a column that contains the variable to be summariezed
##   groupvars: a vector containing names of columns that contain grouping variables
##   na.rm: a boolean that indicates whether to ignore NA's
##   conf.interval: the percent range of the confidence interval (default is 95%)
## 
## Source: http://www.cookbook-r.com/Manipulating_data/Summarizing_data/
summarySE <- function(data=NULL, measurevar, groupvars=NULL, na.rm=FALSE,
        conf.interval=.95, .drop=TRUE) {

# New version of length which can handle NA's: if na.rm==T, don't count them
        length2 <- function (x, na.rm=FALSE) {
            if (na.rm) sum(!is.na(x))
            else       length(x)
        }

# This does the summary. For each group's data frame, return a vector with
# N, mean, and sd
    datac <- ddply(data, groupvars, .drop=.drop,
            .fun = function(xx, col) {
            c(N    = length2(xx[[col]], na.rm=na.rm),
                mean = mean   (xx[[col]], na.rm=na.rm),
                sd   = sd     (xx[[col]], na.rm=na.rm)
             )
            },
            measurevar
            )

# Rename the "mean" column    
        datac <- rename(datac, c("mean" = measurevar))

        datac$se <- datac$sd / sqrt(datac$N)  # Calculate standard error of the mean

# Confidence interval multiplier for standard error
# Calculate t-statistic for confidence interval: 
# e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
        ciMult <- qt(conf.interval/2 + .5, datac$N-1)
        datac$ci <- datac$se * ciMult

        return(datac)
}

## ----sensitivity_benchmark_fig, echo=FALSE, warning=FALSE, fig.width=8, fig.height=3.5, dev='png', fig.cap="**Figure 1:** Sensitivity benchmark. Expression levels of genes in the ovary signature are dedicately sampled randomly from normal distributions with different mean values. The lines show the enrichment score for the Wilcoxon-Mann-Whitney test, the t-test and the Kolmogorov-Smirnov test respectively. In the right panel, outliers were added by adding a random value to 1% of the simulated genes. "----

randomMatrixButOneSignature <- function(rows=humanGenes, signatureGenes,
                                        amplitudes=seq(0, 3, by=0.5)) {
  nrow <- length(rows)
  ncol <- length(amplitudes)
  mat <- matrix(rnorm(nrow*ncol),
                nrow=nrow, byrow=FALSE)
  rownames(mat) <- rows
  sigInd <- na.omit(match(signatureGenes, humanGenes))
  
  colClass <- factor(amplitudes)
  
  for(colInd in unique(colClass)) {
    isCurrCol <- colInd==colClass
    replaceMatrix <- matrix(rnorm(length(sigInd)*sum(isCurrCol),
                                  mean=amplitudes[isCurrCol][1]),
                            nrow=length(sigInd), byrow=FALSE)
    mat[sigInd, isCurrCol] <-  replaceMatrix
    }
  return(mat)
}

addNoise = function(matrix, fractionAffected=.1, stdv=1) {
  noise = matrix(rnorm(nrow(matrix)*ncol(matrix), sd=stdv), nrow=nrow(matrix), byrow=FALSE)
  addNoise = matrix(runif(nrow(matrix)*ncol(matrix)), nrow=nrow(matrix), byrow=FALSE) < fractionAffected
  matrix = matrix + addNoise*noise
  return(matrix)
}

do.test <- function(senseMat, tissueInd, test=t.test, alternative="greater") {
  bgInds = setdiff(1:nrow(senseMat), tissueInd)
  res = apply(senseMat, 2, function(col) {
    gs = col[tissueInd]
    bg = col[bgInds]
    return(test(gs, bg, alternative=alternative)$p.value)
  })
  return(res)
}


selGeneSet <- "Ovary_NGS_RNASEQATLAS_0.6_3"
selSignature <- gmt[[selGeneSet]]$genes
tissueInds <- sapply(gmt, function(x) match(x$genes, humanGenes))
senseAmplitudes <- rep(c(seq(0, 1, by=0.25), seq(1.5, 3, by=0.5)), each=50)
senseMat <- randomMatrixButOneSignature(rows=humanGenes,
                                        signatureGenes=selSignature,
                                        amplitudes=senseAmplitudes)
senseMatOutlier = addNoise(senseMat, stdv=15, fractionAffected=.01)

compare.tests = function(senseMat) {
  senseBioQC <- wmwTest(senseMat, tissueInds, valType="p.greater", simplify=TRUE)[selGeneSet,]
  senseTTest <- do.test(senseMat, tissueInds[[selGeneSet]], test=t.test)
  senseKS <- do.test(senseMat, tissueInds[[selGeneSet]], test=function(x, y, alternative) {
    ks.test(y, x, alternative=alternative)})
  
  comp = data.frame(BioQC=-log10(senseBioQC), t.test=-log10(senseTTest), ks.test=-log10(senseKS), senseAmplitudes=senseAmplitudes)
  comp.molten = melt(comp, id="senseAmplitudes")
  comp.summary = summarySE(comp.molten, measurevar="value", groupvars=c("senseAmplitudes", "variable"))
  return(comp.summary)
}

comp.summary = compare.tests(senseMat)
comp.summary.outlier = compare.tests(senseMatOutlier)

plot.comp = ggplot(comp.summary, aes(x=senseAmplitudes, y=value, color=variable)) + 
  geom_line() +
  geom_errorbar(aes(ymin=value-se, ymax=value+se), width=.1) + 
  geom_point() + xlab("Mean expression differnce") + ylab("Enrichment score") + ggtitle("without noise") + theme_bw() + scale_color_brewer(palette="Dark2")

plot.comp.outlier = ggplot(comp.summary.outlier, aes(x=senseAmplitudes, y=value, color=variable)) + 
  geom_line() +
  geom_errorbar(aes(ymin=value-se, ymax=value+se), width=.1) + 
  geom_point() + xlab("Mean expression differnce") + ylab("Enrichment score") + ggtitle("with noise") + theme_bw()  + scale_color_brewer(palette="Dark2")

grid.arrange(plot.comp, plot.comp.outlier, ncol=2)

## ----benchmark, include=FALSE-------------------------------------------------
runWMW = function() {return(wmwTest(senseMat, tissueInds, valType="p.greater", simplify=TRUE))}
runKS = function() {return(do.test(senseMat, tissueInds[[selGeneSet]], test=function(x, y, alternative) {
    ks.test(y, x, alternative=alternative)}))}
benchmark.res = benchmark(runWMW(), runKS(), columns=c("test", "replications", "elapsed", "relative"), replications=5)

## ----benchmark_res, echo=FALSE------------------------------------------------
benchmark.res

## ----session_info-------------------------------------------------------------
sessionInfo()