## ----vignetteSetup, echo=FALSE, message=FALSE, warning = FALSE-----------
## Track time spent on making the vignette
startTime <- Sys.time()

## Bib setup
library('knitcitations')

## Load knitcitations with a clean bibliography
cleanbib()
cite_options(hyperlink = 'to.doc', citation_format = 'text', style = 'html')
# Note links won't show for now due to the following issue
# https://github.com/cboettig/knitcitations/issues/63

## Write bibliography information
bibs <- c(knitcitations = citation('knitcitations'),
    derfinder = citation('derfinder')[1], 
    BiocStyle = citation('BiocStyle'),
    knitrBootstrap = citation('knitrBootstrap'), 
    knitr = citation('knitr')[3],
    rmarkdown = citation('rmarkdown'),
    brainspan = RefManageR::BibEntry(bibtype = 'Unpublished',
        key = 'brainspan',
        title = 'Atlas of the Developing Human Brain [Internet]. Funded by ARRA Awards 1RC2MH089921-01, 1RC2MH090047-01, and 1RC2MH089929-01.',
        author = 'BrainSpan', year = 2011, url = 'http://www.brainspan.org/'),
    originalder = citation('derfinder')[2],
    R = citation(),
    IRanges = citation('IRanges'),
    devtools = citation('devtools'),
    testthat = citation('testthat'),
    GenomeInfoDb = citation('GenomeInfoDb'),
    GenomicRanges = citation('GenomicRanges'),
    ggplot2 = citation('ggplot2'),
    biovizBase = citation('biovizBase'),
    bumphunter = citation('bumphunter')[1],
    TxDb.Hsapiens.UCSC.hg19.knownGene = citation('TxDb.Hsapiens.UCSC.hg19.knownGene'),
    AnnotationDbi = citation('AnnotationDbi'),
    BiocParallel = citation('BiocParallel'),
    derfinderHelper = citation('derfinderHelper'),
    GenomicAlignments = citation('GenomicAlignments'),
    GenomicFeatures = citation('GenomicFeatures'),
    GenomicFiles = citation('GenomicFiles'),
    Hmisc = citation('Hmisc'),
    qvalue = citation('qvalue'),
    Rsamtools = citation('Rsamtools'),
    rtracklayer = citation('rtracklayer'),
    S4Vectors = citation('S4Vectors'),
    bumphunterPaper = RefManageR::BibEntry(bibtype = 'article',
        key = 'bumphunterPaper',
        title = 'Bump hunting to identify differentially methylated regions in epigenetic epidemiology studies',
        author = 'Jaffe, Andrew E and Murakami, Peter and Lee, Hwajin and Leek, Jeffrey T and Fallin, M Daniele and Feinberg, Andrew P and Irizarry, Rafael A',
        year = 2012, journal = 'International Journal of Epidemiology'),
    derfinderData = citation('derfinderData')
)

write.bibtex(bibs,
    file = 'derfinderUsersGuideRed.bib')
bib <- read.bibtex('derfinderUsersGuideRed.bib')

## Assign short names
names(bib) <- names(bibs)

## Working on Windows?
windowsFlag <- .Platform$OS.type == 'windows'

## ----'start', message=FALSE----------------------------------------------
## Load libraries
library('derfinder')
library('derfinderData')
library('GenomicRanges')

## ----'phenoData', results = 'asis'---------------------------------------
library('knitr')
## Get pheno table
pheno <- subset(brainspanPheno, structure_acronym == 'AMY')

## Display the main information
p <- pheno[, -which(colnames(pheno) %in% c('structure_acronym',
    'structure_name', 'file'))]
rownames(p) <- NULL
kable(p, format = 'html', row.names = TRUE)

## ----'getData', eval = !windowsFlag--------------------------------------
## Determine the files to use and fix the names
files <- rawFiles(system.file('extdata', 'AMY', package = 'derfinderData'),
    samplepatt = 'bw', fileterm = NULL)
names(files) <- gsub('.bw', '', names(files))

## Load the data from disk
system.time(fullCov <- fullCoverage(files = files, chrs = 'chr21',
    totalMapped = rep(1, length(files)), targetSize = 1))

## ----'getDataWindows', eval = windowsFlag, echo = FALSE------------------
#  ## Load data in Windows case
#  foo <- function() {
#      load(system.file('extdata', 'fullCov', 'fullCovAMY.RData',
#          package = 'derfinderData'))
#      return(fullCovAMY)
#  }
#  fullCov <- foo()

## ----'webData', eval = FALSE---------------------------------------------
#  ## Determine the files to use and fix the names
#  files <- pheno$file
#  names(files) <- gsub('.AMY', '', pheno$lab)
#  
#  ## Load the data from the web
#  system.time(fullCov <- fullCoverage(files = files, chrs = 'chr21',
#      totalMapped = rep(1, length(files)), targetSize = 1))

## ----'exploreFullCov'----------------------------------------------------
## Lets explore it
fullCov

## ----'filterCov'---------------------------------------------------------
## Filter coverage
filteredCov <- lapply(fullCov, filterData, cutoff = 2)

## ----'exploreFilteredCov'------------------------------------------------
## Similar to fullCov but with $position
filteredCov

## ----'compareCov'--------------------------------------------------------
## Compare the size in Mb
round(c(fullCov = object.size(fullCov),
    filteredCov = object.size(filteredCov)) / 1024^2, 1)

## ----'regionMatrix'------------------------------------------------------
## Use regionMatrix()
system.time(regionMat <- regionMatrix(fullCov, cutoff = 30, L = 76))

## Explore results
class(regionMat)
names(regionMat$chr21)

## ----'exploreRegMatRegs'-------------------------------------------------
## regions output
regionMat$chr21$regions

## Number of regions
length(regionMat$chr21$regions)

## ----'exploreRegMatBP'---------------------------------------------------
## Base-level coverage matrices for each of the regions
## Useful for plotting
lapply(regionMat$chr21$bpCoverage[1:2], head, n = 2)

## Check dimensions. First region is 565 long, second one is 138 bp long.
## The columns match the number of samples (12 in this case).
lapply(regionMat$chr21$bpCoverage[1:2], dim)

## ----'exploreRegMatrix'--------------------------------------------------
## Dimensions of the coverage matrix
dim(regionMat$chr21$coverageMatrix)

## Coverage for each region. This matrix can then be used with limma or other pkgs
head(regionMat$chr21$coverageMatrix)

## ----'identifyDERsDESeq2'------------------------------------------------
## Required
library('DESeq2')

## Round matrix
counts <- round(regionMat$chr21$coverageMatrix)

## Round matrix and specify design
dse <- DESeqDataSetFromMatrix(counts, pheno, ~ group + gender)

## Perform DE analysis
dse <- DESeq(dse, test = 'LRT', reduced = ~ gender, fitType = 'local')

## Extract results
deseq <- regionMat$chr21$regions
mcols(deseq) <- c(mcols(deseq), results(dse))

## Explore the results
deseq

## ----'buildLimmaModels'--------------------------------------------------
## Build models
mod <- model.matrix(~ pheno$group + pheno$gender)
mod0 <- model.matrix(~ pheno$gender)

## ----'transformForLimma'-------------------------------------------------
## Transform coverage
transformedCov <- log2(regionMat$chr21$coverageMatrix + 32)

## ----'identifyDERsLimma'-------------------------------------------------
## Example using limma
library('limma')

## Run limma
fit <- lmFit(transformedCov, mod)
fit0 <- lmFit(transformedCov, mod0)

## Determine DE status for the regions
## Also in https://github.com/LieberInstitute/jaffelab with help and examples
getF  <- function(fit, fit0, theData) {
	rss1 = rowSums((fitted(fit)-theData)^2)
	df1 = ncol(fit$coef)
	rss0 = rowSums((fitted(fit0)-theData)^2)
	df0 = ncol(fit0$coef)
	fstat = ((rss0-rss1)/(df1-df0))/(rss1/(ncol(theData)-df1))
	f_pval = pf(fstat, df1-df0, ncol(theData)-df1,lower.tail=FALSE)
	fout = cbind(fstat,df1-1,ncol(theData)-df1,f_pval)
	colnames(fout)[2:3] = c("df1","df0")
	fout = data.frame(fout)
	return(fout)
}

ff <- getF(fit, fit0, transformedCov)

## Get the p-value and assign it to the regions
limma <- regionMat$chr21$regions
limma$fstat <- ff$fstat
limma$pvalue <- ff$f_pval
limma$padj <- p.adjust(ff$f_pval, 'BH')

## Explore the results
limma

## ----limmaVSdeseq2-------------------------------------------------------
table(limma$padj < 0.05, deseq$padj < 0.05)

## ----'createMeanBW', eval = !windowsFlag---------------------------------
## Calculate the mean: this step takes a long time with many samples
meanCov <- Reduce('+', fullCov$chr21) / ncol(fullCov$chr21)

## Save it on a bigwig file called meanChr21.bw
createBw(list('chr21' = DataFrame('meanChr21' = meanCov)), keepGR = 
    FALSE)

## ----'railMatrix', eval = !windowsFlag-----------------------------------
## Identify files to use
summaryFile <- 'meanChr21.bw'
## We had already found the sample BigWig files and saved it in the object 'files'
## Lets just rename it to sampleFiles for clarity.
sampleFiles <- files

## Get the regions
system.time( 
    regionMat.rail <- railMatrix(chrs = 'chr21', summaryFiles = summaryFile, 
        sampleFiles = sampleFiles, L = 76, cutoff = 30, maxClusterGap = 3000L)
)

## ----'checkDifferences', eval = !windowsFlag-----------------------------
## Overall not identical due to small rounding errors
identical(regionMat, regionMat.rail)

## Actual regions are the same
identical(ranges(regionMat$chr21$regions), ranges(regionMat.rail$chr21$regions))

## When you round, the small differences go away
identical(round(regionMat$chr21$regions$value, 4),
    round(regionMat.rail$chr21$regions$value, 4))

identical(round(regionMat$chr21$regions$area, 4),
    round(regionMat.rail$chr21$regions$area, 4))

## ----'libSize'-----------------------------------------------------------
## Get some idea of the library sizes
sampleDepths <- sampleDepth(collapseFullCoverage(fullCov), 1)
sampleDepths

## ----'makeModels'--------------------------------------------------------
## Define models
models <- makeModels(sampleDepths, testvars = pheno$group,
    adjustvars = pheno[, c('gender')]) 

## Explore the models
lapply(models, head)

## ----'analyze'-----------------------------------------------------------
## Create a analysis directory
dir.create('analysisResults')
originalWd <- getwd()
setwd(file.path(originalWd, 'analysisResults'))

## Perform differential expression analysis
system.time(results <- analyzeChr(chr = 'chr21', filteredCov$chr21, models,
    groupInfo = pheno$group, writeOutput = TRUE, cutoffFstat = 5e-02,
    nPermute = 20, seeds = 20140923 + seq_len(20), returnOutput = TRUE))

## ----'exploreResults'----------------------------------------------------
## Explore
names(results)

## ----'exploreOptionsStats'-----------------------------------------------
## Explore optionsStats
names(results$optionsStats)

## Call used
results$optionsStats$analyzeCall

## ----'exploreCovPrep'----------------------------------------------------
## Explore coveragePrep
names(results$coveragePrep)

## Group means
results$coveragePrep$groupMeans

## ----'exploreFstats'-----------------------------------------------------
## Explore optionsStats
results$fstats

## Note that the length matches the number of bases used
identical(length(results$fstats), sum(results$coveragePrep$position))

## ----'exploreRegs'-------------------------------------------------------
## Explore regions
names(results$regions)

## ----'exploreRegs2'------------------------------------------------------
## Permutation summary information
results$regions[2:4]

## ----'exploreRegs3'------------------------------------------------------
## Candidate DERs
results$regions$regions

## ----'sensitivity'-------------------------------------------------------
## Width of potential DERs
summary(width(results$regions$regions))
sum(width(results$regions$regions) > 50)

## Width of candidate DERs
sig <- as.logical(results$regions$regions$significant)
summary(width(results$regions$regions[ sig ]))
sum(width(results$regions$regions[ sig ]) > 50)

## ----'exploreAnnotation'-------------------------------------------------
## Nearest annotation
head(results$annotation)

## ----'exploreTime'-------------------------------------------------------
## Time spent
results$timeinfo

## Use this information to make a plot
timed <- diff(results$timeinfo)
timed.df <- data.frame(Seconds = as.numeric(timed), Step = factor(names(timed),
    levels = rev(names(timed))))
library('ggplot2')
ggplot(timed.df, aes(y = Step, x = Seconds)) + geom_point()

## ----'mergeResults'------------------------------------------------------
## Go back to the original directory
setwd(originalWd)

## Merge results from several chromosomes. In this case we only have one.
mergeResults(chrs='chr21', prefix="analysisResults",
    genomicState = genomicState$fullGenome, 
    optionsStats = results$optionsStats)

## Files created by mergeResults()
dir('analysisResults', pattern = '.Rdata')

## ----'optionsMerge'------------------------------------------------------
## Options used to merge
load(file.path('analysisResults', 'optionsMerge.Rdata'))

## Contents
names(optionsMerge)

## Merge call
optionsMerge$mergeCall

## ----'exploreFullRegs'---------------------------------------------------
## Load all the regions
load(file.path('analysisResults', 'fullRegions.Rdata'))

## Metadata columns
names(mcols(fullRegions))

## ----'exploreFullAnnoRegs'-----------------------------------------------
## Load annotateRegions() output
load(file.path('analysisResults', 'fullAnnotatedRegions.Rdata'))

## Information stored
names(fullAnnotatedRegions)

## Take a peak
lapply(fullAnnotatedRegions, head)

## ----'extra'-------------------------------------------------------------
## Find overlaps between regions and summarized genomic annotation
annoRegs <- annotateRegions(fullRegions, genomicState$fullGenome)

## Indeed, the result is the same because we only used chr21
identical(annoRegs, fullAnnotatedRegions)

## Get the region coverage
regionCov <- getRegionCoverage(fullCov, fullRegions)

## Explore the result
head(regionCov[[1]])

## ----'derfinderPlot', eval = FALSE---------------------------------------
#  library('derfinderPlot')
#  
#  ## Overview of the candidate DERs in the genome
#  plotOverview(regions = fullRegions, annotation = results$annotation,
#      type = 'fwer')
#  
#  suppressPackageStartupMessages(library('TxDb.Hsapiens.UCSC.hg19.knownGene'))
#  txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
#  
#  ## Base-levle coverage plots for the first 10 regions
#  plotRegionCoverage(regions = fullRegions, regionCoverage = regionCov,
#      groupInfo = pheno$group, nearestAnnotation = results$annotation,
#      annotatedRegions = annoRegs, whichRegions=1:10, txdb = txdb, scalefac = 1,
#      ask = FALSE)
#  
#  ## Cluster plot for the first region
#  plotCluster(idx = 1, regions = fullRegions, annotation = results$annotation,
#      coverageInfo = fullCov$chr21, txdb = txdb, groupInfo = pheno$group,
#      titleUse = 'fwer')

## ----'featureLevel'------------------------------------------------------
## Get the exon-level matrix
system.time(exonCov <- coverageToExon(fullCov, genomicState$fullGenome, L = 76))

## Dimensions of the matrix
dim(exonCov)

## Explore a little bit
tail(exonCov)

## ----'regionMatAnnotate'-------------------------------------------------
## Annotate regions as exonic, intronic or intergenic
system.time(annoGenome <- annotateRegions(regionMat$chr21$regions,
    genomicState$fullGenome))
## Note that the genomicState object included in derfinder only has information
## for chr21 (hg19).

## Identify closest genes to regions
suppressPackageStartupMessages(library('bumphunter'))
suppressPackageStartupMessages(library('TxDb.Hsapiens.UCSC.hg19.knownGene'))
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
genes <- annotateTranscripts(txdb)
system.time(annoNear <- matchGenes(regionMat$chr21$regions, genes))

## ----'static-vis', eval = FALSE------------------------------------------
#  ## Identify the top regions by highest total coverage
#  top <- order(regionMat$chr21$regions$area, decreasing = TRUE)[1:100]
#  
#  ## Base-level plots for the top 100 regions with transcript information
#  library('derfinderPlot')
#  plotRegionCoverage(regionMat$chr21$regions,
#      regionCoverage = regionMat$chr21$bpCoverage,
#      groupInfo = pheno$group, nearestAnnotation = annoNear,
#      annotatedRegions = annoGenome, whichRegions = top, scalefac = 1,
#      txdb = txdb, ask = FALSE)

## ----'epivizr', eval = FALSE---------------------------------------------
#  ## Load epivizr, it's available from Bioconductor
#  library('epivizr')
#  
#  ## Load data to your browser
#  mgr <- startEpiviz()
#  ders_dev <- mgr$addDevice(
#      fullRegions[as.logical(fullRegions$significantFWER) ], "Candidate DERs")
#  ders_potential_dev <- mgr$addDevice(
#      fullRegions[!as.logical(fullRegions$significantFWER) ], "Potential DERs")
#  regs_dev <- mgr$addDevice(regionMat$chr21$regions, "Region Matrix")
#  
#  ## Go to a place you like in the genome
#  mgr$navigate("chr21", start(regionMat$chr21$regions[top[1]]) - 100,
#      end(regionMat$chr21$regions[top[1]]) + 100)
#  
#  ## Stop the navigation
#  mgr$stopServer()

## ----'exportBigWig', eval = !windowsFlag---------------------------------
## Subset only the first sample
fullCovSmall <- lapply(fullCov, '[', 1)

## Export to BigWig
bw <- createBw(fullCovSmall)

## See the file. Note that the sample name is used to name the file.
dir(pattern = '.bw')

## Internally createBw() coerces each sample to a GRanges object before 
## exporting to a BigWig file. If more than one sample was exported, the
## GRangesList would have more elements.
bw

## ----'exampleNameStyle', eval = FALSE------------------------------------
#  ## Set global species and chrsStyle options
#  options(species = 'arabidopsis_thaliana')
#  options(chrsStyle = 'NCBI')
#  
#  ## Then proceed to load and analyze the data

## ----'analyzeNonHuman', eval = FALSE-------------------------------------
#  ## Load transcript database information
#  library('TxDb.Athaliana.BioMart.plantsmart28')
#  
#  ## Set organism options
#  options(species = 'arabidopsis_thaliana')
#  options(chrsStyle = 'NCBI')
#  
#  ## Run command
#  analyzeChr(txdb = TxDb.Athaliana.BioMart.plantsmart28, --Other arguments--)

## ----'runExample'--------------------------------------------------------
## Find some regions to work with
example('loadCoverage', 'derfinder')
example('getRegionCoverage', 'derfinder')

## ----'loadWhich'---------------------------------------------------------
## Illustrate reading data from a set of regions
test <- loadCoverage(files = files, chr = '21', cutoff = NULL, which = regions,
    protectWhich = 0, fileStyle = 'NCBI')

## Some reads were ignored and thus the coverage is lower as can be seen below:
sapply(test$coverage, max) - sapply(genomeDataRaw$coverage, max)

## ----'loadWhich2'--------------------------------------------------------
## Illustrate reading data from a set of regions

test2 <- loadCoverage(files = files, chr = '21', cutoff = NULL, 
    which = regions, protectWhich = 3e4, fileStyle = 'NCBI')

## Adding some padding to the regions helps get the same coverage
identical(sapply(test2$coverage, max), sapply(genomeDataRaw$coverage, max))

## A more detailed test reveals that the coverage matches at every base
all(mapply(function(x, y) { identical(x, y) }, test2$coverage, genomeDataRaw$coverage))

## ----'derfinder-analysis', eval = FALSE----------------------------------
#  ## Run derfinder's analysis steps with timing info
#  
#  ## Load libraries
#  library("getopt")
#  
#  ## Available at http:/bioconductor.org/packages/derfinder
#  library("derfinder")
#  
#  ## Specify parameters
#  spec <- matrix(c(
#  	'DFfile', 'd', 1, "character", "path to the .Rdata file with the results from loadCoverage()",
#  	'chr', 'c', 1, "character", "Chromosome under analysis. Use X instead of chrX.",
#  	'mcores', 'm', 1, "integer", "Number of cores",
#  	'verbose' , 'v', 2, "logical", "Print status updates",
#  	'help' , 'h', 0, "logical", "Display help"
#  ), byrow=TRUE, ncol=5)
#  opt <- getopt(spec)
#  
#  ## Testing the script
#  test <- FALSE
#  if(test) {
#  	## Speficy it using an interactive R session and testing
#  	test <- TRUE
#  }
#  
#  ## Test values
#  if(test){
#  	opt <- NULL
#  	opt$DFfile <- "/ProjectDir/derCoverageInfo/chr21Cov.Rdata"
#  	opt$chr <- "21"
#  	opt$mcores <- 1
#  	opt$verbose <- NULL
#  }
#  
#  ## if help was asked for print a friendly message
#  ## and exit with a non-zero error code
#  if (!is.null(opt$help)) {
#  	cat(getopt(spec, usage=TRUE))
#  	q(status=1)
#  }
#  
#  ## Default value for verbose = TRUE
#  if (is.null(opt$verbose)) opt$verbose <- TRUE
#  
#  if(opt$verbose) message("Loading Rdata file with the output from loadCoverage()")
#  load(opt$DFfile)
#  
#  ## Make it easy to use the name later. Here I'm assuming the names were generated using output='auto' in loadCoverage()
#  eval(parse(text=paste0("data <- ", "chr", opt$chr, "CovInfo")))
#  eval(parse(text=paste0("rm(chr", opt$chr, "CovInfo)")))
#  
#  ## Just for testing purposes
#  if(test) {
#  	tmp <- data
#  	tmp$coverage <- tmp$coverage[1:1e6, ]
#  	library("IRanges")
#  	tmp$position[which(tmp$pos)[1e6 + 1]:length(tmp$pos)] <- FALSE
#  	data <- tmp
#  }
#  
#  ## Load the models
#  load("models.Rdata")
#  
#  ## Load group information
#  load("groupInfo.Rdata")
#  
#  
#  ## Run the analysis with lowMemDir
#  analyzeChr(chr=opt$chr, coverageInfo = data, models = models,
#      cutoffFstat = 1e-06, cutoffType = "theoretical", nPermute = 1000,
#      seeds = seq_len(1000), maxClusterGap = 3000, groupInfo = groupInfo,
#      subject = "hg19", mc.cores = opt$mcores,
#      lowMemDir = file.path(tempdir(), paste0("chr", opt$chr) , "chunksDir")),
#      verbose = opt$verbose, chunksize = 1e5)
#  
#  ## Done
#  if(opt$verbose) {
#  	print(proc.time())
#  	print(sessionInfo(), locale=FALSE)
#  }
#  

## ----createVignette, eval=FALSE------------------------------------------
#  ## Create the vignette
#  library('rmarkdown')
#  system.time(render('derfinder-users-guide.Rmd', 'BiocStyle::html_document'))
#  
#  ## Extract the R code
#  library('knitr')
#  knit('derfinder-users-guide.Rmd', tangle = TRUE)

## ----createVignette2-----------------------------------------------------
## Clean up
file.remove('derfinderUsersGuideRed.bib')
unlink('analysisResults', recursive = TRUE)
file.remove('HSB113.bw')
file.remove('meanChr21.bw')

## ----reproducibility1, echo=FALSE----------------------------------------
## Date the vignette was generated
Sys.time()

## ----reproducibility2, echo=FALSE----------------------------------------
## Processing time in seconds
totalTime <- diff(c(startTime, Sys.time()))
round(totalTime, digits=3)

## ----reproducibility3, echo=FALSE-------------------------------------------------------------------------------------
## Session info
library('devtools')
options(width = 120)
session_info()

## ----vignetteBiblio, results = 'asis', echo = FALSE, warning = FALSE--------------------------------------------------
## Print bibliography
bibliography()