## ----vignetteSetup, echo=FALSE, message=FALSE, warning = FALSE------------- ## Track time spent on making the vignette startTime <- Sys.time() ## Bib setup library('knitcitations') ## Load knitcitations with a clean bibliography cleanbib() cite_options(hyperlink = 'to.doc', citation_format = 'text', style = 'html') # Note links won't show for now due to the following issue # https://github.com/cboettig/knitcitations/issues/63 ## Write bibliography information bib <- c(knitcitations = citation('knitcitations'), derfinder = citation('derfinder')[1], BiocStyle = citation('BiocStyle'), knitrBootstrap = citation('knitrBootstrap'), knitr = citation('knitr')[3], rmarkdown = citation('rmarkdown'), brainspan = RefManageR::BibEntry(bibtype = 'Unpublished', key = 'brainspan', title = 'Atlas of the Developing Human Brain [Internet]. Funded by ARRA Awards 1RC2MH089921-01, 1RC2MH090047-01, and 1RC2MH089929-01.', author = 'BrainSpan', year = 2011, url = 'http://www.brainspan.org/'), originalder = citation('derfinder')[2], R = citation(), IRanges = citation('IRanges'), sessioninfo = citation('sessioninfo'), testthat = citation('testthat'), GenomeInfoDb = RefManageR::BibEntry(bibtype = 'manual', key = 'GenomeInfoDb', author = 'Sonali Arora and Martin Morgan and Marc Carlson and H. Pagès', title = "GenomeInfoDb: Utilities for manipulating chromosome and other 'seqname' identifiers", year = 2017, doi = '10.18129/B9.bioc.GenomeInfoDb'), GenomicRanges = citation('GenomicRanges'), ggplot2 = citation('ggplot2'), biovizBase = citation('biovizBase'), bumphunter = citation('bumphunter')[1], TxDb.Hsapiens.UCSC.hg19.knownGene = citation('TxDb.Hsapiens.UCSC.hg19.knownGene'), AnnotationDbi = RefManageR::BibEntry(bibtype = 'manual', key = 'AnnotationDbi', author = 'Hervé Pagès and Marc Carlson and Seth Falcon and Nianhua Li', title = 'AnnotationDbi: Annotation Database Interface', year = 2017, doi = '10.18129/B9.bioc.AnnotationDbi'), BiocParallel = citation('BiocParallel'), derfinderHelper = citation('derfinderHelper'), GenomicAlignments = citation('GenomicAlignments'), GenomicFeatures = citation('GenomicFeatures'), GenomicFiles = citation('GenomicFiles'), Hmisc = citation('Hmisc'), qvalue = citation('qvalue'), Rsamtools = citation('Rsamtools'), rtracklayer = citation('rtracklayer'), S4Vectors = RefManageR::BibEntry(bibtype = 'manual', key = 'S4Vectors', author = 'Hervé Pagès and Michael Lawrence and Patrick Aboyoun', title = "S4Vectors: S4 implementation of vector-like and list-like objects", year = 2017, doi = '10.18129/B9.bioc.S4Vectors'), bumphunterPaper = RefManageR::BibEntry(bibtype = 'article', key = 'bumphunterPaper', title = 'Bump hunting to identify differentially methylated regions in epigenetic epidemiology studies', author = 'Jaffe, Andrew E and Murakami, Peter and Lee, Hwajin and Leek, Jeffrey T and Fallin, M Daniele and Feinberg, Andrew P and Irizarry, Rafael A', year = 2012, journal = 'International Journal of Epidemiology'), derfinderData = citation('derfinderData') ) write.bibtex(bib, file = 'derfinderUsersGuideRed.bib') ## Working on Windows? windowsFlag <- .Platform$OS.type == 'windows' ## ----'start', message=FALSE------------------------------------------------ ## Load libraries library('derfinder') library('derfinderData') library('GenomicRanges') ## ----'phenoData', results = 'asis'----------------------------------------- library('knitr') ## Get pheno table pheno <- subset(brainspanPheno, structure_acronym == 'AMY') ## Display the main information p <- pheno[, -which(colnames(pheno) %in% c('structure_acronym', 'structure_name', 'file'))] rownames(p) <- NULL kable(p, format = 'html', row.names = TRUE) ## ----'getData', eval = !windowsFlag---------------------------------------- ## Determine the files to use and fix the names files <- rawFiles(system.file('extdata', 'AMY', package = 'derfinderData'), samplepatt = 'bw', fileterm = NULL) names(files) <- gsub('.bw', '', names(files)) ## Load the data from disk system.time(fullCov <- fullCoverage(files = files, chrs = 'chr21', totalMapped = rep(1, length(files)), targetSize = 1)) ## ----'getDataWindows', eval = windowsFlag, echo = FALSE-------------------- # ## Load data in Windows case # foo <- function() { # load(system.file('extdata', 'fullCov', 'fullCovAMY.RData', # package = 'derfinderData')) # return(fullCovAMY) # } # fullCov <- foo() ## ----'webData', eval = FALSE----------------------------------------------- # ## Determine the files to use and fix the names # files <- pheno$file # names(files) <- gsub('.AMY', '', pheno$lab) # # ## Load the data from the web # system.time(fullCov <- fullCoverage(files = files, chrs = 'chr21', # totalMapped = rep(1, length(files)), targetSize = 1)) ## ----'exploreFullCov'------------------------------------------------------ ## Lets explore it fullCov ## ----'filterCov'----------------------------------------------------------- ## Filter coverage filteredCov <- lapply(fullCov, filterData, cutoff = 2) ## ----'exploreFilteredCov'-------------------------------------------------- ## Similar to fullCov but with $position filteredCov ## ----'compareCov'---------------------------------------------------------- ## Compare the size in Mb round(c(fullCov = object.size(fullCov), filteredCov = object.size(filteredCov)) / 1024^2, 1) ## ----'regionMatrix'-------------------------------------------------------- ## Use regionMatrix() system.time(regionMat <- regionMatrix(fullCov, cutoff = 30, L = 76)) ## Explore results class(regionMat) names(regionMat$chr21) ## ----'exploreRegMatRegs'--------------------------------------------------- ## regions output regionMat$chr21$regions ## Number of regions length(regionMat$chr21$regions) ## ----'exploreRegMatBP'----------------------------------------------------- ## Base-level coverage matrices for each of the regions ## Useful for plotting lapply(regionMat$chr21$bpCoverage[1:2], head, n = 2) ## Check dimensions. First region is 565 long, second one is 138 bp long. ## The columns match the number of samples (12 in this case). lapply(regionMat$chr21$bpCoverage[1:2], dim) ## ----'exploreRegMatrix'---------------------------------------------------- ## Dimensions of the coverage matrix dim(regionMat$chr21$coverageMatrix) ## Coverage for each region. This matrix can then be used with limma or other pkgs head(regionMat$chr21$coverageMatrix) ## ----'identifyDERsDESeq2'-------------------------------------------------- ## Required library('DESeq2') ## Round matrix counts <- round(regionMat$chr21$coverageMatrix) ## Round matrix and specify design dse <- DESeqDataSetFromMatrix(counts, pheno, ~ group + gender) ## Perform DE analysis dse <- DESeq(dse, test = 'LRT', reduced = ~ gender, fitType = 'local') ## Extract results deseq <- regionMat$chr21$regions mcols(deseq) <- c(mcols(deseq), results(dse)) ## Explore the results deseq ## ----'buildLimmaModels'---------------------------------------------------- ## Build models mod <- model.matrix(~ pheno$group + pheno$gender) mod0 <- model.matrix(~ pheno$gender) ## ----'transformForLimma'--------------------------------------------------- ## Transform coverage transformedCov <- log2(regionMat$chr21$coverageMatrix + 32) ## ----'identifyDERsLimma'--------------------------------------------------- ## Example using limma library('limma') ## Run limma fit <- lmFit(transformedCov, mod) fit0 <- lmFit(transformedCov, mod0) ## Determine DE status for the regions ## Also in https://github.com/LieberInstitute/jaffelab with help and examples getF <- function(fit, fit0, theData) { rss1 = rowSums((fitted(fit)-theData)^2) df1 = ncol(fit$coef) rss0 = rowSums((fitted(fit0)-theData)^2) df0 = ncol(fit0$coef) fstat = ((rss0-rss1)/(df1-df0))/(rss1/(ncol(theData)-df1)) f_pval = pf(fstat, df1-df0, ncol(theData)-df1,lower.tail=FALSE) fout = cbind(fstat,df1-1,ncol(theData)-df1,f_pval) colnames(fout)[2:3] = c("df1","df0") fout = data.frame(fout) return(fout) } ff <- getF(fit, fit0, transformedCov) ## Get the p-value and assign it to the regions limma <- regionMat$chr21$regions limma$fstat <- ff$fstat limma$pvalue <- ff$f_pval limma$padj <- p.adjust(ff$f_pval, 'BH') ## Explore the results limma ## ----limmaVSdeseq2--------------------------------------------------------- table(limma$padj < 0.05, deseq$padj < 0.05) ## ----'createMeanBW', eval = !windowsFlag----------------------------------- ## Calculate the mean: this step takes a long time with many samples meanCov <- Reduce('+', fullCov$chr21) / ncol(fullCov$chr21) ## Save it on a bigwig file called meanChr21.bw createBw(list('chr21' = DataFrame('meanChr21' = meanCov)), keepGR = FALSE) ## ----'railMatrix', eval = !windowsFlag------------------------------------- ## Identify files to use summaryFile <- 'meanChr21.bw' ## We had already found the sample BigWig files and saved it in the object 'files' ## Lets just rename it to sampleFiles for clarity. sampleFiles <- files ## Get the regions system.time( regionMat.rail <- railMatrix(chrs = 'chr21', summaryFiles = summaryFile, sampleFiles = sampleFiles, L = 76, cutoff = 30, maxClusterGap = 3000L) ) ## ----'checkDifferences', eval = !windowsFlag------------------------------- ## Overall not identical due to small rounding errors identical(regionMat, regionMat.rail) ## Actual regions are the same identical(ranges(regionMat$chr21$regions), ranges(regionMat.rail$chr21$regions)) ## When you round, the small differences go away identical(round(regionMat$chr21$regions$value, 4), round(regionMat.rail$chr21$regions$value, 4)) identical(round(regionMat$chr21$regions$area, 4), round(regionMat.rail$chr21$regions$area, 4)) ## ----'libSize'------------------------------------------------------------- ## Get some idea of the library sizes sampleDepths <- sampleDepth(collapseFullCoverage(fullCov), 1) sampleDepths ## ----'makeModels'---------------------------------------------------------- ## Define models models <- makeModels(sampleDepths, testvars = pheno$group, adjustvars = pheno[, c('gender')]) ## Explore the models lapply(models, head) ## ----'analyze'------------------------------------------------------------- ## Create a analysis directory dir.create('analysisResults') originalWd <- getwd() setwd(file.path(originalWd, 'analysisResults')) ## Perform differential expression analysis system.time(results <- analyzeChr(chr = 'chr21', filteredCov$chr21, models, groupInfo = pheno$group, writeOutput = TRUE, cutoffFstat = 5e-02, nPermute = 20, seeds = 20140923 + seq_len(20), returnOutput = TRUE)) ## ----'exploreResults'------------------------------------------------------ ## Explore names(results) ## ----'exploreOptionsStats'------------------------------------------------- ## Explore optionsStats names(results$optionsStats) ## Call used results$optionsStats$analyzeCall ## ----'exploreCovPrep'------------------------------------------------------ ## Explore coveragePrep names(results$coveragePrep) ## Group means results$coveragePrep$groupMeans ## ----'exploreFstats'------------------------------------------------------- ## Explore optionsStats results$fstats ## Note that the length matches the number of bases used identical(length(results$fstats), sum(results$coveragePrep$position)) ## ----'exploreRegs'--------------------------------------------------------- ## Explore regions names(results$regions) ## ----'exploreRegs2'-------------------------------------------------------- ## Permutation summary information results$regions[2:4] ## ----'exploreRegs3'-------------------------------------------------------- ## Candidate DERs results$regions$regions ## ----'sensitivity'--------------------------------------------------------- ## Width of potential DERs summary(width(results$regions$regions)) sum(width(results$regions$regions) > 50) ## Width of candidate DERs sig <- as.logical(results$regions$regions$significant) summary(width(results$regions$regions[ sig ])) sum(width(results$regions$regions[ sig ]) > 50) ## ----'exploreAnnotation'--------------------------------------------------- ## Nearest annotation head(results$annotation) ## ----'exploreTime', fig.cap = "Seconds used to run each step in analyzeChr()."---- ## Time spent results$timeinfo ## Use this information to make a plot timed <- diff(results$timeinfo) timed.df <- data.frame(Seconds = as.numeric(timed), Step = factor(names(timed), levels = rev(names(timed)))) library('ggplot2') ggplot(timed.df, aes(y = Step, x = Seconds)) + geom_point() ## ----'mergeResults'-------------------------------------------------------- ## Go back to the original directory setwd(originalWd) ## Merge results from several chromosomes. In this case we only have one. mergeResults(chrs='chr21', prefix="analysisResults", genomicState = genomicState$fullGenome, optionsStats = results$optionsStats) ## Files created by mergeResults() dir('analysisResults', pattern = '.Rdata') ## ----'optionsMerge'-------------------------------------------------------- ## Options used to merge load(file.path('analysisResults', 'optionsMerge.Rdata')) ## Contents names(optionsMerge) ## Merge call optionsMerge$mergeCall ## ----'exploreFullRegs'----------------------------------------------------- ## Load all the regions load(file.path('analysisResults', 'fullRegions.Rdata')) ## Metadata columns names(mcols(fullRegions)) ## ----'exploreFullAnnoRegs'------------------------------------------------- ## Load annotateRegions() output load(file.path('analysisResults', 'fullAnnotatedRegions.Rdata')) ## Information stored names(fullAnnotatedRegions) ## Take a peak lapply(fullAnnotatedRegions, head) ## ----'extra'--------------------------------------------------------------- ## Find overlaps between regions and summarized genomic annotation annoRegs <- annotateRegions(fullRegions, genomicState$fullGenome) ## Indeed, the result is the same because we only used chr21 identical(annoRegs, fullAnnotatedRegions) ## Get the region coverage regionCov <- getRegionCoverage(fullCov, fullRegions) ## Explore the result head(regionCov[[1]]) ## ----'derfinderPlot', eval = FALSE----------------------------------------- # library('derfinderPlot') # # ## Overview of the candidate DERs in the genome # plotOverview(regions = fullRegions, annotation = results$annotation, # type = 'fwer') # # suppressPackageStartupMessages(library('TxDb.Hsapiens.UCSC.hg19.knownGene')) # txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene # # ## Base-levle coverage plots for the first 10 regions # plotRegionCoverage(regions = fullRegions, regionCoverage = regionCov, # groupInfo = pheno$group, nearestAnnotation = results$annotation, # annotatedRegions = annoRegs, whichRegions=1:10, txdb = txdb, scalefac = 1, # ask = FALSE) # # ## Cluster plot for the first region # plotCluster(idx = 1, regions = fullRegions, annotation = results$annotation, # coverageInfo = fullCov$chr21, txdb = txdb, groupInfo = pheno$group, # titleUse = 'fwer') ## ----'featureLevel'-------------------------------------------------------- ## Get the exon-level matrix system.time(exonCov <- coverageToExon(fullCov, genomicState$fullGenome, L = 76)) ## Dimensions of the matrix dim(exonCov) ## Explore a little bit tail(exonCov) ## ----'regionMatAnnotate'--------------------------------------------------- ## Annotate regions as exonic, intronic or intergenic system.time(annoGenome <- annotateRegions(regionMat$chr21$regions, genomicState$fullGenome)) ## Note that the genomicState object included in derfinder only has information ## for chr21 (hg19). ## Identify closest genes to regions suppressPackageStartupMessages(library('bumphunter')) suppressPackageStartupMessages(library('TxDb.Hsapiens.UCSC.hg19.knownGene')) txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene genes <- annotateTranscripts(txdb) system.time(annoNear <- matchGenes(regionMat$chr21$regions, genes)) ## ----'static-vis', eval = FALSE-------------------------------------------- # ## Identify the top regions by highest total coverage # top <- order(regionMat$chr21$regions$area, decreasing = TRUE)[1:100] # # ## Base-level plots for the top 100 regions with transcript information # library('derfinderPlot') # plotRegionCoverage(regionMat$chr21$regions, # regionCoverage = regionMat$chr21$bpCoverage, # groupInfo = pheno$group, nearestAnnotation = annoNear, # annotatedRegions = annoGenome, whichRegions = top, scalefac = 1, # txdb = txdb, ask = FALSE) ## ----'epivizr', eval = FALSE----------------------------------------------- # ## Load epivizr, it's available from Bioconductor # library('epivizr') # # ## Load data to your browser # mgr <- startEpiviz() # ders_dev <- mgr$addDevice( # fullRegions[as.logical(fullRegions$significantFWER) ], "Candidate DERs") # ders_potential_dev <- mgr$addDevice( # fullRegions[!as.logical(fullRegions$significantFWER) ], "Potential DERs") # regs_dev <- mgr$addDevice(regionMat$chr21$regions, "Region Matrix") # # ## Go to a place you like in the genome # mgr$navigate("chr21", start(regionMat$chr21$regions[top[1]]) - 100, # end(regionMat$chr21$regions[top[1]]) + 100) # # ## Stop the navigation # mgr$stopServer() ## ----'exportBigWig', eval = !windowsFlag----------------------------------- ## Subset only the first sample fullCovSmall <- lapply(fullCov, '[', 1) ## Export to BigWig bw <- createBw(fullCovSmall) ## See the file. Note that the sample name is used to name the file. dir(pattern = '.bw') ## Internally createBw() coerces each sample to a GRanges object before ## exporting to a BigWig file. If more than one sample was exported, the ## GRangesList would have more elements. bw ## ----'exampleNameStyle', eval = FALSE-------------------------------------- # ## Set global species and chrsStyle options # options(species = 'arabidopsis_thaliana') # options(chrsStyle = 'NCBI') # # ## Then proceed to load and analyze the data ## ----'analyzeNonHuman', eval = FALSE--------------------------------------- # ## Load transcript database information # library('TxDb.Athaliana.BioMart.plantsmart28') # # ## Set organism options # options(species = 'arabidopsis_thaliana') # options(chrsStyle = 'NCBI') # # ## Run command # analyzeChr(txdb = TxDb.Athaliana.BioMart.plantsmart28, --Other arguments--) ## ----'runExample'---------------------------------------------------------- ## Find some regions to work with example('loadCoverage', 'derfinder') example('getRegionCoverage', 'derfinder') ## ----'loadWhich'----------------------------------------------------------- ## Illustrate reading data from a set of regions test <- loadCoverage(files = files, chr = '21', cutoff = NULL, which = regions, protectWhich = 0, fileStyle = 'NCBI') ## Some reads were ignored and thus the coverage is lower as can be seen below: sapply(test$coverage, max) - sapply(genomeDataRaw$coverage, max) ## ----'loadWhich2'---------------------------------------------------------- ## Illustrate reading data from a set of regions test2 <- loadCoverage(files = files, chr = '21', cutoff = NULL, which = regions, protectWhich = 3e4, fileStyle = 'NCBI') ## Adding some padding to the regions helps get the same coverage identical(sapply(test2$coverage, max), sapply(genomeDataRaw$coverage, max)) ## A more detailed test reveals that the coverage matches at every base all(mapply(function(x, y) { identical(x, y) }, test2$coverage, genomeDataRaw$coverage)) ## ----Figure1, out.width="100%", fig.align="center", fig.cap = "derfinder F-statistics flow chart.", echo = FALSE---- knitr::include_graphics("http://lcolladotor.github.io/derfinder/fig/DERpathway.png") ## ----Figure2, out.width="100%", fig.align="center", fig.cap = "analyzeChr() flow chart.", echo = FALSE---- knitr::include_graphics("http://lcolladotor.github.io/derfinder/fig/analyzeChr.png") ## ----Figure3, out.width="100%", fig.align="center", fig.cap = "regionMatrix() flow chart.", echo = FALSE---- knitr::include_graphics("http://lcolladotor.github.io/derfinder/fig/regionMatrix.png") ## ----'derfinder-analysis', eval = FALSE------------------------------------ # ## Run derfinder's analysis steps with timing info # # ## Load libraries # library("getopt") # # ## Available at http:/bioconductor.org/packages/derfinder # library("derfinder") # # ## Specify parameters # spec <- matrix(c( # 'DFfile', 'd', 1, "character", "path to the .Rdata file with the results from loadCoverage()", # 'chr', 'c', 1, "character", "Chromosome under analysis. Use X instead of chrX.", # 'mcores', 'm', 1, "integer", "Number of cores", # 'verbose' , 'v', 2, "logical", "Print status updates", # 'help' , 'h', 0, "logical", "Display help" # ), byrow=TRUE, ncol=5) # opt <- getopt(spec) # # ## Testing the script # test <- FALSE # if(test) { # ## Speficy it using an interactive R session and testing # test <- TRUE # } # # ## Test values # if(test){ # opt <- NULL # opt$DFfile <- "/ProjectDir/derCoverageInfo/chr21Cov.Rdata" # opt$chr <- "21" # opt$mcores <- 1 # opt$verbose <- NULL # } # # ## if help was asked for print a friendly message # ## and exit with a non-zero error code # if (!is.null(opt$help)) { # cat(getopt(spec, usage=TRUE)) # q(status=1) # } # # ## Default value for verbose = TRUE # if (is.null(opt$verbose)) opt$verbose <- TRUE # # if(opt$verbose) message("Loading Rdata file with the output from loadCoverage()") # load(opt$DFfile) # # ## Make it easy to use the name later. Here I'm assuming the names were generated using output='auto' in loadCoverage() # eval(parse(text=paste0("data <- ", "chr", opt$chr, "CovInfo"))) # eval(parse(text=paste0("rm(chr", opt$chr, "CovInfo)"))) # # ## Just for testing purposes # if(test) { # tmp <- data # tmp$coverage <- tmp$coverage[1:1e6, ] # library("IRanges") # tmp$position[which(tmp$pos)[1e6 + 1]:length(tmp$pos)] <- FALSE # data <- tmp # } # # ## Load the models # load("models.Rdata") # # ## Load group information # load("groupInfo.Rdata") # # # ## Run the analysis with lowMemDir # analyzeChr(chr=opt$chr, coverageInfo = data, models = models, # cutoffFstat = 1e-06, cutoffType = "theoretical", nPermute = 1000, # seeds = seq_len(1000), maxClusterGap = 3000, groupInfo = groupInfo, # subject = "hg19", mc.cores = opt$mcores, # lowMemDir = file.path(tempdir(), paste0("chr", opt$chr) , "chunksDir")), # verbose = opt$verbose, chunksize = 1e5) # # ## Done # if(opt$verbose) { # print(proc.time()) # print(sessionInfo(), locale=FALSE) # } # ## ----createVignette, eval=FALSE-------------------------------------------- # ## Create the vignette # library('rmarkdown') # system.time(render('derfinder-users-guide.Rmd', 'BiocStyle::html_document')) # # ## Extract the R code # library('knitr') # knit('derfinder-users-guide.Rmd', tangle = TRUE) ## ----createVignette2------------------------------------------------------- ## Clean up file.remove('derfinderUsersGuideRed.bib') unlink('analysisResults', recursive = TRUE) file.remove('HSB113.bw') file.remove('meanChr21.bw') ## ----reproducibility1, echo=FALSE------------------------------------------ ## Date the vignette was generated Sys.time() ## ----reproducibility2, echo=FALSE------------------------------------------ ## Processing time in seconds totalTime <- diff(c(startTime, Sys.time())) round(totalTime, digits=3) ## ----reproducibility3, echo=FALSE------------------------------------------------------------------------------------- ## Session info library('sessioninfo') options(width = 120) session_info() ## ----vignetteBiblio, results = 'asis', echo = FALSE, warning = FALSE, message = FALSE--------------------------------- ## Print bibliography bibliography()