## ---- echo=FALSE, results="hide", warning=FALSE------------------------------- suppressPackageStartupMessages({ library(ChIPpeakAnno) library(rtracklayer) library(EnsDb.Hsapiens.v75) library(TxDb.Hsapiens.UCSC.hg19.knownGene) library(org.Hs.eg.db) library(reactome.db) library(BSgenome.Hsapiens.UCSC.hg19) library(seqinr) library(UpSetR) library(trackViewer) }) knitr::opts_chunk$set(warning=FALSE, message=FALSE) ## ----workflow1, fig.cap="Venn diagram of overlaps for replicated experiments", fig.width=6, fig.height=6---- library(ChIPpeakAnno) bed <- system.file("extdata", "MACS_output.bed", package="ChIPpeakAnno") gr1 <- toGRanges(bed, format="BED", header=FALSE) ## one can also try import from rtracklayer gff <- system.file("extdata", "GFF_peaks.gff", package="ChIPpeakAnno") gr2 <- toGRanges(gff, format="GFF", header=FALSE, skip=3) ## must keep the class exactly same as gr1$score, i.e., numeric. gr2$score <- as.numeric(gr2$score) ol <- findOverlapsOfPeaks(gr1, gr2) ## add metadata (mean of score) to the overlapping peaks ol <- addMetadata(ol, colNames="score", FUN=mean) ol$peaklist[["gr1///gr2"]][1:2] makeVennDiagram(ol, fill=c("#009E73", "#F0E442"), # circle fill color col=c("#D55E00", "#0072B2"), #circle border color cat.col=c("#D55E00", "#0072B2")) # label color, keep same as circle border color ## ----annoData----------------------------------------------------------------- library(EnsDb.Hsapiens.v75) ##(hg19) ## create annotation file from EnsDb or TxDb annoData <- toGRanges(EnsDb.Hsapiens.v75, feature="gene") annoData[1:2] ## ----workflow2,fig.cap="Distribution of peaks around transcript start sites.",fig.width=8,fig.height=6---- overlaps <- ol$peaklist[["gr1///gr2"]] binOverFeature(overlaps, annotationData=annoData, radius=5000, nbins=20, FUN=length, errFun=0, xlab="distance from TSS (bp)", ylab="count", main="Distribution of aggregated peak numbers around TSS") ## ----workflow4,fig.cap="Peak distribution over different genomic features.",fig.width=10,fig.height=4---- ## check the genomic element distribution of the duplicates ## the genomic element distribution will indicates the ## the correlation between duplicates. library(TxDb.Hsapiens.UCSC.hg19.knownGene) peaks <- GRangesList(rep1=gr1, rep2=gr2) genomicElementDistribution(peaks, TxDb = TxDb.Hsapiens.UCSC.hg19.knownGene, promoterRegion=c(upstream=2000, downstream=500), geneDownstream=c(upstream=0, downstream=2000)) ## check the genomic element distribution for the overlaps ## the genomic element distribution will indicates the ## the best methods for annotation. ## The percentages in the legend show the percentage of peaks in ## each category. out <- genomicElementDistribution(overlaps, TxDb = TxDb.Hsapiens.UCSC.hg19.knownGene, promoterRegion=c(upstream=2000, downstream=500), geneDownstream=c(upstream=0, downstream=2000), promoterLevel=list( # from 5' -> 3', fixed precedence 3' -> 5' breaks = c(-2000, -1000, -500, 0, 500), labels = c("upstream 1-2Kb", "upstream 0.5-1Kb", "upstream <500b", "TSS - 500b"), colors = c("#FFE5CC", "#FFCA99", "#FFAD65", "#FF8E32"))) ## check the genomic element distribution by upset plot. ## by function genomicElementUpSetR, no precedence will be considered. library(UpSetR) x <- genomicElementUpSetR(overlaps, TxDb.Hsapiens.UCSC.hg19.knownGene) upset(x$plotData, nsets=13, nintersects=NA) ## ----------------------------------------------------------------------------- metagenePlot(peaks, TxDb.Hsapiens.UCSC.hg19.knownGene) ## ----workflow3---------------------------------------------------------------- overlaps.anno <- annotatePeakInBatch(overlaps, AnnotationData=annoData, output="nearestBiDirectionalPromoters", bindingRegion=c(-2000, 500)) library(org.Hs.eg.db) overlaps.anno <- addGeneIDs(overlaps.anno, "org.Hs.eg.db", IDs2Add = "entrez_id") head(overlaps.anno) ol.anno.out <- unname(overlaps.anno) ol.anno.out$peakNames <- NULL # remove the CharacterList to avoid error message. write.csv(as.data.frame(ol.anno.out), "anno.csv") ## ----workflow20,fig.cap="Pie chart of the distribution of common peaks around features."---- pie1(table(overlaps.anno$insideFeature)) ## ----enrichment--------------------------------------------------------------- over <- getEnrichedGO(overlaps.anno, orgAnn="org.Hs.eg.db", condense=TRUE) enrichmentPlot(over) library(reactome.db) path <- getEnrichedPATH(overlaps.anno, "org.Hs.eg.db", "reactome.db", maxP=.05) enrichmentPlot(path) ## ----fasta-------------------------------------------------------------------- library(BSgenome.Hsapiens.UCSC.hg19) seq <- getAllPeakSequence(overlaps, upstream=20, downstream=20, genome=Hsapiens) write2FASTA(seq, "test.fa") ## ----consensus,fig.cap="Histogram of Z-score of 6-mer",fig.height=6,fig.width=6---- ## summary of the short oligos library(seqinr) freqs <- oligoFrequency(Hsapiens$chr1, MarkovOrder=3) os <- oligoSummary(seq, oligoLength=6, MarkovOrder=3, quickMotif=FALSE, freqs=freqs) ## plot the results zscore <- sort(os$zscore) h <- hist(zscore, breaks=100, main="Histogram of Z-score") text(zscore[length(zscore)], h$counts[length(h$counts)]+1, labels=names(zscore[length(zscore)]), adj=0, srt=90) ## ----simulation, fig.cap="Histogram of Z-score of simulation data", fig.width=6, fig.height=6---- ## We can also try simulation data seq.sim.motif <- list(c("t", "g", "c", "a", "t", "g"), c("g", "c", "a", "t", "g", "c")) set.seed(1) seq.sim <- sapply(sample(c(2, 1, 0), 1000, replace=TRUE, prob=c(0.07, 0.1, 0.83)), function(x){ s <- sample(c("a", "c", "g", "t"), sample(100:1000, 1), replace=TRUE) if(x>0){ si <- sample.int(length(s), 1) if(si>length(s)-6) si <- length(s)-6 s[si:(si+5)] <- seq.sim.motif[[x]] } paste(s, collapse="") }) os <- oligoSummary(seq.sim, oligoLength=6, MarkovOrder=3, quickMotif=TRUE) zscore <- sort(os$zscore, decreasing=TRUE) h <- hist(zscore, breaks=100, main="Histogram of Z-score") text(zscore[1:2], rep(5, 2), labels=names(zscore[1:2]), adj=0, srt=90) ## ----simulation.motif, fig.cap="Motif of simulation data", fig.width=6, fig.height=6---- ## generate the motifs library(motifStack) pfms <- mapply(function(.ele, id) new("pfm", mat=.ele, name=paste("SAMPLE motif", id)), os$motifs, 1:length(os$motifs)) motifStack(pfms[[1]]) ## ----peaksNearBDP16----------------------------------------------------------- bdp <- peaksNearBDP(overlaps, annoData, maxgap=5000) c(bdp$percentPeaksWithBDP, bdp$n.peaks, bdp$n.peaksWithBDP) bdp$peaksWithBDP[1:2] ## ----findEnhancers------------------------------------------------------------ DNA5C <- system.file("extdata", "wgEncodeUmassDekker5CGm12878PkV2.bed.gz", package="ChIPpeakAnno") DNAinteractiveData <- toGRanges(gzfile(DNA5C)) findEnhancers(overlaps, annoData, DNAinteractiveData) ## ----importData--------------------------------------------------------------- path <- system.file("extdata", package="ChIPpeakAnno") files <- dir(path, "broadPeak") data <- sapply(file.path(path, files), toGRanges, format="broadPeak") names(data) <- gsub(".broadPeak", "", files) ## ----vennDiagram, fig.cap="Venn diagram of overlaps.", fig.width=6, fig.height=6---- ol <- findOverlapsOfPeaks(data, connectedPeaks="keepAll") averagePeakWidth <- mean(width(unlist(GRangesList(ol$peaklist)))) tot <- ceiling(3.3e+9 * .03 / averagePeakWidth / 24) makeVennDiagram(ol, totalTest=tot, connectedPeaks="keepAll", fill=c("#CC79A7", "#56B4E9", "#F0E442"), # circle fill color col=c("#D55E00", "#0072B2", "#E69F00"), #circle border color cat.col=c("#D55E00", "#0072B2", "#E69F00")) ## see the difference if we set connectedPeaks to "keepFirstListConsistent" ## set connectedPeaks to keepFirstListConsistent will show consistent total ## number of peaks for the first peak list. makeVennDiagram(ol, totalTest=tot, connectedPeaks="keepFirstListConsistent", fill=c("#CC79A7", "#56B4E9", "#F0E442"), col=c("#D55E00", "#0072B2", "#E69F00"), cat.col=c("#D55E00", "#0072B2", "#E69F00")) ## ----peakPermTest1, fig.cap="permutation test for YY1 and TEAD4"-------------- data(HOT.spots) data(wgEncodeTfbsV3) hotGR <- reduce(unlist(HOT.spots)) removeOl <- function(.ele){ ol <- findOverlaps(.ele, hotGR) if(length(ol)>0) .ele <- .ele[-unique(queryHits(ol))] .ele } TAF <- removeOl(data[["TAF"]]) TEAD4 <- removeOl(data[["Tead4"]]) YY1 <- removeOl(data[["YY1"]]) # we subset the pool to save demo time set.seed(1) wgEncodeTfbsV3.subset <- wgEncodeTfbsV3[sample.int(length(wgEncodeTfbsV3), 2000)] pool <- new("permPool", grs=GRangesList(wgEncodeTfbsV3.subset), N=length(YY1)) pt1 <- peakPermTest(YY1, TEAD4, pool=pool, seed=1, force.parallel=FALSE) plot(pt1) ## ----peakPermTest2, fig.cap="permutation test for YY1 and TAF"---------------- pt2 <- peakPermTest(YY1, TAF, pool=pool, seed=1, force.parallel=FALSE) plot(pt2) ## ----heatmap,fig.cap="Heatmap of aligned features sorted by signal of TAF",fig.width=4,fig.height=6---- features <- ol$peaklist[[length(ol$peaklist)]] feature.recentered <- reCenterPeaks(features, width=4000) ## here we also suggest importData function in bioconductor trackViewer package ## to import the coverage. ## compare rtracklayer, it will save you time when handle huge dataset. library(rtracklayer) files <- dir(path, "bigWig") if(.Platform$OS.type != "windows"){ cvglists <- sapply(file.path(path, files), import, format="BigWig", which=feature.recentered, as="RleList") }else{## rtracklayer can not import bigWig files on Windows load(file.path(path, "cvglist.rds")) } names(cvglists) <- gsub(".bigWig", "", files) feature.center <- reCenterPeaks(features, width=1) sig <- featureAlignedSignal(cvglists, feature.center, upstream=2000, downstream=2000) ##Because the bw file is only a subset of the original file, ##the signals are not exists for every peak. keep <- rowSums(sig[[2]]) > 0 sig <- sapply(sig, function(.ele) .ele[keep, ], simplify = FALSE) feature.center <- feature.center[keep] heatmap <- featureAlignedHeatmap(sig, feature.center, upstream=2000, downstream=2000, upper.extreme=c(3,.5,4)) ## ----sortHeatmapByHcluster,fig.cap="Heatmap of aligned features sorted by hclut",fig.width=4,fig.height=6---- sig.rowsums <- sapply(sig, rowSums, na.rm=TRUE) d <- dist(sig.rowsums) hc <- hclust(d) feature.center$order <- hc$order heatmap <- featureAlignedHeatmap(sig, feature.center, upstream=2000, downstream=2000, upper.extreme=c(3,.5,4), sortBy="order") ## ----distribution,fig.cap="Distribution of aligned features",fig.width=6,fig.height=6---- featureAlignedDistribution(sig, feature.center, upstream=2000, downstream=2000, type="l")