## ----setup, echo=FALSE--------------------------------------------------- library(UseBioconductor) stopifnot(BiocInstaller::biocVersion() == "3.1") ## ----style, echo = FALSE, results = 'asis'------------------------------- BiocStyle::markdown() ## ----benchmark----------------------------------------------------------- f0 <- function(n) { ## inefficient! ans <- numeric() for (i in seq_len(n)) ans <- c(ans, exp(i)) ans } f1 <- function(n) { ans <- numeric(n) for (i in seq_len(n)) ans[[i]] <- exp(i) ans } f2 <- function(n) sapply(seq_len(n), exp) f3 <- function(n) exp(seq_len(n)) ## ----parallel-sleep------------------------------------------------------ library(BiocParallel) fun <- function(i) { Sys.sleep(1) i } ## serial f0 <- function(n) lapply(seq_len(n), fun) ## parallel f1 <- function(n) bplapply(seq_len(n), fun) ## ----reduceByYield-setup------------------------------------------------- suppressPackageStartupMessages({ library(GenomicFiles) library(GenomicAlignments) library(Rsamtools) library(TxDb.Hsapiens.UCSC.hg19.knownGene) }) yield <- # how to input the next chunk of data function(X, ...) { readGAlignments(X) } map <- # what to do to each chunk function(VALUE, ..., roi) { olaps <- findOverlaps(VALUE, roi, type="within", ignore.strand=TRUE) count <- tabulate(subjectHits(olaps), subjectLength(olaps)) setNames(count, names(roi)) } reduce <- # how to combine mapped chunks `+` ## ----yieldFactory-------------------------------------------------------- yieldFactory <- # return a function with local state function() { n_records <- 0L function(X, ...) { aln <- readGAlignments(X) n_records <<- n_records + length(aln) message(n_records) aln } } ## ----count-overlaps-roi, eval=FALSE-------------------------------------- # exByTx <- exonsBy(TxDb.Hsapiens.UCSC.hg19.knownGene, "tx") # # map0 <- read.delim("~/igv/genomes/hg19_alias.tab", header=FALSE, # stringsAsFactors=FALSE) # seqlevels(exByTx, force=TRUE) <- setNames(map0$V1, map0$V2) ## ----count-overlaps, eval=FALSE------------------------------------------ # count1 <- function(filename, roi) { # message(filename) # ## Create and open BAM file # bf <- BamFile(filename, yieldSize=1000000) # reduceByYield(bf, yieldFactory(), map, reduce, roi=roi) # } ## ----count-overlaps-doit, eval=FALSE------------------------------------- # filename <- "~/bam/SRR1039508_sorted.bam" # count <- count1(filename, exByTx) ## ----count-overlaps-parallel, eval=FALSE--------------------------------- # library(BiocParallel) # # ## all bam files # filenames <- dir("~/bam", pattern="bam$", full=TRUE) # names(filenames) <- sub("_sorted.bam", "", basename(filenames)) # # ## iterate # counts <- bplapply(filenames, count1, exByTx) # counts <- simplify2array(counts) # head(counts)