Martin Morgan
February 3, 2015
system.time()
, Rprof()
, microbenchmarkopen()
, read chunk(s), close()
.yieldSize
argument to Rsamtools::BamFile()
GenomicFiles::reduceByYield()
Rsamtools::ScanBamParam()
ShortRead::FastqSampler()
lapply()
-like operationsbplapply()
for lapply()
-like functions,
increasingly used by package developers to provide easy, standard
way of gaining parallel evaluation.Write the following as a function. Use system.time()
to explore how
long this takes to execute as n
increases from 100 to 10000. Use
identical()
and microbenchmark to compare
alternatives f1()
, f2()
, and f3()
for both correctness and performance of
these three different functions. What strategies are these functions
using?
f0 <- function(n) {
## inefficient!
ans <- numeric()
for (i in seq_len(n))
ans <- c(ans, exp(i))
ans
}
f1 <- function(n) {
ans <- numeric(n)
for (i in seq_len(n))
ans[[i]] <- exp(i)
ans
}
f2 <- function(n)
sapply(seq_len(n), exp)
f3 <- function(n)
exp(seq_len(n))
Go to sleep for 1 second, then return i
. This takes 8 seconds.
library(BiocParallel)
fun <- function(i) {
Sys.sleep(1)
i
}
## serial
f0 <- function(n)
lapply(seq_len(n), fun)
## parallel
f1 <- function(n)
bplapply(seq_len(n), fun)
Iterate through files: GenomicFiles::reduceByYield()
suppressPackageStartupMessages({
library(GenomicFiles)
library(GenomicAlignments)
library(Rsamtools)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
})
yield <- # how to input the next chunk of data
function(X, ...)
{
readGAlignments(X)
}
map <- # what to do to each chunk
function(VALUE, ..., roi)
{
olaps <- findOverlaps(VALUE, roi, type="within", ignore.strand=TRUE)
count <- tabulate(subjectHits(olaps), subjectLength(olaps))
setNames(count, names(roi))
}
reduce <- # how to combine mapped chunks
`+`
Improvement: “yield factory” to keep track of how many records input
yieldFactory <- # return a function with local state
function()
{
n_records <- 0L
function(X, ...) {
aln <- readGAlignments(X)
n_records <<- n_records + length(aln)
message(n_records)
aln
}
}
Regions of interest, named like the chromosomes in the bam file.
exByTx <- exonsBy(TxDb.Hsapiens.UCSC.hg19.knownGene, "tx")
map0 <- read.delim("~/igv/genomes/hg19_alias.tab", header=FALSE,
stringsAsFactors=FALSE)
seqlevels(exByTx, force=TRUE) <- setNames(map0$V1, map0$V2)
A function to iterate through a bam file
count1 <- function(filename, roi) {
message(filename)
## Create and open BAM file
bf <- BamFile(filename, yieldSize=1000000)
reduceByYield(bf, yieldFactory(), map, reduce, roi=roi)
}
In action
filename <- "~/bam/SRR1039508_sorted.bam"
count <- count1(filename, exByTx)
Parallelize
library(BiocParallel)
## all bam files
filenames <- dir("~/bam", pattern="bam$", full=TRUE)
names(filenames) <- sub("_sorted.bam", "", basename(filenames))
## iterate
counts <- bplapply(filenames, count1, exByTx)
counts <- simplify2array(counts)
head(counts)