## ----style, eval=TRUE, echo=FALSE, results='asis'------------------------
options(max.print=1000)
BiocStyle::latex()
library(knitr)
opts_chunk$set(cache=TRUE, tidy=FALSE)


## ----packages, eval=TRUE, echo=FALSE, warning=FALSE, message=FALSE-------
suppressPackageStartupMessages({
    library(ShortRead)
    library(VariantAnnotation)
    library(parallel); options(mc.cores=detectCores())
    library(ggplot2)
    library(RNAseqData.HNRNPC.bam.chr14)
    library(org.Hs.eg.db)
    library(TxDb.Hsapiens.UCSC.hg19.knownGene)
    library(BSgenome.Hsapiens.UCSC.hg19)
    library(AnnotationHub)
    library(rtracklayer)
    library(EMBOBGI)
})


## ----select-setup--------------------------------------------------------
ensids <- c("ENSG00000130720", "ENSG00000103257", "ENSG00000156414", 
            "ENSG00000144644", "ENSG00000159307", "ENSG00000144485")


## ----select--------------------------------------------------------------
library(org.Hs.eg.db)
keytypes(org.Hs.eg.db)
columns(org.Hs.eg.db)
cols <- c("SYMBOL", "GENENAME")
select(org.Hs.eg.db, keys=ensids, columns=cols, keytype="ENSEMBL")


## ----biomaRt1, eval=FALSE, results="hide"--------------------------------
## ## NEEDS INTERNET ACCESS !!
## library(biomaRt)
## head(listMarts(), 3)                      ## list the marts
## head(listDatasets(useMart("ensembl")), 3) ## mart datasets
## ensembl <-                                ## fully specified mart
##     useMart("ensembl", dataset = "hsapiens_gene_ensembl")
## 
## head(listFilters(ensembl), 3)             ## filters
## myFilter <- "chromosome_name"
## head(filterOptions(myFilter, ensembl), 3) ## return values
## myValues <- c("21", "22")
## head(listAttributes(ensembl), 3)          ## attributes
## myAttributes <- c("ensembl_gene_id","chromosome_name")
## 
## ## assemble and query the mart
## res <- getBM(attributes =  myAttributes, filters =  myFilter,
##              values =  myValues, mart = ensembl)


## ----rtracklayer-roi-----------------------------------------------------
roi <- GRanges("chr10", IRanges(92106877, 112106876, names="ENSG00000099194"))


## ----rtracklayer-session-------------------------------------------------
library(rtracklayer) 
session <- browserSession()


## ----rtracklayer-marks---------------------------------------------------
trackName <- "wgEncodeRegTfbsClusteredV2"
tableName <- "wgEncodeRegTfbsClusteredV2"
trFactor <- "ERalpha_a"
ucscTable <- getTable(ucscTableQuery(session, track=trackName,
    range=roi, table=tableName, name=trFactor))


## ----rtracklayer-plot, fig.height=3--------------------------------------
plot(score ~ chromStart, ucscTable, pch="+")
abline(v=start(roi) + (end(roi) - start(roi) + 1) / 2, col="blue")


## ----setup,echo=FALSE----------------------------------------------------
library(GenomicRanges)
plotRanges <- 
    function(x, xlim = x, main = deparse(substitute(x)),
             col = "black", sep = 0.5, ...)
{
    height <- 1
    if (is(xlim, "Ranges"))
        xlim <- c(min(start(xlim)), max(end(xlim)))
    bins <- disjointBins(IRanges(start(x), end(x) + 1))
    plot.new()
    par(mai=c(0.6, 0.2, 0.6, 0.2))
    plot.window(xlim, c(0, max(bins)*(height + sep)))
    ybottom <- bins * (sep + height) - height
    rect(start(x)-0.5, ybottom, end(x)+0.5, ybottom + height, col = col, ...)
    title(main, cex.main=2.8, font.main=1)
    axis(1)
}


## ----GRanges-genes-------------------------------------------------------
genes <- GRanges(seqnames=c("chr3R", "chrX"),
                 ranges=IRanges(
                     start=c(19967117, 18962306),
                     end = c(19973212, 18962925)),
                 strand=c("+", "-"),
                 seqlengths=c(chr3R=27905053, chrX=22422827))


## ----GRanges-display-----------------------------------------------------
genes


## ----GRanges-vignettes, eval=FALSE---------------------------------------
## vignette(package="GenomicRanges")


## ----GRanges-basics------------------------------------------------------
genes[2]
strand(genes)
width(genes)
length(genes)
names(genes) <- c("FBgn0039155", "FBgn0085359")
genes  # now with names


## ----ranges-ir-----------------------------------------------------------
ir <- IRanges(start=c(7, 9, 12, 14, 22:24),
              end=c(15, 11, 12, 18, 26, 27, 28))


## ----ranges-ir-plot, results='hide', echo=FALSE--------------------------
png("ranges-ir-plot.png", width=800, height=160)
plotRanges(ir, xlim=c(5, 35), main="Original")
dev.off()
png("ranges-shift-ir-plot.png", width=800, height=160)
plotRanges(shift(ir, 5), xlim=c(5, 35), main="Shift")
dev.off()
png("ranges-reduce-ir-plot.png", width=800, height=160)
plotRanges(reduce(ir), xlim=c(5, 35), main="Reduce")
dev.off()


## ----ranges-shift-ir-----------------------------------------------------
shift(ir, 5)


## ----ranges-shift-ir-plot, results='hide', echo=FALSE--------------------
png("ranges-shift-ir-plot.png", width=800, height=160)
plotRanges(shift(ir, 5), xlim=c(5, 35), main="Shift")
dev.off()


## ----ranges-reduce-ir----------------------------------------------------
reduce(ir)


## ----ranges-reduce-ir-plot, results='hide', echo=FALSE-------------------
png("ranges-reduce-ir-plot.png", width=800, height=160)
plotRanges(reduce(ir), xlim=c(5, 35), main="Reduce")
dev.off()


## ----coverage------------------------------------------------------------
cvg <- coverage(ir)
cvg
## plot(as.integer(cvg), type="s", xlab="Coordinate", ylab="Depth of coverage")


## ----GRanges-mcols-------------------------------------------------------
mcols(genes) <- DataFrame(EntrezId=c("42865", "2768869"),
                          Symbol=c("kal-1", "CG34330"))


## ----GRanges-metadata----------------------------------------------------
metadata(genes) <- list(CreatedBy="A. User", Date=date())


## ----GRangesList-eg-setup, echo=FALSE------------------------------------
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene # alias
## need ENTREZID's 
egids <- select(org.Hs.eg.db, ensids, "ENTREZID", "ENSEMBL")$ENTREZID
exByGn <- exonsBy(txdb, "gene")[egids]


## ----GRangesList-eg, echo=FALSE------------------------------------------
exByGn


## ----SYMBOL-to-ENTREZID--------------------------------------------------
library(org.Hs.eg.db)
egid <- select(org.Hs.eg.db, "BRCA1", "ENTREZID", "SYMBOL")$ENTREZID
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
egToTx <- select(txdb, egid, "TXNAME", "GENEID")


## ----cdsBy---------------------------------------------------------------
cdsByTx <- cdsBy(txdb, "tx", use.names=TRUE)[egToTx$TXNAME]


## ----getsequence---------------------------------------------------------
library(BSgenome.Hsapiens.UCSC.hg19)
txx <- extractTranscriptsFromGenome(Hsapiens, cdsByTx)


## ----translate-----------------------------------------------------------
all(width(txx) %% 3 == 0)  # sanity check
translate(txx)             # amino acid sequence


## ----summarizeOverlaps-BAM-----------------------------------------------
library(RNAseqData.HNRNPC.bam.chr14)
fls <- RNAseqData.HNRNPC.bam.chr14_BAMFILES


## ----summarizeOverlaps-gene-model----------------------------------------
## library(parallel); options(mc.cores=detectCores())
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
ex <- exonsBy(TxDb.Hsapiens.UCSC.hg19.knownGene, "gene")


## ----summarizeOverlaps-count---------------------------------------------
counts <- summarizeOverlaps(ex, fls)
colSums(assay(counts))
sum(rowSums(assay(counts)) != 0)