## ----eval = TRUE, echo = TRUE, message = FALSE-------------------------------- library(ORFik) library(GenomicFeatures) ## ----eval = TRUE, echo = TRUE------------------------------------------------- txdbFile <- system.file("extdata", "hg19_knownGene_sample.sqlite", package = "GenomicFeatures") txdb <- loadTxdb(txdbFile) fiveUTRs <- fiveUTRsByTranscript(txdb, use.names = TRUE) fiveUTRs[1] ## ----eval = TRUE, echo = TRUE, message = FALSE-------------------------------- if (requireNamespace("BSgenome.Hsapiens.UCSC.hg19")) { # Extract sequences of fiveUTRs. # Either you import fasta file of ranges, or you have some BSgenome. tx_seqs <- extractTranscriptSeqs(BSgenome.Hsapiens.UCSC.hg19::Hsapiens, fiveUTRs) # Find all ORFs on those transcripts and get their genomic coordinates fiveUTR_ORFs <- findMapORFs(fiveUTRs, tx_seqs, groupByTx = FALSE) fiveUTR_ORFs } ## ----eval = TRUE, echo = TRUE, message = FALSE-------------------------------- if (requireNamespace("BSgenome.Hsapiens.UCSC.hg19")) { orf_seqs <- extractTranscriptSeqs(BSgenome.Hsapiens.UCSC.hg19::Hsapiens, fiveUTR_ORFs[1]) # To save as .fasta do: # writeXStringSet(orf_seqs, filepath = "uorfs.fasta") orf_seqs[1] } ## ----eval = TRUE, echo = TRUE------------------------------------------------- if (requireNamespace("BSgenome.Hsapiens.UCSC.hg19")) { # The ORFs are now grouped by transcripts, but we want them grouped by ORFs: # we use the orfs exon column called ($names) to group, it is made by ORFik. unlisted_ranges <- unlistGrl(fiveUTR_ORFs) test_ranges <- groupGRangesBy(unlisted_ranges, unlisted_ranges$names) print("Grouped by ORF") print(test_ranges[1:2]) # the orfs are now grouped by orfs. If we want to go back to transcripts we do: unlisted_ranges <- unlistGrl(test_ranges) test_ranges <- groupGRangesBy(unlisted_ranges) # <- defualt is tx grouping by names print("Grouped by Transcript") print(test_ranges) } ## ----eval = TRUE, echo = TRUE------------------------------------------------- if (requireNamespace("BSgenome.Hsapiens.UCSC.hg19")) { # lets use the fiveUTR_ORFs #1. Group by ORFs, if ORFs are grouped by transcripts it would make no sense. unlisted_ranges <- unlistGrl(fiveUTR_ORFs) ORFs <- groupGRangesBy(unlisted_ranges, unlisted_ranges$names) print(length(ORFs)) #2. Remove widths < 60 ORFs <- ORFs[widthPerGroup(ORFs) >= 60] print(length(ORFs)) #3. Keep only ORFs with at least 2 exons ORFs <- ORFs[numExonsPerGroup(ORFs) > 1] print(length(ORFs)) #4. Keep only positive ORFs ORFs <- ORFs[strandPerGroup(ORFs) == "+"] # all remaining ORFs where on positive strand, so no change length(ORFs) } ## ----eval = TRUE, echo = TRUE------------------------------------------------- if (requireNamespace("BSgenome.Hsapiens.UCSC.hg19")) { # let's use the ORFs from the previous examples #1. Find the start and stop sites as GRanges startSites(fiveUTR_ORFs, asGR = TRUE, keep.names = TRUE, is.sorted = TRUE) stopSites(fiveUTR_ORFs, asGR = TRUE, keep.names = TRUE, is.sorted = TRUE) #2. Lets find the start and stop codons, # this takes care of potential 1 base exons etc. starts <- startCodons(fiveUTR_ORFs, is.sorted = TRUE) stops <- stopCodons(fiveUTR_ORFs, is.sorted = TRUE) print("Start codon ranges:") print(starts[1:2]) #3. Lets get the bases of the start and stop codons from the fasta file # It's very important to check that ORFs are sorted here, set is.sorted to # FALSE if you are not certain if the exons are sorted. txSeqsFromFa(starts, BSgenome.Hsapiens.UCSC.hg19::Hsapiens, is.sorted = TRUE) print("Stop codons") txSeqsFromFa(stops, BSgenome.Hsapiens.UCSC.hg19::Hsapiens, is.sorted = TRUE) } ## ----eval = TRUE, echo = TRUE------------------------------------------------- library(Biostrings) library(S4Vectors) # strand with ORFs in both directions seqs <- DNAStringSet("ATGAAATGAAGTAAATCAAAACAT") ######################>######################< (< > is direction of ORF) # positive strands pos <- findORFs(seqs, startCodon = "ATG", minimumLength = 0) # negative strands neg <- findORFs(reverseComplement(seqs), startCodon = "ATG", minimumLength = 0) # make GRanges since we want strand information pos <- GRanges(pos, strand = "+") neg <- GRanges(neg, strand = "-") # as GRanges res <- c(pos, neg) # or merge together and make GRangesList res <- split(res, seq.int(1, length(pos) + length(neg))) res ## ----eval = TRUE, echo = TRUE------------------------------------------------- res[strandBool(res)] # Keep only + stranded ORFs ## ----eval = TRUE, echo = TRUE, warning = FALSE, message = FALSE--------------- # path to example CageSeq data from hg19 heart sample cageData <- system.file("extdata", "cage-seq-heart.bed.bgz", package = "ORFik") # get new Transcription Start Sites using CageSeq dataset newFiveUTRs <- reassignTSSbyCage(fiveUTRs, cageData) newFiveUTRs ## ----eval = TRUE, echo = TRUE------------------------------------------------- bam_file <- system.file("extdata", "ribo-seq.bam", package = "ORFik") footprints <- readBam(bam_file) ## ----eval = TRUE, echo = TRUE------------------------------------------------- table(readWidths(footprints)) ## ----eval = TRUE, echo = TRUE, warning = FALSE, message = FALSE--------------- gtf_file <- system.file("extdata", "annotations.gtf", package = "ORFik") txdb <- loadTxdb(gtf_file) tx <- exonsBy(txdb, by = "tx", use.names = TRUE) cds <- cdsBy(txdb, by = "tx", use.names = TRUE) trailers <- threeUTRsByTranscript(txdb, use.names = TRUE) cds[1] ## ----eval = TRUE, echo = TRUE------------------------------------------------- footprintsGR <- convertToOneBasedRanges(footprints, addSizeColumn = TRUE) footprintsGR ## ----eval = TRUE, echo = TRUE------------------------------------------------- hitMap <- windowPerReadLength(cds, tx, footprintsGR, pShifted = FALSE) coverageHeatMap(hitMap, scoring = "transcriptNormalized") ## ----eval = TRUE, echo = TRUE------------------------------------------------- footprints <- footprints[readWidths(footprints) == 29] footprintsGR <- footprintsGR[readWidths(footprintsGR) == 29] footprints ## ----eval = TRUE, echo = TRUE, warning = FALSE-------------------------------- txNames <- filterTranscripts(txdb) # <- get only transcripts that pass filter tx <- tx[txNames]; cds <- cds[txNames]; trailers <- trailers[txNames]; windowsStart <- startRegion(cds[txNames], tx, TRUE, upstream = 30, 29) windowsStop <- startRegion(trailers, tx, TRUE, upstream = 30, 29) windowsStart ## ----eval = TRUE, echo = TRUE, warning = FALSE-------------------------------- hitMapStart <- metaWindow(footprintsGR, windowsStart, withFrames = TRUE) hitMapStop <- metaWindow(footprintsGR, windowsStop, withFrames = TRUE) ## ----eval = TRUE, echo = TRUE, warning = FALSE-------------------------------- pSitePlot(hitMapStart) ## ----eval = TRUE, echo = TRUE, warning = FALSE-------------------------------- pSitePlot(hitMapStop, region = "stop") ## ----eval = TRUE, echo = TRUE, warning = FALSE-------------------------------- shifts <- detectRibosomeShifts(footprints, txdb, stop = TRUE) shifts ## ----eval = TRUE, echo = TRUE, warning = FALSE-------------------------------- shiftedFootprints <- shiftFootprints(footprints, shifts) shiftedFootprints ## ----eval = TRUE, echo = TRUE, warning = FALSE, message = FALSE--------------- if (requireNamespace("BSgenome.Hsapiens.UCSC.hg19")) { library(GenomicFeatures) # Extract sequences of fiveUTRs. fiveUTRs <- fiveUTRs[1:10] faFile <- BSgenome.Hsapiens.UCSC.hg19::Hsapiens tx_seqs <- extractTranscriptSeqs(faFile, fiveUTRs) # Find all ORFs on those transcripts and get their genomic coordinates fiveUTR_ORFs <- findMapORFs(fiveUTRs, tx_seqs) unlistedORFs <- unlistGrl(fiveUTR_ORFs) # group GRanges by ORFs instead of Transcripts, use 4 first ORFs fiveUTR_ORFs <- groupGRangesBy(unlistedORFs, unlistedORFs$names)[1:4] # make some toy ribo seq and rna seq data starts <- unlist(ORFik:::firstExonPerGroup(fiveUTR_ORFs), use.names = FALSE) RFP <- promoters(starts, upstream = 0, downstream = 1) score(RFP) <- rep(29, length(RFP)) # the original read widths # set RNA seq to duplicate transcripts RNA <- unlist(exonsBy(txdb, by = "tx", use.names = TRUE), use.names = TRUE) # transcript database txdb <- loadTxdb(txdbFile) dt <- computeFeatures(fiveUTR_ORFs, RFP, RNA, txdb, faFile, orfFeatures = TRUE) dt } ## ----eval = TRUE, echo = TRUE------------------------------------------------- # In this example we will find kozak score of cds' if (requireNamespace("BSgenome.Hsapiens.UCSC.hg19")) { cds <- cdsBy(txdb, by = "tx", use.names = TRUE)[1:10] tx <- exonsBy(txdb, by = "tx", use.names = TRUE)[names(cds)] faFile <- BSgenome.Hsapiens.UCSC.hg19::Hsapiens kozakSequenceScore(cds, tx, faFile, species = "human") # A few species are pre supported, if not, make your own input pfm. # here is an example where the human pfm is sent in again, even though # it is already supported. pfm <- t(matrix(as.integer(c(29,26,28,26,22,35,62,39,28,24,27,17, 21,26,24,16,28,32,5,23,35,12,42,21, 25,24,22,33,22,19,28,17,27,47,16,34, 25,24,26,25,28,14,5,21,10,17,15,28)), ncol = 4)) kozakSequenceScore(cds, tx, faFile, species = pfm) } ## ----eval = TRUE, echo = TRUE------------------------------------------------- # First make som toy example cds <- GRanges("chr1", IRanges(c(1, 10, 20, 30, 40, 50, 60, 70, 80), c(5, 15, 25, 35, 45, 55, 65, 75, 85)), "+") names(cds) <- c(rep("tx1", 3), rep("tx2", 3), rep("tx3", 3)) cds <- groupGRangesBy(cds) ribo <- GRanges("chr1", c(1, rep.int(23, 4), 30, 34, 34, 43, 60, 64, 71, 74), "+") # We could do a simplification and use the ORFik entropy function entropy(cds, ribo) # <- spread of reads ## ----eval = TRUE, echo = TRUE------------------------------------------------- tile <- tile1(cds, FALSE, FALSE) # tile them to 1 based positions tails <- tails(tile, 9) # get 9 last bases per cds stopOverlap <- countOverlaps(tails, ribo) allOverlap <- countOverlaps(cds, ribo) fractions <- (stopOverlap + 1) / (allOverlap + 1) # pseudocount 1 cdsToRemove <- fractions > 1 / 2 # filter with pseudocounts (1+1)/(3+1) cdsToRemove ## ----eval = TRUE, echo = TRUE, warning = FALSE, message = FALSE--------------- if (requireNamespace("BSgenome.Hsapiens.UCSC.hg19")) { # Load data as shown before and pshift the Ribo-seq # Get the annotation txdb <- loadTxdb(gtf_file) # Lets take all valid transcripts, with size restrictions: # leader > 100 bases, cds > 100 bases, trailer > 100 bases txNames <- filterTranscripts(txdb, 100, 100, 100) # valid transcripts leaders = fiveUTRsByTranscript(txdb, use.names = TRUE)[txNames] cds <- cdsBy(txdb, "tx", use.names = TRUE)[txNames] trailers = threeUTRsByTranscript(txdb, use.names = TRUE)[txNames] tx <- exonsBy(txdb, by = "tx", use.names = TRUE) # Ribo-seq bam_file <- system.file("extdata", "ribo-seq.bam", package = "ORFik") reads <- readGAlignments(bam_file) shiftedReads <- shiftFootprints(reads, detectRibosomeShifts(reads, txdb)) } ## ----eval = TRUE, echo = TRUE------------------------------------------------- if (requireNamespace("BSgenome.Hsapiens.UCSC.hg19")) { library(data.table) # Create meta coverage per part of transcript leaderCov <- metaWindow(shiftedReads, leaders, scoring = NULL, feature = "leaders") cdsCov <- metaWindow(shiftedReads, cds, scoring = NULL, feature = "cds") trailerCov <- metaWindow(shiftedReads, trailers, scoring = NULL, feature = "trailers") # bind together dt <- rbindlist(list(leaderCov, cdsCov, trailerCov)) # Now set info column dt[, `:=` (fraction = "Ribo-seq")] # NOTE: All of this is done in one line in function: windowPerTranscript # zscore gives shape, a good starting plot windowCoveragePlot(dt, scoring = "zscore", title = "Ribo-seq metaplot") } ## ----eval = TRUE, echo = TRUE------------------------------------------------- if (requireNamespace("BSgenome.Hsapiens.UCSC.hg19")) { windowCoveragePlot(dt, scoring = "median", title = "Ribo-seq metaplot") } ## ----eval = TRUE, echo = TRUE------------------------------------------------- if (requireNamespace("BSgenome.Hsapiens.UCSC.hg19")) { # size 100 window: 50 upstream, 49 downstream of TIS windowsStart <- startRegion(cds, tx, TRUE, upstream = 50, 49) hitMapStart <- metaWindow(shiftedReads, windowsStart, withFrames = TRUE) pSitePlot(hitMapStart, length = "meta coverage") } ## ----eval = TRUE, echo = TRUE, message = FALSE, fig.height=8------------------ if (requireNamespace("BSgenome.Hsapiens.UCSC.hg19")) { # size 25 window (default): 5 upstream, 20 downstream of TIS hitMap <- windowPerReadLength(cds, tx, shiftedReads) coverageHeatMap(hitMap, addFracPlot = TRUE) } ## ----eval = TRUE, echo = TRUE, message=FALSE---------------------------------- if (requireNamespace("BSgenome.Hsapiens.UCSC.hg19")) { # Load more files like above (Here I make sampled data from earlier Ribo-seq) dt2 <- copy(dt) dt2[, `:=` (fraction = "Ribo-seq2")] dt2$score <- dt2$score + sample(seq(-40, 40), nrow(dt2), replace = TRUE) dtl <- rbindlist(list(dt, dt2)) windowCoveragePlot(dtl, scoring = "median", title = "Ribo-seq metaplots") }