## ----eval = TRUE, echo = TRUE, message = FALSE--------------------------------
library(ORFik)
library(GenomicFeatures)

## ----eval = TRUE, echo = TRUE-------------------------------------------------
txdbFile <- system.file("extdata", "hg19_knownGene_sample.sqlite", 
                        package = "GenomicFeatures")
txdb <- loadTxdb(txdbFile)
fiveUTRs <- fiveUTRsByTranscript(txdb, use.names = TRUE)
fiveUTRs[1]

## ----eval = TRUE, echo = TRUE, message = FALSE--------------------------------
if (requireNamespace("BSgenome.Hsapiens.UCSC.hg19")) {

  # Extract sequences of fiveUTRs.
  # Either you import fasta file of ranges, or you have some BSgenome.
  tx_seqs <- extractTranscriptSeqs(BSgenome.Hsapiens.UCSC.hg19::Hsapiens, 
                                   fiveUTRs) 
  
  # Find all ORFs on those transcripts and get their genomic coordinates
  fiveUTR_ORFs <- findMapORFs(fiveUTRs, tx_seqs, groupByTx = FALSE)
  fiveUTR_ORFs
}

## ----eval = TRUE, echo = TRUE, message = FALSE--------------------------------
if (requireNamespace("BSgenome.Hsapiens.UCSC.hg19")) {
  orf_seqs <- extractTranscriptSeqs(BSgenome.Hsapiens.UCSC.hg19::Hsapiens,
                                    fiveUTR_ORFs[1])
  # To save as .fasta do:
  # writeXStringSet(orf_seqs, filepath = "uorfs.fasta")
  orf_seqs[1]
}

## ----eval = TRUE, echo = TRUE-------------------------------------------------

if (requireNamespace("BSgenome.Hsapiens.UCSC.hg19")) {
  # The ORFs are now grouped by transcripts, but we want them grouped by ORFs:
  # we use the orfs exon column called ($names) to group, it is made by ORFik.
  unlisted_ranges <- unlistGrl(fiveUTR_ORFs)
  test_ranges <- groupGRangesBy(unlisted_ranges, unlisted_ranges$names)
  print("Grouped by ORF")
  print(test_ranges[1:2])
  # the orfs are now grouped by orfs. If we want to go back to transcripts we do:
  unlisted_ranges <- unlistGrl(test_ranges)
  test_ranges <- groupGRangesBy(unlisted_ranges) # <- defualt is tx grouping by names
  print("Grouped by Transcript")
  print(test_ranges)
}

## ----eval = TRUE, echo = TRUE-------------------------------------------------

if (requireNamespace("BSgenome.Hsapiens.UCSC.hg19")) {
  # lets use the fiveUTR_ORFs
  #1. Group by ORFs, if ORFs are grouped by transcripts it would make no sense.
  unlisted_ranges <- unlistGrl(fiveUTR_ORFs)
  ORFs <- groupGRangesBy(unlisted_ranges, unlisted_ranges$names)
  print(length(ORFs))
  #2. Remove widths < 60
  ORFs <- ORFs[widthPerGroup(ORFs) >= 60]
  print(length(ORFs))
  #3. Keep only ORFs with at least 2 exons
  ORFs <- ORFs[numExonsPerGroup(ORFs) > 1]
  print(length(ORFs))
  
  #4. Keep only positive ORFs
  ORFs <- ORFs[strandPerGroup(ORFs) == "+"]
  # all remaining ORFs where on positive strand, so no change
  length(ORFs)
}

## ----eval = TRUE, echo = TRUE-------------------------------------------------
if (requireNamespace("BSgenome.Hsapiens.UCSC.hg19")) {
  # let's use the ORFs from the previous examples
  #1. Find the start and stop sites as GRanges
  startSites(fiveUTR_ORFs, asGR = TRUE, keep.names = TRUE, is.sorted = TRUE)
  stopSites(fiveUTR_ORFs, asGR = TRUE, keep.names = TRUE, is.sorted = TRUE)
  
  #2. Lets find the start and stop codons,
  # this takes care of potential 1 base exons etc. 
  starts <- startCodons(fiveUTR_ORFs, is.sorted = TRUE)
  stops <- stopCodons(fiveUTR_ORFs, is.sorted = TRUE)
  print("Start codon ranges:")
  print(starts[1:2])
  
  #3. Lets get the bases of the start and stop codons from the fasta file
  # It's very important to check that ORFs are sorted here, set is.sorted to 
  # FALSE if you are not certain if the exons are sorted.
  txSeqsFromFa(starts, BSgenome.Hsapiens.UCSC.hg19::Hsapiens, is.sorted = TRUE)
  print("Stop codons")
  txSeqsFromFa(stops, BSgenome.Hsapiens.UCSC.hg19::Hsapiens, is.sorted = TRUE)
  }

## ----eval = TRUE, echo = TRUE-------------------------------------------------
  library(Biostrings)
  library(S4Vectors)
  # strand with ORFs in both directions
  seqs <- DNAStringSet("ATGAAATGAAGTAAATCAAAACAT")
  ######################>######################< (< > is direction of ORF)
  
  # positive strands
  pos <- findORFs(seqs, startCodon = "ATG", minimumLength = 0)
  # negative strands
  neg <- findORFs(reverseComplement(seqs),
                  startCodon = "ATG", minimumLength = 0)
  # make GRanges since we want strand information
  pos <- GRanges(pos, strand = "+")
  neg <- GRanges(neg, strand = "-")
  # as GRanges
  res <- c(pos, neg)
  # or merge together and make GRangesList
  res <- split(res, seq.int(1, length(pos) + length(neg)))
  res

## ----eval = TRUE, echo = TRUE-------------------------------------------------
  res[strandBool(res)] # Keep only + stranded ORFs

## ----eval = TRUE, echo = TRUE, warning = FALSE, message = FALSE---------------
# path to example CageSeq data from hg19 heart sample
cageData <- system.file("extdata", "cage-seq-heart.bed.bgz", 
                        package = "ORFik")
# get new Transcription Start Sites using CageSeq dataset
newFiveUTRs <- reassignTSSbyCage(fiveUTRs, cageData)
newFiveUTRs

## ----eval = TRUE, echo = TRUE-------------------------------------------------
bam_file <- system.file("extdata", "ribo-seq.bam", package = "ORFik")
footprints <- readBam(bam_file)

## ----eval = TRUE, echo = TRUE-------------------------------------------------
table(readWidths(footprints))

## ----eval = TRUE, echo = TRUE, warning = FALSE, message = FALSE---------------
gtf_file <- system.file("extdata", "annotations.gtf", package = "ORFik")
txdb <- loadTxdb(gtf_file)
tx <- exonsBy(txdb, by = "tx", use.names = TRUE)
cds <- cdsBy(txdb, by = "tx", use.names = TRUE)
trailers <- threeUTRsByTranscript(txdb, use.names = TRUE)
cds[1]

## ----eval = TRUE, echo = TRUE-------------------------------------------------
footprintsGR <- convertToOneBasedRanges(footprints, addSizeColumn = TRUE)
footprintsGR

## ----eval = TRUE, echo = TRUE-------------------------------------------------
  hitMap <- windowPerReadLength(cds, tx,  footprintsGR, pShifted = FALSE)
  coverageHeatMap(hitMap, scoring = "transcriptNormalized")

## ----eval = TRUE, echo = TRUE-------------------------------------------------
footprints <- footprints[readWidths(footprints) == 29]
footprintsGR <- footprintsGR[readWidths(footprintsGR) == 29]
footprints

## ----eval = TRUE, echo = TRUE, warning = FALSE--------------------------------
txNames <- filterTranscripts(txdb) # <- get only transcripts that pass filter
tx <- tx[txNames]; cds <- cds[txNames]; trailers <- trailers[txNames];
windowsStart <- startRegion(cds[txNames], tx, TRUE, upstream = 30, 29)
windowsStop <- startRegion(trailers, tx, TRUE, upstream = 30, 29)
windowsStart

## ----eval = TRUE, echo = TRUE, warning = FALSE--------------------------------
hitMapStart <- metaWindow(footprintsGR, windowsStart, withFrames = TRUE)
hitMapStop <- metaWindow(footprintsGR, windowsStop, withFrames = TRUE)

## ----eval = TRUE, echo = TRUE, warning = FALSE--------------------------------
  pSitePlot(hitMapStart)

## ----eval = TRUE, echo = TRUE, warning = FALSE--------------------------------
  pSitePlot(hitMapStop, region = "stop")

## ----eval = TRUE, echo = TRUE, warning = FALSE--------------------------------
shifts <- detectRibosomeShifts(footprints, txdb, stop = TRUE)
shifts

## ----eval = TRUE, echo = TRUE, warning = FALSE--------------------------------
shiftedFootprints <- shiftFootprints(footprints, shifts)
shiftedFootprints

## ----eval = TRUE, echo = TRUE, warning = FALSE, message = FALSE---------------
 if (requireNamespace("BSgenome.Hsapiens.UCSC.hg19")) {
  library(GenomicFeatures)

  # Extract sequences of fiveUTRs.
  fiveUTRs <- fiveUTRs[1:10]
  faFile <- BSgenome.Hsapiens.UCSC.hg19::Hsapiens
  tx_seqs <- extractTranscriptSeqs(faFile, fiveUTRs)

  # Find all ORFs on those transcripts and get their genomic coordinates
  fiveUTR_ORFs <- findMapORFs(fiveUTRs, tx_seqs)
  unlistedORFs <- unlistGrl(fiveUTR_ORFs)
  # group GRanges by ORFs instead of Transcripts, use 4 first ORFs
  fiveUTR_ORFs <- groupGRangesBy(unlistedORFs, unlistedORFs$names)[1:4]

  # make some toy ribo seq and rna seq data
  starts <- unlist(ORFik:::firstExonPerGroup(fiveUTR_ORFs), use.names = FALSE)
  RFP <- promoters(starts, upstream = 0, downstream = 1)
  score(RFP) <- rep(29, length(RFP)) # the original read widths

  # set RNA seq to duplicate transcripts
  RNA <- unlist(exonsBy(txdb, by = "tx", use.names = TRUE), use.names = TRUE)
  # transcript database
  txdb <- loadTxdb(txdbFile)
  dt <- computeFeatures(fiveUTR_ORFs, RFP, RNA, txdb, faFile, 
                        orfFeatures =  TRUE)
  dt
}

## ----eval = TRUE, echo = TRUE-------------------------------------------------
  # In this example we will find kozak score of cds' 

  if (requireNamespace("BSgenome.Hsapiens.UCSC.hg19")) {

    cds <- cdsBy(txdb, by = "tx", use.names = TRUE)[1:10]
    tx <- exonsBy(txdb, by = "tx", use.names = TRUE)[names(cds)]
    faFile <- BSgenome.Hsapiens.UCSC.hg19::Hsapiens

    kozakSequenceScore(cds, tx, faFile, species = "human")

    # A few species are pre supported, if not, make your own input pfm.

    # here is an example where the human pfm is sent in again, even though
    # it is already supported.

    pfm <- t(matrix(as.integer(c(29,26,28,26,22,35,62,39,28,24,27,17,
                                 21,26,24,16,28,32,5,23,35,12,42,21,
                                 25,24,22,33,22,19,28,17,27,47,16,34,
                                 25,24,26,25,28,14,5,21,10,17,15,28)),
                    ncol = 4))

   kozakSequenceScore(cds, tx, faFile, species = pfm)

  }


## ----eval = TRUE, echo = TRUE-------------------------------------------------
  # First make som toy example
  cds <- GRanges("chr1", IRanges(c(1, 10, 20, 30, 40, 50, 60, 70, 80),
                                 c(5, 15, 25, 35, 45, 55, 65, 75, 85)),
                 "+")
  names(cds) <- c(rep("tx1", 3), rep("tx2", 3), rep("tx3", 3))
  cds <- groupGRangesBy(cds)
  ribo <- GRanges("chr1", c(1, rep.int(23, 4), 30, 34, 34, 43, 60, 64, 71, 74),
                  "+")
  # We could do a simplification and use the ORFik entropy function
  entropy(cds, ribo) # <- spread of reads

## ----eval = TRUE, echo = TRUE-------------------------------------------------
tile <- tile1(cds, FALSE, FALSE) # tile them to 1 based positions
tails <- tails(tile, 9) # get 9 last bases per cds
stopOverlap <- countOverlaps(tails, ribo)
allOverlap <- countOverlaps(cds, ribo)
fractions <- (stopOverlap + 1) / (allOverlap + 1) # pseudocount 1
cdsToRemove <- fractions > 1 / 2 # filter with pseudocounts (1+1)/(3+1) 
cdsToRemove

## ----eval = TRUE, echo = TRUE, warning = FALSE, message = FALSE---------------
if (requireNamespace("BSgenome.Hsapiens.UCSC.hg19")) {
  # Load data as shown before and pshift the Ribo-seq
  # Get the annotation
  txdb <- loadTxdb(gtf_file)
  # Lets take all valid transcripts, with size restrictions:
  # leader > 100 bases, cds > 100 bases, trailer > 100 bases
  txNames <- filterTranscripts(txdb, 100, 100, 100) # valid transcripts
  leaders = fiveUTRsByTranscript(txdb, use.names = TRUE)[txNames]
  cds <- cdsBy(txdb, "tx", use.names = TRUE)[txNames]
  trailers = threeUTRsByTranscript(txdb, use.names = TRUE)[txNames]
  tx <- exonsBy(txdb, by = "tx", use.names = TRUE)
  # Ribo-seq
  bam_file <- system.file("extdata", "ribo-seq.bam", package = "ORFik")
  reads <- readGAlignments(bam_file)
  shiftedReads <- shiftFootprints(reads, detectRibosomeShifts(reads, txdb))  
}

## ----eval = TRUE, echo = TRUE-------------------------------------------------
if (requireNamespace("BSgenome.Hsapiens.UCSC.hg19")) {
  library(data.table)
  # Create meta coverage per part of transcript
  leaderCov <- metaWindow(shiftedReads, leaders, scoring = NULL, 
                          feature = "leaders")

  cdsCov <- metaWindow(shiftedReads, cds, scoring = NULL, 
                       feature = "cds")
  
  trailerCov <- metaWindow(shiftedReads, trailers, scoring = NULL, 
                           feature = "trailers")
  # bind together
  dt <- rbindlist(list(leaderCov, cdsCov, trailerCov))
  # Now set info column
  dt[, `:=` (fraction = "Ribo-seq")]
  # NOTE: All of this is done in one line in function: windowPerTranscript
  
  # zscore gives shape, a good starting plot
  windowCoveragePlot(dt, scoring = "zscore", title = "Ribo-seq metaplot") 
}

## ----eval = TRUE, echo = TRUE-------------------------------------------------
if (requireNamespace("BSgenome.Hsapiens.UCSC.hg19")) {
  windowCoveragePlot(dt, scoring = "median", title = "Ribo-seq metaplot") 
}

## ----eval = TRUE, echo = TRUE-------------------------------------------------
if (requireNamespace("BSgenome.Hsapiens.UCSC.hg19")) {
  # size 100 window: 50 upstream, 49 downstream of TIS
  windowsStart <- startRegion(cds, tx, TRUE, upstream = 50, 49)
  hitMapStart <- metaWindow(shiftedReads, windowsStart, withFrames = TRUE)
  pSitePlot(hitMapStart, length = "meta coverage")
}

## ----eval = TRUE, echo = TRUE, message = FALSE, fig.height=8------------------
if (requireNamespace("BSgenome.Hsapiens.UCSC.hg19")) {
  # size 25 window (default): 5 upstream, 20 downstream of TIS
  hitMap <- windowPerReadLength(cds, tx,  shiftedReads)
  coverageHeatMap(hitMap, addFracPlot = TRUE)
}

## ----eval = TRUE, echo = TRUE, message=FALSE----------------------------------
if (requireNamespace("BSgenome.Hsapiens.UCSC.hg19")) {
  # Load more files like above (Here I make sampled data from earlier Ribo-seq)
  dt2 <- copy(dt)
  dt2[, `:=` (fraction = "Ribo-seq2")]
  dt2$score <- dt2$score + sample(seq(-40, 40), nrow(dt2), replace = TRUE)
  
  dtl <- rbindlist(list(dt, dt2))
  windowCoveragePlot(dtl, scoring = "median", title = "Ribo-seq metaplots") 
}