%\VignetteIndexEntry{A package for importing and analyzing data from Roche's Genome Sequencer System}
%\VignetteDepends{rtracklayer}
%\VignetteKeywords{Roche, AVA, GSM, something else}
%\VignettePackage{R453Plus1Toolbox}

\documentclass[10pt]{article}

\usepackage{times}
\usepackage{hyperref}
\usepackage{verbatim} %TEMP: only for block comments

\newcommand{\Rfunction}[1]{{\texttt{#1}}}
\newcommand{\Robject}[1]{{\texttt{#1}}}
\newcommand{\Rpackage}[1]{{\textit{#1}}}
\newcommand{\Rmethod}[1]{{\texttt{#1}}}
\newcommand{\Rfunarg}[1]{{\texttt{#1}}}
\newcommand{\Rclass}[1]{{\textit{#1}}}
\newcommand{\Rcode}[1]{{\texttt{#1}}}

\SweaveOpts{prefix.string=R453Plus1}


\begin{document}
\SweaveOpts{concordance=TRUE}

\title{R453Plus1Toolbox \\ A package for importing and analyzing data from Roche's Genome Sequencer System}
\author{Hans-Ulrich Klein, Christoph Bartenhagen, Christian Ruckert}
\date{December 05, 2013}

\maketitle

\tableofcontents

<<echo=FALSE>>=
options(width=60)
options(continue=" ")
@

\newpage

\section{Introduction}
The R453Plus1 Toolbox comprises useful functions for the analysis of data generated by Roche's 454 
sequencing platform. It adds functions for quality assurance as well as for annotation and 
visualization of detected variants, complementing the software tools shipped by Roche with their 
product. Further, a pipeline for the detection of structural variants is provided. \\

<<preliminaries>>=
library(R453Plus1Toolbox)
@

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%___Analysis of PCR amplicon projects___%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Analysis of PCR amplicon projects}
This section deals with the analysis of projects investigating massively parallel data generated 
from specifically designed PCR products.


\subsection{Importing a Roche Amplicon Variant Analyzer project}
\label{sec:AVASetFunctions}
The function \Rfunction{AVASet} imports data from Roche's Amplicon Variant Analyzer (AVA). This can be done in three ways, depending on the version
of your AVA software:
\subsubsection{Import from AVA without AVA-CLI (version 2.5 and lower)}
For projects created with the AVA software version $\leq$ 2.5, \Rfunction{AVASet} expects only a \Rfunarg{dirname} pointing 
to the project data, i.e. a directory that contains the following files and subdirectories: \\
\begin{itemize}
\item "Amplicons/\-ProjectDef/\-ampliconsProject.txt"
\item "Amplicons/\-Results/\-Variants/\-currentVariantDefs.txt"
\item "Amplicons/\-Results/\-Variants"
\item "Amplicons/\-Results/\-Align"
\end{itemize}
There is an example project "AVASet" included in the \Rpackage{R453Plus1Toolbox} installation directory:
<<label=createRocheAVASet1>>=
projectDir = system.file("extdata", "AVASet", package = "R453Plus1Toolbox") 
avaSet = AVASet(dirname=projectDir)
@
\subsubsection{Import from AVA with AVA-CLI}
The function \Rfunction{AVASet} can directly access the AVA Command Line Interface (AVA-CLI) from within R. If the AVA software is installed on the same machine 
that runs R, the easiest way to import a project is to specify the project directory with \Rfunarg{dirname} 
and the path to the binaries in the AVA software's installation directory with \Rfunarg{avaBin}. It is usually the directory "bin" 
containing the AVA-CLI command interpreter "doAmplicon".\\
Let's say the AVA software was installed to the directory "/home/User/AVA". Then, the function call looks like:
<<label=createRocheAVASet2, eval=FALSE>>=
projectDir = "My/AVA/Project"
avaSet = AVASet(dirname=projectDir, avaBin="/home/User/AVA/bin")
@

\subsubsection{Import from AVA with projects exported via AVA-CLI}
If the AVA software is not installed on the same machine that runs R, all data must be exported manually using AVA-CLI.
It can be accessed via the command line interpreter "doAmplicon" from the AVA software's installation directory. 
Within the AVA-CLI, load your project with the command "open". \Rfunction{AVASet} expects five files (variant information is optional): \\
\begin{table}[ht]
\begin{scriptsize}
	\begin{center}
		\begin{tabular}{l|l|p{3cm}}
		\hline                    
		\Rfunction{AVASet} argument
		& AVA-CLI command
		& Description \\
		\hline
		\Rfunarg{file\_sample}                     
		& list sample -outputFile sample.csv
		& Table with sample names and annotations \\
		\Rfunarg{file\_amp}                     
		& list amplicon -outputFile amp.csv
		& Table with primer sequences, positions and annotations \\
		\Rfunarg{file\_reference}                     
		& list reference -outputFile reference.csv
		& Reference sequences \\
		\Rfunarg{file\_variant}                     
		& list variant -outputFile variant.csv
		& Detected variants (if available) \\
		\Rfunarg{file\_variantHits}                     
		& report variantHits -outputFile variantHits.csv
		& Variant hits for all samples (if available) \\
		\hline
		\end{tabular}
	\end{center}
	\caption{\Rfunction{AVASet} function arguments for loading projects exported via AVA-CLI.}
	\label{table:AVASet_AVACLI}
\end{scriptsize}
\end{table} 
\\
Note, that all exported tables are expected to be in csv-format. \\
There is an example project "AVASet\_doAmplicon" included in the \Rpackage{R453Plus1Toolbox} installation directory:
<<label=createRocheAVASet3, eval=FALSE>>=
projectDir = system.file("extdata", "AVASet_doAmplicon", package="R453Plus1Toolbox")
avaSetExample = AVASet(dirname=projectDir, file_sample="sample.csv",
 file_amp="amp.csv", file_reference="reference.csv", file_variant="variant.csv", 
 file_variantHits="variantHits.csv")
@
\Rfunction{AVASet} searches the specified \Rfunarg{dirname} for the exported csv-files. \Rfunarg{file\_variant} \Rfunarg{file\_variantHits} can be omitted if no variant information is
available for the project.

\subsection{The AVASet class}
The \Rclass{AVASet} class defines a container to store data imported from projects conducted with 
Roche's AVA software. It extends the \Rpackage{Biobase} \Rclass{eSet} to store 
all relevant information.\\
<<label=showAVASet>>=
avaSet
@

An object of class \Rclass{AVASet} consists of three main components: \\
\begin{enumerate}
	\item Variants: \\
		The variants part stores data about the found variants and is accessible by the functions 
        \Rfunction{assayData}, \Rfunction{featureData} and 
		\Rfunction{phenoData} known from Biobase eSet. \\
		The \Rcode{assayData} slot contains four matrices with variants as rows and samples as 
        columns:
		\begin{itemize}
			\item variantForwCount: Matrix containing the number of reads with the respective 
				variant in forward direction.
			\item variantRevCount: Matrix containing the number of reads with the respective variant 
				in reverse direction.
			\item totalForwCount: Matrix containig the total coverage for every variant location in 
				forward direction.
			\item totalRevCount: Matrix containing the total coverage for every variant location in 
				reverse direction.
		\end{itemize}
<<label=assayDataAVA>>=
assayData(avaSet)$totalForwCount[1:3, ]
@
		The \Rcode{featureData} slot provides additional information on the variants. 
		\Rfunction{fData} returns a data frame with variants as rows and the following columns:
		\begin{itemize}
			\item name/canonicalPattern: Short identifiers of a variant including the position and 
				the bases changed.
			\item referenceSeq: Gives the identifier of the reference sequence (see below).
			\item start/end: The position of the variant relative to the reference sequence.
			\item variantBase/referenceBases: The bases changed in the variant.    
		\end{itemize}
<<label=fDataAVA>>=
fData(avaSet)[1:3, ]
@
		The \Rcode{phenoData} slot provides sample-IDs, multiplexer IDs (MID1, MID2), the pico titer 
		plate (PTP) accession number, the lane, the read group and additional textual annotation for 
		each sample. Most of these informations are imported directly from Roche's software.
<<label=pDataAVA>>=
pData(avaSet)
@
	\item Amplicons: \\
		This part stores information about the used amplicons and is accessible by the functions 
		\Rfunction{assayDataAmp} and \Rfunction{fDataAmp}. \\
		The slot \Rcode{assayDataAmp} contains two matrices with amplicons as rows and samples as 
		columns:
		\begin{itemize}
			\item forwCount: Matrix containing the number of reads for each amplicon and each sample 
				in forward direction.
			\item revCount: Matrix containing the number of reads for each amplicon and each sample 
				in reverse direction.
		\end{itemize}
<<label=assayDataAmpAVA>>=
assayDataAmp(avaSet)$forwCount
@
		The slot \Rcode{featureDataAmp} contains an AnnotatedDataFrame with additional information 
		on each amplicon: 
		\begin{itemize}
			\item ampID: The identifier of the current amplicon.
			\item primer1, primer2: The primer sequences for each amplicon.
			\item referenceSeqID: The identifier of the reference sequence (see below).
			\item targetStart/targetEnd: The coordinates of the target region.
		\end{itemize}
<<label=fDataAmpAVA>>=
fDataAmp(avaSet)
@
As both refer to the same samples, the variants phenoData slot is used for amplicons as well. 
	\item Reference sequences: \\
		This part stores data about the reference sequences the amplicons were selected from. All 
		information is stored into an object of class \Rclass{AlignedRead}. The reads are accessible 
		via	\Rfunction{sread}. To retrieve additional information from Ensembl about the chromosome, 
		the position and the strand of each reference sequence run function 
		\Rfunction{alignShortReads} (see section \ref{sec:Annotations} for details).
<<label=referenceSequences>>=
library(ShortRead)
referenceSequences(avaSet)
sread(referenceSequences(avaSet))
@
\end{enumerate}

The following table sums up the available slots and accessor functions: \\
\begin{table}[ht]
	\begin{center}
		\begin{tabular}{l|p{7cm}}
		\hline

		Function/Slot                    
		& Description \\

		\hline

		\Rfunction{assayData}                     
		& Contains the number of reads and the total coverage for every variant and each sample in forward and reverse direction.\\

		\Rfunction{fData}/\Rfunction{featureData}
		& Contains information about the type, position and reference of each variant. \\

		\Rfunction{pData}/\Rfunction{phenoData}                   
		& Contains sample-IDs, multiplexer IDs (MID1, MID2), the pico titer plate (PTP) accession number, the lane, the read group and additional 
		textual annotation for each sample. \\

		\Rfunction{assayDataAmp}                  
		& Contains the number of reads for every amplicon and each sample in forward/reverse direction.\\

		\Rfunction{fDataAmp}/\Rfunction{featureDataAmp} 
		& Contains the primer sequences, reference sequence and the coordinates of the target region for each amplicon.\\

		\Rfunction{referenceSequences}         
		& Contains the reference sequences for the amplicons together with additional annotations.\\

		\hline
		\end{tabular}
	\end{center}
	\caption{AVASet contents and accessor functions.}
	\label{table:AVASet}
\end{table}


\subsection{Subsetting an AVASet}
\label{sec:Subsetting}
A subset of an \Rclass{AVASet} object can be generated using the common \Rcode{"[]"}-notation: 
<<label=AVASubSet1>>=
avaSubSet = avaSet[1:10, "Sample_1"]
@
The first dimension refers to the variants and the second dimension to the samples, so an 
\Rclass{AVASet} with ten variants and one sample is returned.\\
This is a short and to some extend equivalent version of the function \Rfunction{subset}, which 
expects a \Rfunarg{subset} argument and the respective \Rfunarg{dimension} (either "variants", 
"samples" or "amplicons"):
<<label=AVASubSet2>>=
avaSubSet = subset(avaSet, subset=1:10, dimension="variants")
@ 
The following is equivalent to the \Rcode{"[]"}-example above:
<<label=AVASubSet3>>=
avaSubSet = subset(subset(avaSet, subset=1:10, dimension="variants"), subset="Sample_1", 
 dimension="samples")
@ 
In contrast to the \Rcode{"[]"}-Notation, the function \Rfunction{subset} allows further subsetting 
by amplicons:
<<label=AVASubSet4>>=
avaSubSet = subset(avaSet, subset=c("TET2_E11.04", "TET2_E06"), dimension="amplicons")
@ 
When subsetting by amplicons all variants referring to amplicons that are not in the subset will be 
excluded. \\


\subsection{Setting filters on an AVASet}
\label{sec:Filters}
Another way of generating a subset of an \Rclass{AVASet} object is filtering only those variants, 
whose coverage (in percent) in forward and reverse direction respectively is higher than a given 
\Rfunarg{filter} value in at least one sample. Here, the coverage is defined as the percentual 
amount of the reads with the given variant on the number of all reads covering the variant's 
position. \\
The function \Rfunction{setVariantFilter} returns an updated \Rclass{AVASet} object that meets the 
given requirements:
<<label=filterAVA1>>=
avaSetFiltered1 = setVariantFilter(avaSet, filter=0.05)
@
The above example returns an \Rclass{AVASet}, which only contains variants whose coverage is greater 
than 5\% in at least one sample.\\
Passing a vector of two \Rfunarg{filter} values applies filtering according to forward and reverse 
read direction separately:
<<label=filterAVA2>>=
avaSetFiltered2 = setVariantFilter(avaSet, filter=c(0.1, 0.05))
@
In fact, when filtering an \Rclass{AVASet}, the whole object is still availabe. The filter only 
affects the output given by accessor functions like \Rfunction{fData}, \Rfunction{featureData} and 
\Rfunction{assayData}.\\
The process can be reversed and the filter value(s) can be reset to zero by calling
<<label=filterAVA4>>=
avaSet = setVariantFilter(avaSetFiltered1, filter=0)
@
or simply
<<label=filterAVA5>>=
avaSet = setVariantFilter(avaSetFiltered2)
@


\subsection{Variant coverage}
The function \Rfunction{getVariantPercentages} displays the coverage of the variants for a given 
\Rfunarg{direction} (either "forward", "reverse", or "both"):
<<label=varPercentages1>>=
getVariantPercentages(avaSet, direction="both")[20:25, 1:4]
@ 
In the example above, \Rfunction{getVariantPercentages} is simply a short form of calculating
<<label=varPercentages2, results=hide>>=
(assayData(avaSet)[[1]] + assayData(avaSet)[[3]]) / (assayData(avaSet)[[2]]
 + assayData(avaSet)[[4]])
@ 


\subsection{Annotations and Variant Reports}
\label{sec:Annotations}

Before creating the variant and quality report, the reference sequences must be aligned against a 
reference genome and afterwards the variants have to be annotatetd. \\
The method \Rfunction{alignShortReads} aligns the reference sequences from an \Rclass{AVASet} 
against a given reference genome. Only exact (no errors) and unique matches are returned. In the 
example below the hg19 assembly as provided by UCSC from package 
\Rpackage{BSgenome.Hsapiens.UCSC.hg19} is used as reference:
<<label=alignShortReads, eval=FALSE>>=
library(BSgenome.Hsapiens.UCSC.hg19)
seqNames = names(Hsapiens)[1:24]
avaSet = alignShortReads(avaSet, bsGenome=Hsapiens, 
 seqNames=seqNames, ensemblNotation=TRUE)
@
The function \Rfunction{annotateVariants} annotates genomic variants (mutations) given in a data 
frame or more likely an \Rclass{AVASet}. Annotation includes affected genes, exons and codons. 
Resulting amino acid changes are returned as well as dbSNP identifiers if the mutation is already 
known. All information is fetched from Ensembl via \Rpackage{biomaRt} and returned in an object of 
class \Rclass{AnnotatedVariants}. It is advisible to filter the \Rclass{AVASet} (see section 
\ref{sec:Filters}) prior to that since the annotation process is very time consuming for a large 
number (>500) of variants.
<<label=annotateVariants, eval=FALSE>>=
avaSet = setVariantFilter(avaSet, filter=0.05)
avaAnnot = annotateVariants(avaSet)
@
For an \Rclass{AVASet} with corresponding annotated variants, the function \Rfunction{htmlReport} 
creates a html report containing variant and quality information. \\
The report is structured into three pages:
\begin{enumerate}
    \item Variant report by reference: This page sums up additional information for each variant 
		including name, type, reference gene, position, changed nucleotides and affected samples. In 
		addition, every variant is linked to a page with further details about the affected genes and 
		transcripts (e.g. Ensembl gene-IDs, transcript-IDs, codon sequences, changes of amino acids 
		(if coding)).
    \item Variant report by sample: The upper fraction of this page presents an overview of all 
		samples together with links to individual amplicon coverage plots for each sample. In the 
		lower fraction the found variants are listed for each sample seperately in the same way as 
		described in the variant report by reference above.
    \item Quality report: The report shows the coverage of every amplicon in forward and/or reverse 
		direction. Further plots display the coverage by MID and PTP (if this information is given 
		in the pheno data of the object).
\end{enumerate}
The following command creates a report containing only variants covered by at least 5\% of the reads 
using the argument \Rfunarg{minMut} (\Rfunarg{minMut}=3 is the default value). The argument 
\Rfunarg{blocks} can be used to structure the page by assigning each variant to a block. In this 
example the corresponding genes for each variant are used to create blocks, resulting in only one 
block in the example data set:
<<label=htmlReport, eval=FALSE>>=
blocks = as.character(sapply(annotatedVariants(avaAnnot), 
 function(x) x$genes$external_gene_id))
htmlReport(avaSet, annot=avaAnnot, blocks=blocks, dir="htmlReportExampleAVA", 
 title="htmlReport Example", minMut=3)
@


\subsection{Plotting}

\subsubsection{Plot amplicon coverage}
The function \Rfunction{plotAmpliconCoverage} creates a plot showing the coverage (number of reads) 
per amplicon, MID or PTP. This results in a barplot if the \Rclass{AVASet} contains only one sample 
or in a boxplot for all other cases.
<<label=plotAmpCov1,fig=TRUE,include=FALSE,eps=FALSE,pdf=TRUE>>=
plotAmpliconCoverage(avaSet[, 2], type="amplicon")
@
\begin{figure}[hp]
\begin{center}
  \includegraphics[height=8cm]{R453Plus1-plotAmpCov1}
  \caption{Barplot of the amplicon coverage for sample 2.}
\end{center}
\end{figure}
<<label=plotAmpCov2,fig=TRUE,include=FALSE,eps=FALSE,pdf=TRUE>>=
plotAmpliconCoverage(avaSet, bothDirections=TRUE, type="amplicon")
@
\begin{figure}[hp]
\begin{center}
  \includegraphics[height=8cm]{R453Plus1-plotAmpCov2}
  \caption{Boxplot of the coverage for four amplicons seperated by read direction.}
\end{center}
\end{figure}


\subsubsection{Plot variation frequency}
Given a Roche Amplicon Variant Analyzer Global Alignment export file, the function 
\Rfunction{plotVariationFrequency} creates a plot similar to the variation frequency plot in Roche's 
GS Amplicon Variant Analyzer. The plot shows the reference sequence along the x-axis and indicates 
variants as bars at the appropriate positions. The height of the bars corresponds to the percentage 
of reads carrying the variant. A second y-axis indicates the absolute number of reads covering the 
variant. \Rfunarg{plotRange} defines the start and end base of the reference sequence that should be 
plotted.
<<label=loadXLS>>=
file = system.file("extdata", "AVAVarFreqExport", "AVAVarFreqExport.xls", 
 package="R453Plus1Toolbox")
@
<<label=plotVarFreq,fig=TRUE,include=FALSE,eps=FALSE,pdf=TRUE>>=
plotVariationFrequency(file, plotRange=c(50, 150))
@

\begin{figure}[hp]
\begin{center}
  \includegraphics{R453Plus1-plotVarFreq}
  \caption{Plot of the variation frequency for a given reference sequence.}
\end{center}
\end{figure}


\subsubsection{Plot variant locations}
The function \Rfunction{plotVariants} illustrates the positions and types of mutations within a given \Rfunarg{gene} and \Rfunarg{transcript} (specified by an 
Ensembl gene/transcript id). The plot shows only coding regions (thus, units are amino acids / codons). The coding 
region is further divided into exons labeled with their rank in the transcript. An attribute \Rfunarg{regions} allows to highlight special, predefined areas on the transcript like for example protein domains. \\ \\
The function can be used in two ways: \\
It offers the most functionality when used as a "standalone" function by passing all mutations as a data frame. This mode allows an individual and detailed
annotation of the mutations like labels, colors and user defined mutation types. It requires the columns "label", "pos" "mutation" and "color". It is recommended to add more detailed info 
for each mutation type by preparing a data frame for the parameter \Rfunarg{mutationInfo} which requires the three columns "mutation", "legend" and "color". \\ 
The following example calls \Rfunction{plotVariants} for the gene TET2 having th Ensembl id "ENSG00000168769" and transcript "ENST00000513237" (see Figure 4 below):
<<label=loadXLS>>=
data(plotVariantsExample)
@
<<label=plotVariants,fig=TRUE,include=FALSE,eps=FALSE,pdf=TRUE>>=
geneInfo = plotVariants(data=variants, gene="ENSG00000168769", 
 transcript="ENST00000513237", regions=regions, 
 mutationInfo=mutationInfo, horiz=TRUE, cex=0.8)
@
\begin{figure}[hp]
\begin{center}
  \includegraphics{R453Plus1-plotVariants}
  \caption{Plot of the variants for gene TET2 by passing mutations as a data frame. This version includes mutation labels and allows user defined mutation types.}
\end{center}
\end{figure}
Especially for integration into the R453Plus1Toolbox and for compatibility to older versions \Rfunction{plotVariants} also accepts annotated variants of class 
\Rclass{annotatedVariants} (see section \ref{sec:Annotations}). The function then only distinguishes missense, nonsense and silent point mutations and deletions and does not include mutation labels.\\


\subsection{VCF export}
The variant call format (VCF) is a generic file format for storing DNA polymorphism data such as SNPs, insertions, deletions and structural variants, together with rich annotations (\cite{Danecek2011}). 
The following command exports all variants stored in an \Rclass{AVASet} object into a vcf file with the given name. Make sure to run \Rfunction{alignShortReads} first and optional add dbSNP identifiers with \Rfunction{annotateVariants} (see section \ref{sec:Annotations} for details)
<<label=vcfexport, eval=FALSE>>=
ava2vcf(avaSet, filename="variants.vcf", annot=avaAnnot)
@


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%___Analysis of GS Mapper projects___%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Analysis of GS Mapper projects}
Mapping projects allow the alignment of arbitrary reads from one or more sequencing runs to a given 
reference sequence.


\subsection{Importing a GS Reference Mapper project}
The function \Rfunction{MapperSet} imports data from Roche's GS Reference Mapper. The GS Mapper 
software stores information for each sample in a seperate directory, so \Rfunction{MapperSet} 
expects a character vector \Rfunarg{dirs} containing the directories of all samples to read in, i.e. 
directories containing the files: \\
\begin{itemize}
  \item "mapping/454HCDiffs.txt"
  \item "mapping/454NewblerMetrics.txt"
\end{itemize}
Furthermore the parameter \Rfunarg{samplenames} allows the seperate specification of sample names. 
if missing, the directory names are taken. The following example imports a project containing 3 
samples (N01, N03, N04) with a total of 111 variants:
<<label=gsmDir1>>=
dir_sample01 = system.file("extdata", "MapperSet", "N01", package = "R453Plus1Toolbox")
@ 
<<label=gsmDir2>>=
dir_sample03 = system.file("extdata", "MapperSet", "N03", package = "R453Plus1Toolbox")
@ 
<<label=gsmDir3>>=
dir_sample04 = system.file("extdata", "MapperSet", "N04", package = "R453Plus1Toolbox")
<<label=gsmDir4>>=
dirs = c(dir_sample01, dir_sample03, dir_sample04)
@
<<label=createRocheGSMSet>>= 
mapperSet = MapperSet(dirs=dirs, samplenames=c("N01", "N03", "N04"))
@


\subsection{The MapperSet class}
An object of class \Rclass{MapperSet} ia a container to store data imported from a project of 
Roche's GS Reference Mapper Software. It directly extends the \Rpackage{Biobase} \Rclass{eSet} class 
and as such provides the following slots:
\begin{enumerate}
	\item The \Rcode{assayData} slot contains four matrices with variants as rows and samples as 
		columns:
		\begin{itemize}
			\item variantForwCount/variantRevCount: Matrices containing the number of 
				reads with the respective variant in forward/reverse direction.
			\item totalForwCount/totalRevCount: Matrices containing the total read 
				coverage for every variant location in forward/reverse direction.
		\end{itemize}    
	\item The \Rcode{featureData} slot holds the variants as rows together with additional 
		information on each variant within the following columns:
		\begin{itemize}
			\item chromosome/start/end/strand: Give the location of each variant.
			\item referenceBases/variantBase: Show the base(s) changed in each variant.
			\item regName: The name of the region (gene) where the variant is located.
			\item knownSNP: Contains dbSNP reference cluster ids for known SNPs as given by the GS 
				Mapper software (if any).
		\end{itemize}
	\item The \Rcode{phenoData} slot contains additional information about the samples represented 
		as rows:
		\begin{itemize}
			\item By default, the phenoData slot only contains an accession number indicating the 
				PTP of every sample.
		\end{itemize}
\end{enumerate}
<<label=showMapperSet>>=
mapperSet
@

As the \Rclass{MapperSet} is derived from the \Rpackage{Biobase} \Rclass{eSet} the methods used to 
access or to manipulate the elements of a \Rclass{MapperSet} object remain the same:
<<label=assayDataMapper1>>=
assayData(mapperSet)$variantForwCount[1:4, ]
@ 
<<label=assayDataMapper2>>=
assayData(mapperSet)$totalForwCount[1:4, ]
@
<<label=fDataMapper>>=
fData(mapperSet)[1:4, ]
@ 
<<label=pDataMapper>>=
pData(mapperSet)
@ 


\subsection{Setting filters and subsetting a MapperSet}
\label{sec:FiltersMapper}

The \Rclass{MapperSet} uses the same methods for filtering and subsetting as the \Rclass{AVASet} 
(see section \ref{sec:Subsetting} and \ref{sec:Filters} for details).


\subsection{Annotations and Variant Reports}
\label{sec:AnnotationsMapper}

Before creating the variant and quality report, the variants have to be annotatetd using function 
\Rfunction{annotateVariants}. Annotation includes affected genes, exons and codons. Resulting amino 
acid changes are returned as well as dbSNP identifiers, if the mutation is already known. All 
information is fetched from Ensembl via \Rpackage{biomaRt} and returned in an object of class 
\Rclass{AnnotatedVariants}. It is advisible to filter the \Rclass{MapperSet} (see section 
\ref{sec:FiltersMapper}) since the annotation process is very time consuming for a large number 
(>500) of variants.
<<label=annotateVarMapper, eval=FALSE>>=
mapperAnnot = annotateVariants(mapperSet)
@

For a \Rclass{MapperSet} with corresponding annotated variants, the function \Rfunction{htmlReport} 
creates a html report containing detailed variant information. \\
The report is structured into two pages:
\begin{enumerate}
    \item Variant report by reference: This page sums up additional information for each variant 
		including name, type, reference gene, position, changed nucleotides and affected samples. 
		Furthermore every variant is linked to a page with further details about the affected genes 
		and transcripts (e.g. Ensembl gene-IDs, transcript-IDs, codon sequences, changes of amino 
		acids (if coding)). 
    \item Variant report by sample: The upper fraction of this page presents an overview of all 
		samples. In the lower fraction the found variants are listed for each sample seperately in 
		the same way as described in the variant report by reference above.
\end{enumerate}

The following command creates a report containing only variants covered by at least 3\% of the reads 
using the argument \Rfunarg{minMut} (\Rfunarg{minMut}=3 is also the default value):
<<label=htmlReportMapper, eval=FALSE>>=
htmlReport(mapperSet, annot=mapperAnnot, dir="htmlReportExampleMapper", 
 title="htmlReport Example", minMut=3)
@


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%___Detection of structural variants___%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Detection of structural variants}
Structural variants like translocations or inversions can be detected using non-paired reads if at 
least one read spans the breakpoint of the variant. These reads originate from two different 
locations on the reference genome and are called 'chimeric reads'. 


\subsection{Data preparation}
Before breakpoints can be detected, the generated raw sequences must be preprocessed and aligned. Of 
course, data preprocessing depends on the applied laboratory protocols. The exemplary data set used 
in this vignette is a subset of the data set presented by Kohlmann et al. (\cite{Kohlmann2009}) and 
is described in detail therein.\\

In our example data set, each region of the pico titer plate contains reads from three different 
samples which were loaded into that region. To reallocate reads to samples, each sample has a unique 
multiplex sequence prefixing all reads from that sample. This allocation process is called 
\emph{demultiplexing}. In the code section below, the multiplexed sequences are read in and 
demultiplexed according to the given multiplex sequences (MIDs) using the 
\Rfunction{demultiplexReads} method. The standard multiplex sequences used by the Genome Sequence 
MID library kits can be retrieved by calling \Rfunction{genomeSequencerMIDs}. The last two commands 
show that all reads could be successfully demultiplexed.\\

<<label=demultiplex, eval=TRUE>>=
fnaFile = system.file("extdata", "SVDetection", 
 "R_2009_07_30", 
 "D_2009_07_31", 
 "1.TCA.454Reads.fna", package="R453Plus1Toolbox")
seqs = readDNAStringSet(fnaFile, format="fasta")
MIDSeqs = genomeSequencerMIDs(c("MID1", "MID2", "MID3"))
dmReads = demultiplexReads(seqs, MIDSeqs, numMismatches=2, trim=TRUE)
length(seqs)
sum(sapply(dmReads, length))
@

A sequence capture array was used to ensure that the example data set predominantly contains reads 
from certain genomic regions of interest. The applied NimbleGen array captured short segments 
corresponding to all exon regions of 92 distinct target genes. In addition, contiguous genomic 
regions for three additional genes, i.e. \textit{CBFB}, \textit{MLL}, and \textit{RUNX1}, were 
present on the array. During sample preparation, linkers were ligated to the polished fragments in 
the library to provide a priming site as recommended by the NimbleGen protocol. These linker 
sequences were sequenced and are located at the 5 prime end of the reads. In case of long reads, the 
reverse complement of the linker may be located at the 3 prime end. The function 
\Rfunction{removeLinker} can be used to remove these linkers. Aditionally, very short reads are 
discarded in the following code snippet.
<<label=removeLinker, eval=TRUE>>=
minReadLength = 15
gSel3 = sequenceCaptureLinkers("gSel3")[[1]]
trimReads = lapply(dmReads, function (reads) {
reads = reads[width(reads) >= minReadLength]
reads = removeLinker(reads, gSel3)
reads = reads[width(reads) >= minReadLength]
readsRev = reverseComplement(reads)
readsRev = removeLinker(readsRev, gSel3)
reads = reverseComplement(readsRev)
reads = reads[width(reads) >= minReadLength]
return(reads)
})
@

Finally, the preprocessed reads must be aligned against a reference genome. For this purpose, we 
write the reads to a .fasta file and use the BWA-SW (\cite{Li2010}) algorithm for generating local alignments. 
The BWA-SW algorithm can be substituted by other local alignment algorithms. However, BWA-SW has the 
useful feature to only report the best local alignments. Hence, two local alignments do not overlap 
on the query sequence (they may overlap on the reference). This is an assumption made by the 
pipeline implemented in this package.
<<label=writeFASTA, eval=FALSE>>=
write.XStringSet(trimReads[["MID1"]], file="/tmp/N01.fasta", format="fasta")
@


\subsection{Computing and assessing putative structural variants}
As chimeric reads may also be caused by technical issues during sample preparation, the function
\Rfunction{filterChimericReads} implements several filter steps to remove artificial chimeric reads.
\\

The remaining reads are passed to the \Rfunction{detectBreakpoints} method to create clusters 
representing putative breakpoints. Each cluster contains all chimeric reads that span this 
breakpoint. Promising candidates are clusters with more than one read and ideally with reads from 
different strands. Some structural variations like translocations or inversion lead to two related 
breakpoints. In the context of fusion genes, these breakpoints are refered to as \emph{pathogenic} 
and \emph{reciprocal} breakpoint. By the use of read orientation and strand information during 
clustering, it is ensured that reads from the pathogenic breakpoint will not cluster together with 
reads from the reciprocal breakpoint, although their genomic coordinates may be close to each other 
or even equal. After clsutering, consensus breakpoint coordinates are computed for each cluster.\\

In the last step, the function \Rfunction{mergeBreakpoints} searches breakpoints that originate from 
the same structural variation (i.e. the pathogenic and the related reciprocal breakpoint) and merges
them. We observed, that the distance between two related breakpoints may be up to a few hundred 
basepairs, whereas the breakpoint coordinates of single reads spanning the same breakpoint vary
only by a very few bases due to sequencing errors or ambiguities during alignment.\\

In the following example, we use the reads from sample N01 presented in the previous section. The
reads have been aligned using BWA-SW:
<<label=readBam, eval=TRUE>>=
library("Rsamtools")
bamFile = system.file("extdata", "SVDetection", "bam", "N01.bam", 
 package="R453Plus1Toolbox")
parameters = ScanBamParam(what=scanBamWhat())
bam = scanBam(bamFile, param=parameters)
@ 

For the filtering step, we specify a target region, i.e. the used capture array in form of a
\Rclass{IntegerRangesList}. All chimeric reads not overlapping this region with at least one local
alignment are discarded. The following example creates a target region out of a given .bed file
containing region information using functions from package \Rpackage{rtracklayer}. 
<<label=filterReads, eval=TRUE>>=
library("rtracklayer")
bedFile = system.file("extdata", "SVDetection", "chip", 
 "CaptureArray_hg19.bed", package="R453Plus1Toolbox")
chip = import.ucsc(bedFile, subformat="bed")
chip = split(ranges(chip[[1]]), seqnames(chip[[1]]))
names(chip) = gsub("chr", "", names(chip))

linker = sequenceCaptureLinkers("gSel3")[[1]]

filterReads = filterChimericReads(bam, targetRegion=chip, linkerSeq=linker)
filterReads$log
@
The \Rfunarg{linkerSeq} argument allows to specify the linker sequence used during sample 
preparation. All chimeric reads that have this linker sequence between their local alignments are 
removed.\\

Finally, we call the \Rfunction{detectBreakpoints} and \Rfunction{mergeBreakpoints} functions:
<<label=detectBreakpoints, eval=TRUE>>=
bp = detectBreakpoints(filterReads, minClusterSize=1)
bp
table(bp)

mbp = mergeBreakpoints(bp)
summary(mbp)
@
One cluster of size 8 and another cluster of size 4 were detected. Both putative breakpoints span 
two regions on chromosome 16. Further, 11 clusters of size one were found. The 
\Rfunction{mergeBreakpoints} function merges the first two clusters. The summary reveals that the 
coordinates of the breakpoints only differ by two bases at each region on chromosome 16. Moreover, 
both strands from both breakpoints were sequenced. Obviously, we detected two related breakpoints 
caused by an inversion on chromsome 16.


\subsection{Visualization of breakpoints}
The function \Rfunction{plotChimericReads} takes the output of the function 
\Rfunction{mergeBreakpoints} and produces a plot of the breakpoint regions together with the 
aligned reads and marks deletions, insertions and mismatches. If a pathogenic and a reciprocal 
breakpoint exist, \Rfunction{plotChimericReads} creates two plots as shown in the example
below.\\

The following example shows the breakpoints (pathogenic and reciprocal) of an inversion on chromosome 16 where 12 reads 
aligned:\\
<<eval=TRUE, label=plotCR1,fig=TRUE,include=FALSE,eps=FALSE,pdf=TRUE,width=10,height=5>>=
plotChimericReads(mbp[1], legend=TRUE)
@
\begin{figure}[hbp]
\begin{center}
  \includegraphics[height=7cm, width=12cm]{R453Plus1-plotCR1}
  \caption{Plot of the breakpoint region.}
\end{center}
\end{figure}


Optionally (if the argument \Rfunarg{plotBasePairs} is \Rcode{TRUE}), \Rfunction{plotChimericReads} 
displays all base pairs within a given region of size \Rfunarg{maxBasePairs} around the breakpoint:

<<eval=TRUE, label=plotCR2,fig=TRUE,include=FALSE,eps=FALSE,pdf=TRUE>>=
plotChimericReads(mbp[1], plotBasePairs=TRUE, maxBasePairs=30)
@
\begin{figure}[hbp]
\begin{center}
  \includegraphics[height=8cm]{R453Plus1-plotCR2}
  \caption{Plot of the breakpoint region including base pairs.}
\end{center}
\end{figure}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%___Analysis and manipulation of SFF files___%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Analysis and manipulation of SFF files}

The Standard Flowgram Format(SFF) is a binary file format designed by Roche to store the homopolymer stretches typical for 454 sequencing. It consists of a common header section, containing general information (e.g. number of reads, nucleotides used for each flow) and for each read a read header (e.g. read length, read name) and read data section (e.g. called bases, flow values, quality scores). 

\subsection{Importing SFF files}
SFF files be imported using the \Rfunction{readSFF} function.
<<label=importSFFfiles, eval=TRUE>>=
file <- system.file("extdata", "SFF", "example.sff", package="R453Plus1Toolbox")
sffContainer <- readSFF(file)
@

\subsection{The SFF container}
The contents of the SFF file are stored in an object of class \Rclass{SFFContainer} with different slots:
<<label=SFFcontainer, eval=TRUE>>=
showClass("SFFContainer")
@

The most import slot is the reads slot containing the called bases and the corresponding quality measures:
<<label=sffreads, eval=TRUE>>=
reads(sffContainer)
@

An \Rclass{SFFContainer} object can be subsetted using the \Rcode{[} operator and some read names or numbers:
<<label=sffsubsetting, eval=TRUE>>=
subSffContainer <- sffContainer[1:5]
@

\subsection{Writing SFF files}
An \Rclass{SFFContainer} can be written back into a file using the \Rfunction{writeSFF} method:
<<label=sffsubsetting, eval=FALSE>>=
writeSFF(subSffContainer, subSffFile.sff)
@

\subsection{Quality control of SFF files}
The \Rfunction{qualityReportSFF} function creates a PDF document from one or more SFF files containing information relevant for quality control (e.g. read length distributions, quality histograms, GC content). Two example plots are shown below:
<<label=positionQualityBoxplot, eval=TRUE, fig=TRUE, include=FALSE, results=hide, eps=FALSE, pdf=TRUE>>=
positionQualityBoxplot(sffContainer)
@
\begin{figure}[hbp]
\begin{center}
  \includegraphics[height=8cm]{R453Plus1-positionQualityBoxplot}
  \caption{Position quality boxplot - One of the plots contained in the PDF quality report.}
\end{center}
\end{figure}
@

<<label=dinucleotideOddsRatio, eval=TRUE, results=hide, fig=TRUE, include=FALSE, eps=FALSE, pdf=TRUE>>=
dinucleotideOddsRatio(sffContainer)
@
\begin{figure}[hbp]
\begin{center}
  \includegraphics[height=8cm]{R453Plus1-dinucleotideOddsRatio}
  \caption{Dinucleotide odds ratio showing the over-/under-representation of dinucleotides - One of the plots contained in the PDF quality report.}
\end{center}
\end{figure}
@


\begin{thebibliography}{}
\bibitem[Kohlmann {\it et~al}., 2009]{Kohlmann2009}
Kohlmann,A. \textit{et~al}. (2009) Targeted next-generation sequencing (NGS) enables for the first time the detection of point mutations, molecular insertions and deletions, as well as leukemia-specific fusion genes in AML in a single procedure.
\textit{Blood (ASH Annual Meeting Abstracts)}, \textbf{114}(22), 294--295.

\bibitem[Li and Durbin, 2010]{Li2010}
Li,H. and Durbin,R. (2010) Fast and accurate long-read alignment with Burrows-Wheeler transform.
\textit{Bioinformatics}, \textbf{26}(5), 589--95. 

\bibitem[Danecek {\it et~al}., 2011]{Danecek2011}
Danecek,P. \textit{et~al}. (2011) The variant call format and VCFtools.
\textit{Bioinformatics}, \textbf{27}(15), 2156--2158. 
\end{thebibliography}

\end{document}