%% This BibTeX bibliography file in UTF-8 format was created using Papers. %% http://mekentosj.com/papers/ @article{Lefrancois:2009p778, author = {Philippe Lefran{\c c}ois and others}, journal = {BMC Genomics}, title = {Efficient yeast ChIP-Seq using multiplex short-read DNA sequencing}, abstract = {BACKGROUND: Short-read high-throughput DNA sequencing technologies provide new tools to answer biological questions. However, high cost and low throughput limit their widespread use, particularly in organisms with smaller genomes such as S. cerevisiae. Although ChIP-Seq in mammalian cell lines is replacing array-based ChIP-chip as the standard for transcription factor binding studies, ChIP-Seq in yeast is still underutilized compared to ChIP-chip. We developed a multiplex barcoding system that allows simultaneous sequencing and analysis of multiple samples using Illumina's platform. We applied this method to analyze the chromosomal distributions of three yeast DNA binding proteins (Ste12, Cse4 and RNA PolII) and a reference sample (input DNA) in a single experiment and demonstrate its utility for rapid and accurate results at reduced costs. RESULTS: We developed a barcoding ChIP-Seq method for the concurrent analysis of transcription factor binding sites in yeast. Our multiplex strategy generated high quality data that was indistinguishable from data obtained with non-barcoded libraries. None of the barcoded adapters induced differences relative to a non-barcoded adapter when applied to the same DNA sample. We used this method to map the binding sites for Cse4, Ste12 and Pol II throughout the yeast genome and we found 148 binding targets for Cse4, 823 targets for Ste12 and 2508 targets for PolII. Cse4 was strongly bound to all yeast centromeres as expected and the remaining non-centromeric targets correspond to highly expressed genes in rich media. The presence of Cse4 non-centromeric binding sites was not reported previously. CONCLUSION: We designed a multiplex short-read DNA sequencing method to perform efficient ChIP-Seq in yeast and other small genome model organisms. This method produces accurate results with higher throughput and reduced cost. Given constant improvements in high-throughput sequencing technologies, increasing multiplexing will be possible to further decrease costs per sample and to accelerate the completion of large consortium projects such as modENCODE.}, affiliation = {Department of Molecular, Cellular and Developmental Biology, Yale University, New Haven, CT 06520, USA. philippe.lefrancois@yale.edu}, pages = {37}, volume = {10}, year = {2009}, month = {Jan} } @article{Kim:2010p572, author = {Tae-Kyung Kim and others}, journal = {Nature}, title = {Widespread transcription at neuronal activity-regulated enhancers}, abstract = {We used genome-wide sequencing methods to study stimulus-dependent enhancer function in mouse cortical neurons. We identified approximately 12,000 neuronal activity-regulated enhancers that are bound by the general transcriptional co-activator CBP in an activity-dependent manner. A function of CBP at enhancers may be to recruit RNA polymerase II (RNAPII), as we also observed activity-regulated RNAPII binding to thousands of enhancers. Notably, RNAPII at enhancers transcribes bi-directionally a novel class of enhancer RNAs (eRNAs) within enhancer domains defined by the presence of histone H3 monomethylated at lysine 4. The level of eRNA expression at neuronal enhancers positively correlates with the level of messenger RNA synthesis at nearby genes, suggesting that eRNA synthesis occurs specifically at enhancers that are actively engaged in promoting mRNA synthesis. These findings reveal that a widespread mechanism of enhancer activation involves RNAPII binding and eRNA synthesis.}, affiliation = {Department of Neurobiology, Harvard Medical School, 220 Longwood Avenue, Boston, Massachusetts 02115, USA.}, number = {7295}, pages = {182--7}, volume = {465}, year = {2010}, month = {May} } @article{Li:2009p1662, author = {Heng Li and others}, journal = {Bioinformatics}, title = {The Sequence Alignment/Map format and SAMtools}, abstract = {SUMMARY: The Sequence Alignment/Map (SAM) format is a generic alignment format for storing read alignments against reference sequences, supporting short and long reads (up to 128 Mbp) produced by different sequencing platforms. It is flexible in style, compact in size, efficient in random access and is the format in which alignments from the 1000 Genomes Project are released. SAMtools implements various utilities for post-processing alignments in the SAM format, such as indexing, variant caller and alignment viewer, and thus provides universal tools for processing read alignments. AVAILABILITY: http://samtools.sourceforge.net.}, affiliation = {Wellcome Trust Sanger Institute, Wellcome Trust Genome Campus, Cambridge, CB10 1SA, UK, Broad Institute of MIT and Harvard, Cambridge, MA 02141, USA.}, annote = {Standard for NGS. Keep in mind the CIGAR string, especially the padding and clipping. Keep in mind as well the additional fields. One could use them to encode the library information or the barcode. See to that with Jonathon. }, number = {16}, pages = {2078--9}, volume = {25}, year = {2009}, month = {Aug} } @article{Gentleman:2004p2013, author = {Robert C Gentleman and others}, journal = {Genome Biology 2010 11:202}, title = {Bioconductor: open software development for computational biology and bioinformatics}, abstract = {The Bioconductor project is an initiative for the collaborative creation of extensible software for computational biology and bioinformatics. The goals of the project include: fostering collaborative development and widespread use of innovative software, reducing barriers to entry into interdisciplinary scientific research, and promoting the achievement of remote reproducibility of research results. We describe details of our aims and methods, identify current challenges, compare Bioconductor to other open bioinformatics projects, and provide working examples.}, affiliation = {Department of Biostatistical Science, Dana-Farber Cancer Institute, 44 Binney St, Boston, MA 02115, USA. rgentlem@jimmy.harvard.edu}, number = {10}, pages = {R80}, volume = {5}, year = {2004}, month = {Jan} } @article{Durinck:2005p1990, author = {Steffen Durinck and others}, journal = {Bioinformatics}, title = {BioMart and Bioconductor: a powerful link between biological databases and microarray data analysis}, abstract = {biomaRt is a new Bioconductor package that integrates BioMart data resources with data analysis software in Bioconductor. It can annotate a wide range of gene or gene product identifiers (e.g. Entrez-Gene and Affymetrix probe identifiers) with information such as gene symbol, chromosomal coordinates, Gene Ontology and OMIM annotation. Furthermore biomaRt enables retrieval of genomic sequences and single nucleotide polymorphism information, which can be used in data analysis. Fast and up-to-date data retrieval is possible as the package executes direct SQL queries to the BioMart databases (e.g. Ensembl). The biomaRt package provides a tight integration of large, public or locally installed BioMart databases with data analysis in Bioconductor creating a powerful environment for biological data mining.}, affiliation = {Department of Electronical Engineering, ESAT-SCD, K.U.Leuven, Kasteelpark Arenberg 10, 3001 Leuven-Heverlee, Belgium. steffen.durinck@esat.kuleuven.ac.be}, number = {16}, pages = {3439--40}, volume = {21}, year = {2005}, month = {Aug} } @article{Hamming:1950p825, author = {RW Hamming}, journal = {Bell System Technical Journal}, title = {Error detecting and error correcting codes}, number = {2}, pages = {147--160}, volume = {29}, year = {1950} } @article{Mortazavi:2008p740, author = {Ali Mortazavi and others}, journal = {Nature Methods}, title = {Mapping and quantifying mammalian transcriptomes by RNA-Seq}, abstract = {We have mapped and quantified mouse transcriptomes by deeply sequencing them and recording how frequently each gene is represented in the sequence sample (RNA-Seq). This provides a digital measure of the presence and prevalence of transcripts from known and previously unknown genes. We report reference measurements composed of 41-52 million mapped 25-base-pair reads for poly(A)-selected RNA from adult mouse brain, liver and skeletal muscle tissues. We used RNA standards to quantify transcript prevalence and to test the linear range of transcript detection, which spanned five orders of magnitude. Although >90% of uniquely mapped reads fell within known exons, the remaining data suggest new and revised gene models, including changed or additional promoters, exons and 3' untranscribed regions, as well as new candidate microRNA precursors. RNA splice events, which are not readily measured by standard gene expression microarray or serial analysis of gene expression methods, were detected directly by mapping splice-crossing sequence reads. We observed 1.45 x 10(5) distinct splices, and alternative splices were prominent, with 3,500 different genes expressing one or more alternate internal splices.}, affiliation = {Division of Biology, MC 156-29, California Institute of Technology, Pasadena, California 91125, USA.}, number = {7}, pages = {621--8}, volume = {5}, year = {2008}, month = {Jul} } @article{Morgan:2009p739, author = {Martin Morgan and others}, journal = {Bioinformatics}, title = {ShortRead: a bioconductor package for input, quality assessment and exploration of high-throughput sequence data}, abstract = {ShortRead is a package for input, quality assessment, manipulation and output of high-throughput sequencing data. ShortRead is provided in the R and Bioconductor environments, allowing ready access to additional facilities for advanced statistical analysis, data transformation, visualization and integration with diverse genomic resources. AVAILABILITY AND IMPLEMENTATION: This package is implemented in R and available at the Bioconductor web site; the package contains a 'vignette' outlining typical work flows.}, affiliation = {Program in Computational Biology, Fred Hutchinson Cancer Research Center, Seattle, WA, USA. mtmorgan@fhcrc.org}, number = {19}, pages = {2607--8}, volume = {25}, year = {2009}, month = {Oct} } @article{Robinson:2010p775, author = {Mark D Robinson and others}, journal = {Bioinformatics}, title = {edgeR: a Bioconductor package for differential expression analysis of digital gene expression data}, abstract = {SUMMARY: It is expected that emerging digital gene expression (DGE) technologies will overtake microarray technologies in the near future for many functional genomics applications. One of the fundamental data analysis tasks, especially for gene expression studies, involves determining whether there is evidence that counts for a transcript or exon are significantly different across experimental conditions. edgeR is a Bioconductor software package for examining differential expression of replicated count data. An overdispersed Poisson model is used to account for both biological and technical variability. Empirical Bayes methods are used to moderate the degree of overdispersion across transcripts, improving the reliability of inference. The methodology can be used even with the most minimal levels of replication, provided at least one phenotype or experimental condition is replicated. The software may have other applications beyond sequencing data, such as proteome peptide count data. AVAILABILITY: The package is freely available under the LGPL licence from the Bioconductor web site (http://bioconductor.org).}, affiliation = {Cancer Program, Garvan Institute of Medical Research, 384 Victoria Street, Darlinghurst, NSW 2010, Australia. mrobinson@wehi.edu.au}, number = {1}, pages = {139--40}, volume = {26}, year = {2010}, month = {Jan} } @article{Trapnell:2009p156, author = {C Trapnell and others}, journal = {Bioinformatics}, title = {TopHat: discovering splice junctions with RNA-Seq}, number = {9}, pages = {1105--1111}, volume = {25}, year = {2009}, month = {May}, } @article{Tweedie:2009p2014, author = {Susan Tweedie and others}, journal = {Nucleic Acids Research}, title = {FlyBase: enhancing Drosophila Gene Ontology annotations}, abstract = {FlyBase (http://flybase.org) is a database of Drosophila genetic and genomic information. Gene Ontology (GO) terms are used to describe three attributes of wild-type gene products: their molecular function, the biological processes in which they play a role, and their subcellular location. This article describes recent changes to the FlyBase GO annotation strategy that are improving the quality of the GO annotation data. Many of these changes stem from our participation in the GO Reference Genome Annotation Project--a multi-database collaboration producing comprehensive GO annotation sets for 12 diverse species.}, affiliation = {Department of Genetics, University of Cambridge, Downing Street, Cambridge CB2 3EH, UK. s.tweedie@gen.cam.ac.uk}, number = {Database issue}, pages = {D555--9}, volume = {37}, year = {2009}, month = {Jan}, } @article{Flicek:2011p2042, author = {Paul Flicek and others}, journal = {Nucleic Acids Research}, title = {Ensembl 2011}, abstract = {The Ensembl project (http://www.ensembl.org) seeks to enable genomic science by providing high quality, integrated annotation on chordate and selected eukaryotic genomes within a consistent and accessible infrastructure. All supported species include comprehensive, evidence-based gene annotations and a selected set of genomes includes additional data focused on variation, comparative, evolutionary, functional and regulatory annotation. The most advanced resources are provided for key species including human, mouse, rat and zebrafish reflecting the popularity and importance of these species in biomedical research. As of Ensembl release 59 (August 2010), 56 species are supported of which 5 have been added in the past year. Since our previous report, we have substantially improved the presentation and integration of both data of disease relevance and the regulatory state of different cell types.}, affiliation = {European Bioinformatics Institute, Wellcome Trust Genome Campus, Hinxton, Cambridge CB10 1SD, UK. flicek@ebi.ac.uk}, number = {Database issue}, pages = {D800--6}, volume = {39}, year = {2011}, month = {Jan}, } @article{Pilcher:2008p824, author = {CD Pilcher and others}, journal = {PLoS Med}, title = {Inferring HIV transmission dynamics from phylogenetic sequence relationships}, number = {3}, pages = {e69}, volume = {5}, year = {2008}, } @article{Hansen:2010p495, author = {Kasper D Hansen and Steven E Brenner and Sandrine Dudoit}, journal = {Nucleic Acids Research}, title = {Biases in Illumina transcriptome sequencing caused by random hexamer priming}, abstract = {Generation of cDNA using random hexamer priming induces biases in the nucleotide composition at the beginning of transcriptome sequencing reads from the Illumina Genome Analyzer. The bias is independent of organism and laboratory and impacts the uniformity of the reads along the transcriptome. We provide a read count reweighting scheme, based on the nucleotide frequencies of the reads, that mitigates the impact of the bias.}, affiliation = {Division of Biostatistics, School of Public Health, UC Berkeley, 101 Haviland Hall, Berkeley, CA 94720-7358, Department of Plant and Microbial Biology, UC Berkeley, 461 Koshland Hall, Berkeley, CA 94720-3102 and Department of Statistics, UC Berkeley, 367 Evans Hall, Berkeley, CA 94720-3860, USA.}, pages = {}, year = {2010}, month = {Apr}, } @article{Li:2010p732, author = {Jun Li and Hui Jiang and Wing Hung Wong}, journal = {Genome Biology 2010 11:202}, title = {Modeling non-uniformity in short-read rates in RNA-Seq data}, abstract = {ABSTRACT: After mapping, RNA-Seq data can be summarized by a sequence of read counts commonly modeled as Poisson variables with constant rates along each transcript, which actually fit data poorly. We suggest using variable rates for different positions, and propose two models to predict these rates based on local sequences. These models explain more than 50% of the variations and can lead to improved estimates of gene and isoform expressions for both Illumina and Applied Biosystems (ABI) data.}, number = {5}, pages = {R50}, volume = {11}, year = {2010}, month = {May}, } @Manual{ref:R, title = {R: A Language and Environment for Statistical Computing}, author = {{R Development Core Team}}, organization = {R Foundation for Statistical Computing}, address = {Vienna, Austria}, year = {2009}, note = {{ISBN} 3-900051-07-0}, url = {http://www.R-project.org}, } @article{Smith:2010p777, author = {Andrew M Smith and Lawrence E Heisler and Robert P St Onge and Eveline Farias-Hesson and Iain M Wallace and John Bodeau and Adam N Harris and Kathleen M Perry and Guri Giaever and Nader Pourmand and Corey Nislow}, journal = {Nucleic Acids Research}, title = {Highly-multiplexed barcode sequencing: an efficient method for parallel analysis of pooled samples}, abstract = {Next-generation sequencing has proven an extremely effective technology for molecular counting applications where the number of sequence reads provides a digital readout for RNA-seq, ChIP-seq, Tn-seq and other applications. The extremely large number of sequence reads that can be obtained per run permits the analysis of increasingly complex samples. For lower complexity samples, however, a point of diminishing returns is reached when the number of counts per sequence results in oversampling with no increase in data quality. A solution to making next-generation sequencing as efficient and affordable as possible involves assaying multiple samples in a single run. Here, we report the successful 96-plexing of complex pools of DNA barcoded yeast mutants and show that such 'Bar-seq' assessment of these samples is comparable with data provided by barcode microarrays, the current benchmark for this application. The cost reduction and increased throughput permitted by highly multiplexed sequencing will greatly expand the scope of chemogenomics assays and, equally importantly, the approach is suitable for other sequence counting applications that could benefit from massive parallelization.}, affiliation = {Department of Molecular Genetics, University of Toronto, 1 King's College Circle, Toronto, Ontario M5S 1A8, Banting and Best Department of Medical Research, University of Toronto, 112 College Street, Toronto, Ontario M5G 1L6, Donnelly Centre for Cellular and Biomolecular Research, University of Toronto, 160 College Street, Toronto, Ontario M5S 3E1, Department of Pharmaceutical Sciences, University of Toronto, 144 College Street, Toronto, Ontario M5S 3M2, Canada, Department of Biochemistry, Stanford University, Stanford, CA 94305, Stanford Genome Technology Center, Stanford University, Palo Alto, CA 94304, Biomolecular Engineering, University of California at Santa Cruz, Santa Cruz, CA 95064, Life Technologies Corporation, 850 Lincoln Centre Drive, Foster City, CA 94404 and Life Technologies Corporation, 5791 Van Allen Way, Carlsbad, CA 92009, USA.}, pages = {}, year = {2010}, month = {May}, } @article{Delhomme:2012p4678, author = {Nicolas Delhomme and Isma{\"e}l Padioleau and Eileen E Furlong and Larsm Steinmetz}, journal = {Bioinformatics}, title = {easyRNASeq: a bioconductor package for processing RNA-Seq data}, abstract = {MOTIVATION: RNA sequencing is becoming a standard for expression profiling experiments and many tools have been developed in the past few years to analyze RNA-Seq data. Numerous Bioconductor packages are available for Next-Generation Sequencing data loading in R, e.g. ShortRead, Rsamtools, as well as to perform differential gene expression analyses, e.g. DESeq, edgeR. However, the processing tasks lying in between these requires the precise interplay of many Bioconductor packages, e.g. Biostrings, IRanges or external solutions are to be sought. RESULTS: We developed easyRNASeq, an R package that simplifies the processing of RNA sequencing data, hiding the complex interplay of the required packages behind a single functionality. AVAILABILITY: The package is implemented in R (as of version 2.15) and is available from Bioconductor (as of version 2.10) at the URL: http://bioconductor.org/packages/release/bioc/html/easyRNASeq.html, where installation and usage instructions can be found. CONTACT: delhomme@embl.de.}, affiliation = {Genome Biology Computational Support, European Molecular Biology Laboratory, Meyerhofstrasse 1, 69117 Heidelberg, Germany.}, pages = {}, year = {2012}, month = {Jul}, language = {ENG}, doi = {10.1093/bioinformatics/bts477}, pii = {bts477}, pmid = {22847932}, } @article{Jaager:2012p4708, author = {Kersti J{\"a}{\"a}ger and Saiful Islam and Pawel Zajac and Sten Linnarsson and Toomas Neuman}, journal = {PLoS ONE}, title = {RNA-seq analysis reveals different dynamics of differentiation of human dermis- and adipose-derived stromal stem cells}, abstract = {BACKGROUND: Tissue regeneration and recovery in the adult body depends on self-renewal and differentiation of stem and progenitor cells. Mesenchymal stem cells (MSCs) that have the ability to differentiate into various cell types, have been isolated from the stromal fraction of virtually all tissues. However, little is known about the true identity of MSCs. MSC populations exhibit great tissue-, location- and patient-specific variation in gene expression and are heterogeneous in cell composition. METHODOLOGY/PRINCIPAL FINDINGS: Our aim was to analyze the dynamics of differentiation of two closely related stromal cell types, adipose tissue-derived MSCs (AdMSCs) and dermal fibroblasts (FBs) along adipogenic, osteogenic and chondrogenic lineages using multiplex RNA-seq technology. We found that undifferentiated donor-matched AdMSCs and FBs are distinct populations that stay different upon differentiation into adipocytes, osteoblasts and chondrocytes. The changes in lineage-specific gene expression occur early in differentiation and persist over time in both AdMSCs and FBs. Further, AdMSCs and FBs exhibit similar dynamics of adipogenic and osteogenic differentiation but different dynamics of chondrogenic differentiation. CONCLUSIONS/SIGNIFICANCE: Our findings suggest that stromal stem cells including AdMSCs and dermal FBs exploit different molecular mechanisms of differentiation to reach a common cell fate. The early mechanisms of differentiation are lineage-specific and are similar for adipogenic and osteogenic differentiation but are distinct for chondrogenic differentiation between AdMSCs and FBs.}, affiliation = {Institute of Gene Technology, Tallinn University of Technology, Tallinn, Estonia. kersti.jaager@cellintechnologies.com}, number = {6}, pages = {e38833}, volume = {7}, year = {2012}, month = {Jan}, language = {eng}, pmid = {22723894}, } @article{Soneson:2013p5778, author = {Charlotte Soneson and Mauro Delorenzi}, journal = {BMC Bioinformatics}, title = {A comparison of methods for differential expression analysis of RNA-seq data}, abstract = {BACKGROUND: Finding genes that are differentially expressed between conditions is an integral part of understanding the molecular basis of phenotypic variation. In the past decades, DNA microarrays have been used extensively to quantify the abundance of mRNA corresponding to different genes, and more recently high-throughput sequencing of cDNA (RNA-seq) has emerged as a powerful competitor. As the cost of sequencing decreases, it is conceivable that the use of RNA-seq for differential expression analysis will increase rapidly. To exploit the possibilities and address the challenges posed by this relatively new type of data, a number of software packages have been developed especially for differential expression analysis of RNA-seq data. RESULTS: We conducted an extensive comparison of eleven methods for differential expression analysis of RNA-seq data. All methods are freely available within the R framework and take as input a matrix of counts, i.e. the number of reads mapping to each genomic feature of interest in each of a number of samples. We evaluate the methods based on both simulated data and real RNA-seq data. CONCLUSIONS: Very small sample sizes, which are still common in RNA-seq experiments, impose problems for all evaluated methods and any results obtained under such conditions should be interpreted with caution. For larger sample sizes, the methods combining a variance-stabilizing transformation with the 'limma' method for differential expression analysis perform well under many different conditions, as does the nonparametric SAMseq method.}, affiliation = {Bioinformatics Core Facility, SIB Swiss Institute of Bioinformatics, Lausanne, Switzerland. Charlotte.Soneson@isb-sib.ch}, pages = {91}, volume = {14}, year = {2013}, month = {Jan}, language = {eng}, doi = {10.1186/1471-2105-14-91}, pii = {1471-2105-14-91}, pmid = {23497356}, } @article{Liao:2013p5892, author = {Yang Liao and Gordon K Smyth and Wei Shi}, journal = {Bioinformatics}, title = {featureCounts: an efficient general purpose program for assigning sequence reads to genomic features}, abstract = {MOTIVATION: ???Next-generation sequencing technologies generate millions of short sequence reads, which are usually aligned to a reference genome. In many applications, the key information required for downstream analysis is the number of reads mapping to each genomic feature, for example to each exon or each gene. The process of counting reads is called read summarization. Read summarization is required for a great variety of genomic analyses but has so far received relatively little attention in the literature. RESULTS: ???We present featureCounts, a read summarization program suitable for counting reads generated from either RNA or genomic DNA sequencing experiments. featureCounts implements highly efficient chromosome hashing and feature blocking techniques. It is considerably faster than existing methods (by an order of magnitude for gene-level summarization) and requires far less computer memory. It works with either single or paired-end reads and provides a wide range of options appropriate for different sequencing applications.Availability and implementation:???featureCounts is available under GNU General Public License as part of the Subread (http://subread.sourceforge.net) or Rsubread (http://www.bioconductor.org) software packages. CONTACT: ???shi@wehi.edu.au.}, affiliation = {Bioinformatics Division, The Walter and Eliza Hall Institute of Medical Research, 1G Royal Parade, Parkville, VIC 3052, Department of Computing and Information Systems and Department of Mathematics and Statistics, The University of Melbourne, Parkville, VIC 3010, Australia.}, pages = {}, year = {2013}, month = {Nov}, language = {ENG}, doi = {10.1093/bioinformatics/btt656}, pii = {btt656}, pmid = {24227677}, } @Article{ Robinson:2014p6362, author = {Kathryn Robinson and Nicolas Delhomme and Niklas M{\"a}hler and Bastian Schiffthaler and Jenny {\"O}nskog and Benedicte Albrectsen and P{\"a}r Ingvarsson and Torgeir Hvidsten and Stefan Jansson and Nathaniel Street}, journal = {BMC Plant Biology}, title = { Populus tremula (European aspen) shows no evidence of sexual dimorphism}, abstract = {Evolutionary theory suggests that males and females may evolve sexually dimorphic phenotypic and biochemical traits concordant with each sex having different optimal strategies of resource investment to maximise reproductive success and fitness. Such sexual dimorphism would result in sex biased gene expression patterns in non-floral organs for autosomal genes associated with the control and development of such phenotypic traits.}, number = {1}, pages = {276--276}, volume = {14}, year = {2014}, month = {Oct}, doi = {10.1186/s12870-014-0276-5}, url = {http://www.biomedcentral.com/1471-2229/14/276/abstract} } @Article{ Dobin:2013p5293, author = {Alexander Dobin and Carrie A Davis and Felix Schlesinger and Jorg Drenkow and Chris Zaleski and Sonali Jha and Philippe Batut and Mark Chaisson and Thomas R Gingeras}, journal = {Bioinformatics}, title = {STAR: ultrafast universal RNA-seq aligner}, abstract = {MOTIVATION: Accurate alignment of high-throughput RNA-seq data is a challenging and yet unsolved problem because of the non-contiguous transcript structure, relatively short read lengths and constantly increasing throughput of the sequencing technologies. Currently available RNA-seq aligners suffer from high mapping error rates, low mapping speed, read length limitation and mapping biases. RESULTS: To align our large (>80 billon reads) ENCODE Transcriptome RNA-seq dataset, we developed the Spliced Transcripts Alignment to a Reference (STAR) software based on a previously undescribed RNA-seq alignment algorithm that uses sequential maximum mappable seed search in uncompressed suffix arrays followed by seed clustering and stitching procedure. STAR outperforms other aligners by a factor of >50 in mapping speed, aligning to the human genome 550 million 2 × 76 bp paired-end reads per hour on a modest 12-core server, while at the same time improving alignment sensitivity and precision. In addition to unbiased de novo detection of canonical junctions, STAR can discover non-canonical splices and chimeric (fusion) transcripts, and is also capable of mapping full-length RNA sequences. Using Roche 454 sequencing of reverse transcription polymerase chain reaction amplicons, we experimentally validated 1960 novel intergenic splice junctions with an 80-90\% success rate, corroborating the high precision of the STAR mapping strategy. Availability and implementation: STAR is implemented as a standalone C++ code. STAR is free open source software distributed under GPLv3 license and can be downloaded from http://code.google.com/p/rna-star/. CONTACT: dobin@cshl.edu.}, affiliation = {Cold Spring Harbor Laboratory, Cold Spring Harbor, NY, USA and Pacific Biosciences, Menlo Park, CA, USA.}, number = {1}, pages = {15--21}, volume = {29}, year = {2013}, month = {Jan}, doi = {10.1093/bioinformatics/bts635}, pii = {bts635}, pmid = {23104886} }