Martin Morgan
February 2, 2015
Analysis and comprehension of high-throughput genomic data
Packages, vignettes, work flows
Objects
getClass(), showMethods(..., where=search()),
selectMethod()method?"substr,<tab>" to select help on methods, class?D<tab>
for help on classesExample
suppressPackageStartupMessages({
    library(Biostrings)
})
data(phiX174Phage)                       # sample data, see ?phiX174Phage
phiX174Phage
##   A DNAStringSet instance of length 6
##     width seq                                          names               
## [1]  5386 GAGTTTTATCGCTTCCATGAC...ATTGGCGTATCCAACCTGCA Genbank
## [2]  5386 GAGTTTTATCGCTTCCATGAC...ATTGGCGTATCCAACCTGCA RF70s
## [3]  5386 GAGTTTTATCGCTTCCATGAC...ATTGGCGTATCCAACCTGCA SS78
## [4]  5386 GAGTTTTATCGCTTCCATGAC...ATTGGCGTATCCAACCTGCA Bull
## [5]  5386 GAGTTTTATCGCTTCCATGAC...ATTGGCGTATCCAACCTGCA G97
## [6]  5386 GAGTTTTATCGCTTCCATGAC...ATTGGCGTATCCAACCTGCA NEB03
m <- consensusMatrix(phiX174Phage)[1:4,] # nucl. x position counts
polymorphic <- which(colSums(m != 0) > 1)
m[, polymorphic]
##   [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
## A    4    5    4    3    0    0    5    2    0
## C    0    0    0    0    5    1    0    0    5
## G    2    1    2    3    0    0    1    4    0
## T    0    0    0    0    1    5    0    0    1
showMethods(class=class(phiX174Phage), where=search())
Genomic range
seqnames), start, end, and optionally strandWhy genomic ranges?
Data objects
GenomicRanges::GRanges
seqnames()start(), end(), width()strand()mcols(): 'metadata' associated with each range, stored as a
DataFrameGenomicRanges::GRangesList
length(), names(), [, [[) <!– ] –>unlist() and relist().GenomicAlignments::GAlignments, GAlignmentsList, GAlignemntPairs; VariantAnnotation::VCF, VRanges
Example: GRanges
## 'Annotation' package; more later...
suppressPackageStartupMessages({
    library(TxDb.Hsapiens.UCSC.hg19.knownGene)
})
promoters <- promoters(TxDb.Hsapiens.UCSC.hg19.knownGene)
## 'GRanges' with 2 metadata columns
promoters
## GRanges object with 82960 ranges and 2 metadata columns:
##           seqnames               ranges strand   |     tx_id     tx_name
##              <Rle>            <IRanges>  <Rle>   | <integer> <character>
##       [1]     chr1     [  9874,  12073]      +   |         1  uc001aaa.3
##       [2]     chr1     [  9874,  12073]      +   |         2  uc010nxq.1
##       [3]     chr1     [  9874,  12073]      +   |         3  uc010nxr.1
##       [4]     chr1     [ 67091,  69290]      +   |         4  uc001aal.1
##       [5]     chr1     [319084, 321283]      +   |         5  uc001aaq.2
##       ...      ...                  ...    ... ...       ...         ...
##   [82956]     chrY [27605479, 27607678]      -   |     78803  uc004fwx.1
##   [82957]     chrY [27606222, 27608421]      -   |     78804  uc022cpc.1
##   [82958]     chrY [27607233, 27609432]      -   |     78805  uc004fwz.3
##   [82959]     chrY [27635755, 27637954]      -   |     78806  uc022cpd.1
##   [82960]     chrY [59360655, 59362854]      -   |     78807  uc011ncc.1
##   -------
##   seqinfo: 93 sequences (1 circular) from hg19 genome
head(table(seqnames(promoters)))
## 
## chr1 chr2 chr3 chr4 chr5 chr6 
## 7967 5092 4328 2888 3366 4220
table(strand(promoters))
## 
##     +     -     * 
## 42198 40762     0
seqinfo(promoters)
## Seqinfo object with 93 sequences (1 circular) from hg19 genome:
##   seqnames       seqlengths isCircular genome
##   chr1            249250621      FALSE   hg19
##   chr2            243199373      FALSE   hg19
##   chr3            198022430      FALSE   hg19
##   chr4            191154276      FALSE   hg19
##   chr5            180915260      FALSE   hg19
##   ...                   ...        ...    ...
##   chrUn_gl000245      36651      FALSE   hg19
##   chrUn_gl000246      38154      FALSE   hg19
##   chrUn_gl000247      36422      FALSE   hg19
##   chrUn_gl000248      39786      FALSE   hg19
##   chrUn_gl000249      38502      FALSE   hg19
## vector-like access
promoters[ seqnames(promoters) %in% c("chr1", "chr2") ]
## GRanges object with 13059 ranges and 2 metadata columns:
##           seqnames                 ranges strand   |     tx_id     tx_name
##              <Rle>              <IRanges>  <Rle>   | <integer> <character>
##       [1]     chr1       [  9874,  12073]      +   |         1  uc001aaa.3
##       [2]     chr1       [  9874,  12073]      +   |         2  uc010nxq.1
##       [3]     chr1       [  9874,  12073]      +   |         3  uc010nxr.1
##       [4]     chr1       [ 67091,  69290]      +   |         4  uc001aal.1
##       [5]     chr1       [319084, 321283]      +   |         5  uc001aaq.2
##       ...      ...                    ...    ... ...       ...         ...
##   [13055]     chr2 [242617330, 242619529]      -   |     13055  uc002wcb.2
##   [13056]     chr2 [242751523, 242753722]      -   |     13056  uc002wck.1
##   [13057]     chr2 [242794933, 242797132]      -   |     13057  uc010fzs.3
##   [13058]     chr2 [242800859, 242803058]      -   |     13058  uc002wcq.4
##   [13059]     chr2 [242800859, 242803058]      -   |     13059  uc010fzt.3
##   -------
##   seqinfo: 93 sequences (1 circular) from hg19 genome
## metadata
mcols(promoters)
## DataFrame with 82960 rows and 2 columns
##           tx_id     tx_name
##       <integer> <character>
## 1             1  uc001aaa.3
## 2             2  uc010nxq.1
## 3             3  uc010nxr.1
## 4             4  uc001aal.1
## 5             5  uc001aaq.2
## ...         ...         ...
## 82956     78803  uc004fwx.1
## 82957     78804  uc022cpc.1
## 82958     78805  uc004fwz.3
## 82959     78806  uc022cpd.1
## 82960     78807  uc011ncc.1
length(unique(promoters$tx_name))
## [1] 82960
## exons, grouped by transcript
exByTx <- exonsBy(TxDb.Hsapiens.UCSC.hg19.knownGene, "tx", use.names=TRUE)
## list-like subsetting
exByTx[1:10]              # also logical, character, ...
## GRangesList object of length 10:
## $uc001aaa.3 
## GRanges object with 3 ranges and 3 metadata columns:
##       seqnames         ranges strand |   exon_id   exon_name exon_rank
##          <Rle>      <IRanges>  <Rle> | <integer> <character> <integer>
##   [1]     chr1 [11874, 12227]      + |         1        <NA>         1
##   [2]     chr1 [12613, 12721]      + |         3        <NA>         2
##   [3]     chr1 [13221, 14409]      + |         5        <NA>         3
## 
## $uc010nxq.1 
## GRanges object with 3 ranges and 3 metadata columns:
##       seqnames         ranges strand | exon_id exon_name exon_rank
##   [1]     chr1 [11874, 12227]      + |       1      <NA>         1
##   [2]     chr1 [12595, 12721]      + |       2      <NA>         2
##   [3]     chr1 [13403, 14409]      + |       6      <NA>         3
## 
## $uc010nxr.1 
## GRanges object with 3 ranges and 3 metadata columns:
##       seqnames         ranges strand | exon_id exon_name exon_rank
##   [1]     chr1 [11874, 12227]      + |       1      <NA>         1
##   [2]     chr1 [12646, 12697]      + |       4      <NA>         2
##   [3]     chr1 [13221, 14409]      + |       5      <NA>         3
## 
## ...
## <7 more elements>
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
exByTx[["uc001aaa.3"]]    # also numeric
## GRanges object with 3 ranges and 3 metadata columns:
##       seqnames         ranges strand |   exon_id   exon_name exon_rank
##          <Rle>      <IRanges>  <Rle> | <integer> <character> <integer>
##   [1]     chr1 [11874, 12227]      + |         1        <NA>         1
##   [2]     chr1 [12613, 12721]      + |         3        <NA>         2
##   [3]     chr1 [13221, 14409]      + |         5        <NA>         3
##   -------
##   seqinfo: 93 sequences (1 circular) from hg19 genome
## accessors return typed-List, e.g., IntegerList
width(exByTx)
## IntegerList of length 82960
## [["uc001aaa.3"]] 354 109 1189
## [["uc010nxq.1"]] 354 127 1007
## [["uc010nxr.1"]] 354 52 1189
## [["uc001aal.1"]] 918
## [["uc001aaq.2"]] 32
## [["uc001aar.2"]] 62
## [["uc009vjk.2"]] 192 58 2500
## [["uc001aau.3"]] 169 58 4143
## [["uc021oeh.1"]] 58 248 406 514
## [["uc021oei.1"]] 894
## ...
## <82950 more elements>
log10(width(exByTx))
## NumericList of length 82960
## [["uc001aaa.3"]] 2.54900326202579 2.03742649794062 3.07518185461869
## [["uc010nxq.1"]] 2.54900326202579 2.10380372095596 3.00302947055362
## [["uc010nxr.1"]] 2.54900326202579 1.7160033436348 3.07518185461869
## [["uc001aal.1"]] 2.96284268120124
## [["uc001aaq.2"]] 1.50514997831991
## [["uc001aar.2"]] 1.79239168949825
## [["uc009vjk.2"]] 2.28330122870355 1.76342799356294 3.39794000867204
## [["uc001aau.3"]] 2.22788670461367 1.76342799356294 3.61731493329829
## [["uc021oeh.1"]] 1.76342799356294 2.39445168082622 2.60852603357719 2.71096311899528
## [["uc021oei.1"]] 2.95133751879592
## ...
## <82950 more elements>
## 'easy' to ask basic questions, e.g., ...
hist(unlist(log10(width(exByTx))))         # widths of exons
 
exByTx[which.max(max(width(exByTx)))]      # transcript with largest exon
## GRangesList object of length 1:
## $uc031qjh.1 
## GRanges object with 1 range and 3 metadata columns:
##       seqnames                 ranges strand |   exon_id   exon_name
##          <Rle>              <IRanges>  <Rle> | <integer> <character>
##   [1]    chr12 [102591363, 102796374]      + |    164764        <NA>
##       exon_rank
##       <integer>
##   [1]         1
## 
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
exByTx[which.max(elementLengths(exByTx))]  # transcript with most exons
## GRangesList object of length 1:
## $uc031qqx.1 
## GRanges object with 5065 ranges and 3 metadata columns:
##          seqnames                 ranges strand   |   exon_id   exon_name
##             <Rle>              <IRanges>  <Rle>   | <integer> <character>
##      [1]    chr14 [107283004, 107283085]      -   |    192985        <NA>
##      [2]    chr14 [107282819, 107282992]      -   |    192984        <NA>
##      [3]    chr14 [107281146, 107281249]      -   |    192983        <NA>
##      [4]    chr14 [107281126, 107281145]      -   |    192982        <NA>
##      [5]    chr14 [107276018, 107276044]      -   |    192981        <NA>
##      ...      ...                    ...    ... ...       ...         ...
##   [5061]    chr14 [106067906, 106068064]      -   |    187862        <NA>
##   [5062]    chr14 [106054457, 106054734]      -   |    187853        <NA>
##   [5063]    chr14 [106052986, 106052998]      -   |    187849        <NA>
##   [5064]    chr14 [105994262, 105994283]      -   |    187848        <NA>
##   [5065]    chr14 [105994256, 105994261]      -   |    187847        <NA>
##          exon_rank
##          <integer>
##      [1]         1
##      [2]         2
##      [3]         3
##      [4]         4
##      [5]         5
##      ...       ...
##   [5061]      5061
##   [5062]      5062
##   [5063]      5063
##   [5064]      5064
##   [5065]      5065
## 
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
There are many neat range-based operations (more later)!
Some detail
What is an experiment?
Why integrate?
Data objects
exprs()): matrix of expression valuesfeatureData(); fData()): probeset or gene
identifiersphenoData(); pData(): data.frame of relevant
informationexptData()): Instance of class MIAME.assay(), assays()): arbitrary matrix-like objectrowData()): GRanges or GRangesList;
use GRangesList with names and 0-length elements to represent
assays without ranges.colData()): DataFrame of relevant information.exptData()): List of arbitrary information.Example: ExpressionSet (see vignettes in Biobase).
suppressPackageStartupMessages({
    library(ALL)
})
data(ALL)
ALL
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 12625 features, 128 samples 
##   element names: exprs 
## protocolData: none
## phenoData
##   sampleNames: 01005 01010 ... LAL4 (128 total)
##   varLabels: cod diagnosis ... date last seen (21 total)
##   varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
##   pubMedIds: 14684422 16243790 
## Annotation: hgu95av2
## 'Phenotype' (sample) and 'feature' data
head(pData(ALL))
##        cod diagnosis sex age BT remission CR   date.cr t(4;11) t(9;22)
## 01005 1005 5/21/1997   M  53 B2        CR CR  8/6/1997   FALSE    TRUE
## 01010 1010 3/29/2000   M  19 B2        CR CR 6/27/2000   FALSE   FALSE
## 03002 3002 6/24/1998   F  52 B4        CR CR 8/17/1998      NA      NA
## 04006 4006 7/17/1997   M  38 B1        CR CR  9/8/1997    TRUE   FALSE
## 04007 4007 7/22/1997   M  57 B2        CR CR 9/17/1997   FALSE   FALSE
## 04008 4008 7/30/1997   M  17 B1        CR CR 9/27/1997   FALSE   FALSE
##       cyto.normal        citog mol.biol fusion protein mdr   kinet   ccr
## 01005       FALSE      t(9;22)  BCR/ABL           p210 NEG dyploid FALSE
## 01010       FALSE  simple alt.      NEG           <NA> POS dyploid FALSE
## 03002          NA         <NA>  BCR/ABL           p190 NEG dyploid FALSE
## 04006       FALSE      t(4;11) ALL1/AF4           <NA> NEG dyploid FALSE
## 04007       FALSE      del(6q)      NEG           <NA> NEG dyploid FALSE
## 04008       FALSE complex alt.      NEG           <NA> NEG hyperd. FALSE
##       relapse transplant               f.u date last seen
## 01005   FALSE       TRUE BMT / DEATH IN CR           <NA>
## 01010    TRUE      FALSE               REL      8/28/2000
## 03002    TRUE      FALSE               REL     10/15/1999
## 04006    TRUE      FALSE               REL      1/23/1998
## 04007    TRUE      FALSE               REL      11/4/1997
## 04008    TRUE      FALSE               REL     12/15/1997
head(featureNames(ALL))
## [1] "1000_at"   "1001_at"   "1002_f_at" "1003_s_at" "1004_at"   "1005_at"
## access to pData columns; matrix-like subsetting; exprs()
ALL[, ALL$sex %in% "M"]
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 12625 features, 83 samples 
##   element names: exprs 
## protocolData: none
## phenoData
##   sampleNames: 01005 01010 ... 83001 (83 total)
##   varLabels: cod diagnosis ... date last seen (21 total)
##   varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
##   pubMedIds: 14684422 16243790 
## Annotation: hgu95av2
range(exprs(ALL))
## [1]  1.984919 14.126571
## 30% 'most variable' features (c.f., genefilter::varFilter)
iqr <- apply(exprs(ALL), 1, IQR)
ALL[iqr > quantile(iqr, 0.7), ]
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 3788 features, 128 samples 
##   element names: exprs 
## protocolData: none
## phenoData
##   sampleNames: 01005 01010 ... LAL4 (128 total)
##   varLabels: cod diagnosis ... date last seen (21 total)
##   varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
##   pubMedIds: 14684422 16243790 
## Annotation: hgu95av2
Example: SummarizedExperiment (see vignettes in GenomicRanges).
suppressPackageStartupMessages({
    library(airway)
})
data(airway)
airway
## class: SummarizedExperiment 
## dim: 64102 8 
## exptData(1): ''
## assays(1): counts
## rownames(64102): ENSG00000000003 ENSG00000000005 ... LRG_98 LRG_99
## rowData metadata column names(0):
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(9): SampleName cell ... Sample BioSample
## column and row data
colData(airway)
## DataFrame with 8 rows and 9 columns
##            SampleName     cell      dex    albut        Run avgLength
##              <factor> <factor> <factor> <factor>   <factor> <integer>
## SRR1039508 GSM1275862   N61311    untrt    untrt SRR1039508       126
## SRR1039509 GSM1275863   N61311      trt    untrt SRR1039509       126
## SRR1039512 GSM1275866  N052611    untrt    untrt SRR1039512       126
## SRR1039513 GSM1275867  N052611      trt    untrt SRR1039513        87
## SRR1039516 GSM1275870  N080611    untrt    untrt SRR1039516       120
## SRR1039517 GSM1275871  N080611      trt    untrt SRR1039517       126
## SRR1039520 GSM1275874  N061011    untrt    untrt SRR1039520       101
## SRR1039521 GSM1275875  N061011      trt    untrt SRR1039521        98
##            Experiment    Sample    BioSample
##              <factor>  <factor>     <factor>
## SRR1039508  SRX384345 SRS508568 SAMN02422669
## SRR1039509  SRX384346 SRS508567 SAMN02422675
## SRR1039512  SRX384349 SRS508571 SAMN02422678
## SRR1039513  SRX384350 SRS508572 SAMN02422670
## SRR1039516  SRX384353 SRS508575 SAMN02422682
## SRR1039517  SRX384354 SRS508576 SAMN02422673
## SRR1039520  SRX384357 SRS508579 SAMN02422683
## SRR1039521  SRX384358 SRS508580 SAMN02422677
rowData(airway)
## GRangesList object of length 64102:
## $ENSG00000000003 
## GRanges object with 17 ranges and 2 metadata columns:
##        seqnames               ranges strand   |   exon_id       exon_name
##           <Rle>            <IRanges>  <Rle>   | <integer>     <character>
##    [1]        X [99883667, 99884983]      -   |    667145 ENSE00001459322
##    [2]        X [99885756, 99885863]      -   |    667146 ENSE00000868868
##    [3]        X [99887482, 99887565]      -   |    667147 ENSE00000401072
##    [4]        X [99887538, 99887565]      -   |    667148 ENSE00001849132
##    [5]        X [99888402, 99888536]      -   |    667149 ENSE00003554016
##    ...      ...                  ...    ... ...       ...             ...
##   [13]        X [99890555, 99890743]      -   |    667156 ENSE00003512331
##   [14]        X [99891188, 99891686]      -   |    667158 ENSE00001886883
##   [15]        X [99891605, 99891803]      -   |    667159 ENSE00001855382
##   [16]        X [99891790, 99892101]      -   |    667160 ENSE00001863395
##   [17]        X [99894942, 99894988]      -   |    667161 ENSE00001828996
## 
## ...
## <64101 more elements>
## -------
## seqinfo: 722 sequences (1 circular) from an unspecified genome
## access colData; matrix-like subsetting; assay() / assays()
airway[, airway$dex %in% "trt"]
## class: SummarizedExperiment 
## dim: 64102 4 
## exptData(1): ''
## assays(1): counts
## rownames(64102): ENSG00000000003 ENSG00000000005 ... LRG_98 LRG_99
## rowData metadata column names(0):
## colnames(4): SRR1039509 SRR1039513 SRR1039517 SRR1039521
## colData names(9): SampleName cell ... Sample BioSample
head(assay(airway))
##                 SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000000003        679        448        873        408       1138
## ENSG00000000005          0          0          0          0          0
## ENSG00000000419        467        515        621        365        587
## ENSG00000000457        260        211        263        164        245
## ENSG00000000460         60         55         40         35         78
## ENSG00000000938          0          0          2          0          1
##                 SRR1039517 SRR1039520 SRR1039521
## ENSG00000000003       1047        770        572
## ENSG00000000005          0          0          0
## ENSG00000000419        799        417        508
## ENSG00000000457        331        233        229
## ENSG00000000460         63         76         60
## ENSG00000000938          0          0          0
assays(airway)
## List of length 1
## names(1): counts
## library size
colSums(assay(airway))
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517 
##   20637971   18809481   25348649   15163415   24448408   30818215 
## SRR1039520 SRR1039521 
##   19126151   21164133
hist(rowMeans(log10(assay(airway))))
 
Calculate the GC content of human chr1 in the hg19 build, excluding regions where the sequence is “N”. You will need to
[[, chromosome 1 (“chr1”). <!– ]] –>alphabetFrequency() to calculate the count or frequency
of the nucleotides in chr1library(BSgenome.Hsapiens.UCSC.hg19)
## Loading required package: BSgenome
## Loading required package: rtracklayer
chr1seq <- BSgenome.Hsapiens.UCSC.hg19[["chr1"]]
chr1alf <- alphabetFrequency(chr1seq)
chr1gc <- sum(chr1alf[c("G", "C")]) / sum(chr1alf[c("A", "C", "G", "T")])
Calculate the GC content of 'exome' (approximately, all genic regions) on chr1. You will need to
genes() to extract genic regions of all genes, then
subsetting operations to restrict to chromosome 1.getSeq,BSgenome-method to extract sequences from
chromosome 1 of the BSgenome object.alphabetFrequency() (with the argument collapse=TRUE –
why?) and standard R operations to extract the gc content of
the genes.library(TxDb.Hsapiens.UCSC.hg19.knownGene)
genes <- genes(TxDb.Hsapiens.UCSC.hg19.knownGene)
genes1 <- genes[seqnames(genes) %in% "chr1"]
seq1 <- getSeq(BSgenome.Hsapiens.UCSC.hg19, genes1)
alf1 <- alphabetFrequency(seq1, collapse=TRUE)
gc1 <- sum(alf1[c("G", "C")]) / sum(alf1[c("A", "C", "G", "T")])
How does the GC content just calculated compare to the average of
the GC content of each exon? Answer this using
alphabetFrequency() but with collapse=FALSE), and adjust the
calculation of GC content to act on a matrix, rather than
vector. Why are these numbers different?
alf2 <- alphabetFrequency(seq1, collapse=FALSE)
gc2 <- rowSums(alf2[, c("G", "C")]) / rowSums(alf2[,c("A", "C", "G", "T")])
Plot a histogram of per-gene GC content, annotating with
information about chromosome and exome GC content. Use base
graphics hist(), abline(), plot(density(...)),
plot(ecdf(...)), etc. (one example is below). If this is too
easy, prepare a short presentation for the class illustrating how
to visualize this type of information using another R graphics
package, e.g., ggplot2, {r CRANpkg("ggvis"), or
{r CRANpkg(“lattice”)}.
plot(density(gc2))
abline(v=c(chr1gc, gc1), col=c("red", "blue"), lwd=2)
 
This exercise illustrates how integrated containers can be used to effectively manage data; it does NOT represent a suitable way to analyze RNASeq differential expression data.
Load the airway package and airway data
set. Explore it a litte, e.g., determining its dimensions (number
of regions of interest and samples), the information describing
samples, and the range of values in the count assay. The data are
from an RNA-seq experiment. The colData() describe treatment
groups and other information. The assay() is the (raw) number of
short reads overlapping each region of interest, in each
sample. The solution to this exercise is summarized above.
Create a subset of the data set that contains only the 30% most variable (using IQR as a metric) observations. Plot the distribution of asinh-transformed (a log-like transformation, except near 0) row mean counts
iqr <- apply(assay(airway), 1, IQR)
airway1 <- airway[iqr > quantile(iqr, 0.7),]
plot(density(rowMeans(asinh(assay(airway1)))))
 
Use the genefilter package rowttests function
(consult it's help page!) to compare asinh-transformed read counts
between the two dex treatment groups for each row. Explore the
result in various ways, e.g., finding the 'most' differentially
expressed genes, the genes with largest (absolute) difference
between treatment groups, adding adjusted P values (via
p.adjust(), in the stats package), etc. Can you obtain the
read counts for each treatment group, for the most differentially
expressed gene?
suppressPackageStartupMessages({
    library(genefilter)
})
ttest <- rowttests(asinh(assay(airway1)), airway1$dex)
ttest$p.adj <- p.adjust(ttest$p.value, method="BH")
ttest[head(order(ttest$p.adj)),]
##                 statistic        dm      p.value       p.adj
## ENSG00000179593  24.61562  5.463536 2.956680e-07 0.005671209
## ENSG00000101342  15.22622  2.697598 5.065386e-06 0.014773935
## ENSG00000101347  15.69846  2.485261 4.233805e-06 0.014773935
## ENSG00000134253 -15.46740 -1.483468 4.619047e-06 0.014773935
## ENSG00000143494 -14.93364 -2.760705 5.675890e-06 0.014773935
## ENSG00000152583  17.03482  3.054983 2.617828e-06 0.014773935
split(assay(airway1)[order(ttest$p.adj)[1], ], airway1$dex)
## $trt
## SRR1039509 SRR1039513 SRR1039517 SRR1039521 
##         81         87        129        213 
## 
## $untrt
## SRR1039508 SRR1039512 SRR1039516 SRR1039520 
##          0          0          0          0
Add the statistics of differential expression to the airway1
SummarizedExperiment. Confirm that the statistics have been
added.
mcols(rowData(airway1)) <- ttest
head(mcols(airway1))
## DataFrame with 6 rows and 4 columns
##     statistic          dm    p.value     p.adj
##     <numeric>   <numeric>  <numeric> <numeric>
## 1 -1.62895587 -0.38927224 0.15444536 0.6325992
## 2  0.09683305  0.01803387 0.92601249 0.9799343
## 3 -0.67163380 -0.09939038 0.52681603 0.8637800
## 4 -0.81787455 -0.16759915 0.44468651 0.8286889
## 5  0.55526482  0.17003218 0.59878922 0.8865085
## 6 -2.55199111 -0.29224482 0.04337408 0.4154906
Publications (General Bioconductor)
Other