Day 3 - Gene Expression

Load the GEOquery package.

library(GEOquery)

## Loading required package: Biobase
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## 
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, as.vector, cbind,
##     colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
##     grep, grepl, intersect, is.unsorted, lapply, lengths, Map,
##     mapply, match, mget, order, paste, pmax, pmax.int, pmin,
##     pmin.int, Position, rank, rbind, Reduce, rownames, sapply,
##     setdiff, sort, table, tapply, union, unique, unlist, unsplit
## 
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Setting options('download.file.method.GEOquery'='auto')

Get the data

x = getGEOSuppFiles("GSE20986")

## ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE20nnn/GSE20986/suppl/

##                                           size isdir mode
## /tmp/houtan/GSE20986/GSE20986_RAW.tar 56360960 FALSE  644
## /tmp/houtan/GSE20986/filelist.txt          740 FALSE  644
##                                                     mtime
## /tmp/houtan/GSE20986/GSE20986_RAW.tar 2015-10-14 05:16:54
## /tmp/houtan/GSE20986/filelist.txt     2015-10-14 05:16:57
##                                                     ctime
## /tmp/houtan/GSE20986/GSE20986_RAW.tar 2015-10-14 05:16:54
## /tmp/houtan/GSE20986/filelist.txt     2015-10-14 05:16:57
##                                                     atime   uid   gid
## /tmp/houtan/GSE20986/GSE20986_RAW.tar 2015-10-14 05:14:50 37962 37962
## /tmp/houtan/GSE20986/filelist.txt     2015-10-14 05:08:01 37962 37962
##                                          uname     grname
## /tmp/houtan/GSE20986/GSE20986_RAW.tar mtmorgan g_mtmorgan
## /tmp/houtan/GSE20986/filelist.txt     mtmorgan g_mtmorgan

Untarring the data

untar("GSE20986/GSE20986_RAW.tar", exdir = "data")

gunzipping the data

cels = list.files("data/", pattern = "[gz]")
sapply(paste("data", cels, sep = "/"), gunzip)

## data/GSM524662.CEL.gz data/GSM524663.CEL.gz data/GSM524664.CEL.gz 
##              13555726              13555055              13555639 
## data/GSM524665.CEL.gz data/GSM524666.CEL.gz data/GSM524667.CEL.gz 
##              13560122              13555663              13557614 
## data/GSM524668.CEL.gz data/GSM524669.CEL.gz data/GSM524670.CEL.gz 
##              13556090              13560054              13555971 
## data/GSM524671.CEL.gz data/GSM524672.CEL.gz data/GSM524673.CEL.gz 
##              13554926              13555042              13555290

creating your phenodata

phenodata = matrix(rep(list.files("data"), 2), ncol =2)
class(phenodata)

## [1] "matrix"

phenodata <- as.data.frame(phenodata)
colnames(phenodata) <- c("Name", "FileName")
phenodata$Targets <- c("iris", 
                       "retina", 
                       "retina", 
                       "iris", 
                       "retina", 
                       "iris", 
                       "choroid", 
                       "choroid", 
                       "choroid", 
                       "huvec", 
                       "huvec", 
                       "huvec")
write.table(phenodata, "data/phenodata.txt", quote = F, sep = "\t", row.names = F)

library(simpleaffy)

## Loading required package: affy
## Loading required package: genefilter
## 
## Attaching package: 'genefilter'
## 
## The following object is masked from 'package:base':
## 
##     anyNA
## 
## Loading required package: gcrma

celfiles <- read.affy(covdesc = "phenodata.txt", path = "data")
boxplot(celfiles)

## Warning: replacing previous import by 'utils::tail' when loading
## 'hgu133plus2cdf'

## Warning: replacing previous import by 'utils::head' when loading
## 'hgu133plus2cdf'

##

library(RColorBrewer)
cols = brewer.pal(8, "Set1")
eset <- exprs(celfiles)
samples <- celfiles$Targets
colnames(eset)

##  [1] "GSM524662.CEL" "GSM524663.CEL" "GSM524664.CEL" "GSM524665.CEL"
##  [5] "GSM524666.CEL" "GSM524667.CEL" "GSM524668.CEL" "GSM524669.CEL"
##  [9] "GSM524670.CEL" "GSM524671.CEL" "GSM524672.CEL" "GSM524673.CEL"

colnames(eset) <- samples

boxplot(celfiles, col = cols, las = 2)

distance <- dist(t(eset), method = "maximum")
clusters <- hclust(distance)
plot(clusters)

require(simpleaffy)
require(affyPLM)

## Loading required package: affyPLM
## Loading required package: preprocessCore

celfiles.gcrma = gcrma(celfiles)

## Adjusting for optical effect............Done.
## Computing affinities

## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: IRanges
## Loading required package: S4Vectors
## 
## Attaching package: 'IRanges'
## 
## The following object is masked from 'package:simpleaffy':
## 
##     members

## .Done.
## Adjusting for non-specific binding............Done.
## Normalizing
## Calculating Expression

par(mfrow=c(1,2))
boxplot(celfiles.gcrma, col = cols, las = 2, main = "Post-Normalization");
boxplot(celfiles, col = cols, las = 2, main = "Pre-Normalization")

dev.off()

## null device 
##           1

distance <- dist(t(exprs(celfiles.gcrma)), method = "maximum")
clusters <- hclust(distance)
plot(clusters)

library(limma)

## 
## Attaching package: 'limma'
## 
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA

phenodata

##             Name      FileName Targets
## 1  GSM524662.CEL GSM524662.CEL    iris
## 2  GSM524663.CEL GSM524663.CEL  retina
## 3  GSM524664.CEL GSM524664.CEL  retina
## 4  GSM524665.CEL GSM524665.CEL    iris
## 5  GSM524666.CEL GSM524666.CEL  retina
## 6  GSM524667.CEL GSM524667.CEL    iris
## 7  GSM524668.CEL GSM524668.CEL choroid
## 8  GSM524669.CEL GSM524669.CEL choroid
## 9  GSM524670.CEL GSM524670.CEL choroid
## 10 GSM524671.CEL GSM524671.CEL   huvec
## 11 GSM524672.CEL GSM524672.CEL   huvec
## 12 GSM524673.CEL GSM524673.CEL   huvec

samples <- as.factor(samples)
design <- model.matrix(~0+samples)
colnames(design)

## [1] "sampleschoroid" "sampleshuvec"   "samplesiris"    "samplesretina"

colnames(design) <- c("choroid", "huvec", "iris", "retina")
design

##    choroid huvec iris retina
## 1        0     0    1      0
## 2        0     0    0      1
## 3        0     0    0      1
## 4        0     0    1      0
## 5        0     0    0      1
## 6        0     0    1      0
## 7        1     0    0      0
## 8        1     0    0      0
## 9        1     0    0      0
## 10       0     1    0      0
## 11       0     1    0      0
## 12       0     1    0      0
## attr(,"assign")
## [1] 1 1 1 1
## attr(,"contrasts")
## attr(,"contrasts")$samples
## [1] "contr.treatment"

contrast.matrix = makeContrasts(
              huvec_choroid = huvec - choroid, 
              huvec_retina = huvec - retina, 
              huvec_iris <- huvec - iris, 
              levels = design)

fit = lmFit(celfiles.gcrma, design)
huvec_fit <- contrasts.fit(fit, contrast.matrix)
huvec_ebay <- eBayes(huvec_fit)

library(hgu133plus2.db)

## Loading required package: org.Hs.eg.db
## Loading required package: DBI

library(annotate)

## Loading required package: XML

probenames.list <- rownames(topTable(huvec_ebay, number = 100000))
getsymbols <- getSYMBOL(probenames.list, "hgu133plus2")
results <- topTable(huvec_ebay, number = 100000, coef = "huvec_choroid")
results <- cbind(results, getsymbols)

To make thresholds

summary(results)

##      logFC             AveExpr             t                P.Value      
##  Min.   :-9.19111   Min.   : 2.279   Min.   :-39.77473   Min.   :0.0000  
##  1st Qu.:-0.05967   1st Qu.: 2.281   1st Qu.: -0.70649   1st Qu.:0.1523  
##  Median : 0.00000   Median : 2.480   Median :  0.00000   Median :0.5079  
##  Mean   :-0.02353   Mean   : 4.375   Mean   :  0.07441   Mean   :0.5346  
##  3rd Qu.: 0.03986   3rd Qu.: 6.241   3rd Qu.:  0.67455   3rd Qu.:1.0000  
##  Max.   : 8.67086   Max.   :15.541   Max.   :296.84201   Max.   :1.0000  
##                                                                          
##    adj.P.Val            B             getsymbols   
##  Min.   :0.0000   Min.   :-7.710   YME1L1  :   22  
##  1st Qu.:0.6036   1st Qu.:-7.710   HFE     :   15  
##  Median :1.0000   Median :-7.451   CFLAR   :   14  
##  Mean   :0.7436   Mean   :-6.582   NRP2    :   14  
##  3rd Qu.:1.0000   3rd Qu.:-6.498   ARHGEF12:   13  
##  Max.   :1.0000   Max.   :21.290   (Other) :42280  
##                                    NA's    :12317

results$threshold <- "1"
a <- subset(results, adj.P.Val < 0.05 & logFC > 5)
results[rownames(a), "threshold"] <- "2"
b <- subset(results, adj.P.Val < 0.05 & logFC < -5)
results[rownames(b), "threshold"] <- "3"
table(results$threshold)

## 
##     1     2     3 
## 54587    33    55

To make ggplot

library(ggplot2)
volcano <- ggplot(data = results, 
                  aes(x = logFC, y = -1*log10(adj.P.Val), 
                      colour = threshold, 
                      label = getsymbols))

volcano <- volcano + 
  geom_point() + 
  scale_color_manual(values = c("black", "red", "green"), 
                     labels = c("Not Significant", "Upregulated", "Downregulated"), 
                     name = "Key/Legend")

volcano + 
  geom_text(data = subset(results, logFC > 5 & -1*log10(adj.P.Val) > 5), aes(x = logFC, y = -1*log10(adj.P.Val), colour = threshold, label = getsymbols)  )

## Warning: Removed 4 rows containing missing values (geom_text).

GEO Link