Load the GEOquery package.
library(GEOquery)
## Loading required package: Biobase
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
##
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
##
## The following objects are masked from 'package:stats':
##
## IQR, mad, xtabs
##
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, as.vector, cbind,
## colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
## grep, grepl, intersect, is.unsorted, lapply, lengths, Map,
## mapply, match, mget, order, paste, pmax, pmax.int, pmin,
## pmin.int, Position, rank, rbind, Reduce, rownames, sapply,
## setdiff, sort, table, tapply, union, unique, unlist, unsplit
##
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Setting options('download.file.method.GEOquery'='auto')
Get the data
x = getGEOSuppFiles("GSE20986")
## ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE20nnn/GSE20986/suppl/
x
## size isdir mode
## /tmp/houtan/GSE20986/GSE20986_RAW.tar 56360960 FALSE 644
## /tmp/houtan/GSE20986/filelist.txt 740 FALSE 644
## mtime
## /tmp/houtan/GSE20986/GSE20986_RAW.tar 2015-10-14 05:16:54
## /tmp/houtan/GSE20986/filelist.txt 2015-10-14 05:16:57
## ctime
## /tmp/houtan/GSE20986/GSE20986_RAW.tar 2015-10-14 05:16:54
## /tmp/houtan/GSE20986/filelist.txt 2015-10-14 05:16:57
## atime uid gid
## /tmp/houtan/GSE20986/GSE20986_RAW.tar 2015-10-14 05:14:50 37962 37962
## /tmp/houtan/GSE20986/filelist.txt 2015-10-14 05:08:01 37962 37962
## uname grname
## /tmp/houtan/GSE20986/GSE20986_RAW.tar mtmorgan g_mtmorgan
## /tmp/houtan/GSE20986/filelist.txt mtmorgan g_mtmorgan
Untarring the data
untar("GSE20986/GSE20986_RAW.tar", exdir = "data")
gunzipping the data
cels = list.files("data/", pattern = "[gz]")
sapply(paste("data", cels, sep = "/"), gunzip)
## data/GSM524662.CEL.gz data/GSM524663.CEL.gz data/GSM524664.CEL.gz
## 13555726 13555055 13555639
## data/GSM524665.CEL.gz data/GSM524666.CEL.gz data/GSM524667.CEL.gz
## 13560122 13555663 13557614
## data/GSM524668.CEL.gz data/GSM524669.CEL.gz data/GSM524670.CEL.gz
## 13556090 13560054 13555971
## data/GSM524671.CEL.gz data/GSM524672.CEL.gz data/GSM524673.CEL.gz
## 13554926 13555042 13555290
creating your phenodata
phenodata = matrix(rep(list.files("data"), 2), ncol =2)
class(phenodata)
## [1] "matrix"
phenodata <- as.data.frame(phenodata)
colnames(phenodata) <- c("Name", "FileName")
phenodata$Targets <- c("iris",
"retina",
"retina",
"iris",
"retina",
"iris",
"choroid",
"choroid",
"choroid",
"huvec",
"huvec",
"huvec")
write.table(phenodata, "data/phenodata.txt", quote = F, sep = "\t", row.names = F)
library(simpleaffy)
## Loading required package: affy
## Loading required package: genefilter
##
## Attaching package: 'genefilter'
##
## The following object is masked from 'package:base':
##
## anyNA
##
## Loading required package: gcrma
celfiles <- read.affy(covdesc = "phenodata.txt", path = "data")
boxplot(celfiles)
## Warning: replacing previous import by 'utils::tail' when loading
## 'hgu133plus2cdf'
## Warning: replacing previous import by 'utils::head' when loading
## 'hgu133plus2cdf'
##
library(RColorBrewer)
cols = brewer.pal(8, "Set1")
eset <- exprs(celfiles)
samples <- celfiles$Targets
colnames(eset)
## [1] "GSM524662.CEL" "GSM524663.CEL" "GSM524664.CEL" "GSM524665.CEL"
## [5] "GSM524666.CEL" "GSM524667.CEL" "GSM524668.CEL" "GSM524669.CEL"
## [9] "GSM524670.CEL" "GSM524671.CEL" "GSM524672.CEL" "GSM524673.CEL"
colnames(eset) <- samples
boxplot(celfiles, col = cols, las = 2)
distance <- dist(t(eset), method = "maximum")
clusters <- hclust(distance)
plot(clusters)
require(simpleaffy)
require(affyPLM)
## Loading required package: affyPLM
## Loading required package: preprocessCore
celfiles.gcrma = gcrma(celfiles)
## Adjusting for optical effect............Done.
## Computing affinities
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: IRanges
## Loading required package: S4Vectors
##
## Attaching package: 'IRanges'
##
## The following object is masked from 'package:simpleaffy':
##
## members
## .Done.
## Adjusting for non-specific binding............Done.
## Normalizing
## Calculating Expression
par(mfrow=c(1,2))
boxplot(celfiles.gcrma, col = cols, las = 2, main = "Post-Normalization");
boxplot(celfiles, col = cols, las = 2, main = "Pre-Normalization")
dev.off()
## null device
## 1
distance <- dist(t(exprs(celfiles.gcrma)), method = "maximum")
clusters <- hclust(distance)
plot(clusters)
library(limma)
##
## Attaching package: 'limma'
##
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
phenodata
## Name FileName Targets
## 1 GSM524662.CEL GSM524662.CEL iris
## 2 GSM524663.CEL GSM524663.CEL retina
## 3 GSM524664.CEL GSM524664.CEL retina
## 4 GSM524665.CEL GSM524665.CEL iris
## 5 GSM524666.CEL GSM524666.CEL retina
## 6 GSM524667.CEL GSM524667.CEL iris
## 7 GSM524668.CEL GSM524668.CEL choroid
## 8 GSM524669.CEL GSM524669.CEL choroid
## 9 GSM524670.CEL GSM524670.CEL choroid
## 10 GSM524671.CEL GSM524671.CEL huvec
## 11 GSM524672.CEL GSM524672.CEL huvec
## 12 GSM524673.CEL GSM524673.CEL huvec
samples <- as.factor(samples)
design <- model.matrix(~0+samples)
colnames(design)
## [1] "sampleschoroid" "sampleshuvec" "samplesiris" "samplesretina"
colnames(design) <- c("choroid", "huvec", "iris", "retina")
design
## choroid huvec iris retina
## 1 0 0 1 0
## 2 0 0 0 1
## 3 0 0 0 1
## 4 0 0 1 0
## 5 0 0 0 1
## 6 0 0 1 0
## 7 1 0 0 0
## 8 1 0 0 0
## 9 1 0 0 0
## 10 0 1 0 0
## 11 0 1 0 0
## 12 0 1 0 0
## attr(,"assign")
## [1] 1 1 1 1
## attr(,"contrasts")
## attr(,"contrasts")$samples
## [1] "contr.treatment"
contrast.matrix = makeContrasts(
huvec_choroid = huvec - choroid,
huvec_retina = huvec - retina,
huvec_iris <- huvec - iris,
levels = design)
fit = lmFit(celfiles.gcrma, design)
huvec_fit <- contrasts.fit(fit, contrast.matrix)
huvec_ebay <- eBayes(huvec_fit)
library(hgu133plus2.db)
## Loading required package: org.Hs.eg.db
## Loading required package: DBI
library(annotate)
## Loading required package: XML
probenames.list <- rownames(topTable(huvec_ebay, number = 100000))
getsymbols <- getSYMBOL(probenames.list, "hgu133plus2")
results <- topTable(huvec_ebay, number = 100000, coef = "huvec_choroid")
results <- cbind(results, getsymbols)
To make thresholds
summary(results)
## logFC AveExpr t P.Value
## Min. :-9.19111 Min. : 2.279 Min. :-39.77473 Min. :0.0000
## 1st Qu.:-0.05967 1st Qu.: 2.281 1st Qu.: -0.70649 1st Qu.:0.1523
## Median : 0.00000 Median : 2.480 Median : 0.00000 Median :0.5079
## Mean :-0.02353 Mean : 4.375 Mean : 0.07441 Mean :0.5346
## 3rd Qu.: 0.03986 3rd Qu.: 6.241 3rd Qu.: 0.67455 3rd Qu.:1.0000
## Max. : 8.67086 Max. :15.541 Max. :296.84201 Max. :1.0000
##
## adj.P.Val B getsymbols
## Min. :0.0000 Min. :-7.710 YME1L1 : 22
## 1st Qu.:0.6036 1st Qu.:-7.710 HFE : 15
## Median :1.0000 Median :-7.451 CFLAR : 14
## Mean :0.7436 Mean :-6.582 NRP2 : 14
## 3rd Qu.:1.0000 3rd Qu.:-6.498 ARHGEF12: 13
## Max. :1.0000 Max. :21.290 (Other) :42280
## NA's :12317
results$threshold <- "1"
a <- subset(results, adj.P.Val < 0.05 & logFC > 5)
results[rownames(a), "threshold"] <- "2"
b <- subset(results, adj.P.Val < 0.05 & logFC < -5)
results[rownames(b), "threshold"] <- "3"
table(results$threshold)
##
## 1 2 3
## 54587 33 55
To make ggplot
library(ggplot2)
volcano <- ggplot(data = results,
aes(x = logFC, y = -1*log10(adj.P.Val),
colour = threshold,
label = getsymbols))
volcano <- volcano +
geom_point() +
scale_color_manual(values = c("black", "red", "green"),
labels = c("Not Significant", "Upregulated", "Downregulated"),
name = "Key/Legend")
volcano +
geom_text(data = subset(results, logFC > 5 & -1*log10(adj.P.Val) > 5), aes(x = logFC, y = -1*log10(adj.P.Val), colour = threshold, label = getsymbols) )
## Warning: Removed 4 rows containing missing values (geom_text).