% -*- mode: noweb; noweb-default-code-mode: R-mode; -*- %\VignetteIndexEntry{ecolitk} %\VignetteKeywords{} %\VignetteDepends{ecolitk, Biobase, multtest} %\VignettePackage{ecolitk} \documentclass[12pt]{article} \usepackage{amsmath} \usepackage[authoryear,round]{natbib} \usepackage{hyperref} \textwidth=6.2in \textheight=8.5in %\parskip=.3cm \oddsidemargin=.1in \evensidemargin=.1in \headheight=-.3in \newcommand{\scscst}{\scriptscriptstyle} \newcommand{\scst}{\scriptstyle} \newcommand{\Rfunction}[1]{{\texttt{#1}}} \newcommand{\Robject}[1]{{\texttt{#1}}} \newcommand{\Rpackage}[1]{{\textit{#1}}} \newcommand{\Rmethod}[1]{{\texttt{#1}}} \newcommand{\Rfunarg}[1]{{\texttt{#1}}} \newcommand{\Rclass}[1]{{\textit{#1}}} \bibliographystyle{plainnat} \author{Laurent} \begin{document} \title{A package to work with {\it E. coli} genome (and other circular DNA entities)} \maketitle \section*{Introduction} This package is a mixture of data (close the the data in the meta-data packages) and of code. The code was not included in other packages because: \begin{itemize} \item much is still to be done, much is likely to change. \item some features do are appear completely compatible with what is existing elsewhere \end{itemize} Things will settle in time\ldots The package contains data on {\it Escherichia coli} but can be used with other bacterial genomes or plamids, provided one supplies the required data. To load the package, do: \begin{Sinput} R> library(ecolitk) \end{Sinput} <>= library(ecolitk) @ \section*{{\it E. coli} data} Data related to {\it E. coli} are included in the package. Effort was made to follow the convention used in the meta-data packages available on bioconductor. On should refer to the help files for further details. In the following example, we explain how to find the positions of the genes of the operon lactose on the genome. A first step is to load needed data structures: <<>>= data(ecoligenomeSYMBOL2AFFY) data(ecoligenomeCHRLOC) @ This done, we want to find the genes with names like \verb+lac*+. The \Rpackage{base} function \Rfunction{grep} provides a convenient way to do it. Because the data structures are centered on Affymetrix probeset identifiers, an extra step is need to convert the gene name into the corresponding identifier. The locations of the genes can then be extracted from the environment \Robject{ecoligenomeCHRLOC}. We store them in the \Robject{beg.end}. <<>>= lac.i <- grep("^lac", ls(ecoligenomeSYMBOL2AFFY)) lac.symbol <- ls(ecoligenomeSYMBOL2AFFY)[lac.i] lac.affy <- unlist(lapply(lac.symbol, get, envir=ecoligenomeSYMBOL2AFFY)) beg.end <- lapply(lac.affy, get, envir=ecoligenomeCHRLOC) beg.end <- matrix(unlist(beg.end), nc=2, byrow=TRUE) @ \section*{Primitives for circular entities} <>= par(mfrow=c(2,2)) n <- 10 thetas <- rev(seq(0, 2 * pi, length=n)) rhos <- rev(seq(1, n) / n) xy <- polar2xy(rhos, thetas) colo <- heat.colors(n) plot(0, 0, xlim=c(-2, 2), ylim=c(-2, 2), type="n") for (i in 1:n) linesCircle(rhos[i]/2, xy$x[i], xy$y[i]) plot(0, 0, xlim=c(-2, 2), ylim=c(-2, 2), type="n") for (i in 1:n) polygonDisk(rhos[i]/2, xy$x[i], xy$y[i], col=colo[i]) plot(0, 0, xlim=c(-2, 2), ylim=c(-2, 2), type="n", xlab="", ylab="") for (i in 1:n) polygonArc(0, thetas[i], rhos[i]/2, rhos[i], center.x = xy$x[i], center.y = xy$y[i], col=colo[i]) plot(0, 0, xlim=c(-2, 2), ylim=c(-2, 2), type="n", xlab="", ylab="") for (i in (1:n)[-1]) { linesCircle(rhos[i-1], col="gray", lty=2) polygonArc(thetas[i-1], thetas[i], rhos[i-1], rhos[i], col=colo[i], edges=20) arrowsArc(thetas[i-1], thetas[i], rhos[i] + 1, col=colo[i], edges=20) } @ \section*{Plotting toolbox for circular genomes} \subsection*{Plot a (circular) genome} The function \Rfunction{cPlotCircle} draws a circular chromosome. <>= cPlotCircle(main.inside = "E. coli - K12") @ The drawing of the chromosome is thought as a first step when plotting informations on a circular genome. The following example shows how to plot the locations of the genes for the operon lactose on the genome. (the steps leading to their respective positions were detailed above). \subsection*{plot genes on a genome} We sort the genes the \verb+lac*+ genes discussed in an example above according to their positions on the genome. <<>>= lac.o <- order(beg.end[, 1]) lac.i <- lac.i[lac.o] lac.symbol <- lac.symbol[lac.o] lac.affy <- lac.affy[lac.o] beg.end <- beg.end[lac.o, ] @ This done, we plot them as polygons: <>= cPlotCircle(main.inside = "E. coli - K12", main = "lac genes") polygonChrom(beg.end[, 1], beg.end[, 2], ecoli.len, 1, 1.4) @ Zoom in can be achieved by playing with \Rfunarg{xlim} and \Rfunarg{ylim}. <>= l <- data.frame(x=c(0.470, 0.48), y=c(0.87, 0.90)) cPlotCircle(xlim=range(l$x), ylim=range(l$y), main = "lac genes") polygonChrom(beg.end[, 1], beg.end[, 2], ecoli.len, 1, 1.007, col=rainbow(4)) legend(0.47, 0.9, legend=lac.symbol, fill=rainbow(4)) @ \subsection*{More plots} The following plot uses several different annotation sources. It tries to show a spatial pattern for the genes annotated as - Macromolecules (cellular constituent) biosynthesis -. In the following plot those genes are plotted according to their position on the genome and according to their respective strand. <>= library(Biobase) data(ecoligenomeBNUM2STRAND) data(ecoligenomeBNUM) data(ecoligenomeBNUM2MULTIFUN) data(ecoligenomeCHRLOC) affyids <- ls(ecoligenomeCHRLOC) affypos <- mget(affyids, ecoligenomeCHRLOC, ifnotfound=NA) ## 'unlist' as the mapping is one-to-one bnums <- unlist(mget(affyids, ecoligenomeBNUM, ifnotfound=NA)) strands <- unlist(mget(bnums, ecoligenomeBNUM2STRAND, ifnotfound=NA)) ## multifun <- mget(bnums, ecoligenomeBNUM2MULTIFUN, ifnotfound=NA) ## select the entries in the categorie "1.6" ## ("Macromolecules (cellular constituent) biosynthesis") f <- function(x) { if (all(is.na(x))) return(FALSE) length(grep("^1\\.6", x) > 0) } is.selected <- unlist(lapply(multifun, f)) cPlotCircle(main.inside="E.coli K12") beg.end <- matrix(unlist(affypos), nc=2, byrow=TRUE) ## plot 'bnums'... strand + good <- strands == ">" & is.selected linesChrom(beg.end[good, 1], beg.end[good, 2], ecoli.len, 1.4, col="red", lwd=3) ## plot 'bnums'... strand - good <- strands == "<" & is.selected linesChrom(beg.end[good, 1], beg.end[good, 2], ecoli.len, 1.5, col="blue", lwd=3) @ \subsection*{More complex plot} The next example is more complex: we want to compute and display the GC content over a fragment of the genome. The fragment is decided to be of size $1$ million bases, with the origin of replication (base zero) in the middle of the fragment. We limit the size to lower the computing resources needed to built this document and to demonstrate how to perform something not so trivial. \begin{Scode} cPlotCircle(main.inside = "E. coli - K12") data(ecoli.m52.genome) size.frag <- 1000000 fragment.r <- substring(ecoli.m52.genome, 1, size.frag/2) fragment.l <- substring(ecoli.m52.genome, ecoli.len - size.frag / 2, ecoli.len) fragment <- paste(fragment.l, fragment.r, sep="") library(matchprobes) tmp <- wstringapply(fragment, 400, 200, basecontent) gccontent <- unlist(lapply(tmp, function(x) sum(x[3:4]) / sum(x))) theta0 <- chromPos2angle(0 - size.frag/2, ecoli.len) theta1 <- chromPos2angle(size.frag/2, ecoli.len) linesCircle(1.5, col="gray", lty = 2) linesArc(theta0, theta1, gccontent + 1) \end{Scode} Zooming can be performed by setting the \Rfunarg{xlim} and \Rfunarg{ylim} parameters of the function \Rfunction{cPlotCircle}. In our example, there seem to be a CG rich island we would like to have a closer look at: \begin{Scode} par(mfrow=c(1,2)) cPlotCircle(main.inside = "E. coli - K12") l <- data.frame(x=c(-0.7737990, -0.5286815), y=c(1.521509, 1.304151)) linesCircle(1.5, col="gray", lty = 2) linesArc(theta0, theta1, gccontent + 1) rect(l$x[1], l$y[1], l$x[2], l$y[2], border="red") cPlotCircle(xlim=range(l$x), ylim=range(l$y)) box(col="red") linesCircle(1.5, col="gray", lty = 2) linesArc(theta0, theta1, gccontent + 1) \end{Scode} As the parameter \Rfunarg{SLIDE} was set to 200, one can estimate from the close up that the size of the island is roughly over 1500 base pairs. \section*{Genes in operons} In procaryotes, some genes are {\it bundled} in operons. This means that a some genes are physically located near each others on the genome, and that they are transcribed\footnote{The {\it transcription} is the copy of DNA into RNA} together. Complex mechanisms can regulate the translation\footnote{The {\it translation} is the making of a protein from RNA}. Microarrays are designed to measure the relative abundance of transcripts, therefore genes of the same operon should have the same expression level. First one has to find the Affymetrix identifiers for the genes known to be in operons: <<>>= library(Biobase) data(ecoligenome.operon) data(ecoligenomeSYMBOL2AFFY) tmp <- mget(ecoligenome.operon$gene.name, ecoligenomeSYMBOL2AFFY, ifnotfound=NA) ecoligenome.operon$affyid <- unname(unlist(tmp)) # clean up NAs ecoligenome.operon <- subset(ecoligenome.operon, !is.na(affyid)) @ For convenience, the Affymetrix probe set identifiers are stored in the \Rclass{data.frame}. As an exercise, the reader can write the few lines of code needed to plot the operon on the genome (see above for plotting examples). Once this done, grouping the Affymetrix identifiers according to the operon they belong to is done simply: <<>>= attach(ecoligenome.operon) affyoperons <- split(affyid, operon.name) detach(ecoligenome.operon) ## a sample of 5 operons affyoperons[18:22] @ Since we are going to use Affymetrix data, loading the affy package is needed: <<>>= library(affy) @ A bioconductor package of experimental E.coli data is used: <<>>= library(ecoliLeucine) data(ecoliLeucine) @ First, one should normalize the data: <<>>= abatch.nqt <- normalize(ecoliLeucine, method="quantiles") @ The summary statistics for a probe set can be obtained simply: <<>>= ## the operon taken as an example: names(affyoperons)[18] #colnames(abatch.nqt@exprs) <- NULL eset <- computeExprSet(abatch.nqt, pmcorrect.method="pmonly", summary.method="medianpolish", ids = affyoperons[[18]]) @ <>= operons.eset <- computeExprSet(abatch.nqt, pmcorrect.method="pmonly", summary.method="medianpolish", ids = unlist(affyoperons)) @ We assumed that genes within operons should be all differentially expressed, or all not differentially expressed. <<>>= library(multtest) my.ttest <- function(x, i, j) { pval <- t.test(x[i], x[j])$p.value return(pval) } is.lrpplus <- pData(operons.eset)$strain == "lrp+" is.lrpmoins <- pData(operons.eset)$strain == "lrp-" operons.ttest <- esApply(operons.eset, 1, my.ttest, is.lrpplus, is.lrpmoins) ## adjustment for multiple testing. operons.ttest.adj <- mt.rawp2adjp(operons.ttest, "BY")$adjp ## flag whether or not the probeset can be considered differentially expressed operons.diff.expr <- operons.ttest.adj < 0.01 @ One needs a little bit of bookkeeping to know what belongs to which operon. We can build a list of indexes to know what belongs to where: <<>>= operons.i <- split(seq(along=operons.ttest), ecoligenome.operon$operon.name) @ The thrill is then to see if the results for differential expression (or non-differential expression) are homogeneous among the genes within the same operon\ldots %% First, we want to apply the vsn transformation to our data: %% <<>>= %% library(vsn) %% abatch.n <- normalize(ecoliLeucine, method="vsn") %% @ %% Obtaining the intensities for the probe sets in a particular operon can be done simply %% (here for the $18^{\mathtt{th}}$ operon in our list). %% <<>>= %% ## name of the operon %% names(affyoperons)[18] %% ## get the probe sets %% ppsetoperons <- probeset(abatch.n, affyoperons[[18]]) %% @ %% To compute on the probe intensities %% within an operon, the method \Rfunction{ppsetApply} is helpful. %% The following example shows how to compute the variance for the probes %% within an operon: %% <<>>= %% ppset.fun <- function(ppset, fun, pmcorrect.fun = pmcorrect.pmonly, transfo.fun=log, ...) { %% probes <- do.call("pmcorrect.fun", list(ppset)) %% probes <- transfo.fun(probes) %% r <- apply(probes, 1, fun, ...) %% return(r) %% } %% r <- ppsetApply(ecoliLeucine, ppset.fun, affyoperons[[18]], fun=var, na.rm=TRUE) %% @ %% Note that we log-transform the probe intensities to work on the the generalized-log transform %% (as 'vsn' returns the exponential of the g-log transform) %% <>= %% boxplot(r) %% @ %% <<>>= %% var.boot <- function(x, na.rm=TRUE) { %% f <- function(d, i) { var(d[i], na.rm=na.rm) } %% boot(x, f, R=999, stype="i") %% } %% r.boot <- ppsetApply(ecoliLeucine, ppset.fun, affyoperons[[18]], fun=var.boot, na.rm=TRUE) %% @ \end{document}