## ----style, eval=TRUE, echo=FALSE, results="asis"------------------------ BiocStyle::latex() ## ------------------------------------------------------------------------ # load the R package SeqArray library(SeqArray) ## ------------------------------------------------------------------------ gds.fn <- seqExampleFileName("gds") # or gds.fn <- "C:/YourFolder/Your_GDS_File.gds" gds.fn seqSummary(gds.fn) ## ------------------------------------------------------------------------ # open a GDS file genofile <- seqOpen(gds.fn) # display the contents of the GDS file in a hierarchical structure genofile ## ------------------------------------------------------------------------ # display all contents of the GDS file print(genofile, all=TRUE) # close the GDS file seqClose(genofile) ## ------------------------------------------------------------------------ # the VCF file, using the example in the SeqArray package vcf.fn <- seqExampleFileName("vcf") # or vcf.fn <- "C:/YourFolder/Your_VCF_File.vcf" vcf.fn # parse the header seqVCF.Header(vcf.fn) ## ------------------------------------------------------------------------ # convert, save in "tmp.gds" seqVCF2GDS(vcf.fn, "tmp.gds") seqSummary("tmp.gds") ## ----echo=FALSE---------------------------------------------------------- unlink("tmp.gds") ## ------------------------------------------------------------------------ # the file of GDS gds.fn <- seqExampleFileName("gds") # or gds.fn <- "C:/YourFolder/Your_GDS_File.gds" # open a GDS file genofile <- seqOpen(gds.fn) # convert seqGDS2VCF(genofile, "tmp.vcf.gz") # read z <- readLines("tmp.vcf.gz", n=20) for (i in z) cat(substr(i, 1, 76), " ...\n", sep="") # output BN,GP,AA,HM2 in INFO (the variables are in this order), no FORMAT seqGDS2VCF(genofile, "tmp2.vcf.gz", info.var=c("BN","GP","AA","HM2"), fmt.var=character(0)) # read z <- readLines("tmp2.vcf.gz", n=15) for (i in z) cat(substr(i, 1, 56), " ...\n", sep="") # close the GDS file seqClose(genofile) ## ------------------------------------------------------------------------ # delete temporary files unlink(c("tmp.vcf.gz", "tmp1.vcf.gz", "tmp2.vcf.gz")) ## ------------------------------------------------------------------------ # the file of VCF vcf.fn <- seqExampleFileName("vcf") # or vcf.fn <- "C:/YourFolder/Your_VCF_File.vcf" # convert seqVCF2GDS(vcf.fn, "tmp.gds", verbose=FALSE) # make sure that open with "readonly=FALSE" genofile <- seqOpen("tmp.gds", readonly=FALSE) # display the original structure genofile # delete "HM2", "HM3", "AA", "OR" and "DP" seqDelete(genofile, info.varname=c("HM2", "HM3", "AA", "OR"), format.varname="DP") # display genofile # close the GDS file seqClose(genofile) # clean up the fragments, reduce the file size cleanup.gds("tmp.gds") ## ----echo=FALSE---------------------------------------------------------- unlink("tmp.gds") ## ------------------------------------------------------------------------ # open a GDS file gds.fn <- seqExampleFileName("gds") genofile <- seqOpen(gds.fn) ## ------------------------------------------------------------------------ # take out sample id head(samp.id <- seqGetData(genofile, "sample.id")) # take out variant id head(variant.id <- seqGetData(genofile, "variant.id")) # get "chromosome" table(seqGetData(genofile, "chromosome")) # get "allele" head(seqGetData(genofile, "allele")) # get "annotation/info/GP" head(seqGetData(genofile, "annotation/info/GP")) ## ------------------------------------------------------------------------ # set sample and variant filters seqSetFilter(genofile, sample.id=samp.id[c(2,4,6)]) set.seed(100) seqSetFilter(genofile, variant.id=sample(variant.id, 4)) # get "allele" seqGetData(genofile, "allele") ## ------------------------------------------------------------------------ # get genotypic data seqGetData(genofile, "genotype") ## ------------------------------------------------------------------------ # get "annotation/info/AA", a variable-length dataset seqGetData(genofile, "annotation/info/AA") ## ------------------------------------------------------------------------ # get "annotation/format/DP", a variable-length dataset seqGetData(genofile, "annotation/format/DP") ## ------------------------------------------------------------------------ # set sample and variant filters set.seed(100) seqSetFilter(genofile, sample.id=samp.id[c(2,4,6)], variant.id=sample(variant.id, 4)) ## ------------------------------------------------------------------------ # read multiple variables variant by variant seqApply(genofile, c(geno="genotype", id="annotation/id"), FUN=function(x) print(x), margin="by.variant", as.is="none") ## ------------------------------------------------------------------------ # remove the sample and variant filters seqSetFilter(genofile) # get the numbers of alleles per variant z <- seqApply(genofile, "allele", FUN=function(x) length(unlist(strsplit(x,","))), as.is="integer") table(z) ## ------------------------------------------------------------------------ HM3 <- seqGetData(genofile, "annotation/info/HM3") ## ------------------------------------------------------------------------ # Now HM3 is a global variable # print out RS id if the frequency of reference allele is less than 0.5% and it is HM3 seqApply(genofile, c(geno="genotype", id="annotation/id"), FUN = function(index, x) { p <- mean(x$geno == 0, na.rm=TRUE) # the frequency of reference allele if ((p < 0.005) & HM3[index]) print(x$id) }, as.is="none", var.index="relative", margin="by.variant") ## ------------------------------------------------------------------------ # calculate the frequency of reference allele, afreq <- seqApply(genofile, "genotype", FUN=function(x) mean(x==0, na.rm=TRUE), as.is="double", margin="by.variant") length(afreq) summary(afreq) ## ------------------------------------------------------------------------ # load the "parallel" package library(parallel) # Use option cl.core to choose an appropriate cluster size (or # of cores) cl <- makeCluster(getOption("cl.cores", 2)) ## ------------------------------------------------------------------------ # run in parallel afreq <- seqParallel(cl, genofile, FUN = function(gdsfile) { seqApply(gdsfile, "genotype", as.is="double", FUN=function(x) mean(x==0, na.rm=TRUE)) }, split = "by.variant") length(afreq) summary(afreq) ## ----eval=FALSE---------------------------------------------------------- ## # Since we created the cluster manually, we should stop it: ## stopCluster(cl) ## ------------------------------------------------------------------------ seqClose(genofile) ## ------------------------------------------------------------------------ # open a GDS file gds.fn <- seqExampleFileName("gds") genofile <- seqOpen(gds.fn) ## ------------------------------------------------------------------------ m.variant <- local({ # get ploidy, the number of samples and variants z <- seqSummary(genofile, "genotype") # dm[1] -- # of selected samples, dm[2] -- # of selected variants dm <- z$seldim # ploidy ploidy <- z$dim[1] # apply the function marginally m <- seqApply(genofile, "genotype", function(x) { sum(is.na(x)) }, margin="by.variant", as.is="integer") # output m / (ploidy * dm[1]) }) length(m.variant) summary(m.variant) ## ------------------------------------------------------------------------ m.sample <- local({ # get ploidy, the number of samples and variants z <- seqSummary(genofile, "genotype") # dm[1] -- # of selected samples, dm[2] -- # of selected variants dm <- z$seldim # ploidy ploidy <- z$dim[1] # need a global variable (only available in the bracket of "local") n <- integer(dm[1]) # apply the function marginally # use "<<-" operator to find "n" in the parent environment seqApply(genofile, "genotype", function(x) { n <<- n + colSums(is.na(x)) }, margin="by.variant", as.is="none") # output n / (ploidy * dm[2]) }) length(m.sample) summary(m.sample) ## ----eval=FALSE---------------------------------------------------------- ## # load the "parallel" package ## library(parallel) ## ## # Use option cl.core to choose an appropriate cluster size (or # of cores) ## cl <- makeCluster(getOption("cl.cores", 2)) ## ------------------------------------------------------------------------ # run in parallel m.variant <- local({ # get ploidy, the number of samples and variants z <- seqSummary(genofile, "genotype") # dm[1] -- # of selected samples, dm[2] -- # of selected variants dm <- z$seldim # ploidy ploidy <- z$dim[1] # run in parallel m <- seqParallel(cl, genofile, FUN = function(gdsfile) { # apply the function marginally seqApply(gdsfile, "genotype", function(x) { sum(is.na(x)) }, margin="by.variant", as.is="integer") }, split = "by.variant") # output m / (ploidy * dm[1]) }) summary(m.variant) ## ------------------------------------------------------------------------ m.sample <- local({ # run in parallel m <- seqParallel(cl, genofile, FUN = function(gdsfile) { # dm[1] -- # of selected samples, dm[2] -- # of selected variants dm <- seqSummary(gdsfile, "genotype")$seldim # need a global variable (only available in the bracket of "local") n <- integer(dm[1]) # apply the function marginally # use "<<-" operator to find "n" in the parent environment seqApply(gdsfile, "genotype", function(x) { n <<- n + colSums(is.na(x)) }, margin="by.variant", as.is="none") # output n }, .combine = "+", # sum all variables "n" of different processes split = "by.variant") # get ploidy, the number of samples and variants z <- seqSummary(genofile, "genotype") # dm[1] -- # of selected samples, dm[2] -- # of selected variants dm <- z$seldim # ploidy ploidy <- z$dim[1] # output m / (ploidy * dm[2]) }) summary(m.sample) ## ----eval=FALSE---------------------------------------------------------- ## # Since we created the cluster manually, we should stop it: ## stopCluster(cl) ## ------------------------------------------------------------------------ # apply the function variant by variant afreq <- seqApply(genofile, "genotype", function(x) { mean(x==0, na.rm=TRUE) }, as.is="double", margin="by.variant") length(afreq) summary(afreq) ## ----eval=FALSE---------------------------------------------------------- ## # load the "parallel" package ## library(parallel) ## ## # Use option cl.core to choose an appropriate cluster size (or # of cores) ## cl <- makeCluster(getOption("cl.cores", 2)) ## ------------------------------------------------------------------------ # run in parallel afreq <- seqParallel(cl, genofile, FUN = function(gdsfile) { seqApply(gdsfile, "genotype", as.is="double", FUN=function(x) mean(x==0, na.rm=TRUE)) }, split = "by.variant") length(afreq) summary(afreq) ## ----eval=FALSE---------------------------------------------------------- ## # Since we created the cluster manually, we should stop it: ## stopCluster(cl) ## ------------------------------------------------------------------------ # genetic correlation matrix genmat <- local({ # get the number of samples and variants # dm[1] -- # of selected samples, dm[2] -- # of selected variants dm <- seqSummary(genofile, "genotype")$seldim # need a global variable (only available in the bracket of "local") s <- matrix(0.0, nrow=dm[1], ncol=dm[1]) # apply the function variant by variant # use "<<-" operator to find "s" in the parent environment seqApply(genofile, "genotype", function(x) { g <- (x==0) # indicator of reference allele p <- mean(g, na.rm=TRUE) # allele frequency g2 <- colSums(g) - 2*p # genotypes adjusted by allele frequency m <- (g2 %o% g2) / (p*(1-p)) # genetic correlation matrix m[!is.finite(m)] <- 0 # correct missing values s <<- s + m # output to the global variable "s" }, margin="by.variant", as.is="none") # output, scaled by the trace of matrix "s" over the number of samples s / (sum(diag(s)) / dm[1]) }) # eigen-decomposition eig <- eigen(genmat) # eigenvalues head(eig$value) ## ----fig.width=5, fig.height=5, fig.align='center'----------------------- # eigenvectors plot(eig$vectors[,1], eig$vectors[,2], xlab="PC 1", ylab="PC 2") ## ----eval=FALSE---------------------------------------------------------- ## # load the "parallel" package ## library(parallel) ## ## # Use option cl.core to choose an appropriate cluster size (or # of cores) ## cl <- makeCluster(getOption("cl.cores", 2)) ## ------------------------------------------------------------------------ genmat <- seqParallel(cl, genofile, FUN = function(gdsfile) { # get the number of samples and variants # dm[1] -- # of selected samples, dm[2] -- # of selected variants dm <- seqSummary(gdsfile, "genotype")$seldim # need a global variable (only available in the bracket of "local") s <- matrix(0.0, nrow=dm[1], ncol=dm[1]) # apply the function variant by variant # use "<<-" operator to find "s" in the parent environment seqApply(gdsfile, "genotype", function(x) { g <- (x==0) # indicator of reference allele p <- mean(g, na.rm=TRUE) # allele frequency g2 <- colSums(g) - 2*p # genotypes adjusted by allele frequency m <- (g2 %o% g2) / (p*(1-p)) # genetic correlation matrix m[!is.finite(m)] <- 0 # correct missing values s <<- s + m # output to the global variable "s" }, margin="by.variant", as.is="none") # output s }, .combine = "+", # sum all variables "s" of different processes split = "by.variant") # finally, scaled by the trace of matrix "s" over the number of samples dm <- seqSummary(genofile, "genotype")$seldim genmat <- genmat / (sum(diag(genmat)) / dm[1]) # eigen-decomposition eig <- eigen(genmat) # eigenvalues head(eig$value) ## ----eval=FALSE---------------------------------------------------------- ## # Since we created the cluster manually, we should stop it: ## stopCluster(cl) ## ------------------------------------------------------------------------ coeff <- local({ # get the number of samples and variants # dm[1] -- # of selected samples, dm[2] -- # of selected variants dm <- seqSummary(genofile, "genotype")$seldim # need global variables (only available in the bracket of "local") n <- integer(dm[1]) s <- double(dm[1]) # apply the function variant by variant # use "<<-" operator to find "n" and "s" in the parent environment seqApply(genofile, "genotype", function(x) { p <- mean(x==0, na.rm=TRUE) # allele frequency g <- colSums(x==0) # genotypes: # of reference allele d <- (g*g - g*(1 + 2*p) + 2*p*p) / (2*p*(1-p)) n <<- n + is.finite(d) # output to the global variable "n" d[!is.finite(d)] <- 0 s <<- s + d # output to the global variable "s" }, margin="by.variant", as.is="none") # output s / n }) length(coeff) summary(coeff) ## ----eval=FALSE---------------------------------------------------------- ## # load the "parallel" package ## library(parallel) ## ## # Use option cl.core to choose an appropriate cluster size (or # of cores) ## cl <- makeCluster(getOption("cl.cores", 2)) ## ------------------------------------------------------------------------ coeff <- seqParallel(cl, genofile, FUN = function(gdsfile) { # get the number of samples and variants # dm[1] -- # of selected samples, dm[2] -- # of selected variants dm <- seqSummary(gdsfile, "genotype")$seldim # need global variables (only available in the bracket) n <- integer(dm[1]) s <- double(dm[1]) # apply the function variant by variant # use "<<-" operator to find "n" and "s" in the parent environment seqApply(gdsfile, "genotype", function(x) { p <- mean(x==0, na.rm=TRUE) # allele frequency g <- colSums(x==0) # genotypes: # of reference allele d <- (g*g - g*(1 + 2*p) + 2*p*p) / (2*p*(1-p)) n <<- n + is.finite(d) # output to the global variable "n" d[!is.finite(d)] <- 0 s <<- s + d # output to the global variable "s" }, margin="by.variant", as.is="none") # output list(s=s, n=n) }, # sum all variables "s" and "n" of different processes .combine = function(x1,x2) { list(s = x1$s + x2$s, n = x1$n + x2$n) }, split = "by.variant") # finally, average! coeff <- coeff$s / coeff$n summary(coeff) ## ------------------------------------------------------------------------ # Since we created the cluster manually, we should stop it: stopCluster(cl) ## ----eval=FALSE---------------------------------------------------------- ## library(Rcpp) ## ----eval=FALSE---------------------------------------------------------- ## cppFunction('double RefAlleleFreq(IntegerMatrix x) ## { ## int nrow = x.nrow(), ncol = x.ncol(); ## int cnt=0, zero_cnt=0, g; ## for (int i = 0; i < nrow; i++) ## { ## for (int j = 0; j < ncol; j++) ## { ## if ((g = x(i, j)) != NA_INTEGER) ## { ## cnt ++; ## if (g == 0) zero_cnt ++; ## } ## } ## } ## return double(zero_cnt) / cnt; ## }') ## ----eval=FALSE---------------------------------------------------------- ## RefAlleleFreq ## ## afreq <- seqApply(genofile, "genotype", RefAlleleFreq, ## as.is="double", margin="by.variant") ## ## summary(afreq) ## ------------------------------------------------------------------------ # close the GDS file seqClose(genofile) ## ----sessioninfo, results="asis"----------------------------------------- toLatex(sessionInfo())