## ----style, eval=TRUE, echo=FALSE, results="asis"------------------------
BiocStyle::latex()


## ------------------------------------------------------------------------
# load the R package SeqArray
library(SeqArray)


## ------------------------------------------------------------------------
gds.fn <- seqExampleFileName("gds")
# or gds.fn <- "C:/YourFolder/Your_GDS_File.gds"
gds.fn

seqSummary(gds.fn)


## ------------------------------------------------------------------------
# open a GDS file
genofile <- seqOpen(gds.fn)

# display the contents of the GDS file in a hierarchical structure
genofile


## ------------------------------------------------------------------------
# display all contents of the GDS file
print(genofile, all=TRUE)

# close the GDS file
seqClose(genofile)


## ------------------------------------------------------------------------
# the VCF file, using the example in the SeqArray package
vcf.fn <- seqExampleFileName("vcf")
# or vcf.fn <- "C:/YourFolder/Your_VCF_File.vcf"
vcf.fn

# parse the header
seqVCF.Header(vcf.fn)


## ------------------------------------------------------------------------
# convert, save in "tmp.gds"
seqVCF2GDS(vcf.fn, "tmp.gds")
seqSummary("tmp.gds")


## ----echo=FALSE----------------------------------------------------------
unlink("tmp.gds")


## ------------------------------------------------------------------------
# the file of GDS
gds.fn <- seqExampleFileName("gds")
# or gds.fn <- "C:/YourFolder/Your_GDS_File.gds"

# open a GDS file
genofile <- seqOpen(gds.fn)

# convert
seqGDS2VCF(genofile, "tmp.vcf.gz")
# read
z <- readLines("tmp.vcf.gz", n=20)
for (i in z) cat(substr(i, 1, 76), " ...\n", sep="")

# output BN,GP,AA,HM2 in INFO (the variables are in this order), no FORMAT
seqGDS2VCF(genofile, "tmp2.vcf.gz", info.var=c("BN","GP","AA","HM2"),
    fmt.var=character(0))
# read
z <- readLines("tmp2.vcf.gz", n=15)
for (i in z) cat(substr(i, 1, 56), " ...\n", sep="")

# close the GDS file
seqClose(genofile)


## ------------------------------------------------------------------------
# delete temporary files
unlink(c("tmp.vcf.gz", "tmp1.vcf.gz", "tmp2.vcf.gz"))


## ------------------------------------------------------------------------
# the file of VCF
vcf.fn <- seqExampleFileName("vcf")
# or vcf.fn <- "C:/YourFolder/Your_VCF_File.vcf"

# convert
seqVCF2GDS(vcf.fn, "tmp.gds", verbose=FALSE)

# make sure that open with "readonly=FALSE"
genofile <- seqOpen("tmp.gds", readonly=FALSE)

# display the original structure
genofile

# delete "HM2", "HM3", "AA", "OR" and "DP"
seqDelete(genofile, info.varname=c("HM2", "HM3", "AA", "OR"),
    format.varname="DP")

# display
genofile

# close the GDS file
seqClose(genofile)

# clean up the fragments, reduce the file size
cleanup.gds("tmp.gds")


## ----echo=FALSE----------------------------------------------------------
unlink("tmp.gds")


## ------------------------------------------------------------------------
# open a GDS file
gds.fn <- seqExampleFileName("gds")
genofile <- seqOpen(gds.fn)


## ------------------------------------------------------------------------
# take out sample id
head(samp.id <- seqGetData(genofile, "sample.id"))

# take out variant id
head(variant.id <- seqGetData(genofile, "variant.id"))

# get "chromosome"
table(seqGetData(genofile, "chromosome"))

# get "allele"
head(seqGetData(genofile, "allele"))

# get "annotation/info/GP"
head(seqGetData(genofile, "annotation/info/GP"))


## ------------------------------------------------------------------------
# set sample and variant filters
seqSetFilter(genofile, sample.id=samp.id[c(2,4,6)])
set.seed(100)
seqSetFilter(genofile, variant.id=sample(variant.id, 4))

# get "allele"
seqGetData(genofile, "allele")


## ------------------------------------------------------------------------
# get genotypic data
seqGetData(genofile, "genotype")


## ------------------------------------------------------------------------
# get "annotation/info/AA", a variable-length dataset
seqGetData(genofile, "annotation/info/AA")


## ------------------------------------------------------------------------
# get "annotation/format/DP", a variable-length dataset
seqGetData(genofile, "annotation/format/DP")


## ------------------------------------------------------------------------
# set sample and variant filters
set.seed(100)
seqSetFilter(genofile, sample.id=samp.id[c(2,4,6)],
    variant.id=sample(variant.id, 4))

## ------------------------------------------------------------------------
# read multiple variables variant by variant
seqApply(genofile, c(geno="genotype", id="annotation/id"),
    FUN=function(x) print(x), margin="by.variant", as.is="none")


## ------------------------------------------------------------------------
# remove the sample and variant filters
seqSetFilter(genofile)

# get the numbers of alleles per variant
z <- seqApply(genofile, "allele",
    FUN=function(x) length(unlist(strsplit(x,","))), as.is="integer")
table(z)


## ------------------------------------------------------------------------
HM3 <- seqGetData(genofile, "annotation/info/HM3")

## ------------------------------------------------------------------------
# Now HM3 is a global variable
# print out RS id if the frequency of reference allele is less than 0.5% and it is HM3
seqApply(genofile, c(geno="genotype", id="annotation/id"),
    FUN = function(index, x) {
        p <- mean(x$geno == 0, na.rm=TRUE)  # the frequency of reference allele
        if ((p < 0.005) & HM3[index]) print(x$id) },
    as.is="none", var.index="relative", margin="by.variant")


## ------------------------------------------------------------------------
# calculate the frequency of reference allele,
afreq <- seqApply(genofile, "genotype", FUN=function(x) mean(x==0, na.rm=TRUE),
    as.is="double", margin="by.variant")
length(afreq)
summary(afreq)


## ------------------------------------------------------------------------
# load the "parallel" package
library(parallel)

# Use option cl.core to choose an appropriate cluster size (or # of cores)
cl <- makeCluster(getOption("cl.cores", 2))

## ------------------------------------------------------------------------
# run in parallel
afreq <- seqParallel(cl, genofile, FUN = function(gdsfile) {
        seqApply(gdsfile, "genotype", as.is="double",
            FUN=function(x) mean(x==0, na.rm=TRUE))
    }, split = "by.variant")

length(afreq)
summary(afreq)


## ----eval=FALSE----------------------------------------------------------
## # Since we created the cluster manually, we should stop it:
## stopCluster(cl)


## ------------------------------------------------------------------------
seqClose(genofile)


## ------------------------------------------------------------------------
# open a GDS file
gds.fn <- seqExampleFileName("gds")
genofile <- seqOpen(gds.fn)


## ------------------------------------------------------------------------
m.variant <- local({
    # get ploidy, the number of samples and variants
    z <- seqSummary(genofile, "genotype")
    # dm[1] -- # of selected samples, dm[2] -- # of selected variants
    dm <- z$seldim
    # ploidy
    ploidy <- z$dim[1]

    # apply the function marginally
    m <- seqApply(genofile, "genotype", function(x) { sum(is.na(x)) },
        margin="by.variant", as.is="integer")

    # output
    m / (ploidy * dm[1])
})

length(m.variant)
summary(m.variant)


## ------------------------------------------------------------------------
m.sample <- local({
    # get ploidy, the number of samples and variants
    z <- seqSummary(genofile, "genotype")
    # dm[1] -- # of selected samples, dm[2] -- # of selected variants
    dm <- z$seldim
    # ploidy
    ploidy <- z$dim[1]

    # need a global variable (only available in the bracket of "local")
    n <- integer(dm[1])

    # apply the function marginally
    # use "<<-" operator to find "n" in the parent environment
    seqApply(genofile, "genotype", function(x) { n <<- n + colSums(is.na(x)) },
        margin="by.variant", as.is="none")

    # output
    n / (ploidy * dm[2])
})

length(m.sample)
summary(m.sample)


## ----eval=FALSE----------------------------------------------------------
## # load the "parallel" package
## library(parallel)
## 
## # Use option cl.core to choose an appropriate cluster size (or # of cores)
## cl <- makeCluster(getOption("cl.cores", 2))


## ------------------------------------------------------------------------
# run in parallel
m.variant <- local({
    # get ploidy, the number of samples and variants
    z <- seqSummary(genofile, "genotype")
    # dm[1] -- # of selected samples, dm[2] -- # of selected variants
    dm <- z$seldim
    # ploidy
    ploidy <- z$dim[1]

    # run in parallel
    m <- seqParallel(cl, genofile, FUN = function(gdsfile) {
        # apply the function marginally
        seqApply(gdsfile, "genotype", function(x) { sum(is.na(x)) },
            margin="by.variant", as.is="integer")
    }, split = "by.variant")

    # output
    m / (ploidy * dm[1])
})

summary(m.variant)


## ------------------------------------------------------------------------
m.sample <- local({
    # run in parallel
    m <- seqParallel(cl, genofile, FUN = function(gdsfile)
        {
            # dm[1] -- # of selected samples, dm[2] -- # of selected variants
            dm <- seqSummary(gdsfile, "genotype")$seldim

            # need a global variable (only available in the bracket of "local")
            n <- integer(dm[1])

            # apply the function marginally
            # use "<<-" operator to find "n" in the parent environment
            seqApply(gdsfile, "genotype", function(x) { n <<- n + colSums(is.na(x)) },
                margin="by.variant", as.is="none")

            # output
            n
        }, .combine = "+",     # sum all variables "n" of different processes
        split = "by.variant")

    # get ploidy, the number of samples and variants
    z <- seqSummary(genofile, "genotype")
    # dm[1] -- # of selected samples, dm[2] -- # of selected variants
    dm <- z$seldim
    # ploidy
    ploidy <- z$dim[1]

    # output
    m / (ploidy * dm[2])
})

summary(m.sample)


## ----eval=FALSE----------------------------------------------------------
## # Since we created the cluster manually, we should stop it:
## stopCluster(cl)


## ------------------------------------------------------------------------
# apply the function variant by variant
afreq <- seqApply(genofile, "genotype", function(x) { mean(x==0, na.rm=TRUE) },
    as.is="double", margin="by.variant")

length(afreq)
summary(afreq)


## ----eval=FALSE----------------------------------------------------------
## # load the "parallel" package
## library(parallel)
## 
## # Use option cl.core to choose an appropriate cluster size (or # of cores)
## cl <- makeCluster(getOption("cl.cores", 2))

## ------------------------------------------------------------------------
# run in parallel
afreq <- seqParallel(cl, genofile, FUN = function(gdsfile) {
        seqApply(gdsfile, "genotype", as.is="double",
            FUN=function(x) mean(x==0, na.rm=TRUE))
    }, split = "by.variant")

length(afreq)
summary(afreq)


## ----eval=FALSE----------------------------------------------------------
## # Since we created the cluster manually, we should stop it:
## stopCluster(cl)


## ------------------------------------------------------------------------
# genetic correlation matrix
genmat <- local({
    # get the number of samples and variants
    # dm[1] -- # of selected samples, dm[2] -- # of selected variants
    dm <- seqSummary(genofile, "genotype")$seldim

    # need a global variable (only available in the bracket of "local")
    s <- matrix(0.0, nrow=dm[1], ncol=dm[1])

    # apply the function variant by variant
    # use "<<-" operator to find "s" in the parent environment
    seqApply(genofile, "genotype", function(x) {
        g <- (x==0)                   # indicator of reference allele
        p <- mean(g, na.rm=TRUE)      # allele frequency
        g2 <- colSums(g) - 2*p        # genotypes adjusted by allele frequency
        m <- (g2 %o% g2) / (p*(1-p))  # genetic correlation matrix
        m[!is.finite(m)] <- 0         # correct missing values
        s <<- s + m                   # output to the global variable "s"
    }, margin="by.variant", as.is="none")

    # output, scaled by the trace of matrix "s" over the number of samples
    s / (sum(diag(s)) / dm[1])
})

# eigen-decomposition
eig <- eigen(genmat)

# eigenvalues
head(eig$value)

## ----fig.width=5, fig.height=5, fig.align='center'-----------------------
# eigenvectors
plot(eig$vectors[,1], eig$vectors[,2], xlab="PC 1", ylab="PC 2")


## ----eval=FALSE----------------------------------------------------------
## # load the "parallel" package
## library(parallel)
## 
## # Use option cl.core to choose an appropriate cluster size (or # of cores)
## cl <- makeCluster(getOption("cl.cores", 2))


## ------------------------------------------------------------------------
genmat <- seqParallel(cl, genofile, FUN = function(gdsfile)
    {
        # get the number of samples and variants
        # dm[1] -- # of selected samples, dm[2] -- # of selected variants
        dm <- seqSummary(gdsfile, "genotype")$seldim

        # need a global variable (only available in the bracket of "local")
        s <- matrix(0.0, nrow=dm[1], ncol=dm[1])

        # apply the function variant by variant
        # use "<<-" operator to find "s" in the parent environment
        seqApply(gdsfile, "genotype", function(x) {
            g <- (x==0)                   # indicator of reference allele
            p <- mean(g, na.rm=TRUE)      # allele frequency
            g2 <- colSums(g) - 2*p        # genotypes adjusted by allele frequency
            m <- (g2 %o% g2) / (p*(1-p))  # genetic correlation matrix
            m[!is.finite(m)] <- 0         # correct missing values
            s <<- s + m                   # output to the global variable "s"
        }, margin="by.variant", as.is="none")

        # output
        s
    }, .combine = "+",     # sum all variables "s" of different processes
    split = "by.variant")

# finally, scaled by the trace of matrix "s" over the number of samples
dm <- seqSummary(genofile, "genotype")$seldim
genmat <- genmat / (sum(diag(genmat)) / dm[1])

# eigen-decomposition
eig <- eigen(genmat)
# eigenvalues
head(eig$value)


## ----eval=FALSE----------------------------------------------------------
## # Since we created the cluster manually, we should stop it:
## stopCluster(cl)


## ------------------------------------------------------------------------
coeff <- local({
    # get the number of samples and variants
    # dm[1] -- # of selected samples, dm[2] -- # of selected variants
    dm <- seqSummary(genofile, "genotype")$seldim

    # need global variables (only available in the bracket of "local")
    n <- integer(dm[1])
    s <- double(dm[1])

    # apply the function variant by variant
    # use "<<-" operator to find "n" and "s" in the parent environment
    seqApply(genofile, "genotype", function(x) {
        p <- mean(x==0, na.rm=TRUE)      # allele frequency
        g <- colSums(x==0)               # genotypes: # of reference allele
        d <- (g*g - g*(1 + 2*p) + 2*p*p) / (2*p*(1-p))
        n <<- n + is.finite(d)           # output to the global variable "n"
        d[!is.finite(d)] <- 0
        s <<- s + d                      # output to the global variable "s"
    }, margin="by.variant", as.is="none")

    # output
    s / n
})

length(coeff)
summary(coeff)


## ----eval=FALSE----------------------------------------------------------
## # load the "parallel" package
## library(parallel)
## 
## # Use option cl.core to choose an appropriate cluster size (or # of cores)
## cl <- makeCluster(getOption("cl.cores", 2))


## ------------------------------------------------------------------------
coeff <- seqParallel(cl, genofile, FUN = function(gdsfile)
    {
        # get the number of samples and variants
        # dm[1] -- # of selected samples, dm[2] -- # of selected variants
        dm <- seqSummary(gdsfile, "genotype")$seldim

        # need global variables (only available in the bracket)
        n <- integer(dm[1])
        s <- double(dm[1])

        # apply the function variant by variant
        # use "<<-" operator to find "n" and "s" in the parent environment
        seqApply(gdsfile, "genotype", function(x) {
            p <- mean(x==0, na.rm=TRUE)      # allele frequency
            g <- colSums(x==0)               # genotypes: # of reference allele
            d <- (g*g - g*(1 + 2*p) + 2*p*p) / (2*p*(1-p))
            n <<- n + is.finite(d)           # output to the global variable "n"
            d[!is.finite(d)] <- 0
            s <<- s + d                      # output to the global variable "s"
        }, margin="by.variant", as.is="none")

        # output
        list(s=s, n=n)
    }, # sum all variables "s" and "n" of different processes
    .combine = function(x1,x2) { list(s = x1$s + x2$s, n = x1$n + x2$n) },
    split = "by.variant")

# finally, average!
coeff <- coeff$s / coeff$n

summary(coeff)


## ------------------------------------------------------------------------
# Since we created the cluster manually, we should stop it:
stopCluster(cl)


## ----eval=FALSE----------------------------------------------------------
## library(Rcpp)


## ----eval=FALSE----------------------------------------------------------
## cppFunction('double RefAlleleFreq(IntegerMatrix x)
## {
##     int nrow = x.nrow(), ncol = x.ncol();
##     int cnt=0, zero_cnt=0, g;
##     for (int i = 0; i < nrow; i++)
##     {
##         for (int j = 0; j < ncol; j++)
##         {
##             if ((g = x(i, j)) != NA_INTEGER)
##             {
##                 cnt ++;
##                 if (g == 0) zero_cnt ++;
##             }
##         }
##     }
##     return double(zero_cnt) / cnt;
## }')


## ----eval=FALSE----------------------------------------------------------
## RefAlleleFreq
## 
## afreq <- seqApply(genofile, "genotype", RefAlleleFreq,
##     as.is="double", margin="by.variant")
## 
## summary(afreq)


## ------------------------------------------------------------------------
# close the GDS file
seqClose(genofile)


## ----sessioninfo, results="asis"-----------------------------------------
toLatex(sessionInfo())