## ----style-knitr, eval=TRUE, echo=FALSE, results="asis"--------------------
BiocStyle::latex(use.unsrturl=FALSE)

## ----setup_knitr, include=FALSE, cache=FALSE-------------------------------
library(knitr)
opts_chunk$set(cache = TRUE, tidy = FALSE, tidy.opts = list(blank = FALSE, 
  width.cutoff=70), highlight = FALSE, out.width = "7cm", out.height = "7cm", 
  fig.align = "center")

## ----eval = TRUE, message = FALSE------------------------------------------
library(GenomicRanges)
library(rtracklayer)

## ----annotation, eval=FALSE------------------------------------------------
# gtf_dir <- "gencode.v12.annotation.gtf"
# 
# gtf <- import(gtf_dir)
# 
# ### Keep protein coding genes
# keep_index <- mcols(gtf)$gene_type == "protein_coding" &
#   mcols(gtf)$type == "gene"
# gtf <- gtf[keep_index]
# ### Remove 'chr' from chromosome names
# seqlevels(gtf) <- gsub(pattern = "chr", replacement = "", x = seqlevels(gtf))
# 
# genes_bed <- data.frame(chr = seqnames(gtf), start =  start(gtf),
#   end = end(gtf), geneId = mcols(gtf)$gene_id,
#   stringsAsFactors = FALSE)
# 
# for(i in as.character(1:22)){
#   genes_bed_sub <- genes_bed[genes_bed$chr == i, ]
#   write.table(genes_bed_sub, "genes_chr", i ,".bed", quote = FALSE,
#     sep = "\t", row.names = FALSE, col.names = FALSE)
# }
# 

## ----eval = TRUE, message = FALSE------------------------------------------
library(limma)

## ----eval=FALSE------------------------------------------------------------
# metadata_dir <- "E-GEUV-1.sdrf.txt"
# 
# samples <- read.table(metadata_dir, header = TRUE, sep = "\t", as.is = TRUE)
# 
# samples <- unique(samples[c("Assay.Name", "Characteristics.population.")])
# colnames(samples) <- c("sample_id", "population")
# 
# samples$sample_id_short <- strsplit2(samples$sample_id, "\\.")[,1]
# 

## ----eval = FALSE----------------------------------------------------------
# expr_dir <- "GD660.TrQuantCount.txt"
# 
# expr_all <- read.table(expr_dir, header = TRUE, sep="\t", as.is = TRUE)
# 
# expr_all <- expr_all[, c("TargetID", "Gene_Symbol", "Chr",
#   samples$sample_id)]
# colnames(expr_all) <- c("TargetID", "Gene_Symbol", "Chr",
#   samples$sample_id_short)
# 
# for(j in "CEU"){
#   for(i in 1:22){
#     expr <- expr_all[expr_all$Chr == i, c("TargetID", "Gene_Symbol",
#       samples$sample_id_short[samples$population == j])]
#     write.table(expr, paste0("TrQuantCount_", j, "_chr", i, ".tsv"),
#       quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE)
#   }
# }

## ----eval = TRUE, message = FALSE------------------------------------------
library(Rsamtools)

## ----eval = FALSE----------------------------------------------------------
# files <- list.files(path = ".", pattern = "genotypes.vcf.gz",
#   full.names = TRUE, include.dirs = FALSE)
# 
# ### bgzip and index the vcf files
# for(i in 1:length(files)){
#   zipped <- bgzip(files[i])
#   idx <- indexTabix(zipped, format = "vcf")
# }

## ----eval = TRUE, message = FALSE------------------------------------------
library(VariantAnnotation)
library(tools)

## ----eval = FALSE----------------------------------------------------------
# ### Extended gene ranges
# window <- 5000
# gene_ranges <- resize(gtf, GenomicRanges::width(gtf) + 2 * window,
#   fix = "center")
# 
# chr <- gsub("chr", "", strsplit2(files, split = "\\.")[, 2])
# 
# for(j in "CEU"){
#   for(i in 1:length(files)){
#     cat(j, chr[i], fill = TRUE)
# 
#     zipped <- paste0(file_path_sans_ext(files[i]), ".bgz")
#     idx <- paste0(file_path_sans_ext(files[i]), ".bgz.tbi")
#     tab <- TabixFile(zipped, idx)
# 
#     ### Explore the file header with scanVcfHeader
#     hdr <- scanVcfHeader(tab)
#     print(all(samples$sample_id_short %in% samples(hdr)))
# 
#     ### Read a subset of VCF file
#     gene_ranges_tmp <- gene_ranges[seqnames(gene_ranges) == chr[i]]
#     param <- ScanVcfParam(which = gene_ranges_tmp, samples =
#         samples$sample_id_short[samples$population == j])
#     vcf <- readVcf(tab, "hg19", param)
# 
#     ### Keep only the bi-allelic SNPs
#     # width of ref seq
#     rw <- width(ref(vcf))
#     # width of first alt seq
#     aw <- unlist(lapply(alt(vcf), function(x) {width(x[1])}))
#     # number of alternate genotypes
#     nalt <- elementLengths(alt(vcf))
#     # select only bi-allelic SNPs (monomorphic OK, so aw can be 0 or 1)
#     snp <- rw == 1 & aw <= 1 & nalt == 1
#     # subset vcf
#     vcfbi <- vcf[snp,]
# 
#     rowdata <- rowData(vcfbi)
# 
#     ### Convert genotype into number of alleles different from reference
#     geno <- geno(vcfbi)$GT
#     geno01 <- geno
#     geno01[,] <- -1
#     geno01[geno %in% c("0/0", "0|0")] <- 0 # REF/REF
#     geno01[geno %in% c("0/1", "0|1", "1/0", "1|0")] <- 1 # REF/ALT
#     geno01[geno %in% c("1/1", "1|1")] <- 2 # ALT/ALT
#     mode(geno01) <- "integer"
# 
#     genotypes <- unique(data.frame(chr = seqnames(rowdata),
#       start = start(rowdata), end = end(rowdata), snpId = rownames(geno01),
#       geno01, stringsAsFactors = FALSE))
# 
#     ### Sort SNPs by position
#     genotypes <- genotypes[order(genotypes[ ,2]), ]
# 
#     write.table(genotypes, file = paste0("genotypes_", j, "_chr",
#       chr[i], ".tsv"), quote = FALSE, sep = "\t", row.names = FALSE,
#       col.names = TRUE)
# 
#   }
# }
# 

## ----sessionInfo-----------------------------------------------------------
sessionInfo()