## ----style, eval=TRUE, echo=FALSE, results="asis"------------------------
BiocStyle::latex()


## ----library-------------------------------------------------------------
library(HIBAG)


## ----plot, fig.width=8, fig.height=4, fig.align='center'-----------------
# load the published parameter estimates from European ancestry
#   e.g., filename <- "HumanOmniExpressExome-European-HLA4-hg19.RData"
#   here, we use example data in the package
filename <- system.file("extdata", "ModelList.RData", package="HIBAG")
model.list <- get(load(filename))

# HLA imputation at HLA-A
hla.id <- "A"
model <- hlaModelFromObj(model.list[[hla.id]])
summary(model)

# SNPs in the model
head(model$snp.id)

head(model$snp.position)

# plot SNP information
plot(model)


## ----eval=FALSE----------------------------------------------------------
## #########################################################################
## # Import your PLINK BED file
## #
## yourgeno <- hlaBED2Geno(bed.fn=".bed", fam.fn=".fam", bim.fn=".bim")
## summary(yourgeno)
## 
## # best-guess genotypes and all posterior probabilities
## pred.guess <- predict(model, yourgeno, type="response+prob")
## summary(pred.guess)
## 
## pred.guess$value
## pred.guess$postprob


## ----build-model, eval=FALSE---------------------------------------------
## library(HIBAG)
## 
## # load HLA types and SNP genotypes in the package
## data(HLA_Type_Table, package="HIBAG")
## data(HapMap_CEU_Geno, package="HIBAG")
## 
## # a list of HLA genotypes
## # e.g., train.HLA <- hlaAllele(sample.id, H1=c("01:02", "05:01", ...),
## #                        H2=c("02:01", "03:01", ...), locus="A")
## #     the HLA type of the first individual is 01:02/02:01,
## #                 the second is 05:01/03:01, ...
## hla.id <- "A"
## train.HLA <- hlaAllele(HLA_Type_Table$sample.id,
##     H1 = HLA_Type_Table[, paste(hla.id, ".1", sep="")],
##     H2 = HLA_Type_Table[, paste(hla.id, ".2", sep="")],
##     locus=hla.id, assembly="hg19")
## 
## # selected SNPs:
## #    the flanking region of 500kb on each side,
## #    or an appropriate flanking size without sacrificing predictive accuracy
## region <- 500   # kb
## snpid <- hlaFlankingSNP(HapMap_CEU_Geno$snp.id,
##     HapMap_CEU_Geno$snp.position, hla.id, region*1000, assembly="hg19")
## length(snpid)
## 
## # training genotypes
## #   import your PLINK BED file,
## #   e.g., train.geno <- hlaBED2Geno(bed.fn=".bed", fam.fn=".fam", bim.fn=".bim")
## #   or,
## train.geno <- hlaGenoSubset(HapMap_CEU_Geno,
##     snp.sel = match(snpid, HapMap_CEU_Geno$snp.id))
## summary(train.geno)
## 
## # train a HIBAG model
## set.seed(100)
## model <- hlaAttrBagging(train.HLA, train.geno, nclassifier=100)

## ----echo=FALSE----------------------------------------------------------
mobj <- get(load(system.file("extdata", "ModelList.RData", package="HIBAG")))
model <- hlaModelFromObj(mobj$A)

## ------------------------------------------------------------------------
summary(model)


## ----build-model-parallel, eval=FALSE------------------------------------
## library(HIBAG)
## library(parallel)
## 
## # load HLA types and SNP genotypes in the package
## data(HLA_Type_Table, package="HIBAG")
## data(HapMap_CEU_Geno, package="HIBAG")
## 
## # a list of HLA genotypes
## # e.g., train.HLA <- hlaAllele(sample.id, H1=c("01:02", "05:01", ...),
## #                        H2=c("02:01", "03:01", ...), locus="A")
## #     the HLA type of the first individual is 01:02/02:01,
## #                 the second is 05:01/03:01, ...
## hla.id <- "A"
## train.HLA <- hlaAllele(HLA_Type_Table$sample.id,
##     H1 = HLA_Type_Table[, paste(hla.id, ".1", sep="")],
##     H2 = HLA_Type_Table[, paste(hla.id, ".2", sep="")],
##     locus=hla.id, assembly="hg19")
## 
## # selected SNPs:
## #    the flanking region of 500kb on each side,
## #    or an appropriate flanking size without sacrificing predictive accuracy
## region <- 500   # kb
## snpid <- hlaFlankingSNP(HapMap_CEU_Geno$snp.id,
##     HapMap_CEU_Geno$snp.position, hla.id, region*1000, assembly="hg19")
## length(snpid)
## 
## # training genotypes
## #   import your PLINK BED file,
## #   e.g., train.geno <- hlaBED2Geno(bed.fn=".bed", fam.fn=".fam", bim.fn=".bim")
## #   or,
## train.geno <- hlaGenoSubset(HapMap_CEU_Geno,
##     snp.sel = match(snpid, HapMap_CEU_Geno$snp.id))
## summary(train.geno)
## 
## 
## # Create an environment with an appropriate cluster size,
## #   e.g., 2 -- # of (CPU) cores
## cl <- makeCluster(2)
## 
## # Building ...
## set.seed(1000)
## hlaParallelAttrBagging(cl, train.HLA, train.geno, nclassifier=100,
##     auto.save="AutoSaveModel.RData", stop.cluster=TRUE)
## model.obj <- get(load("AutoSaveModel.RData"))
## model <- hlaModelFromObj(model.obj)
## summary(model)


## ----echo=FALSE----------------------------------------------------------
unlink("AutoSaveModel.RData", force=TRUE)


## ----evaluate------------------------------------------------------------
library(HIBAG)

# load HLA types and SNP genotypes in the package
data(HLA_Type_Table, package="HIBAG")
data(HapMap_CEU_Geno, package="HIBAG")

# make a list of HLA types
hla.id <- "A"
hla <- hlaAllele(HLA_Type_Table$sample.id,
    H1 = HLA_Type_Table[, paste(hla.id, ".1", sep="")],
    H2 = HLA_Type_Table[, paste(hla.id, ".2", sep="")],
    locus=hla.id, assembly="hg19")

# divide HLA types randomly
set.seed(100)
hlatab <- hlaSplitAllele(hla, train.prop=0.5)
names(hlatab)
summary(hlatab$training)
summary(hlatab$validation)

# SNP predictors within the flanking region on each side
region <- 500   # kb
snpid <- hlaFlankingSNP(HapMap_CEU_Geno$snp.id,
    HapMap_CEU_Geno$snp.position, hla.id, region*1000, assembly="hg19")
length(snpid)

# training and validation genotypes
train.geno <- hlaGenoSubset(HapMap_CEU_Geno,
    snp.sel = match(snpid, HapMap_CEU_Geno$snp.id),
    samp.sel = match(hlatab$training$value$sample.id,
        HapMap_CEU_Geno$sample.id))
summary(train.geno)

test.geno <- hlaGenoSubset(HapMap_CEU_Geno, samp.sel=match(
    hlatab$validation$value$sample.id, HapMap_CEU_Geno$sample.id))

## ----eval=FALSE----------------------------------------------------------
## # train a HIBAG model
## set.seed(100)
## model <- hlaAttrBagging(hlatab$training, train.geno, nclassifier=100)

## ----echo=FALSE----------------------------------------------------------
mobj <- get(load(system.file("extdata", "OutOfBag.RData", package="HIBAG")))
model <- hlaModelFromObj(mobj)


## ------------------------------------------------------------------------
summary(model)

# validation
pred <- predict(model, test.geno)

# compare
comp <- hlaCompareAllele(hlatab$validation, pred, allele.limit=model,
    call.threshold=0)
comp$overall


## ----out-text------------------------------------------------------------
# report overall accuracy, per-allele sensitivity, specificity, etc
hlaReport(comp, type="txt")


## ----out-tex, results="asis"---------------------------------------------
# report overall accuracy, per-allele sensitivity, specificity, etc
hlaReport(comp, type="tex", header=FALSE)


## ----release-model, eval=FALSE-------------------------------------------
## library(HIBAG)
## 
## # make a list of HLA types
## hla.id <- "DQA1"
## hla <- hlaAllele(HLA_Type_Table$sample.id,
##     H1 = HLA_Type_Table[, paste(hla.id, ".1", sep="")],
##     H2 = HLA_Type_Table[, paste(hla.id, ".2", sep="")],
##     locus=hla.id, assembly="hg19")
## 
## # training genotypes
## region <- 500   # kb
## snpid <- hlaFlankingSNP(HapMap_CEU_Geno$snp.id, HapMap_CEU_Geno$snp.position,
##     hla.id, region*1000, assembly="hg19")
## train.geno <- hlaGenoSubset(HapMap_CEU_Geno,
##     snp.sel = match(snpid, HapMap_CEU_Geno$snp.id),
##     samp.sel = match(hla$value$sample.id, HapMap_CEU_Geno$sample.id))
## 
## set.seed(1000)
## model <- hlaAttrBagging(hla, train.geno, nclassifier=100)
## summary(model)
## 
## # remove unused SNPs and sample IDs from the model
## mobj <- hlaPublish(model,
##     platform = "Illumina 1M Duo",
##     information = "Training set -- HapMap Phase II",
##     warning = NULL,
##     rm.unused.snp=TRUE, anonymize=TRUE)
## 
## save(mobj, file="Your_HIBAG_Model.RData")


## ----echo=FALSE----------------------------------------------------------
unlink("Your_HIBAG_Model.RData", force=TRUE)


## ----eval=FALSE----------------------------------------------------------
## # assume the HIBAG models are stored in R objects: mobj.A, mobj.B, ...
## 
## ModelList <- list()
## ModelList[["A"]] <- mobj.A
## ModelList[["B"]] <- mobj.B
## ...
## 
## # save to an R data file
## save(ModelList, file="HIBAG_Model_List.RData")


## ----sessioninfo, results="asis"-----------------------------------------
toLatex(sessionInfo())