## ----echo=FALSE--------------------------------------------------------------------------------------------- options(width=110) ## ----------------------------------------------------------------------------------------------------------- library(HIBAG) ## ----fig.width=10, fig.height=5, fig.align='center'--------------------------------------------------------- # load the published parameter estimates from European ancestry # e.g., filename <- "HumanOmniExpressExome-European-HLA4-hg19.RData" # here, we use example data in the package filename <- system.file("extdata", "ModelList.RData", package="HIBAG") model.list <- get(load(filename)) # HLA imputation at HLA-A hla.id <- "A" model <- hlaModelFromObj(model.list[[hla.id]]) summary(model) # SNPs in the model head(model$snp.id) head(model$snp.position) ## ----eval=FALSE--------------------------------------------------------------------------------------------- # ######################################################################### # # Import your PLINK BED file # # # yourgeno <- hlaBED2Geno(bed.fn=".bed", fam.fn=".fam", bim.fn=".bim") # summary(yourgeno) # # # best-guess genotypes and all posterior probabilities # pred.guess <- predict(model, yourgeno, type="response+prob") # summary(pred.guess) # # pred.guess$value # pred.guess$postprob ## ----eval=FALSE--------------------------------------------------------------------------------------------- # library(HIBAG) # # # load HLA types and SNP genotypes in the package # data(HLA_Type_Table, package="HIBAG") # data(HapMap_CEU_Geno, package="HIBAG") # # # a list of HLA genotypes # # e.g., train.HLA <- hlaAllele(sample.id, H1=c("01:02", "05:01", ...), # # H2=c("02:01", "03:01", ...), locus="A") # # the HLA type of the first individual is 01:02/02:01, # # the second is 05:01/03:01, ... # hla.id <- "A" # train.HLA <- hlaAllele(HLA_Type_Table$sample.id, # H1 = HLA_Type_Table[, paste(hla.id, ".1", sep="")], # H2 = HLA_Type_Table[, paste(hla.id, ".2", sep="")], # locus=hla.id, assembly="hg19") # # # selected SNPs: # # the flanking region of 500kb on each side, # # or an appropriate flanking size without sacrificing predictive accuracy # region <- 500 # kb # snpid <- hlaFlankingSNP(HapMap_CEU_Geno$snp.id, # HapMap_CEU_Geno$snp.position, hla.id, region*1000, assembly="hg19") # length(snpid) # # # training genotypes # # import your PLINK BED file, # # e.g., train.geno <- hlaBED2Geno(bed.fn=".bed", fam.fn=".fam", bim.fn=".bim") # # or, # train.geno <- hlaGenoSubset(HapMap_CEU_Geno, # snp.sel = match(snpid, HapMap_CEU_Geno$snp.id)) # summary(train.geno) # # # train a HIBAG model # set.seed(100) # model <- hlaAttrBagging(train.HLA, train.geno, nclassifier=100) ## ----echo=FALSE--------------------------------------------------------------------------------------------- mobj <- get(load(system.file("extdata", "ModelList.RData", package="HIBAG"))) model <- hlaModelFromObj(mobj$A) ## ----------------------------------------------------------------------------------------------------------- summary(model) ## ----eval=FALSE--------------------------------------------------------------------------------------------- # library(HIBAG) # library(parallel) # # # load HLA types and SNP genotypes in the package # data(HLA_Type_Table, package="HIBAG") # data(HapMap_CEU_Geno, package="HIBAG") # # # a list of HLA genotypes # # e.g., train.HLA <- hlaAllele(sample.id, H1=c("01:02", "05:01", ...), # # H2=c("02:01", "03:01", ...), locus="A") # # the HLA type of the first individual is 01:02/02:01, # # the second is 05:01/03:01, ... # hla.id <- "A" # train.HLA <- hlaAllele(HLA_Type_Table$sample.id, # H1 = HLA_Type_Table[, paste(hla.id, ".1", sep="")], # H2 = HLA_Type_Table[, paste(hla.id, ".2", sep="")], # locus=hla.id, assembly="hg19") # # # selected SNPs: # # the flanking region of 500kb on each side, # # or an appropriate flanking size without sacrificing predictive accuracy # region <- 500 # kb # snpid <- hlaFlankingSNP(HapMap_CEU_Geno$snp.id, # HapMap_CEU_Geno$snp.position, hla.id, region*1000, assembly="hg19") # length(snpid) # # # training genotypes # # import your PLINK BED file, # # e.g., train.geno <- hlaBED2Geno(bed.fn=".bed", fam.fn=".fam", bim.fn=".bim") # # or, # train.geno <- hlaGenoSubset(HapMap_CEU_Geno, # snp.sel = match(snpid, HapMap_CEU_Geno$snp.id)) # summary(train.geno) # # # # Create an environment with an appropriate cluster size, # # e.g., 2 -- # of (CPU) cores # cl <- makeCluster(2) # # # Building ... # set.seed(1000) # hlaParallelAttrBagging(cl, train.HLA, train.geno, nclassifier=100, # auto.save="AutoSaveModel.RData", stop.cluster=TRUE) # model.obj <- get(load("AutoSaveModel.RData")) # model <- hlaModelFromObj(model.obj) # summary(model) ## ----------------------------------------------------------------------------------------------------------- library(HIBAG) # load HLA types and SNP genotypes in the package data(HLA_Type_Table, package="HIBAG") data(HapMap_CEU_Geno, package="HIBAG") # make a list of HLA types hla.id <- "A" hla <- hlaAllele(HLA_Type_Table$sample.id, H1 = HLA_Type_Table[, paste(hla.id, ".1", sep="")], H2 = HLA_Type_Table[, paste(hla.id, ".2", sep="")], locus=hla.id, assembly="hg19") # divide HLA types randomly set.seed(100) hlatab <- hlaSplitAllele(hla, train.prop=0.5) names(hlatab) summary(hlatab$training) summary(hlatab$validation) # SNP predictors within the flanking region on each side region <- 500 # kb snpid <- hlaFlankingSNP(HapMap_CEU_Geno$snp.id, HapMap_CEU_Geno$snp.position, hla.id, region*1000, assembly="hg19") length(snpid) # training and validation genotypes train.geno <- hlaGenoSubset(HapMap_CEU_Geno, snp.sel = match(snpid, HapMap_CEU_Geno$snp.id), samp.sel = match(hlatab$training$value$sample.id, HapMap_CEU_Geno$sample.id)) summary(train.geno) test.geno <- hlaGenoSubset(HapMap_CEU_Geno, samp.sel=match( hlatab$validation$value$sample.id, HapMap_CEU_Geno$sample.id)) ## ----eval=FALSE--------------------------------------------------------------------------------------------- # # train a HIBAG model # set.seed(100) # model <- hlaAttrBagging(hlatab$training, train.geno, nclassifier=100) ## ----echo=FALSE--------------------------------------------------------------------------------------------- mobj <- get(load(system.file("extdata", "OutOfBag.RData", package="HIBAG"))) model <- hlaModelFromObj(mobj) ## ----------------------------------------------------------------------------------------------------------- summary(model) # validation pred <- predict(model, test.geno) summary(pred) # compare comp <- hlaCompareAllele(hlatab$validation, pred, allele.limit=model, call.threshold=0) comp$overall ## ----fig.align="center", fig.height=4, fig.width=5---------------------------------------------------------- # hierarchical cluster analysis d <- hlaDistance(model) p <- hclust(as.dist(d)) plot(p, xlab="HLA alleles") ## ----fig.align="center", fig.height=3.3, fig.width=5-------------------------------------------------------- # violin plot hlaReportPlot(pred, model=model, fig="matching") ## ----fig.align="center", fig.height=3.3, fig.width=5-------------------------------------------------------- hlaReportPlot(pred, hlatab$validation, fig="call.rate") hlaReportPlot(pred, hlatab$validation, fig="call.threshold") ## ----------------------------------------------------------------------------------------------------------- # report overall accuracy, per-allele sensitivity, specificity, etc hlaReport(comp, type="txt") ## ----results='asis'----------------------------------------------------------------------------------------- # report overall accuracy, per-allele sensitivity, specificity, etc hlaReport(comp, type="markdown") ## ----------------------------------------------------------------------------------------------------------- # report overall accuracy, per-allele sensitivity, specificity, etc hlaReport(comp, type="tex", header=FALSE) ## ----eval=FALSE--------------------------------------------------------------------------------------------- # library(HIBAG) # # # make a list of HLA types # hla.id <- "DQA1" # hla <- hlaAllele(HLA_Type_Table$sample.id, # H1 = HLA_Type_Table[, paste(hla.id, ".1", sep="")], # H2 = HLA_Type_Table[, paste(hla.id, ".2", sep="")], # locus = hla.id, assembly = "hg19") # # # training genotypes # region <- 500 # kb # snpid <- hlaFlankingSNP(HapMap_CEU_Geno$snp.id, HapMap_CEU_Geno$snp.position, # hla.id, region*1000, assembly="hg19") # train.geno <- hlaGenoSubset(HapMap_CEU_Geno, # snp.sel = match(snpid, HapMap_CEU_Geno$snp.id), # samp.sel = match(hla$value$sample.id, HapMap_CEU_Geno$sample.id)) # # set.seed(1000) # model <- hlaAttrBagging(hla, train.geno, nclassifier=100) # summary(model) # # # remove unused SNPs and sample IDs from the model # mobj <- hlaPublish(model, # platform = "Illumina 1M Duo", # information = "Training set -- HapMap Phase II", # warning = NULL, # rm.unused.snp=TRUE, anonymize=TRUE) # # save(mobj, file="Your_HIBAG_Model.RData") ## ----eval=FALSE--------------------------------------------------------------------------------------------- # # assume the HIBAG models are stored in R objects: mobj.A, mobj.B, ... # # ModelList <- list() # ModelList[["A"]] <- mobj.A # ModelList[["B"]] <- mobj.B # ... # # # save to an R data file # save(ModelList, file="HIBAG_Model_List.RData") ## ----echo=FALSE--------------------------------------------------------------------------------------------- fn <- system.file("doc", "case_control.txt", package="HIBAG") ## ----------------------------------------------------------------------------------------------------------- dat <- read.table(fn, header=TRUE, stringsAsFactors=FALSE) head(dat) # make an object for hlaAssocTest hla <- hlaAllele(dat$sample.id, H1=dat$A, H2=dat$A.1, locus="A", assembly="hg19", prob=dat$prob) summary(hla) ## ----------------------------------------------------------------------------------------------------------- hlaAssocTest(hla, disease ~ h, data=dat) # 95% confidence interval (h.2.5%, h.97.5%) # show details print(hlaAssocTest(hla, disease ~ h, data=dat, verbose=FALSE)) hlaAssocTest(hla, disease ~ h, data=dat, prob.threshold=0.5) # regression with a threshold hlaAssocTest(hla, disease ~ h, data=dat, showOR=TRUE) # report odd ratio instead of log odd ratio hlaAssocTest(hla, disease ~ h + pc1, data=dat) # confounding variable pc1 hlaAssocTest(hla, disease ~ h, data=dat, model="additive") # use an additive model hlaAssocTest(hla, trait ~ h, data=dat) # continuous outcome ## ----------------------------------------------------------------------------------------------------------- aa <- hlaConvSequence(hla, code="P.code.merge") ## ----------------------------------------------------------------------------------------------------------- head(c(aa$value$allele1, aa$value$allele2)) # show cross tabulation at each amino acid position summary(aa) # association tests hlaAssocTest(aa, disease ~ h, data=dat, model="dominant") # try dominant models hlaAssocTest(aa, disease ~ h, data=dat, model="dominant", prob.threshold=0.5) # try dominant models hlaAssocTest(aa, disease ~ h, data=dat, model="recessive") # try recessive models ## ----------------------------------------------------------------------------------------------------------- sessionInfo()