## ----style, echo = FALSE, results = 'asis'---------------------------------
BiocStyle::markdown()

## ----global_options, include=FALSE--------------------------------------------------------------------------
knitr::opts_chunk$set(fig.width=10, fig.height=7, warning=FALSE, message=FALSE)
options(width=110)
set.seed(123)

## -----------------------------------------------------------------------------------------------------------
## load ABAEnrichment package
library(ABAEnrichment)
## create input data.frame with candidate genes
gene_ids = c('NCAPG', 'APOL4', 'NGFR', 'NXPH4', 'C21orf59', 'CACNG2', 'AGTR1',
    'ANO1', 'BTBD3', 'MTUS1', 'CALB1', 'GYG1', 'PAX2')
input_hyper = data.frame(gene_ids, is_candidate=1)
head(input_hyper)

## ---- eval=FALSE--------------------------------------------------------------------------------------------
#  ## run enrichment analyses with default parameters
#  ## for the adult and developing human brain
#  res_adult = aba_enrich(input_hyper, dataset='adult')
#  res_devel = aba_enrich(input_hyper, dataset='5_stages')

## ----results='hide'-----------------------------------------------------------------------------------------
## run enrichment analysis with less cutoffs and randomsets
## to save computation time
res_devel = aba_enrich(input_hyper, dataset='5_stages',
    cutoff_quantiles=c(0.5,0.7,0.9), n_randsets=100)

## -----------------------------------------------------------------------------------------------------------
## extract first element from the output list, which contains the statistics
fwers_devel = res_devel[[1]]
## see results for the brain regions with highest enrichment
## for children (3-11 yrs, age_category 3)
fwers_3 = fwers_devel[fwers_devel[,1]==3, ]
head(fwers_3)

## -----------------------------------------------------------------------------------------------------------
res_devel[2:3]

## ----eval=FALSE---------------------------------------------------------------------------------------------
#  ## run enrichment analysis, with randomsets dependent on gene length
#  res_len = aba_enrich(input_hyper, gene_len=TRUE)
#  ## run the same analysis using gene-coordinates
#  ## from GRCh38 instead of the default GRCh37
#  res_len_grch38 = aba_enrich(input_hyper, gene_len=TRUE, ref_genome='grch38')

## -----------------------------------------------------------------------------------------------------------
## create input vector with a candidate region on chromosome 8
## and background regions on chromosome 7, 8 and 9
regions = c('8:82000000-83000000', '7:1300000-56800000',
    '7:74900000-148700000', '8:7400000-44300000', '8:47600000-146300000',
    '9:0-39200000', '9:69700000-140200000')
is_candidate = c(1, rep(0,6))
input_region = data.frame(regions, is_candidate)

## ----results='hide'-----------------------------------------------------------------------------------------
## run enrichment analysis for the adult human brain
res_region = aba_enrich(input_region, dataset='adult',
    cutoff_quantiles=c(0.5,0.7,0.9), n_randsets=100)

## -----------------------------------------------------------------------------------------------------------
## look at the results from the enrichment analysis
fwers_region = res_region[[1]]
head(fwers_region)
## see which genes are located in the candidate region
input_genes = res_region[[2]]
candidate_genes = input_genes[input_genes[,2]==1,]
candidate_genes

## ----echo=FALSE---------------------------------------------------------------------------------------------
gene = c('NCAPG','APOL4','NGFR','NXPH4','C21orf59','CACNG2')
chr = c('chr4', 'chr22', 'chr17', 'chr12', 'chr21', 'chr22')
start = c(17812436, 36585176, 47572655, 57610578, 33954510, 36956916)
end = c(17846487, 36600879, 47592382, 57620232, 33984918, 37098690)
custom_coords = data.frame(gene, chr, start, end, stringsAsFactors=FALSE)

## -----------------------------------------------------------------------------------------------------------
## example for a dataframe with custom gene-coordinates
head(custom_coords)

## ----eval=FALSE---------------------------------------------------------------------------------------------
#  ## use correction for gene-length based on custom gene-coordinates
#  res_len_cc = aba_enrich(input_hyper, gene_len=TRUE, gene_coords=custom_coords)

## -----------------------------------------------------------------------------------------------------------
## assign random scores to the genes used above
scores = sample(1:50, length(gene_ids))
input_wicox = data.frame(gene_ids, scores)
head(input_wicox)

## ---- results='hide'----------------------------------------------------------------------------------------
## test for enrichment of expressed genes with high scores in the adult brain
## using the Wilcoxon rank-sum test
res_wilcox = aba_enrich(input_wicox, test='wilcoxon',
    cutoff_quantiles=c(0.5,0.7,0.9), n_randsets=100)

## -----------------------------------------------------------------------------------------------------------
head(res_wilcox[[1]])

## -----------------------------------------------------------------------------------------------------------
## create a toy example dataset with two counts per gene
high_A_genes = c('RFFL', 'NTS', 'LIPE', 'GALNT6', 'GSN', 'BTBD16', 'CERS2')
low_A_genes = c('GDA', 'ENC1', 'EGR4', 'VIPR1', 'DOC2A', 'OASL', 'FRY', 'NAV3')
A_counts = c(sample(20:30, length(high_A_genes)),
    sample(5:15, length(low_A_genes)))
B_counts = c(sample(5:15, length(high_A_genes)),
    sample(20:30, length(low_A_genes)))
input_binom = data.frame(gene_ids=c(high_A_genes, low_A_genes),
    A_counts, B_counts)
head(input_binom)

## -----------------------------------------------------------------------------------------------------------
## run binomial test
res_binom = aba_enrich(input_binom, cutoff_quantiles=c(0.2,0.9),
    test='binomial', n_randsets=100, silent=TRUE)
head(res_binom[[1]])

## -----------------------------------------------------------------------------------------------------------
## create a toy example with four counts per gene
high_substi_genes = c('RFFL', 'NTS', 'LIPE', 'GALNT6', 'GSN', 'BTBD16', 'CERS2')
low_substi_genes = c('ENC1', 'EGR4', 'NPTX1', 'DOC2A', 'OASL', 'FRY', 'NAV3')
subs_non_syn = c(sample(5:15, length(high_substi_genes), replace=TRUE),
    sample(0:5, length(low_substi_genes), replace=TRUE))
subs_syn = sample(5:15, length(c(high_substi_genes, low_substi_genes)),
    replace=TRUE)
vari_non_syn = c(sample(0:5, length(high_substi_genes), replace=TRUE),
    sample(0:10, length(low_substi_genes), replace=TRUE))
vari_syn = sample(5:15, length(c(high_substi_genes, low_substi_genes)),
    replace=TRUE)
input_conti = data.frame(gene_ids=c(high_substi_genes, low_substi_genes),
    subs_non_syn, subs_syn, vari_non_syn, vari_syn)
head(input_conti)

## the corresponding contingency table for the first gene would be:
matrix(input_conti[1, 2:5], ncol=2, dimnames=list(c('non-synonymous',
    'synonymous'), c('substitution','variable')))

## ---- results='hide'----------------------------------------------------------------------------------------
res_conti = aba_enrich(input_conti, test='contingency',
    cutoff_quantiles=c(0.7,0.8,0.9), n_randset=100)


## -----------------------------------------------------------------------------------------------------------
head(res_conti[[1]])

## -----------------------------------------------------------------------------------------------------------
## get expression data for the top 5 regions
## and all input genes
## of the last aba_enrich analysis (res_conti)
top_regions = res_conti[[1]][1:5, 'structure_id']
gene_ids = res_conti[[2]][,1]
expr = get_expression(structure_ids=top_regions, gene_ids=gene_ids,
    dataset='adult')
expr[,1:6]

## -----------------------------------------------------------------------------------------------------------
get_expression(structure_ids=c('Allen:10657','Allen:10208'),
    gene_ids=c('RFFL', 'NTS', 'LIPE'), dataset='5_stages')

## -----------------------------------------------------------------------------------------------------------
## plot the expression data from above
plot_expression(expr, main="microarray expression data for adult brain")

## -----------------------------------------------------------------------------------------------------------
## plot expression data for the top 5 regions
## of age-category 2 and all input genes
## of the developmental stage aba_enrich analysis above (res_devel)

## extract brain regions
devel_stats = res_devel[[1]]
devel_stats_2 = devel_stats[devel_stats$age_category==2,]
top_regions_2 = devel_stats_2[1:5,'structure_id']
## extract genes
genes = res_devel[[2]][,1]
## get expression for all 5 age categories
expr_all = get_expression(top_regions_2, genes, "5_stages")
## subset to age-cateogry 2
expr_2 = expr_all[["age_category_2"]]
## plot heatmap
plot_expression(expr_2, main="RPKM from RNA-seq for age_category_2")

## -----------------------------------------------------------------------------------------------------------
## plot expression data for the top 5 regions
## and all input genes
## from the 2x2-contingency table analysis (res_conti)

## extract brain regions
top_regions = res_conti[[1]][1:5, 'structure_id']
## extract genes
gene_ids = res_conti[[2]][,1]
## get expression 
expr = get_expression(structure_ids=top_regions, gene_ids=gene_ids, dataset='adult')
## plot expression with sidebar
plot_expression(expr, gene_vars=res_conti[[2]], main="heatmap with sidebar")

## -----------------------------------------------------------------------------------------------------------
## use heatmap.2 to allow adjusting other parameters like color
gplots::heatmap.2(expr, scale="none", col=gplots::greenred(100),
    main="custom heatmap", trace="none",
    keysize=1.4, key.xlab="expression",
    dendrogram="row", margins=c(6, 8))

## -----------------------------------------------------------------------------------------------------------
## get IDs of the substructures of the frontal neocortex (Allen:10161)
## for which expression data are available
subs = get_sampled_substructures('Allen:10161')
subs
## get the full name of those substructures
get_name(subs)
## get the superstructures of the frontal neocortex (from general to special)
supers = get_superstructures('Allen:10161')
supers
## get the full names of those superstructures
get_name(supers)

## -----------------------------------------------------------------------------------------------------------
## get structure IDs of brain regions that contain 'accumbens' in their name
get_id('accumbens')
## get structure IDs of brain regions that contain 'telencephalon' in their name
get_id('telencephalon')

## -----------------------------------------------------------------------------------------------------------
## get all brain regions included in ABAEnrichment together with thier IDs
all_regions = get_id('')
head(all_regions)

## ---- results='hide'----------------------------------------------------------------------------------------
## run an enrichment analysis with 7 candidate and 7 background genes
## for the developing brain
gene_ids = c('FOXJ1', 'NTS', 'LTBP1', 'STON2', 'KCNJ6', 'AGT', 
    'ANO3', 'TTR', 'ELAVL4', 'BEAN1', 'TOX', 'EPN3', 'PAX2', 'KLHL1')
is_candidate = rep(c(1,0), each=7)
input_hyper = data.frame(gene_ids, is_candidate)
res = aba_enrich(input_hyper, dataset='5_stages',
    cutoff_quantiles=c(0.5,0.7,0.9), n_randsets=100)

## -----------------------------------------------------------------------------------------------------------
head(res[[1]])
## see which candidate genes are annotated to brain regions with a FWER < 0.01
anno = get_annotated_genes(res, fwer_threshold=0.1)
anno

## -----------------------------------------------------------------------------------------------------------
## find out which of the above genes have expression
## above the 70% and 90% expression-cutoff
## in the basal ganglia of the adult human brain (Allen:4276)
anno2 = get_annotated_genes(structure_ids='Allen:4276', dataset='adult', 
    cutoff_quantiles=c(0.7,0.9), genes=gene_ids)
anno2

## -----------------------------------------------------------------------------------------------------------
## use previously defined input genes dataframe
head(input_hyper)

## ----results='hide'-----------------------------------------------------------------------------------------
## run enrichment analysis with developmental effect score
res_dev_effect = aba_enrich(input_hyper, dataset='dev_effect',
    cutoff_quantiles=c(0.5,0.7,0.9), n_randsets=100)

## -----------------------------------------------------------------------------------------------------------
## see the 5 brain regions with the lowest FWERs
top_regions = res_dev_effect[[1]][1:5, ]
top_regions

## ---- eval=F------------------------------------------------------------------------------------------------
#  ## plot developmental effect score of the 5 brain regions with the lowest FWERs
#  plot_expression(top_regions[ ,'structure_id'])

## -----------------------------------------------------------------------------------------------------------
sessionInfo()