## ----init, results='hide', echo=FALSE, warning=FALSE, message=FALSE--------
library(knitr)
opts_chunk$set(warning=FALSE, message=FALSE)
BiocStyle::markdown()
## ----install_bioc, eval=FALSE----------------------------------------------
# if (!require("BiocManager"))
# install.packages("BiocManager")
# BiocManager::install('GenomicDataCommons')
## ----libraries, message=FALSE----------------------------------------------
library(GenomicDataCommons)
## ----statusQS--------------------------------------------------------------
GenomicDataCommons::status()
## ----statusCheck-----------------------------------------------------------
stopifnot(GenomicDataCommons::status()$status=="OK")
## ----manifest--------------------------------------------------------------
ge_manifest = files() %>%
filter( cases.project.project_id == 'TCGA-OV') %>%
filter( type == 'gene_expression' ) %>%
filter( analysis.workflow_type == 'HTSeq - Counts') %>%
manifest()
head(ge_manifest)
fnames = lapply(ge_manifest$id[1:20],gdcdata)
## ----downloadQS------------------------------------------------------------
fnames = lapply(ge_manifest$id[1:20],gdcdata)
## ----gdc_clinical----------------------------------------------------------
case_ids = cases() %>% results(size=10) %>% ids()
clindat = gdc_clinical(case_ids)
names(clindat)
## ----clinData--------------------------------------------------------------
head(clindat[["main"]])
head(clindat[["diagnoses"]])
## ----metadataQS------------------------------------------------------------
expands = c("diagnoses","annotations",
"demographic","exposures")
clinResults = cases() %>%
GenomicDataCommons::select(NULL) %>%
GenomicDataCommons::expand(expands) %>%
results(size=50)
str(clinResults[[1]],list.len=6)
# or listviewer::jsonedit(clinResults)
## ----projectquery----------------------------------------------------------
pquery = projects()
## ----pquery----------------------------------------------------------------
str(pquery)
## ----pquerycount-----------------------------------------------------------
pcount = count(pquery)
# or
pcount = pquery %>% count()
pcount
## ----pqueryresults---------------------------------------------------------
presults = pquery %>% results()
## ----presultsstr-----------------------------------------------------------
str(presults)
## ----presultsall-----------------------------------------------------------
length(ids(presults))
presults = pquery %>% results_all()
length(ids(presults))
# includes all records
length(ids(presults)) == count(pquery)
## ----defaultfields---------------------------------------------------------
default_fields('files')
# The number of fields available for files endpoint
length(available_fields('files'))
# The first few fields available for files endpoint
head(available_fields('files'))
## ----selectexample---------------------------------------------------------
# Default fields here
qcases = cases()
qcases$fields
# set up query to use ALL available fields
# Note that checking of fields is done by select()
qcases = cases() %>% GenomicDataCommons::select(available_fields('cases'))
head(qcases$fields)
## ----aggexample------------------------------------------------------------
# total number of files of a specific type
res = files() %>% facet(c('type','data_type')) %>% aggregations()
res$type
## ----allfilesunfiltered----------------------------------------------------
qfiles = files()
qfiles %>% count() # all files
## ----onlyGeneExpression----------------------------------------------------
qfiles = files() %>% filter( type == 'gene_expression')
# here is what the filter looks like after translation
str(get_filter(qfiles))
## ----filtAvailFields-------------------------------------------------------
grep('pro',available_fields('files'),value=TRUE) %>%
head()
## ----filtProgramID---------------------------------------------------------
files() %>%
facet('cases.project.project_id') %>%
aggregations() %>%
head()
## ----filtfinal-------------------------------------------------------------
qfiles = files() %>%
filter( cases.project.project_id == 'TCGA-OV' & type == 'gene_expression')
str(get_filter(qfiles))
qfiles %>% count()
## ----filtChain-------------------------------------------------------------
qfiles2 = files() %>%
filter( cases.project.project_id == 'TCGA-OV') %>%
filter( type == 'gene_expression')
qfiles2 %>% count()
(qfiles %>% count()) == (qfiles2 %>% count()) #TRUE
## ----filtAndManifest-------------------------------------------------------
manifest_df = qfiles %>% manifest()
head(manifest_df)
## ----filterForHTSeqCounts--------------------------------------------------
qfiles = files() %>% filter( ~ cases.project.project_id == 'TCGA-OV' &
type == 'gene_expression' &
analysis.workflow_type == 'HTSeq - Counts')
manifest_df = qfiles %>% manifest()
nrow(manifest_df)
## ----authenNoRun, eval=FALSE-----------------------------------------------
# token = gdc_token()
# transfer(...,token=token)
# # or
# transfer(...,token=get_token())
## ----singlefileDL----------------------------------------------------------
fnames = gdcdata(manifest_df$id[1:2],progress=FALSE)
## ----bulkDL----------------------------------------------------------------
fnames = gdcdata(manifest_df$id[3:10], access_method = 'client')
## ----casesPerProject-------------------------------------------------------
res = cases() %>% facet("project.project_id") %>% aggregations()
head(res)
library(ggplot2)
ggplot(res$project.project_id,aes(x = key, y = doc_count)) +
geom_bar(stat='identity') +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## ----casesInTCGA-----------------------------------------------------------
cases() %>% filter(~ project.program.name=='TARGET') %>% count()
## ----casesInTARGET---------------------------------------------------------
cases() %>% filter(~ project.program.name=='TCGA') %>% count()
## ----casesTCGABRCASampleTypes----------------------------------------------
# The need to do the "&" here is a requirement of the
# current version of the GDC API. I have filed a feature
# request to remove this requirement.
resp = cases() %>% filter(~ project.project_id=='TCGA-BRCA' &
project.project_id=='TCGA-BRCA' ) %>%
facet('samples.sample_type') %>% aggregations()
resp$samples.sample_type
## ----casesTCGABRCASolidNormal----------------------------------------------
# The need to do the "&" here is a requirement of the
# current version of the GDC API. I have filed a feature
# request to remove this requirement.
resp = cases() %>% filter(~ project.project_id=='TCGA-BRCA' &
samples.sample_type=='Solid Tissue Normal') %>%
GenomicDataCommons::select(c(default_fields(cases()),'samples.sample_type')) %>%
response_all()
count(resp)
res = resp %>% results()
str(res[1],list.len=6)
head(ids(resp))
## ----filesVCFCount---------------------------------------------------------
res = files() %>% facet('type') %>% aggregations()
res$type
ggplot(res$type,aes(x = key,y = doc_count)) + geom_bar(stat='identity') +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## ----filesRNAseqGeneGBM----------------------------------------------------
q = files() %>%
GenomicDataCommons::select(available_fields('files')) %>%
filter(~ cases.project.project_id=='TCGA-GBM' &
data_type=='Gene Expression Quantification')
q %>% facet('analysis.workflow_type') %>% aggregations()
# so need to add another filter
file_ids = q %>% filter(~ cases.project.project_id=='TCGA-GBM' &
data_type=='Gene Expression Quantification' &
analysis.workflow_type == 'HTSeq - Counts') %>%
GenomicDataCommons::select('file_id') %>%
response_all() %>%
ids()
## ----filesRNAseqGeneGBMforBAM----------------------------------------------
q = files() %>%
GenomicDataCommons::select(available_fields('files')) %>%
filter(~ cases.project.project_id == 'TCGA-GBM' &
data_type == 'Aligned Reads' &
experimental_strategy == 'RNA-Seq' &
data_format == 'BAM')
file_ids = q %>% response_all() %>% ids()
## ----slicing10, eval=FALSE-------------------------------------------------
# bamfile = slicing(file_ids[1],regions="chr12:6534405-6538375",token=gdc_token())
# library(GenomicAlignments)
# aligns = readGAlignments(bamfile)
## ----sessionInfo-----------------------------------------------------------
sessionInfo()