## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set(dpi = 300) knitr::opts_chunk$set(cache=FALSE) ## ---- echo = FALSE,hide=TRUE, message=FALSE,warning=FALSE--------------------- devtools::load_all(".") ## ----message=FALSE, warning=FALSE, include=FALSE------------------------------ library(SummarizedExperiment) library(dplyr) library(DT) ## ---- eval = FALSE------------------------------------------------------------ # # You can define a list of samples to query and download providing relative TCGA barcodes. # listSamples <- c("TCGA-E9-A1NG-11A-52R-A14M-07","TCGA-BH-A1FC-11A-32R-A13Q-07", # "TCGA-A7-A13G-11A-51R-A13Q-07","TCGA-BH-A0DK-11A-13R-A089-07", # "TCGA-E9-A1RH-11A-34R-A169-07","TCGA-BH-A0AU-01A-11R-A12P-07", # "TCGA-C8-A1HJ-01A-11R-A13Q-07","TCGA-A7-A13D-01A-13R-A12P-07", # "TCGA-A2-A0CV-01A-31R-A115-07","TCGA-AQ-A0Y5-01A-11R-A14M-07") # # # Query platform Illumina HiSeq with a list of barcode # query <- GDCquery(project = "TCGA-BRCA", # data.category = "Gene expression", # data.type = "Gene expression quantification", # experimental.strategy = "RNA-Seq", # platform = "Illumina HiSeq", # file.type = "results", # barcode = listSamples, # legacy = TRUE) # # # Download a list of barcodes with platform IlluminaHiSeq_RNASeqV2 # GDCdownload(query) # # # Prepare expression matrix with geneID in the rows and samples (barcode) in the columns # # rsem.genes.results as values # BRCARnaseqSE <- GDCprepare(query) # # BRCAMatrix <- assay(BRCARnaseqSE,"raw_count") # or BRCAMatrix <- assay(BRCARnaseqSE,"raw_count") # # # For gene expression if you need to see a boxplot correlation and AAIC plot to define outliers you can run # BRCARnaseq_CorOutliers <- TCGAanalyze_Preprocessing(BRCARnaseqSE) ## ---- eval = TRUE, echo = FALSE,size = 8-------------------------------------- library(TCGAbiolinks) dataGE <- dataBRCA[sample(rownames(dataBRCA),10),sample(colnames(dataBRCA),7)] knitr::kable(dataGE[1:10,2:3], digits = 2, caption = "Example of a matrix of gene expression (10 genes in rows and 2 samples in columns)", row.names = TRUE) ## ---- fig.width=6, fig.height=4, echo=FALSE, fig.align="center"--------------- library(png) library(grid) img <- readPNG("PreprocessingOutput.png") grid.raster(img) ## ---- eval = FALSE------------------------------------------------------------ # # Downstream analysis using gene expression data # # TCGA samples from IlluminaHiSeq_RNASeqV2 with type rsem.genes.results # # save(dataBRCA, geneInfo , file = "dataGeneExpression.rda") # library(TCGAbiolinks) # # # normalization of genes # dataNorm <- TCGAanalyze_Normalization(tabDF = dataBRCA, geneInfo = geneInfo) # # # quantile filter of genes # dataFilt <- TCGAanalyze_Filtering(tabDF = dataNorm, # method = "quantile", # qnt.cut = 0.25) # # # selection of normal samples "NT" # samplesNT <- TCGAquery_SampleTypes(barcode = colnames(dataFilt), # typesample = c("NT")) # # # selection of tumor samples "TP" # samplesTP <- TCGAquery_SampleTypes(barcode = colnames(dataFilt), # typesample = c("TP")) # # # Diff.expr.analysis (DEA) # dataDEGs <- TCGAanalyze_DEA(mat1 = dataFilt[,samplesNT], # mat2 = dataFilt[,samplesTP], # Cond1type = "Normal", # Cond2type = "Tumor", # fdr.cut = 0.01 , # logFC.cut = 1, # method = "glmLRT") # # # DEGs table with expression values in normal and tumor samples # dataDEGsFiltLevel <- TCGAanalyze_LevelTab(dataDEGs,"Tumor","Normal", # dataFilt[,samplesTP],dataFilt[,samplesNT]) # ## ---- eval = TRUE, echo = FALSE----------------------------------------------- library(TCGAbiolinks) dataDEGsFiltLevel$FDR <- format(dataDEGsFiltLevel$FDR, scientific = TRUE) knitr::kable(dataDEGsFiltLevel[1:10,], digits = 2, caption = "Table of DEGs after DEA", row.names = FALSE) ## ----eval=FALSE,echo=TRUE,message=FALSE,warning=FALSE------------------------- # # CancerProject <- "TCGA-BRCA" # DataDirectory <- paste0("../GDC/",gsub("-","_",CancerProject)) # FileNameData <- paste0(DataDirectory, "_","HTSeq_Counts",".rda") # # query <- GDCquery(project = CancerProject, # data.category = "Transcriptome Profiling", # data.type = "Gene Expression Quantification", # workflow.type = "HTSeq - Counts") # # samplesDown <- getResults(query,cols=c("cases")) # # dataSmTP <- TCGAquery_SampleTypes(barcode = samplesDown, # typesample = "TP") # # dataSmNT <- TCGAquery_SampleTypes(barcode = samplesDown, # typesample = "NT") # dataSmTP_short <- dataSmTP[1:10] # dataSmNT_short <- dataSmNT[1:10] # # queryDown <- GDCquery(project = CancerProject, # data.category = "Transcriptome Profiling", # data.type = "Gene Expression Quantification", # workflow.type = "HTSeq - Counts", # barcode = c(dataSmTP_short, dataSmNT_short)) # # GDCdownload(query = queryDown, # directory = DataDirectory) # # dataPrep <- GDCprepare(query = queryDown, # save = TRUE, # directory = DataDirectory, # save.filename = FileNameData) # # dataPrep <- TCGAanalyze_Preprocessing(object = dataPrep, # cor.cut = 0.6, # datatype = "HTSeq - Counts") # # dataNorm <- TCGAanalyze_Normalization(tabDF = dataPrep, # geneInfo = geneInfoHT, # method = "gcContent") # # boxplot(dataPrep, outline = FALSE) # # boxplot(dataNorm, outline = FALSE) # # dataFilt <- TCGAanalyze_Filtering(tabDF = dataNorm, # method = "quantile", # qnt.cut = 0.25) # # dataDEGs <- TCGAanalyze_DEA(mat1 = dataFilt[,dataSmTP_short], # mat2 = dataFilt[,dataSmNT_short], # Cond1type = "Normal", # Cond2type = "Tumor", # fdr.cut = 0.01 , # logFC.cut = 1, # method = "glmLRT") # ## ----eval=FALSE,echo=TRUE,message=FALSE,warning=FALSE------------------------- # require(TCGAbiolinks) # # CancerProject <- "TCGA-BRCA" # DataDirectory <- paste0("../GDC/",gsub("-","_",CancerProject)) # FileNameData <- paste0(DataDirectory, "_","miRNA_gene_quantification",".rda") # # query.miR <- GDCquery(project = CancerProject, # data.category = "Gene expression", # data.type = "miRNA gene quantification", # file.type = "hg19.mirna", # legacy = TRUE) # # samplesDown.miR <- getResults(query.miR,cols=c("cases")) # # dataSmTP.miR <- TCGAquery_SampleTypes(barcode = samplesDown.miR, # typesample = "TP") # # dataSmNT.miR <- TCGAquery_SampleTypes(barcode = samplesDown.miR, # typesample = "NT") # dataSmTP_short.miR <- dataSmTP.miR[1:10] # dataSmNT_short.miR <- dataSmNT.miR[1:10] # # queryDown.miR <- GDCquery(project = CancerProject, # data.category = "Gene expression", # data.type = "miRNA gene quantification", # file.type = "hg19.mirna", # legacy = TRUE, # barcode = c(dataSmTP_short.miR, dataSmNT_short.miR)) # # GDCdownload(query = queryDown.miR, # directory = DataDirectory) # # dataAssy.miR <- GDCprepare(query = queryDown.miR, # save = TRUE, # save.filename = FileNameData, # summarizedExperiment = TRUE, # directory =DataDirectory ) # rownames(dataAssy.miR) <- dataAssy.miR$miRNA_ID # # # using read_count's data # read_countData <- colnames(dataAssy.miR)[grep("count", colnames(dataAssy.miR))] # dataAssy.miR <- dataAssy.miR[,read_countData] # colnames(dataAssy.miR) <- gsub("read_count_","", colnames(dataAssy.miR)) # # dataFilt <- TCGAanalyze_Filtering(tabDF = dataAssy.miR, # method = "quantile", # qnt.cut = 0.25) # # dataDEGs <- TCGAanalyze_DEA(mat1 = dataFilt[,dataSmNT_short.miR], # mat2 = dataFilt[,dataSmTP_short.miR], # Cond1type = "Normal", # Cond2type = "Tumor", # fdr.cut = 0.01 , # logFC.cut = 1, # method = "glmLRT") # ## ---- eval = FALSE------------------------------------------------------------ # library(TCGAbiolinks) # # Enrichment Analysis EA # # Gene Ontology (GO) and Pathway enrichment by DEGs list # Genelist <- rownames(dataDEGsFiltLevel) # # system.time(ansEA <- TCGAanalyze_EAcomplete(TFname="DEA genes Normal Vs Tumor",Genelist)) # # # Enrichment Analysis EA (TCGAVisualize) # # Gene Ontology (GO) and Pathway enrichment barPlot # # TCGAvisualize_EAbarplot(tf = rownames(ansEA$ResBP), # GOBPTab = ansEA$ResBP, # GOCCTab = ansEA$ResCC, # GOMFTab = ansEA$ResMF, # PathTab = ansEA$ResPat, # nRGTab = Genelist, # nBar = 10) # ## ---- fig.width=6, fig.height=4, echo=FALSE, fig.align="center"--------------- library(png) library(grid) img <- readPNG("EAplot.png") grid.raster(img) ## ---- eval = FALSE------------------------------------------------------------ # clin.gbm <- GDCquery_clinic("TCGA-GBM", "clinical") # TCGAanalyze_survival(clin.gbm, # "gender", # main = "TCGA Set\n GBM",height = 10, width=10) ## ---- fig.width=6, fig.height=4, echo = FALSE, fig.align="center",hide=TRUE, message=FALSE,warning=FALSE---- library(png) library(grid) img <- readPNG("case2_surv.png") grid.raster(img) ## ---- eval = FALSE------------------------------------------------------------ # library(TCGAbiolinks) # # Survival Analysis SA # # clinical_patient_Cancer <- GDCquery_clinic("TCGA-BRCA","clinical") # dataBRCAcomplete <- log2(BRCA_rnaseqv2) # # tokenStop<- 1 # # tabSurvKMcomplete <- NULL # # for( i in 1: round(nrow(dataBRCAcomplete)/100)){ # message( paste( i, "of ", round(nrow(dataBRCAcomplete)/100))) # tokenStart <- tokenStop # tokenStop <-100*i # tabSurvKM<-TCGAanalyze_SurvivalKM(clinical_patient_Cancer, # dataBRCAcomplete, # Genelist = rownames(dataBRCAcomplete)[tokenStart:tokenStop], # Survresult = F, # ThreshTop=0.67, # ThreshDown=0.33) # # tabSurvKMcomplete <- rbind(tabSurvKMcomplete,tabSurvKM) # } # # tabSurvKMcomplete <- tabSurvKMcomplete[tabSurvKMcomplete$pvalue < 0.01,] # tabSurvKMcomplete <- tabSurvKMcomplete[order(tabSurvKMcomplete$pvalue, decreasing=F),] # # tabSurvKMcompleteDEGs <- tabSurvKMcomplete[ # rownames(tabSurvKMcomplete) %in% dataDEGsFiltLevel$mRNA, # ] ## ---- fig.width=6, fig.height=4, echo=FALSE, fig.align="center"--------------- tabSurvKMcompleteDEGs$pvalue <- format(tabSurvKMcompleteDEGs$pvalue, scientific = TRUE) knitr::kable(tabSurvKMcompleteDEGs[1:5,1:4], digits = 2, caption = "Table KM-survival genes after SA", row.names = TRUE) knitr::kable(tabSurvKMcompleteDEGs[1:5,5:7], digits = 2, row.names = TRUE) ## ---- eval = FALSE------------------------------------------------------------ # data <- TCGAanalyze_DMR(data, groupCol = "methylation_subtype", # group1 = "CIMP.H", # group2="CIMP.L", # p.cut = 10^-5, # diffmean.cut = 0.25, # legend = "State", # plot.filename = "coad_CIMPHvsCIMPL_metvolcano.png") ## ---- fig.width=6, fig.height=4, echo = FALSE, fig.align="center",hide=TRUE, message=FALSE,warning=FALSE---- library(png) library(grid) img <- readPNG("figure5met.png") grid.raster(img) ## ---- fig.width=6, fig.height=4, echo = FALSE, fig.align="center",hide=TRUE, message=FALSE,warning=FALSE---- library(jpeg) library(grid) img <- readJPEG("case2_Heatmap.jpg") grid.raster(img) ## ---- eval = FALSE------------------------------------------------------------ # # normalization of genes # dataNorm <- TCGAbiolinks::TCGAanalyze_Normalization(dataBRCA, geneInfo) # # # quantile filter of genes # dataFilt <- TCGAanalyze_Filtering(tabDF = dataNorm, # method = "quantile", # qnt.cut = 0.25) # # # selection of normal samples "NT" # group1 <- TCGAquery_SampleTypes(colnames(dataFilt), typesample = c("NT")) # # selection of normal samples "TP" # group2 <- TCGAquery_SampleTypes(colnames(dataFilt), typesample = c("TP")) # # # Principal Component Analysis plot for ntop selected DEGs # pca <- TCGAvisualize_PCA(dataFilt,dataDEGsFiltLevel, ntopgenes = 200, group1, group2) ## ---- fig.width=6, fig.height=4, echo=FALSE, fig.align="center"--------------- library(png) library(grid) img <- readPNG("PCAtop200DEGs.png") grid.raster(img) ## ----include=FALSE,echo=FALSE, fig.height=5, message=FALSE, warning=FALSE----- data <- tryCatch({ query <- GDCquery(project = "TCGA-GBM", data.category = "DNA methylation", platform = "Illumina Human Methylation 27", legacy = TRUE, barcode = c("TCGA-02-0058-01A-01D-0186-05", "TCGA-12-1597-01B-01D-0915-05", "TCGA-12-0829-01A-01D-0392-05", "TCGA-06-0155-01B-01D-0521-05", "TCGA-02-0099-01A-01D-0199-05", "TCGA-19-4068-01A-01D-1228-05", "TCGA-19-1788-01A-01D-0595-05", "TCGA-16-0848-01A-01D-0392-05")) GDCdownload(query, method = "api", chunks.per.download = 2) GDCdownload(query, method = "api") data <- GDCprepare(query) data }, error = function(e) { nrows <- 200; ncols <- 21 counts <- matrix(runif(nrows * ncols, 0, 1), nrows) rowRanges <- GenomicRanges::GRanges(rep(c("chr1", "chr2"), c(50, 150)), IRanges::IRanges(floor(runif(200, 1e5, 1e6)), width=100), strand=sample(c("+", "-"), 200, TRUE), feature_id=sprintf("ID%03d", 1:200)) colData <- S4Vectors::DataFrame(shortLetterCode=rep(c("NT", "TP","TR"), 7), row.names=LETTERS[1:21], subtype_Pan.Glioma.DNA.Methylation.Cluster=rep(c("group1","group2","group3"),c(7,7,7)), vital_status=rep(c("DEAD","ALIVE","DEAD"),7)) data <- SummarizedExperiment::SummarizedExperiment( assays=S4Vectors::SimpleList(counts=counts), rowRanges=rowRanges, colData=colData) data } ) ## ---- eval=FALSE, echo=TRUE, fig.height=5, message=FALSE, warning=FALSE------- # query <- GDCquery(project = "TCGA-GBM", # data.category = "DNA methylation", # platform = "Illumina Human Methylation 27", # legacy = TRUE, # barcode = c("TCGA-02-0058-01A-01D-0186-05", "TCGA-12-1597-01B-01D-0915-05", # "TCGA-12-0829-01A-01D-0392-05", "TCGA-06-0155-01B-01D-0521-05", # "TCGA-02-0099-01A-01D-0199-05", "TCGA-19-4068-01A-01D-1228-05", # "TCGA-19-1788-01A-01D-0595-05", "TCGA-16-0848-01A-01D-0392-05")) # GDCdownload(query, method = "api") # data <- GDCprepare(query) ## ---- echo=TRUE, fig.height=4, fig.width=3, out.width = 3, out.heigh=5, message=FALSE, warning=FALSE---- # "shortLetterCode" is a column in the SummarizedExperiment::colData(data) matrix TCGAvisualize_meanMethylation(data, groupCol = "shortLetterCode",filename = NULL) ## ---- echo=TRUE, fig.height=4, fig.width=7, out.width = 7, out.heigh=5, message=FALSE, warning=FALSE---- # setting y limits: lower 0, upper 1 TCGAvisualize_meanMethylation(data,groupCol = "shortLetterCode", filename = NULL, y.limits = c(0,1)) # setting y limits: lower 0 TCGAvisualize_meanMethylation(data,groupCol = "shortLetterCode", filename = NULL, y.limits = 0) # Changing shapes of jitters to show subgroups TCGAvisualize_meanMethylation(data, groupCol = "subtype_Pan.Glioma.DNA.Methylation.Cluster", subgroupCol ="vital_status", filename = NULL) # Sorting bars by descending mean methylation TCGAvisualize_meanMethylation(data, groupCol = "subtype_Pan.Glioma.DNA.Methylation.Cluster", sort="mean.desc", filename=NULL) # Sorting bars by asc mean methylation TCGAvisualize_meanMethylation(data, groupCol = "subtype_Pan.Glioma.DNA.Methylation.Cluster", sort = "mean.asc", filename=NULL) TCGAvisualize_meanMethylation(data, groupCol = "vital_status", sort = "mean.asc", filename=NULL) ## ---- eval = FALSE------------------------------------------------------------ # starburst <- TCGAvisualize_starburst(coad.SummarizeExperiment, # different.experssion.analysis.data, # group1 = "CIMP.H", # group2 = "CIMP.L", # met.platform = "450K", # genome = "hg19", # met.p.cut = 10^-5, # exp.p.cut = 10^-5, # names = TRUE) ## ---- fig.width=6, fig.height=4, echo = FALSE, fig.align="center",hide=TRUE, message=FALSE,warning=FALSE---- library(png) library(grid) img <- readPNG("figure5star.png") grid.raster(img) ## ----sessionInfo-------------------------------------------------------------- sessionInfo()