## ----eval=FALSE--------------------------------------------------------------- # # suppressWarnings(suppressMessages(require(netDx))) # suppressMessages(require(curatedTCGAData)) # # # fetch data for example # brca <- suppressMessages( # curatedTCGAData("BRCA",c("mRNAArray"),FALSE, version="1.1.38")) # # # reformat for this example # staget <- sub("[abcd]","", # sub("t","",colData(brca)$pathology_T_stage)) # staget <- suppressWarnings(as.integer(staget)) # colData(brca)$STAGE <- staget # # pam50 <- colData(brca)$PAM50.mRNA # # binarize class # pam50[which(!pam50 %in% "Luminal A")] <- "notLumA" # pam50[which(pam50 %in% "Luminal A")] <- "LumA" # colData(brca)$pam_mod <- pam50 # tmp <- colData(brca)$PAM50.mRNA # idx <- union(which(tmp %in% c("Normal-like", # "Luminal B","HER2-enriched")), # which(is.na(staget))) # pID <- colData(brca)$patientID # tokeep <- setdiff(pID, pID[idx]) # brca <- brca[,tokeep,] # # remove duplicate assays mapped to the same sample # smp <- sampleMap(brca) # samps <- smp[which(smp$assay=="BRCA_mRNAArray-20160128"),] # notdup <- samps[which(!duplicated(samps$primary)),"colname"] # brca[[1]] <- suppressMessages(brca[[1]][,notdup]) # pam50 <- colData(brca)$pam_mod # pheno <- colData(brca) # # # create holdout set # set.seed(123) # make reproducible # idx_holdout <- c( # sample(which(pam50 == "LumA"),10,F), # sample(which(pam50 == "notLumA"),10,F) # ) # holdout <- brca[,rownames(pheno)[idx_holdout]] # colData(holdout)$ID <- as.character(colData(holdout)$patientID) # colData(holdout)$STATUS <- colData(holdout)$pam_mod # # # remove holdout samples from training set # tokeep <- setdiff(1:length(pam50),idx_holdout) # brca <- brca[,rownames(pheno)[tokeep]] # pID <- as.character(colData(brca)$patientID) # colData(brca)$ID <- pID # colData(brca)$STATUS <- colData(brca)$pam_mod # # # feature design # # genes in mRNA data are grouped by pathways # pathList <- readPathways(fetchPathwayDefinitions("January",2018)) # # groupList <- list() # groupList[["BRCA_mRNAArray-20160128"]] <- pathList[1:3] # # clinical data is not grouped; each variable is its own feature # groupList[["clinical"]] <- list( # age="patient.age_at_initial_pathologic_diagnosis", # stage="STAGE" # ) # # # define function to create nets # makeNets <- function(dataList, groupList, netDir, ...) { # netList <- c() # initialize before is.null() check # # make RNA nets (NOTE: the check for is.null() is important!) # # (Pearson correlation) # if (!is.null(groupList[["BRCA_mRNAArray-20160128"]])) { # netList <- makePSN_NamedMatrix( # dataList[["BRCA_mRNAArray-20160128"]], # rownames(dataList[["BRCA_mRNAArray-20160128"]]), # groupList[["BRCA_mRNAArray-20160128"]], # netDir, verbose = FALSE, # writeProfiles = TRUE, runSerially=TRUE, ...) # } # # # make clinical nets (normalized difference) # netList2 <- c() # if (!is.null(groupList[["clinical"]])) { # netList2 <- makePSN_NamedMatrix(dataList$clinical, # rownames(dataList$clinical), # groupList[["clinical"]], netDir, # simMetric = "custom", # customFunc = normDiff, # custom function # writeProfiles = FALSE, # sparsify = TRUE, verbose = TRUE, runSerially=TRUE, ...) # } # netList <- c(unlist(netList), unlist(netList2)) # return(netList) # } # # # run predictor # set.seed(42) # make results reproducible # outDir <- paste(tempdir(),randAlphanumString(), # "pred_output",sep=getFileSep()) # if (file.exists(outDir)) unlink(outDir,recursive=TRUE) # numSplits <- 2L # out <- suppressMessages( # buildPredictor( # dataList=brca,groupList=groupList, # makeNetFunc=makeNets, # outDir=outDir, # numSplits=numSplits, # featScoreMax=2L, # featSelCutoff=1L, # numCores=1L,debugMode=FALSE, # logging="none") # ) # # # compile consistently high-scoring functions # featScores <- list() # feature scores per class # st <- unique(colData(brca)$STATUS) # for (k in 1:numSplits) { # # feature scores # for (cur in unique(st)) { # if (is.null(featScores[[cur]])) featScores[[cur]] <- list() # tmp <- out[[sprintf("Split%i",k)]] # tmp2 <- tmp[["featureScores"]][[cur]] # colnames(tmp2) <- c("PATHWAY_NAME","SCORE") # featScores[[cur]][[sprintf("Split%i",k)]] <- tmp2 # } # } # featScores2 <- lapply(featScores, getNetConsensus) # featSelNet <- lapply(featScores2, function(x) { # callFeatSel(x, fsCutoff=1, fsPctPass=0) # }) # # # predict performance on holdout set # outDir <- paste(tempdir(), randAlphanumString(), # sep = getFileSep()) # if (file.exists(outDir)) unlink(outDir,recursive=TRUE) # dir.create(outDir) # predModel <- suppressMessages( # predict(brca, holdout, groupList, # featSelNet, makeNets, # outDir, verbose = FALSE) # ) # # # compute performance for prediction # perf <- getPerformance(predModel, c("LumA", "notLumA")) # message(sprintf("AUROC=%1.2f", perf$auroc)) # message(sprintf("AUPR=%1.2f", perf$aupr)) # message(sprintf("Accuracy=%1.1f%%", perf$acc)) # # # plot ROC and PR curve # plotPerf_multi(list(perf$rocCurve), # plotTitle = sprintf( # "BRCA Validation: %i samples", # nrow(colData(holdout)))) # plotPerf_multi(list(perf$prCurve), # plotType = "PR", # plotTitle = sprintf( # "BRCA Validation: %i samples", # nrow(colData(holdout)))) # ## ----eval=TRUE---------------------------------------------------------------- suppressWarnings(suppressMessages(require(netDx))) suppressMessages(require(curatedTCGAData)) brca <- suppressMessages( curatedTCGAData("BRCA",c("mRNAArray"),FALSE, version="1.1.38")) staget <- sub("[abcd]","", sub("t","",colData(brca)$pathology_T_stage)) staget <- suppressWarnings(as.integer(staget)) colData(brca)$STAGE <- staget pam50 <- colData(brca)$PAM50.mRNA pam50[which(!pam50 %in% "Luminal A")] <- "notLumA" pam50[which(pam50 %in% "Luminal A")] <- "LumA" colData(brca)$pam_mod <- pam50 tmp <- colData(brca)$PAM50.mRNA idx <- union(which(tmp %in% c("Normal-like", "Luminal B","HER2-enriched")), which(is.na(staget))) pID <- colData(brca)$patientID tokeep <- setdiff(pID, pID[idx]) brca <- brca[,tokeep,] # remove duplicate assays mapped to the same sample smp <- sampleMap(brca) samps <- smp[which(smp$assay=="BRCA_mRNAArray-20160128"),] notdup <- samps[which(!duplicated(samps$primary)),"colname"] brca[[1]] <- suppressMessages(brca[[1]][,notdup]) pam50 <- colData(brca)$pam_mod pheno <- colData(brca) ## ----eval=TRUE---------------------------------------------------------------- set.seed(123) # make reproducible idx_holdout <- c( sample(which(pam50 == "LumA"),10,F), sample(which(pam50 == "notLumA"),10,F) ) holdout <- brca[,rownames(pheno)[idx_holdout]] colData(holdout)$ID <- as.character(colData(holdout)$patientID) colData(holdout)$STATUS <- colData(holdout)$pam_mod tokeep <- setdiff(1:length(pam50),idx_holdout) brca <- brca[,rownames(pheno)[tokeep]] pID <- as.character(colData(brca)$patientID) colData(brca)$ID <- pID colData(brca)$STATUS <- colData(brca)$pam_mod ## ----eval=TRUE---------------------------------------------------------------- # genes in mRNA data are grouped by pathways pathList <- readPathways(fetchPathwayDefinitions("January",2018)) groupList <- list() groupList[["BRCA_mRNAArray-20160128"]] <- pathList[1:3] # clinical data is not grouped; each variable is its own feature groupList[["clinical"]] <- list( age="patient.age_at_initial_pathologic_diagnosis", stage="STAGE" ) ## ----eval=TRUE---------------------------------------------------------------- makeNets <- function(dataList, groupList, netDir, ...) { netList <- c() # initialize before is.null() check # make RNA nets (NOTE: the check for is.null() is important!) # (Pearson correlation) if (!is.null(groupList[["BRCA_mRNAArray-20160128"]])) { netList <- makePSN_NamedMatrix( dataList[["BRCA_mRNAArray-20160128"]], rownames(dataList[["BRCA_mRNAArray-20160128"]]), groupList[["BRCA_mRNAArray-20160128"]], netDir, verbose = FALSE, writeProfiles = TRUE, runSerially=TRUE, ...) } # make clinical nets (normalized difference) netList2 <- c() if (!is.null(groupList[["clinical"]])) { netList2 <- makePSN_NamedMatrix(dataList$clinical, rownames(dataList$clinical), groupList[["clinical"]], netDir, simMetric = "custom", customFunc = normDiff, # custom function writeProfiles = FALSE, sparsify = TRUE, verbose = TRUE, runSerially=TRUE, ...) } netList <- c(unlist(netList), unlist(netList2)) return(netList) } ## ----eval=TRUE---------------------------------------------------------------- set.seed(42) # make results reproducible outDir <- paste(tempdir(),randAlphanumString(), "pred_output",sep=getFileSep()) if (file.exists(outDir)) unlink(outDir,recursive=TRUE) numSplits <- 2L out <- suppressMessages( buildPredictor( dataList=brca,groupList=groupList, makeNetFunc=makeNets, outDir=outDir, numSplits=numSplits, featScoreMax=2L, featSelCutoff=1L, numCores=1L,debugMode=FALSE, logging="none") ) ## ----eval=TRUE---------------------------------------------------------------- featScores <- list() # feature scores per class st <- unique(colData(brca)$STATUS) for (k in 1:numSplits) { # feature scores for (cur in unique(st)) { if (is.null(featScores[[cur]])) featScores[[cur]] <- list() tmp <- out[[sprintf("Split%i",k)]] tmp2 <- tmp[["featureScores"]][[cur]] colnames(tmp2) <- c("PATHWAY_NAME","SCORE") featScores[[cur]][[sprintf("Split%i",k)]] <- tmp2 } } # compile scores across runs featScores2 <- lapply(featScores, getNetConsensus) ## ----eval=TRUE---------------------------------------------------------------- featSelNet <- lapply(featScores2, function(x) { callFeatSel(x, fsCutoff=1, fsPctPass=0) }) ## ----eval=TRUE---------------------------------------------------------------- outDir <- paste(tempdir(), randAlphanumString(), sep = getFileSep()) if (file.exists(outDir)) unlink(outDir,recursive=TRUE) dir.create(outDir) predModel <- suppressMessages( predict(brca, holdout, groupList, featSelNet, makeNets, outDir, verbose = FALSE) ) ## ----eval=TRUE---------------------------------------------------------------- perf <- getPerformance(predModel, c("LumA", "notLumA")) message(sprintf("AUROC=%1.2f", perf$auroc)) message(sprintf("AUPR=%1.2f", perf$aupr)) message(sprintf("Accuracy=%1.1f%%", perf$acc)) ## ----eval=TRUE---------------------------------------------------------------- plotPerf_multi(list(perf$rocCurve), plotTitle = sprintf( "BRCA Validation: %i samples", nrow(colData(holdout)))) plotPerf_multi(list(perf$prCurve), plotType = "PR", plotTitle = sprintf( "BRCA Validation: %i samples", nrow(colData(holdout)))) ## ----eval=TRUE---------------------------------------------------------------- sessionInfo()