## ----eval=FALSE---------------------------------------------------------------
#
# suppressWarnings(suppressMessages(require(netDx)))
# suppressMessages(require(curatedTCGAData))
#
# # fetch data for example
# brca <- suppressMessages(
# curatedTCGAData("BRCA",c("mRNAArray"),FALSE, version="1.1.38"))
#
# # reformat for this example
# staget <- sub("[abcd]","",
# sub("t","",colData(brca)$pathology_T_stage))
# staget <- suppressWarnings(as.integer(staget))
# colData(brca)$STAGE <- staget
#
# pam50 <- colData(brca)$PAM50.mRNA
# # binarize class
# pam50[which(!pam50 %in% "Luminal A")] <- "notLumA"
# pam50[which(pam50 %in% "Luminal A")] <- "LumA"
# colData(brca)$pam_mod <- pam50
# tmp <- colData(brca)$PAM50.mRNA
# idx <- union(which(tmp %in% c("Normal-like",
# "Luminal B","HER2-enriched")),
# which(is.na(staget)))
# pID <- colData(brca)$patientID
# tokeep <- setdiff(pID, pID[idx])
# brca <- brca[,tokeep,]
# # remove duplicate assays mapped to the same sample
# smp <- sampleMap(brca)
# samps <- smp[which(smp$assay=="BRCA_mRNAArray-20160128"),]
# notdup <- samps[which(!duplicated(samps$primary)),"colname"]
# brca[[1]] <- suppressMessages(brca[[1]][,notdup])
# pam50 <- colData(brca)$pam_mod
# pheno <- colData(brca)
#
# # create holdout set
# set.seed(123) # make reproducible
# idx_holdout <- c(
# sample(which(pam50 == "LumA"),10,F),
# sample(which(pam50 == "notLumA"),10,F)
# )
# holdout <- brca[,rownames(pheno)[idx_holdout]]
# colData(holdout)$ID <- as.character(colData(holdout)$patientID)
# colData(holdout)$STATUS <- colData(holdout)$pam_mod
#
# # remove holdout samples from training set
# tokeep <- setdiff(1:length(pam50),idx_holdout)
# brca <- brca[,rownames(pheno)[tokeep]]
# pID <- as.character(colData(brca)$patientID)
# colData(brca)$ID <- pID
# colData(brca)$STATUS <- colData(brca)$pam_mod
#
# # feature design
# # genes in mRNA data are grouped by pathways
# pathList <- readPathways(fetchPathwayDefinitions("January",2018))
#
# groupList <- list()
# groupList[["BRCA_mRNAArray-20160128"]] <- pathList[1:3]
# # clinical data is not grouped; each variable is its own feature
# groupList[["clinical"]] <- list(
# age="patient.age_at_initial_pathologic_diagnosis",
# stage="STAGE"
# )
#
# # define function to create nets
# makeNets <- function(dataList, groupList, netDir, ...) {
# netList <- c() # initialize before is.null() check
# # make RNA nets (NOTE: the check for is.null() is important!)
# # (Pearson correlation)
# if (!is.null(groupList[["BRCA_mRNAArray-20160128"]])) {
# netList <- makePSN_NamedMatrix(
# dataList[["BRCA_mRNAArray-20160128"]],
# rownames(dataList[["BRCA_mRNAArray-20160128"]]),
# groupList[["BRCA_mRNAArray-20160128"]],
# netDir, verbose = FALSE,
# writeProfiles = TRUE, runSerially=TRUE, ...)
# }
#
# # make clinical nets (normalized difference)
# netList2 <- c()
# if (!is.null(groupList[["clinical"]])) {
# netList2 <- makePSN_NamedMatrix(dataList$clinical,
# rownames(dataList$clinical),
# groupList[["clinical"]], netDir,
# simMetric = "custom",
# customFunc = normDiff, # custom function
# writeProfiles = FALSE,
# sparsify = TRUE, verbose = TRUE, runSerially=TRUE, ...)
# }
# netList <- c(unlist(netList), unlist(netList2))
# return(netList)
# }
#
# # run predictor
# set.seed(42) # make results reproducible
# outDir <- paste(tempdir(),randAlphanumString(),
# "pred_output",sep=getFileSep())
# if (file.exists(outDir)) unlink(outDir,recursive=TRUE)
# numSplits <- 2L
# out <- suppressMessages(
# buildPredictor(
# dataList=brca,groupList=groupList,
# makeNetFunc=makeNets,
# outDir=outDir,
# numSplits=numSplits,
# featScoreMax=2L,
# featSelCutoff=1L,
# numCores=1L,debugMode=FALSE,
# logging="none")
# )
#
# # compile consistently high-scoring functions
# featScores <- list() # feature scores per class
# st <- unique(colData(brca)$STATUS)
# for (k in 1:numSplits) {
# # feature scores
# for (cur in unique(st)) {
# if (is.null(featScores[[cur]])) featScores[[cur]] <- list()
# tmp <- out[[sprintf("Split%i",k)]]
# tmp2 <- tmp[["featureScores"]][[cur]]
# colnames(tmp2) <- c("PATHWAY_NAME","SCORE")
# featScores[[cur]][[sprintf("Split%i",k)]] <- tmp2
# }
# }
# featScores2 <- lapply(featScores, getNetConsensus)
# featSelNet <- lapply(featScores2, function(x) {
# callFeatSel(x, fsCutoff=1, fsPctPass=0)
# })
#
# # predict performance on holdout set
# outDir <- paste(tempdir(), randAlphanumString(),
# sep = getFileSep())
# if (file.exists(outDir)) unlink(outDir,recursive=TRUE)
# dir.create(outDir)
# predModel <- suppressMessages(
# predict(brca, holdout, groupList,
# featSelNet, makeNets,
# outDir, verbose = FALSE)
# )
#
# # compute performance for prediction
# perf <- getPerformance(predModel, c("LumA", "notLumA"))
# message(sprintf("AUROC=%1.2f", perf$auroc))
# message(sprintf("AUPR=%1.2f", perf$aupr))
# message(sprintf("Accuracy=%1.1f%%", perf$acc))
#
# # plot ROC and PR curve
# plotPerf_multi(list(perf$rocCurve),
# plotTitle = sprintf(
# "BRCA Validation: %i samples",
# nrow(colData(holdout))))
# plotPerf_multi(list(perf$prCurve),
# plotType = "PR",
# plotTitle = sprintf(
# "BRCA Validation: %i samples",
# nrow(colData(holdout))))
#
## ----eval=TRUE----------------------------------------------------------------
suppressWarnings(suppressMessages(require(netDx)))
suppressMessages(require(curatedTCGAData))
brca <- suppressMessages(
curatedTCGAData("BRCA",c("mRNAArray"),FALSE, version="1.1.38"))
staget <- sub("[abcd]","",
sub("t","",colData(brca)$pathology_T_stage))
staget <- suppressWarnings(as.integer(staget))
colData(brca)$STAGE <- staget
pam50 <- colData(brca)$PAM50.mRNA
pam50[which(!pam50 %in% "Luminal A")] <- "notLumA"
pam50[which(pam50 %in% "Luminal A")] <- "LumA"
colData(brca)$pam_mod <- pam50
tmp <- colData(brca)$PAM50.mRNA
idx <- union(which(tmp %in% c("Normal-like",
"Luminal B","HER2-enriched")),
which(is.na(staget)))
pID <- colData(brca)$patientID
tokeep <- setdiff(pID, pID[idx])
brca <- brca[,tokeep,]
# remove duplicate assays mapped to the same sample
smp <- sampleMap(brca)
samps <- smp[which(smp$assay=="BRCA_mRNAArray-20160128"),]
notdup <- samps[which(!duplicated(samps$primary)),"colname"]
brca[[1]] <- suppressMessages(brca[[1]][,notdup])
pam50 <- colData(brca)$pam_mod
pheno <- colData(brca)
## ----eval=TRUE----------------------------------------------------------------
set.seed(123) # make reproducible
idx_holdout <- c(
sample(which(pam50 == "LumA"),10,F),
sample(which(pam50 == "notLumA"),10,F)
)
holdout <- brca[,rownames(pheno)[idx_holdout]]
colData(holdout)$ID <- as.character(colData(holdout)$patientID)
colData(holdout)$STATUS <- colData(holdout)$pam_mod
tokeep <- setdiff(1:length(pam50),idx_holdout)
brca <- brca[,rownames(pheno)[tokeep]]
pID <- as.character(colData(brca)$patientID)
colData(brca)$ID <- pID
colData(brca)$STATUS <- colData(brca)$pam_mod
## ----eval=TRUE----------------------------------------------------------------
# genes in mRNA data are grouped by pathways
pathList <- readPathways(fetchPathwayDefinitions("January",2018))
groupList <- list()
groupList[["BRCA_mRNAArray-20160128"]] <- pathList[1:3]
# clinical data is not grouped; each variable is its own feature
groupList[["clinical"]] <- list(
age="patient.age_at_initial_pathologic_diagnosis",
stage="STAGE"
)
## ----eval=TRUE----------------------------------------------------------------
makeNets <- function(dataList, groupList, netDir, ...) {
netList <- c() # initialize before is.null() check
# make RNA nets (NOTE: the check for is.null() is important!)
# (Pearson correlation)
if (!is.null(groupList[["BRCA_mRNAArray-20160128"]])) {
netList <- makePSN_NamedMatrix(
dataList[["BRCA_mRNAArray-20160128"]],
rownames(dataList[["BRCA_mRNAArray-20160128"]]),
groupList[["BRCA_mRNAArray-20160128"]],
netDir, verbose = FALSE,
writeProfiles = TRUE, runSerially=TRUE, ...)
}
# make clinical nets (normalized difference)
netList2 <- c()
if (!is.null(groupList[["clinical"]])) {
netList2 <- makePSN_NamedMatrix(dataList$clinical,
rownames(dataList$clinical),
groupList[["clinical"]], netDir,
simMetric = "custom",
customFunc = normDiff, # custom function
writeProfiles = FALSE,
sparsify = TRUE, verbose = TRUE, runSerially=TRUE, ...)
}
netList <- c(unlist(netList), unlist(netList2))
return(netList)
}
## ----eval=TRUE----------------------------------------------------------------
set.seed(42) # make results reproducible
outDir <- paste(tempdir(),randAlphanumString(),
"pred_output",sep=getFileSep())
if (file.exists(outDir)) unlink(outDir,recursive=TRUE)
numSplits <- 2L
out <- suppressMessages(
buildPredictor(
dataList=brca,groupList=groupList,
makeNetFunc=makeNets,
outDir=outDir,
numSplits=numSplits,
featScoreMax=2L,
featSelCutoff=1L,
numCores=1L,debugMode=FALSE,
logging="none")
)
## ----eval=TRUE----------------------------------------------------------------
featScores <- list() # feature scores per class
st <- unique(colData(brca)$STATUS)
for (k in 1:numSplits) {
# feature scores
for (cur in unique(st)) {
if (is.null(featScores[[cur]])) featScores[[cur]] <- list()
tmp <- out[[sprintf("Split%i",k)]]
tmp2 <- tmp[["featureScores"]][[cur]]
colnames(tmp2) <- c("PATHWAY_NAME","SCORE")
featScores[[cur]][[sprintf("Split%i",k)]] <- tmp2
}
}
# compile scores across runs
featScores2 <- lapply(featScores, getNetConsensus)
## ----eval=TRUE----------------------------------------------------------------
featSelNet <- lapply(featScores2, function(x) {
callFeatSel(x, fsCutoff=1, fsPctPass=0)
})
## ----eval=TRUE----------------------------------------------------------------
outDir <- paste(tempdir(), randAlphanumString(),
sep = getFileSep())
if (file.exists(outDir)) unlink(outDir,recursive=TRUE)
dir.create(outDir)
predModel <- suppressMessages(
predict(brca, holdout, groupList,
featSelNet, makeNets,
outDir, verbose = FALSE)
)
## ----eval=TRUE----------------------------------------------------------------
perf <- getPerformance(predModel, c("LumA", "notLumA"))
message(sprintf("AUROC=%1.2f", perf$auroc))
message(sprintf("AUPR=%1.2f", perf$aupr))
message(sprintf("Accuracy=%1.1f%%", perf$acc))
## ----eval=TRUE----------------------------------------------------------------
plotPerf_multi(list(perf$rocCurve),
plotTitle = sprintf(
"BRCA Validation: %i samples",
nrow(colData(holdout))))
plotPerf_multi(list(perf$prCurve),
plotType = "PR",
plotTitle = sprintf(
"BRCA Validation: %i samples",
nrow(colData(holdout))))
## ----eval=TRUE----------------------------------------------------------------
sessionInfo()