mlr3fselect

Package website: release | dev

tic CRAN Status StackOverflow Mattermost CodeFactor

This package provides feature selection for mlr3. It offers various feature selection wrappers, e.g. random search and sequential feature selection and different termination criteria can be set and combined.’AutoFSelect’ provides a convenient way to perform nested resampling in combination with ‘mlr3’. The package is build on bbotk which provides a common framework for optimization. For feature filters and embedded methods, see mlr3filters

Resources

Installation

Install the last release from CRAN:

install.packages("mlr3fselect")

Install the development version from GitHub:

remotes::install_github("mlr-org/mlr3fselect")

Example

Basic feature selection

library("mlr3fselect")

# feature selection on the pima indians diabetes data set
instance = fselect(
  method = "random_search",
  task =  tsk("pima"),
  learner = lrn("classif.rpart"),
  resampling = rsmp("holdout"),
  measure = msr("classif.ce"),
  term_evals = 10,
  batch_size = 5
)

# best performing feature set
instance$result
##     age glucose insulin mass pedigree pregnant pressure triceps                                  features classif.ce
## 1: TRUE    TRUE    TRUE TRUE     TRUE    FALSE    FALSE    TRUE age,glucose,insulin,mass,pedigree,triceps  0.1757812
# all evaluated feature sets
as.data.table(instance$archive)
##       age glucose insulin  mass pedigree pregnant pressure triceps classif.ce runtime_learners           timestamp batch_nr      resample_result
##  1:  TRUE    TRUE   FALSE  TRUE    FALSE     TRUE     TRUE   FALSE  0.2031250            0.051 2021-09-02 13:52:48        1 <ResampleResult[20]>
##  2: FALSE   FALSE    TRUE FALSE    FALSE     TRUE     TRUE   FALSE  0.2578125            0.066 2021-09-02 13:52:48        1 <ResampleResult[20]>
##  3:  TRUE    TRUE    TRUE FALSE     TRUE     TRUE    FALSE   FALSE  0.2070312            0.049 2021-09-02 13:52:48        1 <ResampleResult[20]>
##  4:  TRUE    TRUE   FALSE  TRUE    FALSE     TRUE    FALSE   FALSE  0.2031250            0.050 2021-09-02 13:52:48        1 <ResampleResult[20]>
##  5:  TRUE   FALSE   FALSE FALSE    FALSE    FALSE    FALSE   FALSE  0.2968750            0.054 2021-09-02 13:52:48        1 <ResampleResult[20]>
##  6: FALSE    TRUE   FALSE FALSE     TRUE    FALSE    FALSE   FALSE  0.2031250            0.050 2021-09-02 13:52:49        2 <ResampleResult[20]>
##  7:  TRUE   FALSE   FALSE FALSE     TRUE     TRUE     TRUE    TRUE  0.3203125            0.050 2021-09-02 13:52:49        2 <ResampleResult[20]>
##  8:  TRUE   FALSE    TRUE  TRUE     TRUE     TRUE     TRUE    TRUE  0.2578125            0.054 2021-09-02 13:52:49        2 <ResampleResult[20]>
##  9:  TRUE    TRUE    TRUE  TRUE     TRUE    FALSE    FALSE    TRUE  0.1757812            0.051 2021-09-02 13:52:49        2 <ResampleResult[20]>
## 10:  TRUE   FALSE    TRUE  TRUE     TRUE     TRUE     TRUE    TRUE  0.2578125            0.053 2021-09-02 13:52:49        2 <ResampleResult[20]>

Automatic feature selection

# task
task = tsk("pima")

# construct auto tuner
afs = auto_fselector(
  method = "random_search",
  learner = lrn("classif.rpart"),
  resampling = rsmp("holdout"),
  measure = msr("classif.ce"),
  term_evals = 10,
  batch_size = 5
)

# train/test split
train_set = sample(task$nrow, 0.8 * task$nrow)
test_set = setdiff(seq_len(task$nrow), train_set)

# select features set and fit final model on the complete data set in one go
afs$train(task, row_ids = train_set)

# best performing feature set
afs$fselect_result
##      age glucose insulin mass pedigree pregnant pressure triceps                                            features classif.ce
## 1: FALSE    TRUE    TRUE TRUE     TRUE     TRUE     TRUE    TRUE glucose,insulin,mass,pedigree,pregnant,pressure,...  0.2829268
# all evaluated feature sets
as.data.table(afs$archive)
##       age glucose insulin  mass pedigree pregnant pressure triceps classif.ce runtime_learners           timestamp batch_nr      resample_result
##  1:  TRUE    TRUE    TRUE  TRUE    FALSE     TRUE    FALSE    TRUE  0.2878049            0.050 2021-09-02 13:52:50        1 <ResampleResult[20]>
##  2: FALSE    TRUE    TRUE  TRUE     TRUE     TRUE     TRUE    TRUE  0.2829268            0.050 2021-09-02 13:52:50        1 <ResampleResult[20]>
##  3:  TRUE   FALSE    TRUE FALSE     TRUE    FALSE    FALSE    TRUE  0.3073171            0.083 2021-09-02 13:52:50        1 <ResampleResult[20]>
##  4: FALSE    TRUE    TRUE  TRUE    FALSE    FALSE     TRUE   FALSE  0.2829268            0.050 2021-09-02 13:52:50        1 <ResampleResult[20]>
##  5:  TRUE    TRUE    TRUE  TRUE    FALSE     TRUE     TRUE    TRUE  0.2878049            0.054 2021-09-02 13:52:50        1 <ResampleResult[20]>
##  6: FALSE    TRUE    TRUE  TRUE     TRUE     TRUE     TRUE    TRUE  0.2829268            0.051 2021-09-02 13:52:51        2 <ResampleResult[20]>
##  7: FALSE   FALSE    TRUE FALSE    FALSE    FALSE     TRUE   FALSE  0.3658537            0.049 2021-09-02 13:52:51        2 <ResampleResult[20]>
##  8: FALSE   FALSE    TRUE  TRUE     TRUE    FALSE    FALSE   FALSE  0.2926829            0.049 2021-09-02 13:52:51        2 <ResampleResult[20]>
##  9: FALSE   FALSE   FALSE FALSE    FALSE     TRUE     TRUE   FALSE  0.3658537            0.050 2021-09-02 13:52:51        2 <ResampleResult[20]>
## 10: FALSE   FALSE   FALSE FALSE    FALSE    FALSE    FALSE    TRUE  0.3268293            0.048 2021-09-02 13:52:51        2 <ResampleResult[20]>
# predict new data
afs$predict(task, row_ids = test_set)
## <PredictionClassif> for 154 observations:
##     row_ids truth response
##           2   neg      neg
##          12   pos      pos
##          16   pos      neg
## ---                       
##         748   neg      neg
##         751   pos      pos
##         766   neg      neg

Nested resampling

# nested resampling
rr = fselect_nested(
  method = "random_search",
  task =  tsk("pima"),
  learner = lrn("classif.rpart"),
  inner_resampling = rsmp("holdout"),
  outer_resampling = rsmp("cv", folds = 3),
  measure = msr("classif.ce"),
  term_evals = 10,
  batch_size = 5
)

# aggregated performance of all outer resampling iterations
rr$aggregate()
## classif.ce 
##  0.2617188
# performance scores of the outer resampling
rr$score()
##                 task task_id             learner              learner_id         resampling resampling_id iteration              prediction classif.ce
## 1: <TaskClassif[47]>    pima <AutoFSelector[40]> classif.rpart.fselector <ResamplingCV[19]>            cv         1 <PredictionClassif[19]>  0.2265625
## 2: <TaskClassif[47]>    pima <AutoFSelector[40]> classif.rpart.fselector <ResamplingCV[19]>            cv         2 <PredictionClassif[19]>  0.2617188
## 3: <TaskClassif[47]>    pima <AutoFSelector[40]> classif.rpart.fselector <ResamplingCV[19]>            cv         3 <PredictionClassif[19]>  0.2968750
# inner resampling results
extract_inner_fselect_results(rr)
##    iteration   age glucose insulin  mass pedigree pregnant pressure triceps classif.ce                                       features task_id              learner_id resampling_id
## 1:         1  TRUE    TRUE   FALSE  TRUE    FALSE    FALSE    FALSE   FALSE  0.2748538                               age,glucose,mass    pima classif.rpart.fselector            cv
## 2:         2  TRUE    TRUE    TRUE  TRUE     TRUE    FALSE     TRUE    TRUE  0.2397661 age,glucose,insulin,mass,pedigree,pressure,...    pima classif.rpart.fselector            cv
## 3:         3 FALSE    TRUE   FALSE FALSE    FALSE    FALSE    FALSE   FALSE  0.2222222                                        glucose    pima classif.rpart.fselector            cv
# inner resampling archives
extract_inner_fselect_archives(rr)
##     iteration   age glucose insulin  mass pedigree pregnant pressure triceps classif.ce runtime_learners           timestamp batch_nr      resample_result task_id              learner_id
##  1:         1  TRUE    TRUE   FALSE  TRUE     TRUE     TRUE     TRUE   FALSE  0.2807018            0.050 2021-09-02 13:52:52        1 <ResampleResult[20]>    pima classif.rpart.fselector
##  2:         1 FALSE    TRUE   FALSE FALSE     TRUE     TRUE     TRUE    TRUE  0.3216374            0.050 2021-09-02 13:52:52        1 <ResampleResult[20]>    pima classif.rpart.fselector
##  3:         1  TRUE   FALSE    TRUE FALSE     TRUE     TRUE     TRUE    TRUE  0.3976608            0.050 2021-09-02 13:52:52        1 <ResampleResult[20]>    pima classif.rpart.fselector
##  4:         1  TRUE   FALSE   FALSE  TRUE     TRUE     TRUE     TRUE    TRUE  0.3450292            0.049 2021-09-02 13:52:52        1 <ResampleResult[20]>    pima classif.rpart.fselector
##  5:         1  TRUE    TRUE    TRUE  TRUE     TRUE     TRUE     TRUE    TRUE  0.2865497            0.053 2021-09-02 13:52:52        1 <ResampleResult[20]>    pima classif.rpart.fselector
##  6:         1 FALSE   FALSE   FALSE FALSE    FALSE    FALSE     TRUE   FALSE  0.3567251            0.056 2021-09-02 13:52:53        2 <ResampleResult[20]>    pima classif.rpart.fselector
##  7:         1 FALSE   FALSE    TRUE FALSE    FALSE     TRUE    FALSE   FALSE  0.3918129            0.050 2021-09-02 13:52:53        2 <ResampleResult[20]>    pima classif.rpart.fselector
##  8:         1  TRUE   FALSE    TRUE  TRUE     TRUE     TRUE     TRUE    TRUE  0.3742690            0.052 2021-09-02 13:52:53        2 <ResampleResult[20]>    pima classif.rpart.fselector
##  9:         1  TRUE    TRUE   FALSE  TRUE    FALSE    FALSE    FALSE   FALSE  0.2748538            0.052 2021-09-02 13:52:53        2 <ResampleResult[20]>    pima classif.rpart.fselector
## 10:         1 FALSE   FALSE   FALSE FALSE    FALSE    FALSE     TRUE   FALSE  0.3567251            0.050 2021-09-02 13:52:53        2 <ResampleResult[20]>    pima classif.rpart.fselector
## 11:         2  TRUE    TRUE    TRUE  TRUE     TRUE     TRUE     TRUE   FALSE  0.2456140            0.058 2021-09-02 13:52:54        1 <ResampleResult[20]>    pima classif.rpart.fselector
## 12:         2  TRUE   FALSE   FALSE  TRUE    FALSE    FALSE    FALSE   FALSE  0.3274854            0.050 2021-09-02 13:52:54        1 <ResampleResult[20]>    pima classif.rpart.fselector
## 13:         2  TRUE   FALSE    TRUE  TRUE     TRUE     TRUE     TRUE    TRUE  0.3274854            0.075 2021-09-02 13:52:54        1 <ResampleResult[20]>    pima classif.rpart.fselector
## 14:         2 FALSE    TRUE    TRUE  TRUE     TRUE     TRUE     TRUE   FALSE  0.2514620            0.051 2021-09-02 13:52:54        1 <ResampleResult[20]>    pima classif.rpart.fselector
## 15:         2 FALSE   FALSE   FALSE FALSE     TRUE     TRUE    FALSE    TRUE  0.4152047            0.049 2021-09-02 13:52:54        1 <ResampleResult[20]>    pima classif.rpart.fselector
## 16:         2  TRUE   FALSE   FALSE FALSE    FALSE     TRUE    FALSE   FALSE  0.3391813            0.051 2021-09-02 13:52:55        2 <ResampleResult[20]>    pima classif.rpart.fselector
## 17:         2 FALSE   FALSE   FALSE FALSE    FALSE    FALSE     TRUE   FALSE  0.3508772            0.049 2021-09-02 13:52:55        2 <ResampleResult[20]>    pima classif.rpart.fselector
## 18:         2  TRUE    TRUE    TRUE  TRUE     TRUE    FALSE     TRUE    TRUE  0.2397661            0.064 2021-09-02 13:52:55        2 <ResampleResult[20]>    pima classif.rpart.fselector
## 19:         2 FALSE   FALSE   FALSE FALSE    FALSE    FALSE    FALSE    TRUE  0.3742690            0.051 2021-09-02 13:52:55        2 <ResampleResult[20]>    pima classif.rpart.fselector
## 20:         2  TRUE   FALSE    TRUE  TRUE    FALSE    FALSE    FALSE   FALSE  0.2807018            0.050 2021-09-02 13:52:55        2 <ResampleResult[20]>    pima classif.rpart.fselector
## 21:         3 FALSE    TRUE   FALSE FALSE    FALSE    FALSE    FALSE   FALSE  0.2222222            0.071 2021-09-02 13:52:56        1 <ResampleResult[20]>    pima classif.rpart.fselector
## 22:         3  TRUE   FALSE    TRUE FALSE    FALSE     TRUE    FALSE   FALSE  0.3157895            0.078 2021-09-02 13:52:56        1 <ResampleResult[20]>    pima classif.rpart.fselector
## 23:         3  TRUE    TRUE    TRUE  TRUE     TRUE     TRUE     TRUE    TRUE  0.2456140            0.068 2021-09-02 13:52:56        1 <ResampleResult[20]>    pima classif.rpart.fselector
## 24:         3 FALSE   FALSE   FALSE FALSE     TRUE     TRUE    FALSE   FALSE  0.3040936            0.069 2021-09-02 13:52:56        1 <ResampleResult[20]>    pima classif.rpart.fselector
## 25:         3  TRUE    TRUE    TRUE  TRUE     TRUE    FALSE    FALSE    TRUE  0.2339181            0.085 2021-09-02 13:52:56        1 <ResampleResult[20]>    pima classif.rpart.fselector
## 26:         3  TRUE    TRUE    TRUE FALSE     TRUE     TRUE     TRUE    TRUE  0.3274854            0.062 2021-09-02 13:52:57        2 <ResampleResult[20]>    pima classif.rpart.fselector
## 27:         3  TRUE   FALSE   FALSE  TRUE    FALSE    FALSE    FALSE    TRUE  0.3391813            0.050 2021-09-02 13:52:57        2 <ResampleResult[20]>    pima classif.rpart.fselector
## 28:         3 FALSE   FALSE   FALSE FALSE    FALSE     TRUE    FALSE   FALSE  0.2982456            0.050 2021-09-02 13:52:57        2 <ResampleResult[20]>    pima classif.rpart.fselector
## 29:         3 FALSE    TRUE    TRUE FALSE    FALSE    FALSE     TRUE   FALSE  0.2222222            0.061 2021-09-02 13:52:57        2 <ResampleResult[20]>    pima classif.rpart.fselector
## 30:         3 FALSE    TRUE    TRUE  TRUE     TRUE    FALSE    FALSE   FALSE  0.2397661            0.051 2021-09-02 13:52:57        2 <ResampleResult[20]>    pima classif.rpart.fselector
##     iteration   age glucose insulin  mass pedigree pregnant pressure triceps classif.ce runtime_learners           timestamp batch_nr      resample_result task_id              learner_id
##     resampling_id
##  1:            cv
##  2:            cv
##  3:            cv
##  4:            cv
##  5:            cv
##  6:            cv
##  7:            cv
##  8:            cv
##  9:            cv
## 10:            cv
## 11:            cv
## 12:            cv
## 13:            cv
## 14:            cv
## 15:            cv
## 16:            cv
## 17:            cv
## 18:            cv
## 19:            cv
## 20:            cv
## 21:            cv
## 22:            cv
## 23:            cv
## 24:            cv
## 25:            cv
## 26:            cv
## 27:            cv
## 28:            cv
## 29:            cv
## 30:            cv
##     resampling_id