-
-
Notifications
You must be signed in to change notification settings - Fork 33
Description
Hi,
There's a glitch with modelStudio when using mlr3 pipelines with data with missing values.
It looks like modelStudio() doesn't know how to impute missing data before crunching the numbers, even when the user has incorporated a pipe operator for missing values in the mlr3 pipeline. In fact, modelStudio() does not even recognize mlr3 learners if their class is other than [1] "LearnerClassifRanger" "LearnerClassif" "Learner" "R6" (e.g. try class(learner) for a Random Forest learner). If you have a pipeline, whose class is [1] "GraphLearner" "Learner" "R6", modelStudio() doesn't know how to handle it.
Package DALExtra's explainer_mlr3() suffers from the same issue, although this can be dealt with by providing custom functions for arguments predict_function and residual_function.
Below is an example of a pipeline that imputes missing data and then balances classes. Note that it works fine when there are no missing data, but returns an error otherwise.
Example 1: no missing data
library(tidyverse)
library(data.table)
library(tidymodels)
library(paradox)
library(mlr3) # NOTE: install mlr3 packages from GitHub, not CRAN, as they differ in a few things, e.g. with GitHub you tune the pipeline with $optimize() but with CRAN with $tune()
library(mlr3filters)
library(mlr3learners)
library(mlr3misc)
library(mlr3pipelines)
library(mlr3tuning)
library(DALEXtra)
library(modelStudio)
# Load task and make smaller so code runs faster
task <- tsk('sonar')
task$select(paste0('V', 1:10))
# Ratio values for class-balancing pipe operators
class_counts <- table(task$truth())
upsample_ratio <- class_counts[class_counts == max(class_counts)] /
class_counts[class_counts == min(class_counts)]
downsample_ratio <- 1 / upsample_ratio
# Pipe operators for class-balancing
# 1. Enrich minority class by factor 'ratio'
po_over <- po("classbalancing", id = "up", adjust = "minor",
reference = "minor", shuffle = FALSE, ratio = upsample_ratio)
# 2. Reduce majority class by factor '1/ratio'
po_under <- po("classbalancing", id = "down", adjust = "major",
reference = "major", shuffle = FALSE, ratio = downsample_ratio)
# Handle missing values
features_with_nas <- sort(task$missings() / task$nrow, decreasing = TRUE)
features_with_nas <- features_with_nas[features_with_nas != 0]
# Imputes values based on histogram
hist_imp <- po("imputehist", param_vals =
list(affect_columns = selector_name(names(features_with_nas))))
# Add an indicator column for each feature with missing values
# One-hot encode these new categorical columns, and then remove the categorical versions of them
miss_ind <- po("missind") %>>%
po("encode") %>>%
po("select",
selector = selector_invert(selector_type("factor")),
id = 'dummy_encoding')
impute_data <- po("copy", 2) %>>%
gunion(list(hist_imp, miss_ind)) %>>%
po("featureunion")
impute_data$plot() # This is the Graph we'll add to the pipeline
impute_data$plot(html = TRUE)
# Random Forest learner with up- and down-balancing
rf <- lrn("classif.ranger", predict_type = "prob")
rf_up <- GraphLearner$new(
po_over %>>%
po('learner', rf, id = 'rf'),
predict_type = 'prob'
)
rf_down <- GraphLearner$new(
po_under %>>%
po('learner', rf, id = 'rf'),
predict_type = 'prob')
# All learners (Random Forest with up- and down-balancing)
learners <- list(
rf_up,
rf_down
)
names(learners) <- sapply(learners, function(x) x$id)
# Our pipeline
graph <-
impute_data %>>%
po("branch", names(learners)) %>>%
gunion(unname(learners)) %>>%
po("unbranch")
graph$plot() # Plot pipeline
graph$plot(html = TRUE) # Plot pipeline
pipe <- GraphLearner$new(graph) # Convert pipeline to learner
pipe$predict_type <- 'prob' # We want to predict probabilities and not classes.
param_set <- ParamSetCollection$new(list(
ParamSet$new(list(pipe$param_set$params$branch.selection$clone()))
))
# Set up tuning instance
instance <- TuningInstance$new(
task = task,
learner = pipe,
resampling = rsmp('cv', folds = 2),
measures = msr('classif.bbrier'),
param_set,
terminator = term("evals", n_evals = 3),
store_models = TRUE)
tuner <- TunerRandomSearch$new()
# Tune pipe learner to find best-performing branch
tuner$optimize(instance)
# Take a look at the results
instance$result
print(instance$result$tune_x$branch.selection) # Best model
# Train pipeline
pipe$train(task)
################################################################################################
# DALEXextra and modelStudio stuff
################################################################################################
# First create custom functions for predictions and residuals
# We need custom functions because explain_mlr3() doesn't recognize the Graph Learner class of mlr3
predict_function_custom <- function(model, data) {
pr <- model$
predict_newdata(data)$
data$
prob[, 1]
return(pr)
}
residual_function_custom <- function(model, data, y) {
pr <- model$
predict_newdata(data)
y_hat <- pr$
data$
prob[, 1]
return(as.integer(y == 0) - y_hat)
}
# Run explainer- works fine with cthe above functions
explainer <- explain_mlr3(model = pipe,
data = task$data()[, -1],
y = as.integer(task$data()[, 1] == 'M'),
predict_function = predict_function_custom,
residual_function = residual_function_custom,
label = "mlr3")
# HOWEVER: we have a classification task, but explainer thinks it's regression!
explainer$model_info
# Let's run modelStudio. You'll need to wait for a while
modelStudio(
explainer,
new_observation = task$data()[6, -1]
)
# Ignore warning about data format. Argument `new_observation` is a `data.table`, so its class is `[1] "data.table" "data.frame"`,
# which is essentially a data frame. so the class has two elements, but the condition only looks at the first one.
Working just fine.
Example 2: missing data
library(tidyverse)
library(data.table)
library(tidymodels)
library(paradox)
library(mlr3)
library(mlr3filters)
library(mlr3learners)
library(mlr3misc)
library(mlr3pipelines)
library(mlr3tuning)
library(DALEXtra)
library(modelStudio)
# Load task and make smaller so code runs faster
task <- tsk('sonar')
task$select(paste0('V', 1:10))
# Create some missing data
data <- task$data()
data$V1[1:5] <- NA
task <- TaskClassif$new(data, id = 'sonar', target = 'Class')
# Ratio values for class-balancing pipe operators
class_counts <- table(task$truth())
upsample_ratio <- class_counts[class_counts == max(class_counts)] /
class_counts[class_counts == min(class_counts)]
downsample_ratio <- 1 / upsample_ratio
# Pipe operators for class-balancing
# 1. Enrich minority class by factor 'ratio'
po_over <- po("classbalancing", id = "up", adjust = "minor",
reference = "minor", shuffle = FALSE, ratio = upsample_ratio)
# 2. Reduce majority class by factor '1/ratio'
po_under <- po("classbalancing", id = "down", adjust = "major",
reference = "major", shuffle = FALSE, ratio = downsample_ratio)
# Handle missing values
features_with_nas <- sort(task$missings() / task$nrow, decreasing = TRUE)
features_with_nas <- features_with_nas[features_with_nas != 0]
# Imputes values based on histogram
hist_imp <- po("imputehist", param_vals =
list(affect_columns = selector_name(names(features_with_nas))))
# Add an indicator column for each feature with missing values
# One-hot encode these new categorical columns, and then remove the categorical versions of them
miss_ind <- po("missind") %>>%
po("encode") %>>%
po("select",
selector = selector_invert(selector_type("factor")),
id = 'dummy_encoding')
impute_data <- po("copy", 2) %>>%
gunion(list(hist_imp, miss_ind)) %>>%
po("featureunion")
impute_data$plot() # This is the Graph we'll add to the pipeline
impute_data$plot(html = TRUE)
# Random Forest learner with up- and down-balancing
rf <- lrn("classif.ranger", predict_type = "prob")
rf_up <- GraphLearner$new(
po_over %>>%
po('learner', rf, id = 'rf'),
predict_type = 'prob'
)
rf_down <- GraphLearner$new(
po_under %>>%
po('learner', rf, id = 'rf'),
predict_type = 'prob')
# All learners (Random Forest with up- and down-balancing)
learners <- list(
rf_up,
rf_down
)
names(learners) <- sapply(learners, function(x) x$id)
# Our pipeline
graph <-
impute_data %>>%
po("branch", names(learners)) %>>%
gunion(unname(learners)) %>>%
po("unbranch")
graph$plot() # Plot pipeline
graph$plot(html = TRUE) # Plot pipeline
pipe <- GraphLearner$new(graph) # Convert pipeline to learner
pipe$predict_type <- 'prob' # We want to predict probabilities and not classes.
param_set <- ParamSetCollection$new(list(
ParamSet$new(list(pipe$param_set$params$branch.selection$clone()))
))
# Set up tuning instance
instance <- TuningInstance$new(
task = task,
learner = pipe,
resampling = rsmp('cv', folds = 2),
measures = msr('classif.bbrier'),
param_set,
terminator = term("evals", n_evals = 3),
store_models = TRUE)
tuner <- TunerRandomSearch$new()
# Tune pipe learner to find best-performing branch
tuner$optimize(instance)
# Take a look at the results
instance$result
print(instance$result$tune_x$branch.selection) # Best model
# Train pipeline
pipe$train(task)
################################################################################################
# DALEXextra and modelStudio stuff
################################################################################################
# First create custom functions for predictions and residuals
# We need custom functions because explain_mlr3() doesn't recognize the Graph Learner class of mlr3
predict_function_custom <- function(model, data) {
pr <- model$
predict_newdata(data)$
data$
prob[, 1]
return(pr)
}
residual_function_custom <- function(model, data, y) {
pr <- model$
predict_newdata(data)
y_hat <- pr$
data$
prob[, 1]
return(as.integer(y == 0) - y_hat)
}
# Run explainer- works fine with cthe above functions
explainer <- explain_mlr3(model = pipe,
data = task$data()[, -1],
y = as.integer(task$data()[, 1] == 'M'),
predict_function = predict_function_custom,
residual_function = residual_function_custom,
label = "mlr3")
# HOWEVER: we have a classification task, but explainer thinks it's regression!
explainer$model_info
# Let's run modelStudio. You'll need to wait for a while
modelStudio(
explainer,
new_observation = task$data()[6, -1]
)
# Ignore warning about data format. Argument `new_observation` is a `data.table`, so its class is `[1] "data.table" "data.frame"`,
# which is essentially a data frame. so the class has two elements, but the condition only looks at the first one.
We get errors and no plot:
Calculating ...
Calculating ingredients::feature_importance
Calculating ingredients::partial_dependence (numerical)
Calculating ingredients::accumulated_dependence (numerical)
Elapsed time: 00:01:01 ETA...Error in seq.default(min(x[, name]), max(x[, name]), length.out = nbins) :
'from' must be a finite number
In addition: Warning messages:
1: In value[[3L]](cond) :
Error occurred in ingredients::partial_dependence (numerical) function: missing values and NaN's not allowed if 'na.rm' is FALSE
2: In value[[3L]](cond) :
Error occurred in ingredients::accumulated_dependence (numerical) function: missing values and NaN's not allowed if 'na.rm' is FALSE
Is there a way to pass imputed data from explainer_mlr3() to modelStudio() just like you can pass predictions and residuals with arguments predict_function and residual_function respectively? Any chances of implementing this please?
Thanks