library(here)
library(tidyverse)
library(tidymodels)
tidymodels_prefer()
ML Ops with Penguin
Thanks james-h-wade for simplify the whole process. The original material can be found at:
EDA
|>
penguins filter(!is.na(sex)) |>
ggplot(aes(x = flipper_length_mm,
y = bill_length_mm,
color = sex,
size = body_mass_g)) +
geom_point(alpha = 0.5) +
facet_wrap(~species)
Split
# remove rows with missing sex, exclude year and island
<-
penguins_df ::penguins |>
palmerpenguinsdrop_na(sex) |>
select(-year, -island)
# set the seed for reproducibility
set.seed(1234)
# Split the data into train and test sets stratified by sex
<- initial_split(penguins_df, strata = sex)
penguin_split <- training(penguin_split)
penguin_train <- testing(penguin_split)
penguin_test
# create folds for cross validation
<- vfold_cv(penguin_train) penguin_folds
Recipes
<-
penguin_rec recipe(sex ~ ., data = penguin_train) |>
step_YeoJohnson(all_numeric_predictors()) |>
step_dummy(species) |>
step_normalize(all_numeric_predictors())
Model Spec
# Logistic Regression
<-
glm_spec logistic_reg(penalty = 1) |>
set_engine("glm")
# Random Forest
<-
tree_spec rand_forest(min_n = tune()) |>
set_engine("ranger") |>
set_mode("classification")
# Neural Network with `{torch}` (Not Done)
Fit Models & Tune Hyperparameters
Use Bayes optimizaiton for hyperparameter tuning
<- control_bayes(no_improve = 10L,
bayes_control time_limit = 20,
save_pred = TRUE,
verbose = TRUE)
# Unix and macOS only
library(doMC)
Loading required package: foreach
Attaching package: 'foreach'
The following objects are masked from 'package:purrr':
accumulate, when
Loading required package: iterators
Loading required package: parallel
registerDoMC(cores = 8)
<-
workflow_set workflow_set(
preproc = list(penguin_rec),
models = list(glm = glm_spec,
tree = tree_spec)
|>
) workflow_map("tune_bayes",
iter = 50L,
resamples = penguin_folds,
control = bayes_control
)
❯ Generating a set of 5 initial parameter results
✓ Initialization complete
i Gaussian process model
✓ Gaussian process model
i Generating 34 candidates
i Predicted candidates
i Estimating performance
✓ Estimating performance
i Gaussian process model
✓ Gaussian process model
i Generating 33 candidates
i Predicted candidates
i Estimating performance
✓ Estimating performance
i Gaussian process model
✓ Gaussian process model
i Generating 32 candidates
i Predicted candidates
i Estimating performance
✓ Estimating performance
i Gaussian process model
✓ Gaussian process model
i Generating 31 candidates
i Predicted candidates
i Estimating performance
✓ Estimating performance
i Gaussian process model
✓ Gaussian process model
i Generating 30 candidates
i Predicted candidates
i Estimating performance
✓ Estimating performance
i Gaussian process model
✓ Gaussian process model
i Generating 29 candidates
i Predicted candidates
i Estimating performance
✓ Estimating performance
i Gaussian process model
✓ Gaussian process model
i Generating 28 candidates
i Predicted candidates
i Estimating performance
✓ Estimating performance
i Gaussian process model
✓ Gaussian process model
i Generating 27 candidates
i Predicted candidates
i Estimating performance
✓ Estimating performance
i Gaussian process model
✓ Gaussian process model
i Generating 26 candidates
i Predicted candidates
i Estimating performance
✓ Estimating performance
i Gaussian process model
✓ Gaussian process model
i Generating 25 candidates
i Predicted candidates
i Estimating performance
✓ Estimating performance
i Gaussian process model
✓ Gaussian process model
i Generating 24 candidates
i Predicted candidates
i Estimating performance
✓ Estimating performance
i Gaussian process model
✓ Gaussian process model
i Generating 23 candidates
i Predicted candidates
i Estimating performance
✓ Estimating performance
! No improvement for 10 iterations; returning current results.
class(workflow_set)
[1] "workflow_set" "tbl_df" "tbl" "data.frame"
workflow_set
# A workflow set/tibble: 2 × 4
wflow_id info option result
<chr> <list> <list> <list>
1 recipe_glm <tibble [1 × 4]> <opts[3]> <rsmp[+]>
2 recipe_tree <tibble [1 × 4]> <opts[3]> <tune[+]>
Compare Model Results
Tabular view
# create table of best models defined using roc_auc metric
rank_results(workflow_set,
rank_metric = "roc_auc",
select_best = TRUE)
# A tibble: 4 × 9
wflow_id .config .metric mean std_err n preprocessor model rank
<chr> <chr> <chr> <dbl> <dbl> <int> <chr> <chr> <int>
1 recipe_glm Preprocessor… accura… 0.900 0.0199 10 recipe logi… 1
2 recipe_glm Preprocessor… roc_auc 0.969 0.0123 10 recipe logi… 1
3 recipe_tree Iter2 accura… 0.912 0.0251 10 recipe rand… 2
4 recipe_tree Iter2 roc_auc 0.967 0.0132 10 recipe rand… 2
Plotting performance
autoplot(workflow_set)
Finalize
Select best model
<- "recipe_glm"
best_model_id
<-
best_fit |>
workflow_set extract_workflow_set_result(best_model_id) |>
select_best(metric = "accuracy")
best_fit
# A tibble: 1 × 1
.config
<chr>
1 Preprocessor1_Model1
Final Fit
# create workflow for best model
<-
final_workflow |>
workflow_set extract_workflow(best_model_id) |>
finalize_workflow(best_fit)
# fit final model with all data
<-
final_fit |>
final_workflow last_fit(penguin_split)
Final Metric
# show model performance
collect_metrics(final_fit)
# A tibble: 2 × 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 accuracy binary 0.905 Preprocessor1_Model1
2 roc_auc binary 0.971 Preprocessor1_Model1
collect_predictions(final_fit) |>
roc_curve(sex, .pred_female) |>
autoplot()