library(here)
library(tidyverse)
library(tidymodels)
tidymodels_prefer()ML Ops with Penguin
Thanks james-h-wade for simplify the whole process. The original material can be found at:
EDA
penguins |>
filter(!is.na(sex)) |>
ggplot(aes(x = flipper_length_mm,
y = bill_length_mm,
color = sex,
size = body_mass_g)) +
geom_point(alpha = 0.5) +
facet_wrap(~species)
Split
# remove rows with missing sex, exclude year and island
penguins_df <-
palmerpenguins::penguins |>
drop_na(sex) |>
select(-year, -island)
# set the seed for reproducibility
set.seed(1234)
# Split the data into train and test sets stratified by sex
penguin_split <- initial_split(penguins_df, strata = sex)
penguin_train <- training(penguin_split)
penguin_test <- testing(penguin_split)
# create folds for cross validation
penguin_folds <- vfold_cv(penguin_train)Recipes
penguin_rec <-
recipe(sex ~ ., data = penguin_train) |>
step_YeoJohnson(all_numeric_predictors()) |>
step_dummy(species) |>
step_normalize(all_numeric_predictors()) Model Spec
# Logistic Regression
glm_spec <-
logistic_reg(penalty = 1) |>
set_engine("glm")
# Random Forest
tree_spec <-
rand_forest(min_n = tune()) |>
set_engine("ranger") |>
set_mode("classification")
# Neural Network with `{torch}` (Not Done)Fit Models & Tune Hyperparameters
Use Bayes optimizaiton for hyperparameter tuning
bayes_control <- control_bayes(no_improve = 10L,
time_limit = 20,
save_pred = TRUE,
verbose = TRUE)# Unix and macOS only
library(doMC)Loading required package: foreach
Attaching package: 'foreach'
The following objects are masked from 'package:purrr':
accumulate, when
Loading required package: iterators
Loading required package: parallel
registerDoMC(cores = 8)workflow_set <-
workflow_set(
preproc = list(penguin_rec),
models = list(glm = glm_spec,
tree = tree_spec)
) |>
workflow_map("tune_bayes",
iter = 50L,
resamples = penguin_folds,
control = bayes_control
)
❯ Generating a set of 5 initial parameter results
✓ Initialization complete
i Gaussian process model
✓ Gaussian process model
i Generating 34 candidates
i Predicted candidates
i Estimating performance
✓ Estimating performance
i Gaussian process model
✓ Gaussian process model
i Generating 33 candidates
i Predicted candidates
i Estimating performance
✓ Estimating performance
i Gaussian process model
✓ Gaussian process model
i Generating 32 candidates
i Predicted candidates
i Estimating performance
✓ Estimating performance
i Gaussian process model
✓ Gaussian process model
i Generating 31 candidates
i Predicted candidates
i Estimating performance
✓ Estimating performance
i Gaussian process model
✓ Gaussian process model
i Generating 30 candidates
i Predicted candidates
i Estimating performance
✓ Estimating performance
i Gaussian process model
✓ Gaussian process model
i Generating 29 candidates
i Predicted candidates
i Estimating performance
✓ Estimating performance
i Gaussian process model
✓ Gaussian process model
i Generating 28 candidates
i Predicted candidates
i Estimating performance
✓ Estimating performance
i Gaussian process model
✓ Gaussian process model
i Generating 27 candidates
i Predicted candidates
i Estimating performance
✓ Estimating performance
i Gaussian process model
✓ Gaussian process model
i Generating 26 candidates
i Predicted candidates
i Estimating performance
✓ Estimating performance
i Gaussian process model
✓ Gaussian process model
i Generating 25 candidates
i Predicted candidates
i Estimating performance
✓ Estimating performance
i Gaussian process model
✓ Gaussian process model
i Generating 24 candidates
i Predicted candidates
i Estimating performance
✓ Estimating performance
i Gaussian process model
✓ Gaussian process model
i Generating 23 candidates
i Predicted candidates
i Estimating performance
✓ Estimating performance
! No improvement for 10 iterations; returning current results.
class(workflow_set)[1] "workflow_set" "tbl_df" "tbl" "data.frame"
workflow_set# A workflow set/tibble: 2 × 4
wflow_id info option result
<chr> <list> <list> <list>
1 recipe_glm <tibble [1 × 4]> <opts[3]> <rsmp[+]>
2 recipe_tree <tibble [1 × 4]> <opts[3]> <tune[+]>
Compare Model Results
Tabular view
# create table of best models defined using roc_auc metric
rank_results(workflow_set,
rank_metric = "roc_auc",
select_best = TRUE)# A tibble: 4 × 9
wflow_id .config .metric mean std_err n preprocessor model rank
<chr> <chr> <chr> <dbl> <dbl> <int> <chr> <chr> <int>
1 recipe_glm Preprocessor… accura… 0.900 0.0199 10 recipe logi… 1
2 recipe_glm Preprocessor… roc_auc 0.969 0.0123 10 recipe logi… 1
3 recipe_tree Iter2 accura… 0.912 0.0251 10 recipe rand… 2
4 recipe_tree Iter2 roc_auc 0.967 0.0132 10 recipe rand… 2
Plotting performance
autoplot(workflow_set)
Finalize
Select best model
best_model_id <- "recipe_glm"
best_fit <-
workflow_set |>
extract_workflow_set_result(best_model_id) |>
select_best(metric = "accuracy")
best_fit# A tibble: 1 × 1
.config
<chr>
1 Preprocessor1_Model1
Final Fit
# create workflow for best model
final_workflow <-
workflow_set |>
extract_workflow(best_model_id) |>
finalize_workflow(best_fit)
# fit final model with all data
final_fit <-
final_workflow |>
last_fit(penguin_split)Final Metric
# show model performance
collect_metrics(final_fit)# A tibble: 2 × 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 accuracy binary 0.905 Preprocessor1_Model1
2 roc_auc binary 0.971 Preprocessor1_Model1
collect_predictions(final_fit) |>
roc_curve(sex, .pred_female) |>
autoplot()