Best GLM

library(bestglm)
Loading required package: leaps
data(zprostate)

Outcome: lpsa = Prostate-Specific Antigen (psa)

skimr::skim(zprostate)
Data summary
Name zprostate
Number of rows 97
Number of columns 10
_______________________
Column type frequency:
logical 1
numeric 9
________________________
Group variables None

Variable type: logical

skim_variable n_missing complete_rate mean count
train 0 1 0.69 TRU: 67, FAL: 30

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
lcavol 0 1 0.00 1.00 -2.29 -0.71 0.08 0.66 2.10 ▃▅▇▆▃
lweight 0 1 0.00 1.00 -2.93 -0.59 -0.01 0.58 2.69 ▁▃▇▆▁
age 0 1 0.00 1.00 -3.07 -0.52 0.15 0.56 2.03 ▁▁▅▇▂
lbph 0 1 0.00 1.00 -1.02 -1.02 0.14 1.00 1.53 ▇▁▂▃▅
svi 0 1 0.00 1.00 -0.52 -0.52 -0.52 -0.52 1.89 ▇▁▁▁▂
lcp 0 1 0.00 1.00 -0.86 -0.86 -0.44 0.97 2.21 ▇▂▂▂▂
gleason 0 1 0.00 1.00 -1.04 -1.04 0.34 0.34 3.11 ▅▇▁▁▁
pgg45 0 1 0.00 1.00 -0.86 -0.86 -0.33 0.55 2.68 ▇▃▂▂▁
lpsa 0 1 2.48 1.15 -0.43 1.73 2.59 3.06 5.58 ▁▅▇▃▁

Regsubset

train <- (zprostate[zprostate[, 10], ])[, -10]
X <- train[, 1:8]
y <- train[, 9]
out <- summary(regsubsets(x = X, y = y, nvmax = ncol(X)))
Subsets <- out$which
RSS <- out$rss


zprostate_regsub_tbl <-
  cbind(as.data.frame(Subsets), RSS = RSS) |> 
  relocate(RSS) |> 
  mutate(across(where(is.logical), ~ifelse(.x, "*", NA)))

zprostate_regsub_tbl
       RSS (Intercept) lcavol lweight  age lbph  svi  lcp gleason pgg45
1 44.52858           *      *    <NA> <NA> <NA> <NA> <NA>    <NA>  <NA>
2 37.09185           *      *       * <NA> <NA> <NA> <NA>    <NA>  <NA>
3 34.90775           *      *       * <NA> <NA>    * <NA>    <NA>  <NA>
4 32.81499           *      *       * <NA>    *    * <NA>    <NA>  <NA>
5 32.06945           *      *       * <NA>    *    * <NA>    <NA>     *
6 30.53978           *      *       * <NA>    *    *    *    <NA>     *
7 29.43730           *      *       *    *    *    *    *    <NA>     *
8 29.42638           *      *       *    *    *    *    *       *     *

Best GLM

Linear Reg

# `y` must be the last column
Xy <- cbind(as.data.frame(X), lpsa = y)

zprostate_bestglm <- bestglm(Xy)
names(zprostate_bestglm)
[1] "BestModel"   "BestModels"  "Bestq"       "qTable"      "Subsets"    
[6] "Title"       "ModelReport"

BestModel: lm or glm object giving the best model

zprostate_bestglm$BestModel

Call:
lm(formula = y ~ ., data = data.frame(Xy[, c(bestset[-1], FALSE), 
    drop = FALSE], y = y))

Coefficients:
(Intercept)       lcavol      lweight  
     2.4774       0.7397       0.3163  

BestModels: a 𝑇 × 𝑝 logical matrix showing which variables are included in the top 𝑇 models

zprostate_bestglm$BestModels
  lcavol lweight   age  lbph   svi   lcp gleason pgg45 Criterion
1   TRUE    TRUE FALSE FALSE FALSE FALSE   FALSE FALSE -31.20741
2   TRUE    TRUE FALSE FALSE  TRUE FALSE   FALSE FALSE -31.06884
3   TRUE    TRUE FALSE  TRUE  TRUE FALSE   FALSE FALSE -31.00630
4   TRUE    TRUE FALSE FALSE FALSE FALSE   FALSE  TRUE -30.06625
5   TRUE   FALSE FALSE  TRUE  TRUE FALSE   FALSE FALSE -29.06138

Subsets: a (𝑝 + 1) × 𝑝 logical matrix showing which variables are included for subset sizes 𝑘 = 0, . . . , 𝑝 have the smallest deviance

zprostate_bestglm$Subsets
   (Intercept) lcavol lweight   age  lbph   svi   lcp gleason pgg45
0         TRUE  FALSE   FALSE FALSE FALSE FALSE FALSE   FALSE FALSE
1         TRUE   TRUE   FALSE FALSE FALSE FALSE FALSE   FALSE FALSE
2*        TRUE   TRUE    TRUE FALSE FALSE FALSE FALSE   FALSE FALSE
3         TRUE   TRUE    TRUE FALSE FALSE  TRUE FALSE   FALSE FALSE
4         TRUE   TRUE    TRUE FALSE  TRUE  TRUE FALSE   FALSE FALSE
5         TRUE   TRUE    TRUE FALSE  TRUE  TRUE FALSE   FALSE  TRUE
6         TRUE   TRUE    TRUE FALSE  TRUE  TRUE  TRUE   FALSE  TRUE
7         TRUE   TRUE    TRUE  TRUE  TRUE  TRUE  TRUE   FALSE  TRUE
8         TRUE   TRUE    TRUE  TRUE  TRUE  TRUE  TRUE    TRUE  TRUE
   logLikelihood       BIC
0      -12.14653  24.29306
1       13.68680 -23.16892
2*      19.80840 -31.20741
3       21.84146 -31.06884
4       23.91254 -31.00630
5       24.68243 -28.34139
6       26.31970 -27.41124
7       27.55141 -25.66996
8       27.56383 -21.49012

qTable: a table showing all possible model choices for different intervals of 𝑞.

zprostate_bestglm$qTable
          LogL           q1           q2 k
[1,] -12.14653 0.000000e+00 4.940404e-11 0
[2,]  13.68680 4.940404e-11 1.764939e-02 1
[3,]  19.80840 1.764939e-02 5.125667e-01 2
[4,]  23.91254 5.125667e-01 7.087642e-01 4
[5,]  27.55141 7.087642e-01 8.899197e-01 7
[6,]  27.56383 8.899197e-01 1.000000e+00 8

Log Reg

library(ISLR2)
data("Default")
Default_Xy <- Default |> relocate(default, .after = last_col())
Default_b.glm <- bestglm(Default_Xy, family = binomial(), 
                         IC = "BIC")
Morgan-Tatar search since family is non-gaussian.

BestModel: lm or glm object giving the best model

Default_b.glm$BestModel

Call:  glm(formula = y ~ ., family = family, data = Xi, weights = weights)

Coefficients:
(Intercept)   studentYes      balance  
 -10.749496    -0.714878     0.005738  

Degrees of Freedom: 9999 Total (i.e. Null);  9997 Residual
Null Deviance:      2921 
Residual Deviance: 1572     AIC: 1578

BestModels: a 𝑇 × 𝑝 logical matrix showing which variables are included in the top 𝑇 models

Default_b.glm$BestModels
  student balance income Criterion
1    TRUE    TRUE  FALSE  1590.102
2   FALSE    TRUE   TRUE  1597.387
3    TRUE    TRUE   TRUE  1599.176
4   FALSE    TRUE  FALSE  1605.662
5    TRUE   FALSE  FALSE  2917.893

Log Reg (Penguin)

library(palmerpenguins)
data(penguins)
# Only work with this
penguins_Xy <- penguins |> 
  relocate(sex, .after = last_col()) |> 
  select(ends_with("_mm"), body_mass_g, sex) |> 
  mutate(across(where(is.integer), as.double)) |> 
  filter(!if_any(everything(), ~is.na(.x))) |> 
  as.data.frame() # Must

head(penguins_Xy)
  bill_length_mm bill_depth_mm flipper_length_mm body_mass_g    sex
1           39.1          18.7               181        3750   male
2           39.5          17.4               186        3800 female
3           40.3          18.0               195        3250 female
4           36.7          19.3               193        3450 female
5           39.3          20.6               190        3650   male
6           38.9          17.8               181        3625 female
penguins_bestglm <- bestglm::bestglm(penguins_Xy, 
                                     family = binomial())
Morgan-Tatar search since family is non-gaussian.
penguins_bestglm$BestModels
  bill_length_mm bill_depth_mm flipper_length_mm body_mass_g Criterion
1          FALSE          TRUE             FALSE        TRUE  175.9541
2           TRUE          TRUE             FALSE        TRUE  177.3122
3          FALSE          TRUE              TRUE        TRUE  181.7523
4           TRUE          TRUE              TRUE        TRUE  182.2360
5           TRUE          TRUE              TRUE       FALSE  268.3962