Best GLM

library(bestglm)

Loading required package: leaps

data(zprostate)

Outcome: lpsa = Prostate-Specific Antigen (psa)

skimr::skim(zprostate)

Data summary
Name	zprostate
Number of rows	97
Number of columns	10
_______________________
Column type frequency:
logical	1
numeric	9
________________________
Group variables	None

Variable type: logical

skim_variable	n_missing	complete_rate	mean	count
train	0	1	0.69	TRU: 67, FAL: 30

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
lcavol	1	0.00	1.00	-2.29	-0.71	0.08	0.66	2.10	▃▅▇▆▃
lweight	1	0.00	1.00	-2.93	-0.59	-0.01	0.58	2.69	▁▃▇▆▁
age	1	0.00	1.00	-3.07	-0.52	0.15	0.56	2.03	▁▁▅▇▂
lbph	1	0.00	1.00	-1.02	-1.02	0.14	1.00	1.53	▇▁▂▃▅
svi	1	0.00	1.00	-0.52	-0.52	-0.52	-0.52	1.89	▇▁▁▁▂
lcp	1	0.00	1.00	-0.86	-0.86	-0.44	0.97	2.21	▇▂▂▂▂
gleason	1	0.00	1.00	-1.04	-1.04	0.34	0.34	3.11	▅▇▁▁▁
pgg45	1	0.00	1.00	-0.86	-0.86	-0.33	0.55	2.68	▇▃▂▂▁
lpsa	1	2.48	1.15	-0.43	1.73	2.59	3.06	5.58	▁▅▇▃▁

Regsubset

train <- (zprostate[zprostate[, 10], ])[, -10]
X <- train[, 1:8]
y <- train[, 9]

out <- summary(regsubsets(x = X, y = y, nvmax = ncol(X)))
Subsets <- out$which
RSS <- out$rss


zprostate_regsub_tbl <-
  cbind(as.data.frame(Subsets), RSS = RSS) |> 
  relocate(RSS) |> 
  mutate(across(where(is.logical), ~ifelse(.x, "*", NA)))

zprostate_regsub_tbl

       RSS (Intercept) lcavol lweight  age lbph  svi  lcp gleason pgg45
1 44.52858           *      *    <NA> <NA> <NA> <NA> <NA>    <NA>  <NA>
2 37.09185           *      *       * <NA> <NA> <NA> <NA>    <NA>  <NA>
3 34.90775           *      *       * <NA> <NA>    * <NA>    <NA>  <NA>
4 32.81499           *      *       * <NA>    *    * <NA>    <NA>  <NA>
5 32.06945           *      *       * <NA>    *    * <NA>    <NA>     *
6 30.53978           *      *       * <NA>    *    *    *    <NA>     *
7 29.43730           *      *       *    *    *    *    *    <NA>     *
8 29.42638           *      *       *    *    *    *    *       *     *

Best GLM

Linear Reg

# `y` must be the last column
Xy <- cbind(as.data.frame(X), lpsa = y)

zprostate_bestglm <- bestglm(Xy)
names(zprostate_bestglm)

[1] "BestModel"   "BestModels"  "Bestq"       "qTable"      "Subsets"    
[6] "Title"       "ModelReport"

BestModel: lm or glm object giving the best model

zprostate_bestglm$BestModel


Call:
lm(formula = y ~ ., data = data.frame(Xy[, c(bestset[-1], FALSE), 
    drop = FALSE], y = y))

Coefficients:
(Intercept)       lcavol      lweight  
     2.4774       0.7397       0.3163

BestModels: a 𝑇 × 𝑝 logical matrix showing which variables are included in the top 𝑇 models

zprostate_bestglm$BestModels

  lcavol lweight   age  lbph   svi   lcp gleason pgg45 Criterion
1   TRUE    TRUE FALSE FALSE FALSE FALSE   FALSE FALSE -31.20741
2   TRUE    TRUE FALSE FALSE  TRUE FALSE   FALSE FALSE -31.06884
3   TRUE    TRUE FALSE  TRUE  TRUE FALSE   FALSE FALSE -31.00630
4   TRUE    TRUE FALSE FALSE FALSE FALSE   FALSE  TRUE -30.06625
5   TRUE   FALSE FALSE  TRUE  TRUE FALSE   FALSE FALSE -29.06138

Subsets: a (𝑝 + 1) × 𝑝 logical matrix showing which variables are included for subset sizes 𝑘 = 0, . . . , 𝑝 have the smallest deviance

zprostate_bestglm$Subsets

   (Intercept) lcavol lweight   age  lbph   svi   lcp gleason pgg45
0         TRUE  FALSE   FALSE FALSE FALSE FALSE FALSE   FALSE FALSE
1         TRUE   TRUE   FALSE FALSE FALSE FALSE FALSE   FALSE FALSE
2*        TRUE   TRUE    TRUE FALSE FALSE FALSE FALSE   FALSE FALSE
3         TRUE   TRUE    TRUE FALSE FALSE  TRUE FALSE   FALSE FALSE
4         TRUE   TRUE    TRUE FALSE  TRUE  TRUE FALSE   FALSE FALSE
5         TRUE   TRUE    TRUE FALSE  TRUE  TRUE FALSE   FALSE  TRUE
6         TRUE   TRUE    TRUE FALSE  TRUE  TRUE  TRUE   FALSE  TRUE
7         TRUE   TRUE    TRUE  TRUE  TRUE  TRUE  TRUE   FALSE  TRUE
8         TRUE   TRUE    TRUE  TRUE  TRUE  TRUE  TRUE    TRUE  TRUE
   logLikelihood       BIC
0      -12.14653  24.29306
1       13.68680 -23.16892
2*      19.80840 -31.20741
3       21.84146 -31.06884
4       23.91254 -31.00630
5       24.68243 -28.34139
6       26.31970 -27.41124
7       27.55141 -25.66996
8       27.56383 -21.49012

qTable: a table showing all possible model choices for different intervals of 𝑞.

zprostate_bestglm$qTable

          LogL           q1           q2 k
[1,] -12.14653 0.000000e+00 4.940404e-11 0
[2,]  13.68680 4.940404e-11 1.764939e-02 1
[3,]  19.80840 1.764939e-02 5.125667e-01 2
[4,]  23.91254 5.125667e-01 7.087642e-01 4
[5,]  27.55141 7.087642e-01 8.899197e-01 7
[6,]  27.56383 8.899197e-01 1.000000e+00 8

Log Reg

library(ISLR2)
data("Default")

Default_Xy <- Default |> relocate(default, .after = last_col())

Default_b.glm <- bestglm(Default_Xy, family = binomial(), 
                         IC = "BIC")

Morgan-Tatar search since family is non-gaussian.

BestModel: lm or glm object giving the best model

Default_b.glm$BestModel


Call:  glm(formula = y ~ ., family = family, data = Xi, weights = weights)

Coefficients:
(Intercept)   studentYes      balance  
 -10.749496    -0.714878     0.005738  

Degrees of Freedom: 9999 Total (i.e. Null);  9997 Residual
Null Deviance:      2921 
Residual Deviance: 1572     AIC: 1578

BestModels: a 𝑇 × 𝑝 logical matrix showing which variables are included in the top 𝑇 models

Default_b.glm$BestModels

  student balance income Criterion
1    TRUE    TRUE  FALSE  1590.102
2   FALSE    TRUE   TRUE  1597.387
3    TRUE    TRUE   TRUE  1599.176
4   FALSE    TRUE  FALSE  1605.662
5    TRUE   FALSE  FALSE  2917.893

Log Reg (Penguin)

library(palmerpenguins)
data(penguins)
# Only work with this
penguins_Xy <- penguins |> 
  relocate(sex, .after = last_col()) |> 
  select(ends_with("_mm"), body_mass_g, sex) |> 
  mutate(across(where(is.integer), as.double)) |> 
  filter(!if_any(everything(), ~is.na(.x))) |> 
  as.data.frame() # Must

head(penguins_Xy)

  bill_length_mm bill_depth_mm flipper_length_mm body_mass_g    sex
1           39.1          18.7               181        3750   male
2           39.5          17.4               186        3800 female
3           40.3          18.0               195        3250 female
4           36.7          19.3               193        3450 female
5           39.3          20.6               190        3650   male
6           38.9          17.8               181        3625 female

penguins_bestglm <- bestglm::bestglm(penguins_Xy, 
                                     family = binomial())

Morgan-Tatar search since family is non-gaussian.

penguins_bestglm$BestModels

  bill_length_mm bill_depth_mm flipper_length_mm body_mass_g Criterion
1          FALSE          TRUE             FALSE        TRUE  175.9541
2           TRUE          TRUE             FALSE        TRUE  177.3122
3          FALSE          TRUE              TRUE        TRUE  181.7523
4           TRUE          TRUE              TRUE        TRUE  182.2360
5           TRUE          TRUE              TRUE       FALSE  268.3962