library(bestglm)
Loading required package: leaps
data(zprostate)
library(bestglm)
Loading required package: leaps
data(zprostate)
Outcome: lpsa
= Prostate-Specific Antigen (psa)
::skim(zprostate) skimr
Name | zprostate |
Number of rows | 97 |
Number of columns | 10 |
_______________________ | |
Column type frequency: | |
logical | 1 |
numeric | 9 |
________________________ | |
Group variables | None |
Variable type: logical
skim_variable | n_missing | complete_rate | mean | count |
---|---|---|---|---|
train | 0 | 1 | 0.69 | TRU: 67, FAL: 30 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
lcavol | 0 | 1 | 0.00 | 1.00 | -2.29 | -0.71 | 0.08 | 0.66 | 2.10 | ▃▅▇▆▃ |
lweight | 0 | 1 | 0.00 | 1.00 | -2.93 | -0.59 | -0.01 | 0.58 | 2.69 | ▁▃▇▆▁ |
age | 0 | 1 | 0.00 | 1.00 | -3.07 | -0.52 | 0.15 | 0.56 | 2.03 | ▁▁▅▇▂ |
lbph | 0 | 1 | 0.00 | 1.00 | -1.02 | -1.02 | 0.14 | 1.00 | 1.53 | ▇▁▂▃▅ |
svi | 0 | 1 | 0.00 | 1.00 | -0.52 | -0.52 | -0.52 | -0.52 | 1.89 | ▇▁▁▁▂ |
lcp | 0 | 1 | 0.00 | 1.00 | -0.86 | -0.86 | -0.44 | 0.97 | 2.21 | ▇▂▂▂▂ |
gleason | 0 | 1 | 0.00 | 1.00 | -1.04 | -1.04 | 0.34 | 0.34 | 3.11 | ▅▇▁▁▁ |
pgg45 | 0 | 1 | 0.00 | 1.00 | -0.86 | -0.86 | -0.33 | 0.55 | 2.68 | ▇▃▂▂▁ |
lpsa | 0 | 1 | 2.48 | 1.15 | -0.43 | 1.73 | 2.59 | 3.06 | 5.58 | ▁▅▇▃▁ |
<- (zprostate[zprostate[, 10], ])[, -10]
train <- train[, 1:8]
X <- train[, 9] y
<- summary(regsubsets(x = X, y = y, nvmax = ncol(X)))
out <- out$which
Subsets <- out$rss
RSS
<-
zprostate_regsub_tbl cbind(as.data.frame(Subsets), RSS = RSS) |>
relocate(RSS) |>
mutate(across(where(is.logical), ~ifelse(.x, "*", NA)))
zprostate_regsub_tbl
RSS (Intercept) lcavol lweight age lbph svi lcp gleason pgg45
1 44.52858 * * <NA> <NA> <NA> <NA> <NA> <NA> <NA>
2 37.09185 * * * <NA> <NA> <NA> <NA> <NA> <NA>
3 34.90775 * * * <NA> <NA> * <NA> <NA> <NA>
4 32.81499 * * * <NA> * * <NA> <NA> <NA>
5 32.06945 * * * <NA> * * <NA> <NA> *
6 30.53978 * * * <NA> * * * <NA> *
7 29.43730 * * * * * * * <NA> *
8 29.42638 * * * * * * * * *
# `y` must be the last column
<- cbind(as.data.frame(X), lpsa = y)
Xy
<- bestglm(Xy)
zprostate_bestglm names(zprostate_bestglm)
[1] "BestModel" "BestModels" "Bestq" "qTable" "Subsets"
[6] "Title" "ModelReport"
BestModel
: lm or glm object giving the best model
$BestModel zprostate_bestglm
Call:
lm(formula = y ~ ., data = data.frame(Xy[, c(bestset[-1], FALSE),
drop = FALSE], y = y))
Coefficients:
(Intercept) lcavol lweight
2.4774 0.7397 0.3163
BestModels
: a 𝑇 × 𝑝 logical matrix showing which variables are included in the top 𝑇 models
$BestModels zprostate_bestglm
lcavol lweight age lbph svi lcp gleason pgg45 Criterion
1 TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE -31.20741
2 TRUE TRUE FALSE FALSE TRUE FALSE FALSE FALSE -31.06884
3 TRUE TRUE FALSE TRUE TRUE FALSE FALSE FALSE -31.00630
4 TRUE TRUE FALSE FALSE FALSE FALSE FALSE TRUE -30.06625
5 TRUE FALSE FALSE TRUE TRUE FALSE FALSE FALSE -29.06138
Subsets
: a (𝑝 + 1) × 𝑝 logical matrix showing which variables are included for subset sizes 𝑘 = 0, . . . , 𝑝 have the smallest deviance
$Subsets zprostate_bestglm
(Intercept) lcavol lweight age lbph svi lcp gleason pgg45
0 TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
1 TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
2* TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
3 TRUE TRUE TRUE FALSE FALSE TRUE FALSE FALSE FALSE
4 TRUE TRUE TRUE FALSE TRUE TRUE FALSE FALSE FALSE
5 TRUE TRUE TRUE FALSE TRUE TRUE FALSE FALSE TRUE
6 TRUE TRUE TRUE FALSE TRUE TRUE TRUE FALSE TRUE
7 TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE
8 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
logLikelihood BIC
0 -12.14653 24.29306
1 13.68680 -23.16892
2* 19.80840 -31.20741
3 21.84146 -31.06884
4 23.91254 -31.00630
5 24.68243 -28.34139
6 26.31970 -27.41124
7 27.55141 -25.66996
8 27.56383 -21.49012
qTable
: a table showing all possible model choices for different intervals of 𝑞.
$qTable zprostate_bestglm
LogL q1 q2 k
[1,] -12.14653 0.000000e+00 4.940404e-11 0
[2,] 13.68680 4.940404e-11 1.764939e-02 1
[3,] 19.80840 1.764939e-02 5.125667e-01 2
[4,] 23.91254 5.125667e-01 7.087642e-01 4
[5,] 27.55141 7.087642e-01 8.899197e-01 7
[6,] 27.56383 8.899197e-01 1.000000e+00 8
library(ISLR2)
data("Default")
<- Default |> relocate(default, .after = last_col()) Default_Xy
<- bestglm(Default_Xy, family = binomial(),
Default_b.glm IC = "BIC")
Morgan-Tatar search since family is non-gaussian.
BestModel
: lm or glm object giving the best model
$BestModel Default_b.glm
Call: glm(formula = y ~ ., family = family, data = Xi, weights = weights)
Coefficients:
(Intercept) studentYes balance
-10.749496 -0.714878 0.005738
Degrees of Freedom: 9999 Total (i.e. Null); 9997 Residual
Null Deviance: 2921
Residual Deviance: 1572 AIC: 1578
BestModels
: a 𝑇 × 𝑝 logical matrix showing which variables are included in the top 𝑇 models
$BestModels Default_b.glm
student balance income Criterion
1 TRUE TRUE FALSE 1590.102
2 FALSE TRUE TRUE 1597.387
3 TRUE TRUE TRUE 1599.176
4 FALSE TRUE FALSE 1605.662
5 TRUE FALSE FALSE 2917.893
library(palmerpenguins)
data(penguins)
# Only work with this
<- penguins |>
penguins_Xy relocate(sex, .after = last_col()) |>
select(ends_with("_mm"), body_mass_g, sex) |>
mutate(across(where(is.integer), as.double)) |>
filter(!if_any(everything(), ~is.na(.x))) |>
as.data.frame() # Must
head(penguins_Xy)
bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex
1 39.1 18.7 181 3750 male
2 39.5 17.4 186 3800 female
3 40.3 18.0 195 3250 female
4 36.7 19.3 193 3450 female
5 39.3 20.6 190 3650 male
6 38.9 17.8 181 3625 female
<- bestglm::bestglm(penguins_Xy,
penguins_bestglm family = binomial())
Morgan-Tatar search since family is non-gaussian.
$BestModels penguins_bestglm
bill_length_mm bill_depth_mm flipper_length_mm body_mass_g Criterion
1 FALSE TRUE FALSE TRUE 175.9541
2 TRUE TRUE FALSE TRUE 177.3122
3 FALSE TRUE TRUE TRUE 181.7523
4 TRUE TRUE TRUE TRUE 182.2360
5 TRUE TRUE TRUE FALSE 268.3962