Stat 410/510: Backwards, Forwards, and Stepwise variable selection
> birds <- read.table("http://users.humboldt.edu//rizzardi//Data.dir/bird.txt",
+ header=T, skip=15)
> attach( birds )
>
> # Use age class (age, adult=1, juvenile=2), total length (tl), wing span (ae),
> # femur length (thigh bone, fl), and humerus (arm bone, hl) to predict weight (wt)
> fit.all <- lm( wt ~ ae + ag + fl + hl + tl )
> summary( fit.all )
Call:
lm(formula = wt ~ ae + ag + fl + hl + tl)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -18.055780 7.640757 -2.363 0.020520 *
ae 0.006812 0.050789 0.134 0.893635
ag -0.167180 0.273993 -0.610 0.543463
fl -5.795307 11.288314 -0.513 0.609076
hl 24.462067 12.245164 1.998 0.049107 *
tl 0.177890 0.051351 3.464 0.000853 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.162 on 81 degrees of freedom
Multiple R-squared: 0.3718, Adjusted R-squared: 0.3331
F-statistic: 9.589 on 5 and 81 DF, p-value: 3.245e-07
> anova(fit.all)
Analysis of Variance Table
Response: wt
Df Sum Sq Mean Sq F value Pr(>F)
ae 1 43.345 43.345 32.1142 2.156e-07 ***
ag 1 0.085 0.085 0.0629 0.8026650
fl 1 2.720 2.720 2.0151 0.1595742
hl 1 2.367 2.367 1.7534 0.1891771
tl 1 16.197 16.197 12.0008 0.0008527 ***
Residuals 81 109.326 1.350
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
>
> # can define a formula object
> formula.all <- formula(wt~ae+ag+fl+hl+tl)
> formula.all
wt ~ ae + ag + fl + hl + tl
> lm(formula.all) # same as fit.all
Call:
lm(formula = formula.all)
Coefficients:
(Intercept) ae ag fl hl tl
-18.055780 0.006812 -0.167180 -5.795307 24.462067 0.177890
>
> drop1(fit.all) # uses AIC statistic as criteria
Single term deletions
Model:
wt ~ ae + ag + fl + hl + tl
Df Sum of Sq RSS AIC
<none> 109.33 31.873
ae 1 0.0243 109.35 29.892
ag 1 0.5025 109.83 30.272
fl 1 0.3557 109.68 30.156
hl 1 5.3864 114.71 34.057
tl 1 16.1975 125.52 41.893
> # AIC = n*log(RSS/n)+2p, where p=number of parameters
> # Goal: minimize AIC
> # confirming AIC for fit.all
> 87 * log( sum(residuals(fit.all)^2) / 87) + 2*6
[1] 31.87317
> extractAIC(fit.all) # same thing
[1] 6.00000 31.87317
>
> drop1(fit.all,test="F") # uses so called "type II" ANOVA - decrease in RSS.
Single term deletions
Model:
wt ~ ae + ag + fl + hl + tl
Df Sum of Sq RSS AIC F value Pr(>F)
<none> 109.33 31.873
ae 1 0.0243 109.35 29.892 0.0180 0.8936349
ag 1 0.5025 109.83 30.272 0.3723 0.5434627
fl 1 0.3557 109.68 30.156 0.2636 0.6090761
hl 1 5.3864 114.71 34.057 3.9908 0.0491067 *
tl 1 16.1975 125.52 41.893 12.0008 0.0008527 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
>
> step( fit.all, direction="backward" ) # repeated uses drop1()
Start: AIC=31.87
wt ~ ae + ag + fl + hl + tl
Df Sum of Sq RSS AIC
- ae 1 0.0243 109.35 29.892
- fl 1 0.3557 109.68 30.156
- ag 1 0.5025 109.83 30.272
<none> 109.33 31.873
- hl 1 5.3864 114.71 34.057
- tl 1 16.1975 125.52 41.893
Step: AIC=29.89
wt ~ ag + fl + hl + tl
Df Sum of Sq RSS AIC
- fl 1 0.3429 109.69 28.165
- ag 1 0.5708 109.92 28.345
<none> 109.35 29.892
- hl 1 6.7828 116.13 33.128
- tl 1 23.3115 132.66 44.705
Step: AIC=28.16
wt ~ ag + hl + tl
Df Sum of Sq RSS AIC
- ag 1 0.5303 110.22 26.584
<none> 109.69 28.165
- hl 1 16.1467 125.84 38.112
- tl 1 23.2088 132.90 42.862
Step: AIC=26.58
wt ~ hl + tl
Df Sum of Sq RSS AIC
<none> 110.22 26.584
- hl 1 15.811 126.03 36.246
- tl 1 22.948 133.17 41.038
Call:
lm(formula = wt ~ hl + tl)
Coefficients:
(Intercept) hl tl
-17.0972 20.0488 0.1755
> step( fit.all, direction="backward", test="F", trace=F ) # selects same model
Call:
lm(formula = wt ~ hl + tl)
Coefficients:
(Intercept) hl tl
-17.0972 20.0488 0.1755
>
> # can define a formula object
> formula.all <- formula(wt~ae+ag+fl+hl+tl)
> formula.all
wt ~ ae + ag + fl + hl + tl
> lm(formula.all) # same as fit.all
Call:
lm(formula = formula.all)
Coefficients:
(Intercept) ae ag fl hl tl
-18.055780 0.006812 -0.167180 -5.795307 24.462067 0.177890
>
> fit.null <- lm( wt ~ 1 )
> add1( fit.null, scope=formula.all ) # formula.all sets range of choices
Single term additions
Model:
wt ~ 1
Df Sum of Sq RSS AIC
<none> 174.04 62.324
ae 1 43.345 130.69 39.405
ag 1 0.010 174.03 64.319
fl 1 34.197 139.84 45.291
hl 1 40.868 133.17 41.038
tl 1 48.005 126.03 36.246
>
> # "forward" direction repeated uses add1()
> step( fit.null, direction="forward", scope=formula.all ) # same model again
Start: AIC=62.32
wt ~ 1
Df Sum of Sq RSS AIC
+ tl 1 48.005 126.03 36.246
+ ae 1 43.345 130.69 39.405
+ hl 1 40.868 133.17 41.038
+ fl 1 34.197 139.84 45.291
<none> 174.04 62.324
+ ag 1 0.010 174.03 64.319
Step: AIC=36.25
wt ~ tl
Df Sum of Sq RSS AIC
+ hl 1 15.8111 110.22 26.584
+ fl 1 9.5958 116.44 31.357
+ ae 1 8.4463 117.59 32.212
<none> 126.03 36.246
+ ag 1 0.1947 125.84 38.112
Step: AIC=26.58
wt ~ tl + hl
Df Sum of Sq RSS AIC
<none> 110.22 26.584
+ ag 1 0.53026 109.69 28.165
+ fl 1 0.30238 109.92 28.345
+ ae 1 0.06409 110.16 28.534
Call:
lm(formula = wt ~ tl + hl)
Coefficients:
(Intercept) tl hl
-17.0972 0.1755 20.0488
>
> # Use best of "forward" and "backward"
> fit.middle <- lm(wt~ ae+ag+tl)
> step( fit.middle, direction="both", scope=formula.all ) # same model again
Start: AIC=34.19
wt ~ ae + ag + tl
Df Sum of Sq RSS AIC
+ hl 1 7.8830 109.68 30.156
- ag 1 0.0236 117.59 32.212
+ fl 1 2.8523 114.71 34.057
<none> 117.56 34.194
- ae 1 8.2751 125.84 38.112
- tl 1 13.0452 130.61 41.349
Step: AIC=30.16
wt ~ ae + ag + tl + hl
Df Sum of Sq RSS AIC
- ae 1 0.0114 109.69 28.165
- ag 1 0.4776 110.16 28.534
<none> 109.68 30.156
+ fl 1 0.3557 109.33 31.873
- hl 1 7.8830 117.56 34.194
- tl 1 15.8422 125.52 39.893
Step: AIC=28.16
wt ~ ag + tl + hl
Df Sum of Sq RSS AIC
- ag 1 0.5303 110.22 26.584
<none> 109.69 28.165
+ fl 1 0.3429 109.35 29.892
+ ae 1 0.0114 109.68 30.156
- hl 1 16.1467 125.84 38.112
- tl 1 23.2088 132.90 42.862
Step: AIC=26.58
wt ~ tl + hl
Df Sum of Sq RSS AIC
<none> 110.22 26.584
+ ag 1 0.5303 109.69 28.165
+ fl 1 0.3024 109.92 28.345
+ ae 1 0.0641 110.16 28.534
- hl 1 15.8111 126.03 36.246
- tl 1 22.9480 133.17 41.038
Call:
lm(formula = wt ~ tl + hl)
Coefficients:
(Intercept) tl hl
-17.0972 0.1755 20.0488
pairs(formula.all)
birds <- read.table("http://users.humboldt.edu//rizzardi//Data.dir/bird.txt",
header=T, skip=15)
attach( birds )
# Use age class (age, adult=1, juvenile=2), total length (tl), wing span (ae),
# femur length (thigh bone, fl), and humerus (arm bone, hl) to predict weight (wt)
fit.all <- lm( wt ~ ae + ag + fl + hl + tl )
summary( fit.all )
anova(fit.all)
# can define a formula object
formula.all <- formula(wt~ae+ag+fl+hl+tl)
formula.all
lm(formula.all) # same as fit.all
drop1(fit.all) # uses AIC statistic as criteria
# AIC = n*log(RSS/n)+2p, where p=number of parameters
# Goal: minimize AIC
# confirming AIC for fit.all
87 * log( sum(residuals(fit.all)^2) / 87) + 2*6
extractAIC(fit.all) # same thing
drop1(fit.all,test="F") # uses so called "type II" ANOVA - decrease in RSS.
step( fit.all, direction="backward" ) # repeated uses drop1()
step( fit.all, direction="backward", test="F", trace=F ) # selects same model
# can define a formula object
formula.all <- formula(wt~ae+ag+fl+hl+tl)
formula.all
lm(formula.all) # same as fit.all
fit.null <- lm( wt ~ 1 )
add1( fit.null, scope=formula.all ) # formula.all sets range of choices
# "forward" direction repeated uses add1()
step( fit.null, direction="forward", scope=formula.all ) # same model again
# Use best of "forward" and "backward"
fit.middle <- lm(wt~ ae+ag+tl)
step( fit.middle, direction="both", scope=formula.all ) # same model again
pairs(formula.all)
5