Chapter 8 Model selection
library(AER)
data("Fatalities")
dat.fatalities = Fatalities[,c('fatal','spirits','unemp','income','beertax','baptist','mormon','drinkage','dry','youngdrivers','miles','gsp')]
8.1 Criteria
Reading materials: Slides 166 - 180 in STA108_LinearRegression_S20.pdf.
# Information criteria
fit.full=lm(fatal~spirits+unemp+income+beertax+baptist+mormon+drinkage+dry+youngdrivers+miles+gsp,data=dat.fatalities);
fit.six=lm(fatal~spirits+unemp+income+beertax+baptist+mormon,data=dat.fatalities)
BIC(fit.full)
BIC(fit.six)
AIC(fit.full)
AIC(fit.six)
# cross-validation:
# We will use a package for this:
library(boot)
fit.full.g=glm(fatal~spirits+unemp+income+beertax+baptist+mormon+drinkage+dry+youngdrivers+miles+gsp,data=dat.fatalities);
fit.six.g=glm(fatal~spirits+unemp+income+beertax+baptist+mormon,data=dat.fatalities)
set.seed(1)
cv.full.10=cv.glm(dat.fatalities,fit.full.g,K=10)
cv.full.10$delta
cv.six.10=cv.glm(dat.fatalities,fit.six.g,K=10)
cv.six.10$delta
# From the helpfile, delta is:
# A vector of length two. The first component is the raw cross-validation estimate of prediction error. The second component is the adjusted cross-validation estimate. The adjustment is designed to compensate for the bias introduced by not using leave-one-out cross-validation.
8.2 Selection procedure
Reading materials: Slides 181 - 190 in STA108_LinearRegression_S20.pdf.
# Model selection procedure
# Forward selection & AIC
step(lm(fatal~spirits+unemp+income+beertax+baptist+mormon+drinkage+dry+youngdrivers+miles+gsp,data=dat.fatalities),direction="forward",k=2)
# Backward selection & AIC
step(lm(fatal~spirits+unemp+income+beertax+baptist+mormon+drinkage+dry+youngdrivers+miles+gsp,data=dat.fatalities),direction="backward", k=2)
# Backward selection & BIC
step(lm(fatal~spirits+unemp+income+beertax+baptist+mormon+drinkage+dry+youngdrivers+miles+gsp,data=dat.fatalities),direction="backward", k=log(dim(dat.fatalities)[1]))
# Penalized regression: lasso
library(glmnet)
y=dat.fatalities$fatal; #Number of vehicle fatalities.
X=as.matrix(dat.fatalities[,c('spirits','unemp','income','beertax','baptist','mormon','drinkage','dry','youngdrivers','miles','gsp')]);
fit.cv.glmnet=cv.glmnet(y=y,x=X,nfold=10,family='gaussian');
fit.cv.glmnet$lambda.min # selected tuning parameter
fit.glmnet=glmnet(x=X,y=y,family='gaussian',alpha=1,lambda=fit.cv.glmnet$lambda.min)
fit.glmnet$beta # One of the fitted coefficient is zero