Chapter 8 Model selection

library(AER)
data("Fatalities")
dat.fatalities = Fatalities[,c('fatal','spirits','unemp','income','beertax','baptist','mormon','drinkage','dry','youngdrivers','miles','gsp')]

8.1 Criteria

Reading materials: Slides 166 - 180 in STA108_LinearRegression_S20.pdf.

# Information criteria


fit.full=lm(fatal~spirits+unemp+income+beertax+baptist+mormon+drinkage+dry+youngdrivers+miles+gsp,data=dat.fatalities);
fit.six=lm(fatal~spirits+unemp+income+beertax+baptist+mormon,data=dat.fatalities)

BIC(fit.full)
BIC(fit.six)


AIC(fit.full)
AIC(fit.six) 
# cross-validation:
# We will use a package for this:
library(boot)


fit.full.g=glm(fatal~spirits+unemp+income+beertax+baptist+mormon+drinkage+dry+youngdrivers+miles+gsp,data=dat.fatalities);
fit.six.g=glm(fatal~spirits+unemp+income+beertax+baptist+mormon,data=dat.fatalities)


set.seed(1)
cv.full.10=cv.glm(dat.fatalities,fit.full.g,K=10)
cv.full.10$delta

cv.six.10=cv.glm(dat.fatalities,fit.six.g,K=10)
cv.six.10$delta

# From the helpfile, delta is:

# A vector of length two. The first component is the raw cross-validation estimate of prediction error. The second component is the adjusted cross-validation estimate. The adjustment is designed to compensate for the bias introduced by not using leave-one-out cross-validation.

8.2 Selection procedure

Reading materials: Slides 181 - 190 in STA108_LinearRegression_S20.pdf.

# Model selection procedure 

# Forward selection & AIC
step(lm(fatal~spirits+unemp+income+beertax+baptist+mormon+drinkage+dry+youngdrivers+miles+gsp,data=dat.fatalities),direction="forward",k=2) 


# Backward selection & AIC
step(lm(fatal~spirits+unemp+income+beertax+baptist+mormon+drinkage+dry+youngdrivers+miles+gsp,data=dat.fatalities),direction="backward", k=2)



# Backward selection & BIC
step(lm(fatal~spirits+unemp+income+beertax+baptist+mormon+drinkage+dry+youngdrivers+miles+gsp,data=dat.fatalities),direction="backward", k=log(dim(dat.fatalities)[1]))
# Penalized regression: lasso
library(glmnet)

y=dat.fatalities$fatal; #Number of vehicle fatalities.
X=as.matrix(dat.fatalities[,c('spirits','unemp','income','beertax','baptist','mormon','drinkage','dry','youngdrivers','miles','gsp')]);


fit.cv.glmnet=cv.glmnet(y=y,x=X,nfold=10,family='gaussian');
        
fit.cv.glmnet$lambda.min # selected tuning parameter

fit.glmnet=glmnet(x=X,y=y,family='gaussian',alpha=1,lambda=fit.cv.glmnet$lambda.min)

fit.glmnet$beta # One of the fitted coefficient is zero