Chapter 2 Estimation
Reading materials: Slides 12 - 22 in STA108_LinearRegression_S20.pdf
# Consider the advertising dataset
dat.advertising=read.csv('./data/advertising.csv');
plot(x=dat.advertising$TV, y=dat.advertising$sales, pch=16,cex=1.3,col='blue',xlab='x',ylab='y')
# Suppose that we want to draw a line in this scatter plot
beta=c(4,0.03); # beta0=4; beta1=0.03; # Just two arbitrary numbers
# Visualize the line in the scatter plot
# This time we use the plotting functions in base R (i.e., not ggplot2)
# With the above function, we can reproduce the plot in Slide 13
plot(x=dat.advertising$TV, y=dat.advertising$sales, pch=16,cex=1.3,col='blue',xlab='x',ylab='y')
abline(a=beta[1],b=beta[2],col='red',lwd=3)
# To reproduce the plot in Slide 13
# We need to find the corresponding point on the line for each data point
plot(x=dat.advertising$TV, y=dat.advertising$sales, pch=16,cex=1.3,col='black',xlab='x',ylab='y')
abline(a=beta[1],b=beta[2],col='red',lwd=3)
yout=dat.advertising$TV*beta[2]+beta[1];
for(i in 1:dim(dat.advertising)[1]){
segments(x0=dat.advertising$TV[i], y0=yout[i], y1 = dat.advertising$sales[i],col='blue')
}
# What if we want to feed in different numbers?
# Write a function that can calculate x*beta1 + beta0
linear.model<-function(beta,covariate){
# beta: a 2-vector, where the first entry is the intercept
yout=covariate*beta[2]+beta[1]
return(yout);
# Note: this function only works with simple linear regression model
# How would you generalize it?
}
fits=linear.model(beta=c(4,0.03),covariate=dat.advertising$TV);
plot(x=dat.advertising$TV, y=dat.advertising$sales, pch=16,cex=1.3,col='blue',xlab='x',ylab='y')
points(fits~dat.advertising$TV,col='red')
# Furthermore, we can wrap up the plotting code into one function as well
plot.scatter.line<-function(beta,covariate,response){
plot(x=covariate, y=response, pch=16,cex=1.3,col='blue',xlab='x',ylab='y')
abline(a=beta[1],b=beta[2],col='red',lwd=3)
yout=linear.model(beta=beta,covariate=covariate);
for(i in 1:dim(dat.advertising)[1]){
segments(x0=covariate[i], y0=yout[i], y1 = response[i],col='blue')
}
}
# We can now draw lines
plot.scatter.line(beta=c(4,0.03),covariate=dat.advertising$TV,response=dat.advertising$sales)
# We can see that the line is less than ideal
# Measuring the loss:
## We will move on to find the minimizer of the squared error loss
sum.of.squares<-function(beta,covariate,outcome){
yout=linear.model(beta=beta,covariate=covariate);
res=outcome-yout;
sos= sum(res^2);
return(sos)
}
# There are many ways to minimize the sum of squares
# 1, Use generic optimizer in R
fit.optim=optim(c(0,0),sum.of.squares,covariate=dat.advertising$TV,outcome=dat.advertising$sales)
beta.hat.optim=fit.optim$par;
# 2. Use analytic form of the optimizer
fit.linear.model<-function(covariate,outcome){
# I will write down the function for (multiple) linear regression here
X=cbind(1,covariate);
beta.fit=solve( t(X)%*%X )%*%t(X)%*%outcome;
return(beta.fit)
}
beta.hat=fit.linear.model(covariate=dat.advertising$TV,outcome=dat.advertising$sales)
# 3. Write your own version of Newton-Raphson method, or gradient descent
# Not required or taught in this class
# We might want to check if our solution matches that from lm()
fit.advertising=lm(sales~TV+1,data=dat.advertising);
fit.advertising$coef
beta.hat
beta.hat.optim
# Summary of lm()
summary(fit.advertising)
# Decomposition of sum of squares
residual.sum.of.squares=sum.of.squares(beta=beta.hat,covariate=dat.advertising$TV,outcome=dat.advertising$sales)
explained.sum.of.squares=sum((linear.model(beta=beta.hat,covariate=dat.advertising$TV)-mean(dat.advertising$sales))^2)
total.sum.of.squares=sum((dat.advertising$sales-mean(dat.advertising$sales))^2)
# Check if the equality holds..
total.sum.of.squares
explained.sum.of.squares+residual.sum.of.squares
# But if we use identical()...
identical(total.sum.of.squares,explained.sum.of.squares+residual.sum.of.squares)
# Calculate the coefficient of determination
R.sq= explained.sum.of.squares/total.sum.of.squares;
# Calculate the Pearson sample correlation
cor.hat=cor(dat.advertising$TV,dat.advertising$sales)
# Verify the identity:
R.sq
cor.hat^2
# Q: what if these equalities only hold on this one data set? What would you do to reassure yourself?
# One option is to use simulation to verify if the equality holds in every instance in your simulation
# Simulate N.sim instances
N.sim=10000;
beta=c(2,0.4);n=50;x=runif(n,min=-2,max=7.5);sigma=2;
out=matrix(0,nrow=N.sim,ncol=2)
for(i in 1:N.sim){
Ey=x*beta[2]+beta[1];
error.terms=sigma*rnorm(n);
y=Ey+error.terms;
beta.hat=fit.linear.model(covariate=x,outcome=y)
# Copying code from the previous trunk:
residual.sum.of.squares=sum.of.squares(beta=beta.hat,covariate=x,outcome=y)
explained.sum.of.squares=sum((linear.model(beta=beta.hat,covariate=x)-mean(y))^2)
total.sum.of.squares=sum((y-mean(y))^2)
R.sq= explained.sum.of.squares/total.sum.of.squares;
cor.hat=cor(x,y)
out[i,]=c(R.sq,cor.hat)
}
plot(y=out[,1],x=out[,2],xlab='Correlation',ylab='Coef. of determination')
# It's better to wrap it up into one function:
N.sim=10000;
beta=c(2,0.4);n=50;x=runif(n,min=-2,max=7.5);sigma=2;
simulate.one.instance<-function(x,beta,sigma){
n=length(x)
Ey=x*beta[2]+beta[1];
error.terms=sigma*rnorm(n);
y=Ey+error.terms;
beta.hat=fit.linear.model(covariate=x,outcome=y)
# Copying code from the previous trunk:
residual.sum.of.squares=sum.of.squares(beta=beta.hat,covariate=x,outcome=y)
explained.sum.of.squares=sum((linear.model(beta=beta.hat,covariate=x)-mean(y))^2)
total.sum.of.squares=sum((y-mean(y))^2)
R.sq= explained.sum.of.squares/total.sum.of.squares;
cor.hat=cor(x,y)
out=c(R.sq,cor.hat)
return(out)
}
set.seed(1)
sim=replicate(N.sim,simulate.one.instance(x,beta,sigma));
plot(y=sim[1,],x=sim[2,],xlab='Correlation',ylab='Coef. of determination')
# However, stimulation only tells us that the equality holds in the scenario that we design
# To establish the equality for all possible scenarios, we still have to use rigorous derivation
# You can derive the property yourself if you are good at algebra
# Or search online to read others' proof and explanation
log.likelihood<-function(beta.sigma,covariate,outcome){
yout=linear.model(beta=beta.sigma[-3],covariate=covariate);
res=outcome-yout;
lglklh=-sum( log(dnorm(x=res,mean=0,sd=beta.sigma[3]) ))
return(lglklh)
}
# 1, Use generic optimizer in R
fit.lglklh.optim=optim(c(0,0,1),log.likelihood,covariate=dat.advertising$TV,outcome=dat.advertising$sales)
fit.lglklh.optim
fit.optim