Chapter 2 Estimation

Reading materials: Slides 12 - 22 in STA108_LinearRegression_S20.pdf

# Consider the advertising dataset
dat.advertising=read.csv('./data/advertising.csv');
plot(x=dat.advertising$TV, y=dat.advertising$sales, pch=16,cex=1.3,col='blue',xlab='x',ylab='y')


# Suppose that we want to draw a line in this scatter plot
beta=c(4,0.03); # beta0=4; beta1=0.03; # Just two arbitrary numbers 

# Visualize the line in the scatter plot 
# This time we use the plotting functions in base R (i.e., not ggplot2)
# With the above function, we can reproduce the plot in Slide 13
plot(x=dat.advertising$TV, y=dat.advertising$sales, pch=16,cex=1.3,col='blue',xlab='x',ylab='y')
abline(a=beta[1],b=beta[2],col='red',lwd=3)

# To reproduce the plot in Slide 13
# We need to find the corresponding point on the line for each data point
plot(x=dat.advertising$TV, y=dat.advertising$sales, pch=16,cex=1.3,col='black',xlab='x',ylab='y')
abline(a=beta[1],b=beta[2],col='red',lwd=3)
yout=dat.advertising$TV*beta[2]+beta[1];
for(i in 1:dim(dat.advertising)[1]){
  segments(x0=dat.advertising$TV[i], y0=yout[i],  y1 = dat.advertising$sales[i],col='blue') 
}

# What if we want to feed in different numbers? 

# Write a function that can calculate x*beta1 + beta0

linear.model<-function(beta,covariate){
  # beta: a 2-vector, where the first entry is the intercept
  yout=covariate*beta[2]+beta[1]
  return(yout);
  # Note: this function only works with simple linear regression model
  # How would you generalize it?
}

fits=linear.model(beta=c(4,0.03),covariate=dat.advertising$TV);

plot(x=dat.advertising$TV, y=dat.advertising$sales, pch=16,cex=1.3,col='blue',xlab='x',ylab='y')

points(fits~dat.advertising$TV,col='red')

# Furthermore, we can wrap up the plotting code into one function as well 

plot.scatter.line<-function(beta,covariate,response){
  
  plot(x=covariate, y=response, pch=16,cex=1.3,col='blue',xlab='x',ylab='y')
  abline(a=beta[1],b=beta[2],col='red',lwd=3)
  
  yout=linear.model(beta=beta,covariate=covariate);
  for(i in 1:dim(dat.advertising)[1]){
    segments(x0=covariate[i], y0=yout[i],  y1 = response[i],col='blue') 
  }
}



# We can now draw lines 
plot.scatter.line(beta=c(4,0.03),covariate=dat.advertising$TV,response=dat.advertising$sales)

# We can see that the line is less than ideal

# Measuring the loss: 
## We will move on to find the minimizer of the squared error loss 

sum.of.squares<-function(beta,covariate,outcome){
  yout=linear.model(beta=beta,covariate=covariate);
  res=outcome-yout;
  sos= sum(res^2);
  return(sos)
}

# There are many ways to minimize the sum of squares

# 1, Use generic optimizer in R
fit.optim=optim(c(0,0),sum.of.squares,covariate=dat.advertising$TV,outcome=dat.advertising$sales)
beta.hat.optim=fit.optim$par;

# 2. Use analytic form of the optimizer

fit.linear.model<-function(covariate,outcome){
  # I will write down the function for (multiple) linear regression here
  X=cbind(1,covariate);
  beta.fit=solve( t(X)%*%X )%*%t(X)%*%outcome;
  return(beta.fit)
}

beta.hat=fit.linear.model(covariate=dat.advertising$TV,outcome=dat.advertising$sales)

# 3. Write your own version of Newton-Raphson method, or gradient descent 
# Not required or taught in this class 

# We might want to check if our solution matches that from lm()

fit.advertising=lm(sales~TV+1,data=dat.advertising); 

fit.advertising$coef
beta.hat
beta.hat.optim

# Summary of lm()
summary(fit.advertising)

# Decomposition of sum of squares 
residual.sum.of.squares=sum.of.squares(beta=beta.hat,covariate=dat.advertising$TV,outcome=dat.advertising$sales)
explained.sum.of.squares=sum((linear.model(beta=beta.hat,covariate=dat.advertising$TV)-mean(dat.advertising$sales))^2)
total.sum.of.squares=sum((dat.advertising$sales-mean(dat.advertising$sales))^2)

# Check if the equality holds..
total.sum.of.squares
explained.sum.of.squares+residual.sum.of.squares

# But if we use identical()...
identical(total.sum.of.squares,explained.sum.of.squares+residual.sum.of.squares) 


# Calculate the coefficient of determination
R.sq= explained.sum.of.squares/total.sum.of.squares;

# Calculate the Pearson sample correlation 
cor.hat=cor(dat.advertising$TV,dat.advertising$sales)

# Verify the identity:
R.sq
cor.hat^2

# Q: what if these equalities only hold on this one data set? What would you do to reassure yourself?

# One option is to use simulation to verify if the equality holds in every instance in your simulation 
# Simulate N.sim instances
N.sim=10000;
beta=c(2,0.4);n=50;x=runif(n,min=-2,max=7.5);sigma=2;
out=matrix(0,nrow=N.sim,ncol=2)
for(i in 1:N.sim){
  Ey=x*beta[2]+beta[1];
  error.terms=sigma*rnorm(n);
  y=Ey+error.terms;
  beta.hat=fit.linear.model(covariate=x,outcome=y)

  # Copying code from the previous trunk:
  residual.sum.of.squares=sum.of.squares(beta=beta.hat,covariate=x,outcome=y)
  explained.sum.of.squares=sum((linear.model(beta=beta.hat,covariate=x)-mean(y))^2)
  total.sum.of.squares=sum((y-mean(y))^2)
  R.sq= explained.sum.of.squares/total.sum.of.squares;
  cor.hat=cor(x,y)
  out[i,]=c(R.sq,cor.hat)
}

plot(y=out[,1],x=out[,2],xlab='Correlation',ylab='Coef. of determination')

# It's better to wrap it up into one function:

N.sim=10000;
beta=c(2,0.4);n=50;x=runif(n,min=-2,max=7.5);sigma=2;

simulate.one.instance<-function(x,beta,sigma){
  n=length(x)
 Ey=x*beta[2]+beta[1];
  error.terms=sigma*rnorm(n);
  y=Ey+error.terms;
  beta.hat=fit.linear.model(covariate=x,outcome=y)

  # Copying code from the previous trunk:
  residual.sum.of.squares=sum.of.squares(beta=beta.hat,covariate=x,outcome=y)
  explained.sum.of.squares=sum((linear.model(beta=beta.hat,covariate=x)-mean(y))^2)
  total.sum.of.squares=sum((y-mean(y))^2)
  R.sq= explained.sum.of.squares/total.sum.of.squares;
  cor.hat=cor(x,y)
  out=c(R.sq,cor.hat)
  return(out)
}

set.seed(1)
sim=replicate(N.sim,simulate.one.instance(x,beta,sigma));

plot(y=sim[1,],x=sim[2,],xlab='Correlation',ylab='Coef. of determination')

# However, stimulation only tells us that the equality holds in the scenario that we design
# To establish the equality for all possible scenarios, we still have to use rigorous derivation 
# You can derive the property yourself if you are good at algebra 
# Or search online to read others' proof and explanation

log.likelihood<-function(beta.sigma,covariate,outcome){
  yout=linear.model(beta=beta.sigma[-3],covariate=covariate);
  res=outcome-yout;
  lglklh=-sum( log(dnorm(x=res,mean=0,sd=beta.sigma[3])   ))
  return(lglklh)
}


# 1, Use generic optimizer in R
fit.lglklh.optim=optim(c(0,0,1),log.likelihood,covariate=dat.advertising$TV,outcome=dat.advertising$sales)
fit.lglklh.optim
fit.optim