Chapter 3 Sampling distribution

3.1 Understanding sampling distribution via simulation

Reading materials: Slides 23 - 32 in STA108_LinearRegression_S20.pdf.

In this section, we assume a true data generating model in order to draw samples from the true popluation distribution. In particular, we assume that \[ y_i = x_i \beta_1 + \beta_0 + \epsilon_i, \ i=1,\ldots, 50\] where \(\epsilon_i \sim {\mathcal{U}}(-2.5, 2,5)\). We set \(\beta_1=0.15\) and \(\beta_0=20\).

### We will simulate synthetic data to understand the concept of sampling distribution

### To match our setup, we will generate the covarates and keep them fixed in the remainder of the simulation. 
set.seed(1)
n=50;
x=as.matrix(rnorm(n,mean=10,sd=2),ncol=n);
beta.true=c(20,0.15)
Ey= x*beta.true[2]+beta.true[1];

error.terms= (runif(n)-0.5)*5;
y=Ey+error.terms;
beta.hat=fit.linear.model(covariate=x,outcome=y);


plot(x=x,y=y,pch=16,cex=2)
abline(b=beta.hat[2],a=beta.hat[1],col='red',lwd=2)

### We can run the simulation by putting the above lines into a large for-loop
### We will wrap them up into one function

simulate.one.instance<-function(x,beta.true){
  n=length(x);
  Ey= x*beta.true[2]+beta.true[1];
  error.terms= (runif(n)-0.5)*5;
  y=Ey+error.terms;
  beta.hat=fit.linear.model(covariate=x,outcome=y);
  return(beta.hat)
}

### Set the number of replicates to be 10000
N.sim=1e4;
set.seed(1)
beta.sim=replicate(N.sim,simulate.one.instance(x,beta.true));


# Draw the histogram for the estimated intercept 
hist(beta.sim[1,1,],xlab='Fitted intercept',main='')
abline(v=beta.true[1],lwd=3,col='red')

Recall from the lecture notes that the variance of \(\hat{\beta}_1\) is \[{\rm var}\big(\hat{\beta}_1\big)=\frac{1}{\sum_{i=1}^n (x_i-\bar{x})^2} \sigma^2.\]

## Using the simulation results, we can verify the statement on Slide 26

### Expectation 
mean(beta.sim[1,1,])-beta.true[1] # intercept 
mean(beta.sim[2,1,])-beta.true[2] # slope 

### Variance 
sigma.sq = 5^2/12; # variance of the error: (runif(n)-0.5)*5;

# slope:
sigma.sq/sum(  (x-mean(x))^2  ) # theoretical value 
var(beta.sim[2,1,]) # variance from the simulation 


# intercept:
sigma.sq*sum(x^2)/sum(  (x-mean(x))^2  )/n # theoretical value 
var(beta.sim[1,1,]) # variance from the simulation

Q: How would you verify the Gauss-Markov theorem in this simulation?

# A very simple simulation to compare the LSE with another linear unbiased estimator: 
simulate.one.instance<-function(x,beta.true){
  n=length(x);
  Ey= x*beta.true[2]+beta.true[1];
  error.terms= (runif(n)-0.5)*5;
  y=Ey+error.terms;
  beta.hat=fit.linear.model(covariate=x,outcome=y);
  slope.hat=beta.hat[2];
  slope.dot= (y[n]-y[1])/(x[n]-x[1])
  return(c(slope.hat,slope.dot))
}

### Set the number of replicates to be 10000
N.sim=1e4;
set.seed(1)
slope.sim=replicate(N.sim,simulate.one.instance(x,beta.true));

apply(slope.sim,1,mean) # both are unbiased
apply(slope.sim,1,var) # variance of LSE is smaller

## Estimate the variance of errors using the residuals 
set.seed(1)
error.terms= (runif(n)-0.5)*5;
y=Ey+error.terms;
beta.hat=fit.linear.model(covariate=x,outcome=y);

residual.sum.of.squares=sum.of.squares(beta=beta.hat,covariate=x,outcome=y)

sigma.sq.hat=residual.sum.of.squares/(n-2) # estimates from one instance

## Run a simulation to verify the claim

## NOTE: we will use the same function name
simulate.one.instance<-function(x,beta.true){
  n=length(x);
  Ey= x*beta.true[2]+beta.true[1];
  error.terms= (runif(n)-0.5)*5;
  y=Ey+error.terms;
  beta.hat=fit.linear.model(covariate=x,outcome=y);
  residual.sum.of.squares=sum.of.squares(beta=beta.hat,covariate=x,outcome=y)

sigma.sq.hat=residual.sum.of.squares/(n-2) # estimates from one instance

  return(sigma.sq.hat)
}

### Set the number of replicates to be 10000
N.sim=1e4;
set.seed(1)
sigma.sq.hat.sim=replicate(N.sim,simulate.one.instance(x,beta.true));

mean(sigma.sq.hat.sim)
sigma.sq

## To wrap this up
## For any new fits, we can estiamte the variance and standad errors of the estimators 
set.seed(1)
error.terms= (runif(n)-0.5)*5;
y=Ey+error.terms;
beta.hat=fit.linear.model(covariate=x,outcome=y);

estimate.sigma.sq<-function(beta,covariate,outcome){
   residual.sum.of.squares=sum.of.squares(beta=beta,covariate=covariate,outcome=outcome)
   n=length(outcome)
   sigma.sq.hat=residual.sum.of.squares/(n-2) 
   return(sigma.sq.hat)
}

estimate.coef.var<-function(beta,covariate,outcome){
  sigma.sq.hat=estimate.sigma.sq(beta,covariate,outcome)
  var.hat.beta=beta;
  var.hat.beta[2]=sigma.sq.hat/sum(  (covariate-mean(covariate))^2  ) 
  n=length(outcome)
  var.hat.beta[1]=sigma.sq.hat*sum(covariate^2)/sum((covariate-mean(covariate))^2  )/n
  return( var.hat.beta)
}

estimate.coef.sd<-function(beta,covariate,outcome){
  var.hat.beta=estimate.coef.var(beta,covariate,outcome)
 sd.hat.beta=sqrt(var.hat.beta);
  return(sd.hat.beta)
}
estimate.coef.sd(beta=beta.hat,covariate=x,outcome=y)

## Compare your numbers with the output from lm()
summary(lm(y~x+1))

3.2 Shapes of sampling distributions

Reading materials: Slide 33 in STA108_LinearRegression_S20.pdf.

# There can be multiple distributions with the same mean and variance. Consider three distributions discussed in the lecture notes.

xgrid = seq(from = -5, to = 5, by=0.01);
norm_density=dnorm(xgrid,mean=0,sd=sqrt(2));
unif_density=dunif(xgrid,min=-sqrt(6),max=sqrt(6));

xbin=c(-sqrt(2),sqrt(2));
ybin=c(0.5,0.5);


plot(norm_density~xgrid,type="l",col='red',lwd=3,ylim=c(0,0.52),xlab="z",ylab="density")
lines(unif_density~xgrid,col='black',lwd=3)

segments(xbin[1],0,xbin[1],ybin[1],col='green',lwd=10)
segments(xbin[2],0,xbin[2],ybin[2],col='green',lwd=10)
legend(x=2.5,y=0.5,legend=c("Uniform","Normal","Binary"),lwd=2,col=c('black','red','green'))

3.2.1 Asymptotic distribution

Reading materials: Slides 34 - 48 in STA108_LinearRegression_S20.pdf.

The L-F central limit theorem claims that the asymptotic distribution for \(\frac{\hat{\beta}_1 - \beta_1}{\hat{\sigma} / S_{xx}^{1/2} }\) is \(\mathcal{N}(0,1)\).

## Edit the simulation code 
### This time with large n (n=500)
### Three different versions of errors
### Unequal variances  
simulate.one.instance<-function(x,beta.true,error.type){
  n=length(x);
  Ey= x*beta.true[2]+beta.true[1];
  vars=0.1+runif(n)*2; # unequal variance 
  if(error.type=='uniform'){
    error.terms= (runif(n)-0.5)*sqrt(vars);
    
  }else if(error.type=='Bernoulli'){
    error.terms=(rbinom(n,size=1,prob=0.5)-0.5)*sqrt(vars);
  }else{
    error.terms= rnorm(n)*sqrt(vars);
  }
  y=Ey+error.terms;
  beta.hat=fit.linear.model(covariate=x,outcome=y);
  
  residual.sum.of.squares=sum.of.squares(beta=beta.hat,covariate=x,outcome=y)

sigma.sq.hat=residual.sum.of.squares/(n-2) # estimates from one instance

  beta.hat.std =(beta.hat[2]-beta.true[2])/sqrt(sigma.sq.hat)*sqrt( sum( (x-mean(x))^2 ) );  # standardized intercept
  
  return(beta.hat.std)
}

### Set the number of replicates to be 10000
N.sim=1e4;
set.seed(1)

n=500;
x=as.matrix(rnorm(n,mean=10,sd=2),ncol=n);
beta.hat.std.uniform=replicate(N.sim,simulate.one.instance(x,beta.true,error.type='uniform'));
beta.hat.std.Bernoulli=replicate(N.sim,simulate.one.instance(x,beta.true,error.type='Bernoulli'));
beta.hat.std.normal=replicate(N.sim,simulate.one.instance(x,beta.true,error.type='normal'));

# Draw the density plot

density.uniform=density( beta.hat.std.uniform);
density.Bernoulli=density( beta.hat.std.Bernoulli);
density.normal=density( beta.hat.std.normal);

# Theoretical density:
xgrid = seq(from=-4,to=4,length.out=100);
normal.pdf= dnorm(xgrid)

plot(density.uniform,xlab="Standardized intercept",col='red',lwd=3,ylim=c(0,0.5),main='',lty=2)
lines(density.Bernoulli,col='green',lwd=3,lty=2)
lines(density.normal,col='blue',lwd=3,lty=2)
lines(normal.pdf~xgrid,lwd=2,lty=3)
legend(x=2,y=0.46,legend=c("Uniform errors","Bernoulli errors","Normal errors", "N(0,1)"),lwd=3,col=c('red','green','blue','black'),lty=c(2,2,2,3))

## The L-F CLT allows us to write down the asymptotic distribution in closed-form
## The bootstrap method is an alternative when analytic solution is not available 
## Note: Bootstrap is not that useful for linear regression, but has proven to be a powerful tool for other methods. 

set.seed(1)
# Generate one set of data: 
n=500;
x=as.matrix(rnorm(n,mean=10,sd=2),ncol=n);
Ey= x*beta.true[2]+beta.true[1];
vars=0.1+runif(n)*2; # unequal variance 
error.terms= (runif(n)-0.5)*sqrt(vars);
y=Ey+error.terms;
beta.hat=fit.linear.model(covariate=x,outcome=y);


# Now write a function for bootstrap 
boot.one.instance<-function(covariate,outcome){
  n=length(outcome);
  sample_indices = sample(1:n,n,replace=TRUE) # sampling with replacement
  covariate.boot= covariate[sample_indices]; outcome.boot= outcome[sample_indices];
   
  beta.hat=fit.linear.model(covariate=covariate.boot,outcome=outcome.boot);
  return(beta.hat[2]) 
}

set.seed(1)

B=1e5;
beta.hat.boot=replicate(B,boot.one.instance(covariate=x,outcome=y));

## Compare the bootstrap distribution of the standardized slope against that from the CLT 


density.boot=density((beta.hat.boot-mean(beta.hat.boot))/sd(beta.hat.boot));
# Theoretical density (from CLT)
xgrid = seq(from=-4,to=4,length.out=100);
normal.pdf= dnorm(xgrid)

plot(density.boot,xlab="Standardized intercept",col='red',lwd=3,ylim=c(0,0.5),main='',lty=2)
lines(normal.pdf~xgrid,lwd=2,lty=3)
legend(x=2,y=0.46,legend=c("Bootstrap", "N(0,1)"),lwd=3,col=c('red','green','blue','black'),lty=c(2,3))

## Compare the distributions using the raw bootstrap distribution, and the distribution using the moments only


density.boot=density(beta.hat.boot);
# Theoretical density (from CLT)
xgrid = seq(from=min(beta.hat.boot),to=max(beta.hat.boot),length.out=100);
normal.pdf= dnorm(xgrid,mean=mean(beta.hat.boot),sd=sd(beta.hat.boot))

plot(density.boot,xlab="Standardized intercept",col='red',lwd=3,main='',lty=2)
lines(normal.pdf~xgrid,lwd=2,lty=3)
legend(x=0.15,y=50,legend=c("Empirical", "Moment-based"),lwd=3,col=c('red','green','blue','black'),lty=c(2,3))

3.3 Small sample size

Reading materials: Slides 49 - 57 in STA108_LinearRegression_S20.pdf.

When the sample size is small, the asymptotic distribution is not close to the true sampling distribution. Neither the central limit theorem or the bootstrap distribution are good approximation of the asymptotic distribution.

# Simulation with small sample size
# We will demonstrate the case with 5 samples
# You will find the code in this trunk almost identical with previous code

# Generate one set of data:
set.seed(1)
n=10;
x=as.matrix(rnorm(n,mean=10,sd=2),ncol=n);
beta.true=c(20,0.15)
Ey= x*beta.true[2]+beta.true[1];

error.terms= rnorm(n)*5; # Normal errors!
y=Ey+error.terms;
beta.hat=fit.linear.model(covariate=x,outcome=y);
plot(x=x,y=y,pch=16,cex=2)
abline(b=beta.hat[2],a=beta.hat[1],col='red',lwd=2)

## Simulate the true sampling distribution:
simulate.one.instance<-function(x,beta.true){
  n=length(x);
  Ey= x*beta.true[2]+beta.true[1];
  error.terms= (runif(n)-0.5)*5;
  y=Ey+error.terms;
  beta.hat=fit.linear.model(covariate=x,outcome=y);
  return(beta.hat)
}

### Set the number of replicates to be 10000
N.sim=1e4;
true.beta.hat=replicate(N.sim,simulate.one.instance(x,beta.true));

### Obtain the mean and standard deviation using CLT:
residual.sum.of.squares=sum.of.squares(beta=beta.hat,covariate=x,outcome=y)
sigma.sq.hat=residual.sum.of.squares/(n-2) # estimates from one instance

  
### Obtain the bootstrap distribution 
B=1e5;
boot.beta.hat=replicate(B,boot.one.instance(covariate=x,outcome=y));

density.true=density( (true.beta.hat[2,1,]-mean(true.beta.hat[2,1,]))/sd(true.beta.hat[2,1,]) );

density.boot=density((boot.beta.hat-mean(boot.beta.hat))/sd(boot.beta.hat));
# Theoretical density (from CLT)
xgrid = seq(from=-4,to=4,length.out=100);
normal.pdf= dnorm(xgrid)


plot(density.true,xlab="Estimated intercept",lwd=3,ylim=c(0,0.5),xlim=c(-4,4),main='',lty=3)
lines(density.boot,col='green',lwd=3,lty=2)
lines(normal.pdf~xgrid,col='red',,lwd=2,lty=2)
legend(x=2,y=0.5,legend=c("True","Bootstrap","CLT"),lwd=3,col=c('black','green','red'),lty=c(3,2,2))

## With very few samples, we have to exploit the parametric assumption
## In this case, we take advantage of the normality assumption on the errors


t.pdf= dt(xgrid,df=n-2)

plot(density.true,xlab="Estimated intercept",lwd=3,ylim=c(0,0.5),xlim=c(-4,4),main='',lty=3)
lines(density.boot,col='green',lwd=3,lty=2)
lines(normal.pdf~xgrid,col='red',,lwd=2,lty=2)

lines(t.pdf~xgrid,col='blue',,lwd=2,lty=2)
legend(x=2,y=0.5,legend=c("True","Bootstrap","CLT",'Student t'),lwd=3,col=c('black','green','red','blue'),lty=c(3,2,2))

## The Student t distribution grows more similar to the standard normal as its degrees of freedom increases 

t_dist=dt(xgrid,df=1);
plot(t_dist~xgrid,xlab="beta 1 hat",col=rgb(1,0,0,0.1),lwd=3,type="l",ylim=c(0,0.5),xlim=c(-5,5),main='')

for(i in 2:10){
t_dist=dt(xgrid,df=i);

lines(t_dist~xgrid,col=rgb(1,0,0,i/10),lwd=3)

}

legend(x=2,y=0.46,legend=c("Student t", "N(0,1)"),lwd=3,col=c('red','green'))

lines(normal.pdf~xgrid,col=rgb(0,1,0,0.4),lwd=5)