# # Written by: # -- # John L. Weatherwax 2009-04-21 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # #----- set.seed(0) # Ex. 1: # pnorm( 3, mean=3.1, sd=sqrt(0.3) ) # Ex. 4: # X = seq(1,15,length.out=30) X2 = X^2 cor(X,X2) m_x_on_x2 = lm( X ~ X2 ) sm_x_on_x2 = summary(m_x_on_x2) RX = sm_x_on_x2$r.squared # R^2 for X ~ X2 VIF_X = 1/(1-RX) # The VIF is symmetric in the two variable case ... here we show this numerically m_x2_on_x = lm( X2 ~ X ) sm_x2_on_x = summary(m_x2_on_x) RX2 = sm_x2_on_x$r.squared # R^2 for X2 ~ X VIF_X2 = 1/(1-RX2) V = X - mean(X) V2 = V^2 cor(V,V2) m_v_on_v2 = lm( V ~ V2 ) sm_v_on_v2 = summary(m_v_on_v2) RV = sm_v_on_v2$r.squared VIF_V = 1/(1-RV^2) print( sprintf("cor(X,X^2)= %5.3f", cor(X,X2) ) ) print( sprintf("cor(V,V^2)= %5.3f", cor(V,V2) ) ) print( sprintf("VIF for X and X^2= %5.3f", VIF_X) ) print( sprintf("VIF for (X-barX) and (X-barX)^2= %5.3f", VIF_V) ) # Ex. 6: # n = 66 M = 5 # the largest number of predictors total_SS = 48 residual_error_SS = c(12.2,10.1,10.0) ps = c( 3, 4, 5 ) R2 = 1 - residual_error_SS / total_SS Adjusted_R2 = 1 - ( residual_error_SS / (n-ps-1) ) / ( total_SS / (n-1) ) sigma_epsilon_M2 = residual_error_SS[3] / ( n - 1 - M ) Cp = residual_error_SS / sigma_epsilon_M2 - n + 2 * ( ps + 1 ) print( rbind( R2, Adjusted_R2, Cp ) ) # Ex. 9: # # the number of data points and the degree of freedoms: total_df = 15 n = total_df + 1 regression_df = 2 error_df = total_df - regression_df f_value = qf( 1 - 0.04, regression_df, error_df ) error_ss = 5.66 error_ms = error_ss / error_df sigma2 = error_ms # Since F = MS / sigma^2 we can solve for MS to get MS = F * sigma^2 # regression_ms = sigma2 * f_value regression_ss = regression_ms * regression_df total_ss = regression_ss + error_ss R2 = regression_ss / total_ss # Verify the above calculations on some other ANOVA tables: # set.seed(0) # Two variable prediction: # X1 = rnorm(15) X2 = rnorm(15) Y = 1*X1 - 2*X2 + rnorm(15) m = lm( Y ~ X1 + X2 ) anova(m) F_X1_value = qf( 1 - 0.0006981, 1, 12 ) # take first Pr(>F) value F_X2_value = qf( 1 - 0.0005173, 1, 12 ) # take second Pr(>F) value # Simple linear regression: # m = lm( Y ~ X1 ) anova(m) F_X1_value = qf( 1 - 0.01519, 1, 13 ) s2 = 2.0608 # extract out estimate of the error variance (Residuals Mean Sq element) Mean_Sq_X1 = F_X1_value * s2 # Ex. 10: # # Generate some data and see if we can determine how well we estimate its parameters: # set.seed(0) npts = 50000 x = runif( npts, -3, +3 ) e = 10 * rt( npts, 6 ) y = 4 + 2 * x + e # our true regression function # estimate this model using least squares: # lmfit = lm( y ~ x ) summary(lmfit) # estimate this model using an error distribution given by a t distribution: # library(fGarch) start = c(lmfit$coef,sd(lmfit$resid),4) loglik = function(theta){ -sum(log(dstd(y,mean=theta[1]+theta[2]*x,sd=theta[3],nu=theta[4]))) } mle = optim(start, loglik, hessian=T) FishInfo = solve(mle$hessian) mle$par mle$value mle$convergence sqrt(diag(FishInfo)) qnorm(0.975)