# # Written by: # -- # John L. Weatherwax 2009-04-21 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # # EPage 138 # #----- save_plots= F if( ! require("ISLR") ){ install.packages("ISLR") } if( ! require("leaps") ){ install.packages("leaps") } if( ! require("glmnet") ){ install.packages("glmnet") } if( ! require("pls") ){ install.packages("pls") } set.seed(0) # The sample size and the number of features: # n = 1000 p = 20 # Create the true value of beta (and zero out half of the entries): # beta_truth = rnorm(p+1) # add one for the constant beta_0 zero_locations = c(2,3,4,7,8,11,12,15,17,20) beta_truth[zero_locations] = 0 # For debugging lets check that we can recover our coefficients: #beta_truth = rep(0,p+1); beta_truth[1] = 1.5; beta_truth[10] = 3.5; beta_truth[15] = -3.4 print( "True values for beta (beta_0-beta_20):" ) print( beta_truth ) # Generate some input features and an output response: # X = c( rep(1,n), rnorm( n*p ) ) # make leading column of ones X = matrix( X, nrow=n, ncol=(p+1), byrow=FALSE ) Y = X %*% beta_truth + rnorm( n ) # Create a dataframe with this data: # DF = data.frame( Y, X[,-1] ) # drop the column of ones train_inds = sample( 1:n, 100 ) test_inds = ( 1:n )[ -train_inds ] #-- # Apply best subset selection using the training data: #-- regfit.full = regsubsets( Y ~ ., data=DF[train_inds,], nvmax=20 ) #print( summary( regfit.full ) ) reg.summary = summary( regfit.full ) # Plot the in-sample MSE: # training.mat = model.matrix( Y ~ ., data=DF[train_inds,] ) training.errors = rep(NA,20) for( ii in 1:20 ){ coefi = coef( regfit.full, id=ii ) pred = training.mat[,names(coefi)] %*% coefi training.errors[ii] = mean( ( DF$Y[train_inds] - pred )^2 ) } print( "best subset training MSE" ) print( training.errors ) if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter6/prob_10_train_test_MSE.eps", onefile=FALSE, horizontal=FALSE) } plot( 1:20, training.errors, xlab='number of predictors', ylab='training MSE', type='o', col='red', ylim=c(0,9) ) # Test models on the validation set: # test.mat = model.matrix( Y ~ ., data=DF[test_inds,] ) val.errors = rep(NA,20) for( ii in 1:20 ){ coefi = coef( regfit.full, id=ii ) pred = test.mat[,names(coefi)] %*% coefi val.errors[ii] = mean( ( DF$Y[test_inds] - pred )^2 ) } print( "best subset validation MSE" ) print( val.errors ) k = which.min( val.errors ) print( k ) print( coef( regfit.full, id=k ) ) points( 1:20, val.errors, xlab='number of predictors', ylab='testing MSE', type='o', col='green' ) grid() legend( 11, 9.25, c('Training MSE','Testing MSE'), col=c('red','green'), lty=c(1,1) ) if( save_plots ){ dev.off() } # Part (g): # nms = colnames(DF) nms[1] = "(Intercept)" names(beta_truth) = nms norm.beta.diff = rep(NA,20) for( ii in 1:20 ){ coefi = coef( regfit.full, id=ii ) norm.beta.diff[ii] = sqrt( sum( ( beta_truth[ names( coefi ) ] - coefi )^2 ) ) } if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter6/prob_10_beta_norm_plot.eps", onefile=FALSE, horizontal=FALSE) } plot( 1:20, norm.beta.diff, xlab='number of predictors', ylab='||beta_truth - beta^r||', type='o', col='green' ) grid() if( save_plots ){ dev.off() }