# # Written by: # -- # John L. Weatherwax 2009-04-21 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # # EPage 313 # EPage 63 # #----- save_plots = F if( ! require("ISLR") ){ install.packages("ISLR") } if( ! require("leaps") ){ install.packages("leaps") } if( ! require("glmnet") ){ install.packages("glmnet") } if( ! require("gam") ){ install.packages("gam") } set.seed(0) # Part (a): # # Divide the dataset into three parts: training==1, validation==2, and test==3 # dataset_part = sample( 1:3, nrow(College), replace=T, prob=c(0.5,0.25,0.25) ) p = ncol(College)-1 # Fit subsets of various sizes: # regfit.forward = regsubsets( Outstate ~ ., data=College[dataset_part==1,], nvmax=p, method="forward" ) print( summary( regfit.forward ) ) reg.summary = summary( regfit.forward ) # Test the trained models on the validation set: # validation.mat = model.matrix( Outstate ~ ., data=College[dataset_part==2,] ) val.errors = rep(NA,p) for( ii in 1:p ){ coefi = coef( regfit.forward, id=ii ) pred = validation.mat[,names(coefi)] %*% coefi val.errors[ii] = mean( ( College$Outstate[dataset_part==2] - pred )^2 ) } print( "forward selection validation errors" ) print( val.errors ) k = which.min( val.errors ) print( sprintf("smallest validation error for index= %d, with coefficients given by", k) ) print( coef( regfit.forward, id=k ) ) if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter7/prob_10_forward_summary_plots.eps", onefile=FALSE, horizontal=FALSE) } plot( val.errors, xlab="Number of variables", ylab="Validation MSE", pch=19, type='b' ) abline(v=k,col='red') grid() if( save_plots ){ dev.off() } # Predict the best model found on the testing set: # test.mat = model.matrix( Outstate ~ ., data=College[dataset_part==3,] ) coefi = coef( regfit.forward, id=k ) pred = test.mat[,names(coefi)] %*% coefi test.error = mean( ( College$Outstate[dataset_part==3] - pred )^2 ) print( "test error on the optimal subset" ) print( test.error ) k = 3 coefi = coef( regfit.forward, id=k ) pred = test.mat[,names(coefi)] %*% coefi test.error = mean( ( College$Outstate[dataset_part==3] - pred )^2 ) print( "test error on the k=3 subset" ) print( test.error ) # Part (b): # # Combine the training and validation into one "training" dataset # dataset_part[dataset_part==2] = 1 dataset_part[dataset_part==3] = 2 gam.model = gam( Outstate ~ s(Expend,4) + s(Room.Board,4) + Private, data=College[dataset_part==1,] ) if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter7/prob_10_gam_plot.eps", onefile=FALSE, horizontal=FALSE) } par(mfrow=c(1,3)) plot(gam.model, se=TRUE, col="blue") par(mfrow=c(1,1)) if( save_plots ){ dev.off() } # Predict the GAM performance on the test dataset: # y_hat = predict( gam.model, newdata=College[dataset_part==2,] ) MSE = mean( ( College[dataset_part==2,]$Outstate - y_hat )^2 ) print( "gam testing set (MSE) error" ) print( MSE )