# # Written by: # -- # John L. Weatherwax 2009-04-21 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # # EPage 314 # #----- if( ! require("ISLR") ){ install.packages("ISLR") } if( ! require("boot") ){ install.packages("boot") } set.seed(0) # Plot the data to see what it looks like: # with( Wage, plot( age, wage ) ) # Part (a): # # Perform polynomial regression for various polynomial degrees: # cv.error = rep(0,10) for( i in 1:10 ){ # fit polynomial models of various degrees (based on EPage 208 in the book) glm.fit = glm( wage ~ poly(age,i), data=Wage ) cv.error[i] = cv.glm( Wage, glm.fit, K=10 )$delta[1] } #postscript("../../WriteUp/Graphics/Chapter7/prob_6_part_a_CV_plot.eps", onefile=FALSE, horizontal=FALSE) plot( 1:10, cv.error, pch=19, type='b', xlab='degree of polynomial', ylab='CV estimate of the prediction error' ) grid() #dev.off() # Using the minimal value for the CV error gives the value 10 which seems like too much polynomial i.e. too wiggly # From the plot 4 is the point where the curve stops decreasing and starts increasing so we will consider polynomials # of this degree. # me = which.min( cv.error ) me = 4 m = glm( wage ~ poly(age,me), data=Wage ) #postscript("../../WriteUp/Graphics/Chapter7/prob_6_part_a_data_N_model_plot.eps", onefile=FALSE, horizontal=FALSE) plot( Wage$age, Wage$wage ) aRng = range(Wage$age) a_predict = seq( from=aRng[1], to=aRng[2], length.out=100 ) w_predict = predict( m, newdata=list( age=a_predict ) ) lines( a_predict, w_predict, col='red' ) #dev.off() # Lets consider the ANOVA approach (i.e. a sequence of nested linear models): # m0 = lm( wage ~ 1, data=Wage ) m1 = lm( wage ~ poly(age,1), data=Wage ) m2 = lm( wage ~ poly(age,2), data=Wage ) m3 = lm( wage ~ poly(age,3), data=Wage ) m4 = lm( wage ~ poly(age,4), data=Wage ) m5 = lm( wage ~ poly(age,5), data=Wage ) anova(m0,m1,m2,m3,m4,m5) # Part (b): # # Lets do the same thing with the cut function for fitting a piecewise constant model: # # For some reason the command cv.glm does not work when we use the "cut" command. I think it was # how I formed the cut factors the first time i.e. not taking bins when some of the testing data # was outside of the training data. # # To debug this and understand what is going on we will do cross-validation by hand. # See EPage 265 in the book on some more information on how to do cross-validation in R. # number_of_bins = c( 2, 3, 4, 5, 10 ) nc = length(number_of_bins) k = 10 folds = sample( 1:k, nrow(Wage), replace=TRUE ) cv.errors = matrix( NA, k, nc ) # Prepare for the type of factors you might obtain (extend the age range a bit): # age_range = range( Wage$age ) age_range[1] = age_range[1]-1 age_range[2] = age_range[2]+1 for( ci in 1:nc ){ # for each number of cuts to test nob = number_of_bins[ci] # n(umber) o(f) c(uts) 2, 3, 4 ... for( fi in 1:k ){ # for each fold # In this ugly command we: # # break the "age" variable in the subset of data Wage[folds!=fi,] into "nob" bins that # span between the smallest and largest values of age observed over the entire dataset. # # This allows us to be able to use the function "predict" on age values not seen in # the training subset. # # If we try to "cut" the age variable into bins that are too small they may not contain any ages # in them. I'm not sure that lm/glm would be doing something reasonable in that case. Thus I only # do cross-validation on a smallish number of bins. # fit = glm( wage ~ cut( age, breaks=seq( from=age_range[1], to=age_range[2], length.out=(nob+1) ) ), data=Wage[folds!=fi,] ) y_hat = predict( fit, newdata=Wage[folds==fi,] ) cv.errors[fi,ci] = mean( ( Wage[folds==fi,]$wage - y_hat )^2 ) } } cv.errors.mean = apply(cv.errors,2,mean) cv.errors.stderr = apply(cv.errors,2,sd)/sqrt(k) min.cv.index = which.min( cv.errors.mean ) one_se_up_value = ( cv.errors.mean+cv.errors.stderr )[min.cv.index] # Set up the x-y limits for plotting: min_lim=min( one_se_up_value, cv.errors.mean, cv.errors.mean-cv.errors.stderr, cv.errors.mean+cv.errors.stderr ) * 0.9 max_lim=max( one_se_up_value, cv.errors.mean, cv.errors.mean-cv.errors.stderr, cv.errors.mean+cv.errors.stderr ) * 1.1 #postscript("../../WriteUp/Graphics/Chapter7/prob_6_part_b_CV_plot.eps", onefile=FALSE, horizontal=FALSE) plot( number_of_bins, cv.errors.mean, ylim=c(min_lim,max_lim), pch=19, type='b', xlab='number of cut bins', ylab='CV estimate of the prediction error' ) lines( number_of_bins, cv.errors.mean-cv.errors.stderr, lty='dashed' ) lines( number_of_bins, cv.errors.mean+cv.errors.stderr, lty='dashed' ) abline( h=one_se_up_value, col='red' ) grid() #dev.off() # Fit the optimal model using all data: # nob = 3 fit = glm( wage ~ cut( age, breaks=seq( from=age_range[1], to=age_range[2], length.out=(nob+1) ) ), data=Wage ) #postscript("../../WriteUp/Graphics/Chapter7/prob_6_part_b_data_N_model_plot.eps", onefile=FALSE, horizontal=FALSE) plot( Wage$age, Wage$wage ) aRng = range(Wage$age) a_predict = seq( from=aRng[1], to=aRng[2], length.out=100 ) w_predict = predict( fit, newdata=list( age=a_predict ) ) lines( a_predict, w_predict, col='red', lw=4 ) #dev.off()