# # Duplicate part of Figure 3.7 (the all possible subset models) from Chapter 3 of the book ESLII # # Written by: # -- # John L. Weatherwax 2009-04-21 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # #----- source("load_prostate_data.R") source("cv_all_subsets.R") source("one_standard_error_rule.R") PD = load_prostate_data(globalScale=FALSE,trainingScale=FALSE,responseScale=FALSE) # read in unscaled data XTraining = PD[[1]] XTesting = PD[[2]] p = dim(XTraining)[2]-1 # the last column is the response nSamples = dim(XTraining)[1] # Do best-subset cross validation: # for( k in 0:p ){ ##print(k) res = cv_all_subsets(k,XTraining,numberOfCV=10) if( k==0 ){ complexityParam = k cvResults = res[[1]] bestPredictorFormula = res[[3]] }else{ complexityParam = cbind(complexityParam,k) cvResults = cbind( cvResults, res[[1]] ) ## each column is a different value of the complexity parameter ... ## each row is a point sample of (y - y_hat)^2 where y_hat is estimated using the ## data in the fold that y is not in. bestPredictorFormula = c( bestPredictorFormula, res[[3]] ) } } # Group all cvResults by k (the subset size) and compute statistics: # OSE = one_standard_error_rule( complexityParam, cvResults ) complexVValue = OSE[[1]] complexIndex = OSE[[2]] means = OSE[[4]] stds = OSE[[5]] oseHValue = OSE[[3]] library(gplots) # plotCI, plotmeans found here ##postscript("../../WriteUp/Graphics/Chapter3/dup_fig_3_7_all_subsets.eps", onefile=FALSE, horizontal=FALSE) plotCI( x=0:p, y=means, uiw=0.5*stds, xlim=c(0,8), ylim=c(0.3,1.8), col="black", barcol="blue", lwd=1, type="l", xlab="subset size (p)", ylab="expected squared prediction error (ESPE)" ) abline( h=oseHValue, lty=2 ) abline( v=complexVValue, lty=2 ) grid() ##dev.off() # Pick the best model, retrain over the entire data set, and predict on the testing data set: # bestModelFormula = bestPredictorFormula[ complexIndex ] DCV = XTraining # # lpsa = DCV[,p+1] # get the response and delete it from the data frame DCV$lpsa = NULL # Standardize the predictors and demean the response # # Note that one can get the centering value (the mean) with the command attr(DCV,'scaled:center') # and the scaling value (the sample standard deviation) with the command attr(DCV,'scaled:scale') # DCV = scale( DCV ) lpsaMean = mean( lpsa ) lpsa = lpsa - lpsaMean DCVb = cbind( DCV, lpsa ) # append back on the response DCVf = data.frame( DCVb ) # a data frame containing all scaled variables of interest # extract the centering and scaling information: # means = attr(DCV,"scaled:center") stds = attr(DCV,"scaled:scale") # apply the computed scaling based on the training data to the testing data: # DCVTest = XTesting lpsaTest = DCVTest[,p+1] # in physical units (not mean adjusted) NTest = length(lpsaTest) DCVTest$lpsa = NULL DCVTest = t( apply( DCVTest, 1, '-', means ) ) DCVTest = t( apply( DCVTest, 1, '/', stds ) ) DCVTestb = cbind( DCVTest, lpsaTest - lpsaMean ) # append back on the response DCVTestf = data.frame( DCVTestb ) # a data frame containing all scaled variables of interest names(DCVTestf)[p+1] = "lpsa" # fix the name of the response # fit this linear model and compute the expected prediction error (EPE) using these features (this just estimates the mean in this case): # mk = lm( formula = bestModelFormula, data=DCVf ) # print the coefficients of this model: # print( lpsaMean, digits=4 ) # add back in the mean print( coef( mk ), digits=4 ) pdt = predict( mk, newdata=DCVTestf, interval="prediction" )[,1] # get predictions on the test set pdt = pdt + lpsaMean # add back in the mean. Now in physical units (not mean adjusted) mErr = mean( (lpsaTest - pdt)^2 ) print( mErr ) sErr = sqrt( var( (lpsaTest - pdt)^2 )/NTest ) print( sErr )