# # Duplicate Figure 3.5 (all possible subset models) from the book # X = read.table("../../Data/prostate.data") XTraining = subset( X, train ) XTesting = subset( X, train==FALSE ) D = XTraining[,1:8] # get the predictor data nrow = dim( D )[1] p = dim( D )[2] # Standardize the data. # # Note that one can get the centering value (the mean) with the command attr(D,'scaled:center') # and the scaling value (the sample standard deviation) with the command attr(D,'scaled:scale') # D = scale( D ) lpsa = XTraining[,9] Db = cbind( D, lpsa ) Df = data.frame( Db ) # a data frame containing all variable of interest # do the k=0 (no features) subset first: # mk = lm( formula = "lpsa ~ +1", data=Df ) rss = sum( mk$residuals^2 ) xPlot = c(0); yPlot = c(rss) # do all remaining k>0 subsets next: # for( k in 1:p ){ # k=0 needs to be done outside of this loop allPosSubsetsSizeK = combn(p,k) # get all possible subsets of size k numOfSubsets = dim(allPosSubsetsSizeK)[2] for( si in 1:numOfSubsets ){ featIndices = allPosSubsetsSizeK[,si] featNames = as.vector(names(Df))[featIndices] # construct a formula needed for the linear regression: # form = "lpsa ~ " for ( ki in 1:k ){ if( ki==1 ){ form = paste( form, featNames[ki], sep=" " ) }else{ form = paste( form, featNames[ki], sep="+" ) } } # fit this linear model and compute the RSS using these features: # mk = lm( formula = form, data=Df ) rss = sum( mk$residuals^2 ) xPlot = c(xPlot,k) yPlot = c(yPlot,rss) } } ##postscript("../../WriteUp/Graphics/Chapter3/fig_3_5_dup.eps", onefile=FALSE, horizontal=FALSE) plot(xPlot,yPlot,xlab="Subset Size k",ylab="Residual Sum-of-Squares",ylim=c(0,100),xlim=c(0,8)) xMinPlot = xPlot[1]; yMinPlot = yPlot[1] for ( ki in 1:p ){ inds = xPlot==ki rmin = min(yPlot[inds]) xMinPlot = c(xMinPlot,ki); yMinPlot = c(yMinPlot,rmin) } lines(xMinPlot,yMinPlot) ##dev.off()