# # Runs ridge regression on the SPAM data set # # Written by: # -- # John L. Weatherwax 2009-04-21 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # #----- source("cv_Ridge.R") source("load_spam_data.R") source("one_standard_error_rule.R") source("df_ridge.R") source("opt_lambda_ridge.R") PD = load_spam_data(trainingScale=FALSE,responseScale=FALSE) # read in unscaled data XTraining = PD[[1]] XTesting = PD[[2]] p = dim(XTraining)[2]-1 # the last column is the response nSamples = dim(XTraining)[1] # Do ridge-regression cross validation (large lambda values => small degrees of freedom): # allLambdas = opt_lambda_ridge( XTraining[,1:p], multiple=2 ) numberOfLambdas = length( allLambdas ) for( li in 1:numberOfLambdas ){ res = cv_Ridge(allLambdas[li],XTraining,numberOfCV=10) # To get a *single* value for the degrees of freedom use the call on the entire training data set # We could also take the mean/median of this computed from the cross validation samples # dof = df_ridge(allLambdas[li],as.matrix(XTraining[,1:p])) if( li==1 ){ complexityParam = dof cvResults = res$cvraw }else{ complexityParam = cbind( complexityParam, dof ) # will be sorted from largest value of dof to smallest valued dof cvResults = cbind( cvResults, res$cvraw ) # each column is a different value of the complexity parameter ... # each row is a different cross validation sample ... } } # flip the order of the results: inds = rev( 1:numberOfLambdas ) allLambdas = allLambdas[ inds ] complexityParam = complexityParam[ inds ] cvResults = cvResults[ , inds ] # group all cvResults by lambda (the complexity parameter) and compute statistics: # OSE = one_standard_error_rule( complexityParam, cvResults ) complexVValue = OSE[[1]] complexIndex = OSE[[2]] oseHValue = OSE[[3]] means = OSE[[4]] # as a function of model complexity stds = OSE[[5]] # as a function of model complexity library(gplots) # plotCI, plotmeans found here, xlim=c(0,8), ylim=c(0.3,1.8) savePlot = TRUE if( savePlot ) postscript("../../WriteUp/Graphics/Chapter3/spam_ridge_regression.eps", onefile=FALSE, horizontal=FALSE) plotCI( x=complexityParam, y=means, uiw=0.5*stds, col="black", barcol="blue", lwd=1, type="l", xlab="degrees of freedom", ylab="expected squared prediction error (ESPE)" ) abline( h=oseHValue, lty=2 ) abline( v=complexVValue, lty=2 ) if( savePlot ) dev.off() # pick the best model, retrain over the entire data set, and predict on the testing data set: # bestLambda = allLambdas[ complexIndex ] DCV = XTraining # # responseName = names(DCV)[p+1] # get the response and delete it from the data frame response = DCV[,p+1] DCV[[responseName]] = NULL # Standardize the predictors and demean the response # # Note that one can get the centering value (the mean) with the command attr(DCV,'scaled:center') # and the scaling value (the sample standard deviation) with the command attr(DCV,'scaled:scale') # DCV = scale( DCV ) responseMean = mean( response ) response = response - responseMean DCVb = cbind( DCV, response ) # append back on the response DCVf = data.frame( DCVb ) # a data frame containing all scaled variables of interest # extract the centering and scaling information: # means = attr(DCV,"scaled:center") stds = attr(DCV,"scaled:scale") # apply the computed scaling based on the training data to the testing data: # DCVTest = XTesting responseTest = DCVTest[,p+1] - responseMean # in demeaned units (not mean adjusted) DCVTest[[responseName]] = NULL DCVTest = t( apply( DCVTest, 1, '-', means ) ) DCVTest = t( apply( DCVTest, 1, '/', stds ) ) DCVTestb = cbind( DCVTest, responseTest ) # append back on the response DCVTestf = data.frame( DCVTestb ) # a data frame containing all scaled variables of interest names(DCVTestf)[p+1] = responseName # fix the name of the response # fit this linear model and compute the expected prediction error (EPE) using these features (this just estimates the mean in this case): # DM = as.matrix(DCV) M = ginv( t(DM) %*% DM + bestLambda*diag(p) ) %*% t(DM) betaHat = M %*% as.matrix(DCVf[,p+1]) print( responseMean, digits=4 ) print( betaHat, digits=3 ) DTest = as.matrix(DCVTest) pdt = DTest %*% betaHat # get predictions on the test set #pdt = pdt + responseMean # add back in the mean. Now in physical units (not mean adjusted) mErr = mean( (responseTest - pdt)^2 ) print( mErr ) NTest = length(responseTest) sErr = sqrt( var( (responseTest - pdt)^2 )/NTest ) print( sErr )