cv_Ridge <- function( lambda, D, numberOfCV ){ # # Does Cross Validation of the Ridge Regression Linear Model specified by the formula "form" # # Input: # lambda = multiplier of the penatly term # D = training data frame where the last column is the response variable # numberOfCV = number of cross validations to do # # Output: # MSPE = vector of length numberOfCV with components mean square prediction errors # extracted from each of the numberOfCV test data sets # # Written by: # -- # John L. Weatherwax 2009-04-21 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # #----- library(MASS) # needed for ginv nSamples = dim( D )[1] p = dim( D )[2] - 1 # the last column is the response # needed for the cross vaildation (CV) loops: # nCVTest = round( nSamples*(1/numberOfCV) ) # each CV run will have this many test points nCVTrain = nSamples-nCVTest # each CV run will have this many training points for (cvi in 1:numberOfCV) { testInds = (cvi-1)*nCVTest + 1:nCVTest # this may attempt to access sample indexes > nSamples testInds = intersect( testInds, 1:nSamples ) # so we restrict this list here DCVTest = D[testInds,1:(p+1)] # get the predictor + response testing data # select this cross validation's section of data from all the training data: # trainInds = setdiff( 1:nSamples, testInds ) DCV = D[trainInds,1:(p+1)] # get the predictor + response training data # Get the response and delete it from the data frame # responseName = names(DCV)[p+1] # get the actual string name from the data frame response = DCV[,p+1] DCV[[responseName]] = NULL # For ridge-regression we begin by standardizing the predictors and demeaning the response # # Note that one can get the centering value (the mean) with the command attr(DCV,'scaled:center') # and the scaling value (the sample standard deviation) with the command attr(DCV,'scaled:scale') # DCV = scale( DCV ) responseMean = mean( response ) response = response - responseMean DCVb = cbind( DCV, response ) # append back on the response DCVf = data.frame( DCVb ) # a data frame containing all scaled variables of interest # extract the centering and scaling information: # means = attr(DCV,"scaled:center") stds = attr(DCV,"scaled:scale") # apply the computed scaling based on the training data to the testing data: # responseTest = DCVTest[,p+1] - responseMean # mean adjust DCVTest[[responseName]] = NULL DCVTest = t( apply( DCVTest, 1, '-', means ) ) DCVTest = t( apply( DCVTest, 1, '/', stds ) ) DCVTestb = cbind( DCVTest, responseTest ) # append back on the response DCVTestf = data.frame( DCVTestb ) # a data frame containing all scaled variables of interest names(DCVTestf)[p+1] = responseName # fix the name of the response # fit this linear model and compute the expected prediction error (EPE) using these features (this just estimates the mean in this case): # DM = as.matrix(DCV) M = ginv( t(DM) %*% DM + lambda*diag(p) ) %*% t(DM) betaHat = M %*% as.matrix(DCVf[,p+1]) DTest = as.matrix(DCVTest) pdt = DTest %*% betaHat # get demeaned predictions on the test set #pdt = pdt + responseMean # add back in the mean if we want the response in physical units (not mean adjusted) if( cvi==1 ){ predmat = pdt y = responseTest }else{ predmat = c( predmat, pdt ) y = c( y, responseTest ) } } # endfor cvi loop # this code is modeled after the code in "glmnet/cvelnet.R" # N = length(y) cvraw = ( y - predmat )^2 cvm = mean(cvraw) cvsd = sqrt( var(cvraw)/N ) l = list( cvraw=cvraw, cvm=cvm, cvsd=cvsd, name="Mean Squared Error" ) return(l) }