cv_PCR_N_PLSR <- function( D, numberOfCV=10, choice=1 ){ # # Does Cross Validation of the Principal Component Regression (PCR) or # Partial Least Squares Linear Models depending on the value of choice # # Note that CV can be done implicitly in the pls package but I was unable to # find out how to extract the standard errors from these results # # Input: # D = training data frame where the last column is the response variable # numberOfCV = number of cross validations to do # # Output: # MSPE = matrix of size [ numberOfCV \times ncomp ] of mean square prediction errors # extracted from each of the numberOfCV test data sets for each of the ncomp # # Written by: # -- # John L. Weatherwax 2009-04-21 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # #----- library(pls) # needed for pcr and plsr nSamples = dim( D )[1] p = dim( D )[2] - 1 # the last column is the response responseName = names(D)[p+1] # the the name of the response # needed for the cross vaildation (CV) loops: # nCVTest = round( nSamples*(1/numberOfCV) ) # each CV run will have this many test points nCVTrain = nSamples-nCVTest # each CV run will have this many training points for (cvi in 1:numberOfCV) { testInds = (cvi-1)*nCVTest + 1:nCVTest # this may attempt to access sample indexes > nSamples testInds = intersect( testInds, 1:nSamples ) # so we restrict this list here nCVTest = length(testInds) DCVTest = D[testInds,1:(p+1)] # get the predictor + response testing data # select this cross validation's section of data from all the training data: # trainInds = setdiff( 1:nSamples, testInds ) DCV = D[trainInds,1:(p+1)] # get the predictor + response training data response = DCV[,p+1] # get the response and delete it from the data frame DCV[[responseName]] = NULL # We begin by standardizing the predictors and demeaning the response # # Note that one can get the centering value (the mean) with the command attr(DCV,'scaled:center') # and the scaling value (the sample standard deviation) with the command attr(DCV,'scaled:scale') # DCV = scale( DCV ) responseMean = mean( response ) response = response - responseMean DCVb = cbind( DCV, response ) # append back on the response DCVf = data.frame( DCVb ) # a data frame containing all scaled variables of interest names(DCVf)[p+1] = responseName # fix the name of the response # extract the centering and scaling information: # means = attr(DCV,"scaled:center") stds = attr(DCV,"scaled:scale") # apply the computed scaling based on the training data to the testing data: # responseTest = DCVTest[,p+1] - responseMean # mean adjusted DCVTest[[responseName]] = NULL DCVTest = t( apply( DCVTest, 1, '-', means ) ) DCVTest = t( apply( DCVTest, 1, '/', stds ) ) DCVTestb = cbind( DCVTest, responseTest ) # append back on the response DCVTestf = data.frame( DCVTestb ) # a data frame containing all scaled variables of interest names(DCVTestf)[p+1] = responseName # fix the name of the response ncomp=p # fit a model over all components #ncomp=4 # construct a formula needed for the calls to pcr and plsr: # form = paste( responseName, " ~ " ) for ( ki in 1:p ){ if( ki==1 ){ form = paste( form, names(D)[ki], sep=" " ) }else{ form = paste( form, names(D)[ki], sep="+" ) } } form = as.formula( form ) # convert to a formula object # Fit a model and compute the expected prediction error (EPE) for all remaining modes k=1:p # if( choice==1 ){ # principal component regression model m0 = pcr( form, ncomp=ncomp, data=DCVf, validation="none" ) }else{ # partial least squares regression model m0 = plsr( form, ncomp=ncomp, data=DCVf, validation="none" ) } pdt = predict( m0, newdata=DCVTestf ) pdt = pdt[,1,] # create a matrix of these results of size [ nCVTestSamples, ncomp ] pdt = cbind( mat.or.vec(nCVTest,1), pdt ) # append the demeaned prediction (zero vector) #pdt = pdt + responseMean # add back in the mean if we want the response in physical units (not mean adjusted) # create a matrix from the testing response of size [ nCVTestSamples, ncomp+1 ] responseTestAsMatrix = matrix( data=rep( responseTest, ncomp+1 ), ncol=ncomp+1 ) if( cvi==1 ){ predmat = pdt y = responseTestAsMatrix }else{ predmat = rbind( predmat, pdt ) y = rbind( y, responseTestAsMatrix ) } } # endfor cvi loop # this code is modeled after the code in "glmnet/cvelnet.R" # N = dim(y)[1] cvraw = ( y - predmat )^2 cvm = apply(cvraw,2,mean) cvsd = sqrt( apply(cvraw,2,var)/N ) l = list( cvraw=cvraw, cvm=cvm, cvsd=cvsd, name="Mean Squared Error" ) return( l ) }