repmat <- function(a,n,m){ # # duplicates matlab "repmat" functionality: # kronecker(matrix(1,n,m),a) } pcr_wwx <- function( DTrain, DTest ){ # # Implements "Principal Component Regression" using the algorithm discussed on # page 79 (epage 97) of the book: # # Elements of Statistical Learning by # Hastie, Tibshirani, and Friedman. # # Note that generally we must specify an index *M* that determines how close the PCR # estimates will be to the ordinary least squares estimates. This routine produces # estimated responses (\hat{y}) for {\em all} values of M # # In addition, there is R package called "pcr" that will also do principal component # regression and is probabily more suitable for general use. # # Input: # DTrain = training data frame where the last column is the response variable # of dimension [ N, p+1 ] # DTest = testing data frame on which we will return all possible prediction # # Output: # res[[1]] = beta_hat estimates coefficients for a linear model of dimension [ p, p ] # The first column corresponds to M=1 (using only one principal direction) # The second column corresponds to M=2 (using two principal directions) # ... # The pth column corresponds to M=p (using all principal directions=OLS estimate of beta) # beta's of all zeros corresponds to predictions with the mean vector \bar{y} # # res[[2]] = y_hat estimated response for each of the p+1 models on the training data. Of dimension [ N, p+1 ] # The first column corresponds to M=0 (using a constant predictor) # The second column corresponds to M=1 (using a single precitor) # ... # The (p+1)th column correspons to using all principal directions # # res[[3]] = y_hat column representing the predictions made by all models on the TESTING data # # res[[4]] = the mean response of the training data (needed for prediction using these models) # # res[[5]] = the mean value of each predictor from the training data # # res[[6]] = the standard deviation of each predictor from the training data # # Note 1: # A cross validation procedure should be used to select the ultimate approximation degree # (that would then be fixed) and used in an envionrment where one does not have access to the true # response. # # Note 2: # One might want to have the ability to pass in DTest empty for when one just wants to train # and has no testing to be done. # # Written by: # -- # John L. Weatherwax 2010-03-02 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # #----- nTrain = dim( DTrain )[1] p = dim( DTrain )[2] - 1 # the last column is the response responseName = names( DTrain )[p+1] # the the name of the response nTest = dim( DTest )[1] # Get the response and delete it from the data frame response = DTrain[,p+1] DTrain[[responseName]] = NULL # We begin by standardizing the predictors and demeaning the response # # Note that one can get the centering value (the mean) with the command attr(DTrain,'scaled:center') # and the scaling value (the sample standard deviation) with the command attr(DTrain,'scaled:scale') # DTrain = scale( DTrain ) responseMean = mean( response ) response = response - responseMean # extract the centering and scaling information: # means = attr(DTrain,"scaled:center") stds = attr(DTrain,"scaled:scale") # apply the computed scaling based on the training data to the testing data: # responseTest = DTest[,p+1] - responseMean # mean adjusted DTest[[responseName]] = NULL DTest = t( apply( DTest, 1, '-', means ) ) DTest = t( apply( DTest, 1, '/', stds ) ) # Fit a model and compute the expected prediction error (EPE) for all remaining modes k=1:p # # compute the product X^T X: XTX = t(DTrain) %*% DTrain # compute the eigen-decomposition of X^T X: sm = eigen( XTX ) V = sm$vectors # the columns of V are the vectors v_m d_m2 = sm$values # these are the values of d_m (squared) # compute the vectors z_m = X v_m: Z = DTrain %*% V # the columns of Z are the vectors z_m # compute the regression coefficients theta_m: theta_m = t(response) %*% Z / d_m2 # compute the various PCR predictions (for k=0:p): bigTheta = repmat( theta_m, nTrain, 1 ) thetaZ = bigTheta * Z # the pointwise product of theta and z_m thetaZ = cbind( repmat(responseMean,nTrain,1), thetaZ ) # if you want the prediction to have the mean included pTrain = t( apply( thetaZ, 1, cumsum ) ) # the training set predictions # compute the values of beta_hat: bigTheta = repmat( theta_m, p, 1 ) thetaV = bigTheta * V betaHat = t( apply( thetaV, 1, cumsum ) ) # the estimated beta values # compute the predictions on the cross validation test set: pTest = as.matrix( DTest[,1:p] ) %*% betaHat # create a matrix of these results of size [ nTest, p ] pTest = cbind( mat.or.vec(nTest,1), pTest ) # append the demeaned prediction (zero vector) pTest = pTest + responseMean # add back in the mean if we want the response in physical units (not mean adjusted) res = list(betaHat,pTrain,pTest,responseMean,means,stds) return(res) } regression_coefficients_given_pcr = function(y_train, pcr_res, M=1){ # # Returns the coefficients of the model: # # Y = beta_0 + \sum_{i=1}^p \beta_i X_i # # and # # Y = beta_{c0} + \sum_{i=1}^p \beta_{ci} X_{ci} # # when doing PCR and keeping M principle components. # # The first is the model we would use with the original variables X_i # and the second is the model we would use with centered variables X_{ci} # (the same variables as X_i but after subtracting their mean) # # The PCA regression gives the coefficients beta_pc_i in the model # # Y = my_y + \sum_i beta_pc_i ( ( X_i - mu_X_i ) / sigma_X_i ) # # If we let c_i equal # # c_i = sigma_y beta_pc_i / sigma_X_i # # then the coefficient we want are given by the model # # Y = ( mu_y - \sum_i c_i mu_X_i ) + \sum_i c_i X_i # # and # # Y = mu_y + \sum_i ( beta_pc_i / sigma_X_i ) ( X_i - mu_X_i ) # stopifnot((M >= 0) & (M<=length(pcr_res[[6]]))) y_mean = mean(y_train) y_std = 1 # the standard deviation of the response is not scaled out when using pcr_wwx x_means = pcr_res[[5]] x_stds = pcr_res[[6]] beta_pc_z = pcr_res[[1]][, M] # the coefficients of the standardized predictors c_i = y_std * ( beta_pc_z / x_stds ) # the coefficients of the centered predictors coeffs = c( y_mean - sum( c_i * x_means ), c_i ) centered_coeffs = c( y_mean, c_i ) R = rbind( coeffs, centered_coeffs ) return(R) }