# # Here we duplicate some of the computations on the body fat example in the book using R tools. # This verifies that we understand everything. # library(car) source('../../Data/data_loaders.R') DF = load_appendix_bodyfat_data() m = 3 # number of predictors N = dim(DF)[1] # number of samples print(summary(DF)) CXY = cor(DF) # Duplicates the numbers in Table 5.1 print(CXY) #pairs(DF) scatterplotMatrix(DF) # Form the standardized predictors: # DFs = scale(DF) xy_means = attr(DFs, 'scaled:center') xy_stds = attr(DFs, 'scaled:scale') DFs = data.frame(DFs) full_lm = lm(FAT ~ ., data=DFs) print(summary(full_lm)) vif(full_lm) # duplicates the numbers on Page 168 print(lm(SKIN ~ THIGH + ARM, data=DF)) # duplicates the regressions on Page 168 print(lm(THIGH ~ SKIN + ARM, data=DF)) print(lm(ARM ~ SKIN + THIGH, data=DF)) ES = eigen(CXY[ 1:m, 1:m ]) print('Eigenvalues') print(round(ES$values, m)) # matches top of page 171 ES$vectors[, 1] = -ES$vectors[, 1] # change the sign of all components of the first eigenvector print('Eigenvectors') print(round(ES$vectors, 3)) # matches the matrix P on the top of page 171 # Compute the principal components (and a scatter plot of them): # Z = as.matrix(DFs[ , 1:m ]) %*% ES$vectors colnames(Z) = c('Z1', 'ZZ', 'Z3') pairs(Z, xlim=c(-1, +1), ylim=c(-1, +1)) # duplicates Figure 5.3 on page 172 # Drop the eigenvector with the sma1.est eigenvalue and form PC regression: # #source('../../../Hastie/Code/Chapter3/pcr_wwx.R') source('http://www.waxworksmath.com/Authors/G_M/Hastie/Code/Chapter3/pcr_wwx.R') res = pcr_wwx(DF, DF) y_mean = res[[4]] y_std = sd(DF$FAT) x_means = res[[5]] x_stds = res[[6]] beta_pc_z = print(res[[1]][, m-1]) # the coefficients of the standardized predictors (dropping the eigenvector with the smallest eigenvalue) beta_pc_c = beta_pc_z / x_stds # the coefficients of the centered predictors c_names = paste(names(beta_pc_c), 'C', sep='') # centered names i.e. X1C pt_1 = sprintf('PC model: FAT_PC = %+.3f', y_mean) pt_2 = paste(sprintf('%+.3f %s', beta_pc_c, c_names), sep='', collapse=' ') print(sprintf('%s %s', pt_1, pt_2)) # duplicates the coefficients given on page 183 # Using a routine to compute the coefficients we have: # print(sprintf('Coefficients of a linear model with 1 principle component:')) R = regression_coefficients_given_pcr(DF$FAT, res, M=1) print(R[1, ]) print(sprintf('Coefficients (centered) of a linear model with 2 principle components:')) R = regression_coefficients_given_pcr(DF$FAT, res, M=2) print(R[2, ]) # the row with the label 'centered_coeffs' in R can be used to duplicate the above PC model: FAT_PC = result print(sprintf('Coefficients of a linear model with 3=m (all variables) principle components:')) R = regression_coefficients_given_pcr(DF$FAT, res, M=m) print(R[1, ]) # matches the coefficients in the call lm(FAT ~ ., data=DF) # The predictions with this model: # y_hat = res[[2]][, m-1] RMS = sqrt( sum( (DF$FAT - y_hat)^2 )/(N-m-1) ) SS_Total = sum( ( DF$FAT - y_mean )^2 ) SS_Regression = sum( ( y_hat - y_mean )^2 ) SS_Residual = sum( ( DF$FAT - y_hat )^2 ) R2 = 1 - SS_Residual / SS_Total print(sprintf('PC model: RMS: %.3f; R2 = %.3f', RMS, R2)) # the RMS numbers are different than in the book (not sure why) # Ridge Regression: # XTX = CXY[1:m, 1:m] XTy = CXY[1:m, m+1] for( cv in c(0.05, 0.1) ){ beta_ridge_z = solve(XTX + cv*diag(m), XTy) # the coefficients of the standardized predictors (duplicates the coefficients on page 186 which is strange) beta_ridge_c = beta_ridge_z / x_stds # the coefficients of the centered predictors beta_ridge_c = beta_ridge_c * y_std # put the multiple of y_std in place pt_1 = sprintf('Ridge model (c=%.2f): FAT_R = %+.3f', cv, y_mean) c_names = paste(names(beta_ridge_c), 'C', sep='') # centered names i.e. X1C pt_2 = paste(sprintf('%+.3f %s', beta_ridge_c, c_names), sep='', collapse=' ') print(sprintf('%s %s', pt_1, pt_2)) # The predictions with this model: # y_hat = ( as.matrix(DFs[, 1:m]) %*% as.vector(beta_ridge_z) ) * xy_stds[m+1] + y_mean RMS = sqrt( sum( (DF$FAT - y_hat)^2 )/(N-m-1) ) # the RMS numbers are different than in the book (not sure why) SS_Total = sum( ( DF$FAT - y_mean )^2 ) SS_Regression = sum( ( y_hat - y_mean )^2 ) SS_Residual = sum( ( DF$FAT - y_hat )^2 ) R2 = 1 - SS_Residual / SS_Total print(sprintf('Ridge model (c=%.2f): RMS= %.3f; R2 = %.3f', cv, RMS, R2)) }