#
# Here we duplicate some of the computations on the body fat example in the book using R tools.
# This verifies that we understand everything.
#
library(car)

source('../../Data/data_loaders.R')
DF = load_appendix_bodyfat_data()
m = 3 # number of predictors
N = dim(DF)[1] # number of samples

print(summary(DF))

CXY = cor(DF) # Duplicates the numbers in Table 5.1
print(CXY)

#pairs(DF)
scatterplotMatrix(DF)

# Form the standardized predictors:
#
DFs = scale(DF)
xy_means = attr(DFs, 'scaled:center')
xy_stds = attr(DFs, 'scaled:scale')
DFs = data.frame(DFs)

full_lm = lm(FAT ~ ., data=DFs)
print(summary(full_lm))

vif(full_lm) # duplicates the numbers on Page 168

print(lm(SKIN ~ THIGH + ARM, data=DF)) # duplicates the regressions on Page 168
print(lm(THIGH ~ SKIN + ARM, data=DF))
print(lm(ARM ~ SKIN + THIGH, data=DF))


ES = eigen(CXY[ 1:m, 1:m ])
print('Eigenvalues')
print(round(ES$values, m)) # matches top of page 171
ES$vectors[, 1] = -ES$vectors[, 1] # change the sign of all components of the first eigenvector
print('Eigenvectors')
print(round(ES$vectors, 3)) # matches the matrix P on the top of page 171


# Compute the principal components (and a scatter plot of them):
#
Z = as.matrix(DFs[ , 1:m ]) %*% ES$vectors
colnames(Z) = c('Z1', 'ZZ', 'Z3')
pairs(Z, xlim=c(-1, +1), ylim=c(-1, +1)) # duplicates Figure 5.3 on page 172

# Drop the eigenvector with the smallest eigenvalue and form PC regression:
#
#source('../../../Hastie/Code/Chapter3/pcr_wwx.R')
source('http://www.waxworksmath.com/Authors/G_M/Hastie/Code/Chapter3/pcr_wwx.R')
res = pcr_wwx(DF, DF)
y_mean = res[[4]]
y_std = sd(DF$FAT)
x_means = res[[5]]
x_stds = res[[6]]
beta_pc_z = print(res[[1]][, m-1]) # the coefficients of the standardized predictors (dropping the eigenvector with the smallest eigenvalue)
beta_pc_c = beta_pc_z / x_stds # the coefficients of the centered predictors
c_names = paste(names(beta_pc_c), 'C', sep='') # centered names i.e. x1c

pt_1 = sprintf('PC model: FAT_PC = %+.3f', y_mean)
pt_2 = paste(sprintf('%+.3f %s', beta_pc_c, c_names), sep='', collapse=' ')
print(sprintf('%s %s', pt_1, pt_2)) # duplicates the coefficients given on page 183

# Using a routine to compute the coefficients we have:
#
print(sprintf('Coefficients of a linear model with 1 principle component:'))
R = regression_coefficients_given_pcr(DF$FAT, res, M=1)
print(R[1, ])

print(sprintf('Coefficients (centered) of a linear model with 2 principle components:'))
R = regression_coefficients_given_pcr(DF$FAT, res, M=2)
print(R[2, ]) # the row with the label 'centered_coeffs' in R can be used to duplicate the above PC model: FAT_PC = result

print(sprintf('Coefficients of a linear model with 3=m (all variables) principle components:'))
R = regression_coefficients_given_pcr(DF$FAT, res, M=m)
print(R[1, ]) # matches the coefficients in the call lm(FAT ~ ., data=DF)


# The predictions with this model:
#
y_hat = res[[2]][, m-1]

RMS = sqrt( sum( (DF$FAT - y_hat)^2 )/(N-m-1) )
SS_Total = sum( ( DF$FAT - y_mean )^2 )
SS_Regression = sum( ( y_hat - y_mean )^2 )
SS_Residual = sum( ( DF$FAT - y_hat )^2 )
R2 = 1 - SS_Residual / SS_Total
print(sprintf('PC model: RMS: %.3f; R2 = %.3f', RMS, R2)) # the RMS numbers are different than in the book (not sure why)

# Ridge Regression:
#
XTX = CXY[1:m, 1:m]
XTy = CXY[1:m, m+1]

for( cv in c(0.05, 0.1) ){
    beta_ridge_z = solve(XTX + cv*diag(m), XTy) # the coefficients of the standardized predictors (duplicates the coefficients on page 186 which is strange)
    beta_ridge_c = beta_ridge_z / x_stds # the coefficients of the centered predictors
    beta_ridge_c = beta_ridge_c * y_std # put the multiple of y_std in place

    pt_1 = sprintf('Ridge model (c=%.2f): FAT_R = %+.3f', cv, y_mean)
    c_names = paste(names(beta_ridge_c), 'C', sep='') # centered names i.e. X1C
    pt_2 = paste(sprintf('%+.3f %s', beta_ridge_c, c_names), sep='', collapse=' ')
    print(sprintf('%s %s', pt_1, pt_2))

    # The predictions with this model:
    #
    y_hat = ( as.matrix(DFs[, 1:m]) %*% as.vector(beta_ridge_z) ) * xy_stds[m+1] + y_mean
    RMS = sqrt( sum( (DF$FAT - y_hat)^2 )/(N-m-1) ) # the RMS numbers are different than in the book (not sure why)
    SS_Total = sum( ( DF$FAT - y_mean )^2 )
    SS_Regression = sum( ( y_hat - y_mean )^2 )
    SS_Residual = sum( ( DF$FAT - y_hat )^2 )
    R2 = 1 - SS_Residual / SS_Total
    print(sprintf('Ridge model (c=%.2f): RMS= %.3f; R2 = %.3f', cv, RMS, R2))
}