#
# Duplicates Table 3.1 and 3.2 from the book
#
# Written by:
# -- 
# John L. Weatherwax                2009-04-21
# 
# email: wax@alum.mit.edu
# 
# Please send comments and especially bug reports to the
# above email address.
# 
#-----

## Note: to get numbers that match the book we first scale all the predictors AND then fit our model:
## 
source('load_prostate_data.R')
res = load_prostate_data(globalScale=TRUE, trainingScale=FALSE, responseScale=FALSE)
XTraining = res[[1]]
XTesting = res[[2]] 

nrow = dim( XTraining )[1]
p = dim( XTraining )[2] - 1 # the last column is the response 

D = XTraining[,1:p] # get the predictor data

# This gives the raw data for Table 3.1 from the book:
# 
print(round(cor(D), 3))

library(xtable)
xtable( cor(D), caption="Duplication of the values from Table 3.1 from the book", digits=3 )

#
# Duplicate Table 3.2 from the book 
#

# Append a column of ones:
# 
Dp = cbind( matrix(1,nrow,1), as.matrix( D ) )

lpsa = XTraining[,p+1]

library(MASS)

betaHat = ginv( t(Dp) %*% Dp ) %*% t(Dp) %*% as.matrix(lpsa)

print('Table 3.2: first column: beta estimates')
print(round(betaHat, 2))

# make predictions based on these estimated beta coefficients:
#
yhat = Dp %*% betaHat 

# estimate the variance:
# 
sigmaHat = sum( ( lpsa - yhat )^2 ) / ( nrow - p - 1 )

# calculate the covariance of betaHat:
#
covarBetaHat = sigmaHat * ginv( t(Dp) %*% Dp ) 

# calulate the standard deviations of betahat:
#
stdBetaHat = sqrt(diag(covarBetaHat))

print('Table 3.2: second column: beta standard errors')
print(round(as.matrix(stdBetaHat), 2))

# compute the z-scores:
#
z = betaHat / stdBetaHat 

print('Table 3.2: third column: beta z-scores')
print(round(z, 2))

# reproduce/verify the above above results using R's "lm" function:
# 
Db = cbind( D, lpsa ) 
Df = data.frame( Db )
m0 = lm( lpsa ~ lcavol+lweight+age+lbph+svi+lcp+gleason+pgg45, data=Df ) 
print('using the R lm function')
print( summary(m0), digits=2 )

# display the results we get : 
F = data.frame( Term=names(coefficients(m0)), Coefficients=betaHat, Std_Error=stdBetaHat, Z_Score=z )
xtable( F, caption="Duplicated results for the books Table 3.2", digits=2 ) 

# Run this full linear model on the Testing data so that we can verify the comments at the end of section 3.2.1: Example: Prostate Cancer
#
pdt  = predict( m0, newdata=XTesting, interval="prediction" )[,1] # get predictions on the test set
lpsaTest = XTesting$lpsa
NTest = length(lpsaTest)
mErr = mean( (lpsaTest - pdt)^2 )
sErr = sqrt( var( (lpsaTest - pdt)^2 )/NTest ) 
print(sprintf('test data mean prediction error= %.3f; stdErr(mean)= %.3f', mErr, sErr))

mConstant = mean(Df$lpsa) ## the prediction we would make assuming a constant model
mErrConstant = mean( (XTesting$lpsa - mConstant)^2 )
print(sprintf('test data mean prediction error= %.3f using the constant= %.3f', mErrConstant, mConstant))