#
# Duplicate part of Figure 3.7 (the ridge regression results) from Chapter 3 of the book ESLII
#
# Written by:
# -- 
# John L. Weatherwax                2009-04-21
# 
# email: wax@alum.mit.edu
# 
# Please send comments and especially bug reports to the
# above email address.
#
#-----

source("cv_Ridge.R")
source("load_prostate_data.R")
source("one_standard_error_rule.R")
source("df_ridge.R")
source("opt_lambda_ridge.R")

PD        = load_prostate_data(globalScale=FALSE,trainingScale=FALSE,responseScale=FALSE) # read in unscaled data 
XTraining = PD[[1]]
XTesting  = PD[[2]]

p         = dim(XTraining)[2]-1 # the last column is the response 
nSamples  = dim(XTraining)[1] 

# Do ridge-regression cross validation:
#
allLambdas      = opt_lambda_ridge(XTraining[,1:p],multiple=2)
numberOfLambdas = length( allLambdas ) 

for( li in 1:numberOfLambdas ){
  
  res = cv_Ridge(allLambdas[li],XTraining,numberOfCV=10)
  
  # To get a *single* value for the degrees of freedom use the call on the entire training data set
  # We could also take the mean/median of this computed from the cross validation samples 
  # 
  dof = df_ridge(allLambdas[li],as.matrix(XTraining[,1:p]))
    
  if( li==1 ){
    complexityParam = dof 
    cvResults       = res$cvraw
  }else{
    complexityParam = cbind( complexityParam, dof )  # will be sorted from largest value of dof to smallest valued dof 
    cvResults       = cbind( cvResults, res$cvraw )  # each column is a different value of the complexity parameter ...
                                                     # each row    is a different cross validation sample ...
  }
}

# flip the order of the results: 
inds = rev( 1:numberOfLambdas )
allLambdas      = allLambdas[ inds ] 
complexityParam = complexityParam[ inds ] 
cvResults       = cvResults[ , inds ]

# group all cvResults by lambda (the complexity parameter) and compute statistics:
#
OSE = one_standard_error_rule( complexityParam, cvResults )
complexVValue = OSE[[1]]
complexIndex  = OSE[[2]] 
oseHValue = OSE[[3]]
means = OSE[[4]] # as a function of model complexity  
stds = OSE[[5]] # as a function of model complexity


library(gplots) # plotCI, plotmeans found here

saveFig = TRUE
if( saveFig ) postscript("../../WriteUp/Graphics/Chapter3/dup_fig_3_7_ridge_regression.eps", onefile=FALSE, horizontal=FALSE)
plotCI( x=complexityParam, y=means, uiw=0.5*stds, xlim=c(0,8), ylim=c(0.3,1.8),
        col="black", barcol="blue", lwd=1, type="l", 
        xlab="degrees of freedom", ylab="expected squared prediction error (ESPE)" )
abline( h=oseHValue, lty=2 ) 
abline( v=complexVValue, lty=2 ) 
if( saveFig ) dev.off() 


# pick the best model, retrain over the entire data set, and predict on the testing data set:
#
bestLambda = allLambdas[ complexIndex ]

DCV  = XTraining
#
#
lpsa = DCV[,p+1] # get the response and delete it from the data frame 
DCV$lpsa = NULL

# Standardize the predictors and demean the response
#
# Note that one can get the centering value (the mean) with the command attr(DCV,'scaled:center')
#                   and the scaling value (the sample standard deviation) with the command attr(DCV,'scaled:scale')
# 
DCV      = scale( DCV )
lpsaMean = mean( lpsa )
lpsa     = lpsa - lpsaMean 
DCVb     = cbind( DCV, lpsa ) # append back on the response 
DCVf     = data.frame( DCVb ) # a data frame containing all scaled variables of interest 
# extract the centering and scaling information:
#
means = attr(DCV,"scaled:center")
stds  = attr(DCV,"scaled:scale")

# apply the computed scaling based on the training data to the testing data:
#
DCVTest  = XTesting
lpsaTest = DCVTest[,p+1] - lpsaMean # in demeaned units (not mean adjusted)
DCVTest$lpsa = NULL 
DCVTest  = t( apply( DCVTest, 1, '-', means ) ) 
DCVTest  = t( apply( DCVTest, 1, '/', stds ) ) 
DCVTestb = cbind( DCVTest, lpsaTest  ) # append back on the response
DCVTestf = data.frame( DCVTestb ) # a data frame containing all scaled variables of interest
names(DCVTestf)[p+1] = "lpsa" # fix the name of the response

# fit this linear model and compute the expected prediction error (EPE) using these features (this just estimates the mean in this case):
#
DM = as.matrix(DCV)
M = ginv( t(DM) %*% DM + bestLambda*diag(p) ) %*% t(DM)
betaHat = M %*% as.matrix(DCVf[,p+1])

print( lpsaMean, digits=4 )
print( betaHat, digits=3 ) 

DTest = as.matrix(DCVTest)
pdt = DTest %*% betaHat # get predictions on the test set
#pdt  = pdt + lpsaMean # add back in the mean.  Now in physical units (not mean adjusted)
mErr = mean( (lpsaTest - pdt)^2 )
print( mErr )
NTest = length(lpsaTest)
sErr = sqrt( var( (lpsaTest - pdt)^2 )/NTest ) 
print( sErr )