#
# Runs ridge regression on the SPAM data set
#
# Written by:
# -- 
# John L. Weatherwax                2009-04-21
# 
# email: wax@alum.mit.edu
# 
# Please send comments and especially bug reports to the
# above email address.
#
#-----

source("cv_Ridge.R")
source("load_spam_data.R")
source("one_standard_error_rule.R")
source("df_ridge.R")
source("opt_lambda_ridge.R")

PD        = load_spam_data(trainingScale=FALSE,responseScale=FALSE) # read in unscaled data 
XTraining = PD[[1]]
XTesting  = PD[[2]]

p        = dim(XTraining)[2]-1 # the last column is the response 
nSamples = dim(XTraining)[1]

# Do ridge-regression cross validation (large lambda values => small degrees of freedom):
#
allLambdas      = opt_lambda_ridge( XTraining[,1:p], multiple=2 ) 
numberOfLambdas = length( allLambdas ) 

for( li in 1:numberOfLambdas ){
  
  res = cv_Ridge(allLambdas[li],XTraining,numberOfCV=10)
  
  # To get a *single* value for the degrees of freedom use the call on the entire training data set
  # We could also take the mean/median of this computed from the cross validation samples 
  # 
  dof = df_ridge(allLambdas[li],as.matrix(XTraining[,1:p]))
    
  if( li==1 ){
    complexityParam = dof 
    cvResults       = res$cvraw
  }else{
    complexityParam = cbind( complexityParam, dof )  # will be sorted from largest value of dof to smallest valued dof 
    cvResults       = cbind( cvResults, res$cvraw )  # each column is a different value of the complexity parameter ...
                                                     # each row    is a different cross validation sample ...
  }
}

# flip the order of the results: 
inds            = rev( 1:numberOfLambdas )
allLambdas      = allLambdas[ inds ] 
complexityParam = complexityParam[ inds ] 
cvResults       = cvResults[ , inds ]

# group all cvResults by lambda (the complexity parameter) and compute statistics:
#
OSE           = one_standard_error_rule( complexityParam, cvResults )
complexVValue = OSE[[1]]
complexIndex  = OSE[[2]] 
oseHValue     = OSE[[3]]
means         = OSE[[4]] # as a function of model complexity  
stds          = OSE[[5]] # as a function of model complexity

library(gplots) # plotCI, plotmeans found here, xlim=c(0,8), ylim=c(0.3,1.8)

savePlot = TRUE 
if( savePlot ) postscript("../../WriteUp/Graphics/Chapter3/spam_ridge_regression.eps", onefile=FALSE, horizontal=FALSE)
plotCI( x=complexityParam, y=means, uiw=0.5*stds,
        col="black", barcol="blue", lwd=1, type="l", 
        xlab="degrees of freedom", ylab="expected squared prediction error (ESPE)" )
abline( h=oseHValue, lty=2 ) 
abline( v=complexVValue, lty=2 ) 
if( savePlot ) dev.off() 

# pick the best model, retrain over the entire data set, and predict on the testing data set:
#
bestLambda = allLambdas[ complexIndex ]

DCV  = XTraining
#
#
responseName        = names(DCV)[p+1]  # get the response and delete it from the data frame 
response            = DCV[,p+1]
DCV[[responseName]] = NULL

# Standardize the predictors and demean the response
#
# Note that one can get the centering value (the mean) with the command attr(DCV,'scaled:center')
#                   and the scaling value (the sample standard deviation) with the command attr(DCV,'scaled:scale')
# 
DCV          = scale( DCV )
responseMean = mean( response )
response     = response - responseMean 
DCVb         = cbind( DCV, response ) # append back on the response 
DCVf         = data.frame( DCVb ) # a data frame containing all scaled variables of interest 
# extract the centering and scaling information:
#
means = attr(DCV,"scaled:center")
stds  = attr(DCV,"scaled:scale")

# apply the computed scaling based on the training data to the testing data:
#
DCVTest                 = XTesting
responseTest            = DCVTest[,p+1] - responseMean # in demeaned units (not mean adjusted)
DCVTest[[responseName]] = NULL
DCVTest                 = t( apply( DCVTest, 1, '-', means ) ) 
DCVTest                 = t( apply( DCVTest, 1, '/', stds ) ) 
DCVTestb                = cbind( DCVTest, responseTest  ) # append back on the response
DCVTestf                = data.frame( DCVTestb ) # a data frame containing all scaled variables of interest
names(DCVTestf)[p+1]    = responseName # fix the name of the response

# fit this linear model and compute the expected prediction error (EPE) using these features (this just estimates the mean in this case):
#
DM      = as.matrix(DCV)
M       = ginv( t(DM) %*% DM + bestLambda*diag(p) ) %*% t(DM)
betaHat = M %*% as.matrix(DCVf[,p+1])

print( responseMean, digits=4 )
print( betaHat, digits=3 ) 

DTest = as.matrix(DCVTest)
pdt = DTest %*% betaHat # get predictions on the test set
#pdt  = pdt + responseMean # add back in the mean.  Now in physical units (not mean adjusted)
mErr = mean( (responseTest - pdt)^2 )
print( mErr )
NTest = length(responseTest)
sErr = sqrt( var( (responseTest - pdt)^2 )/NTest ) 
print( sErr )