#
# Duplicate part of Figure 3.7 (the all possible subset models) from Chapter 3 of the book ESLII
#
# Written by:
# -- 
# John L. Weatherwax                2009-04-21
# 
# email: wax@alum.mit.edu
# 
# Please send comments and especially bug reports to the
# above email address.
#
#-----

source("load_prostate_data.R")
source("cv_all_subsets.R")
source("one_standard_error_rule.R")

PD = load_prostate_data(globalScale=FALSE,trainingScale=FALSE,responseScale=FALSE) # read in unscaled data 
XTraining = PD[[1]]
XTesting = PD[[2]]

p        = dim(XTraining)[2]-1 # the last column is the response 
nSamples = dim(XTraining)[1] 

# Do best-subset cross validation: 
#
for( k in 0:p ){ 
  ##print(k)

  res = cv_all_subsets(k,XTraining,numberOfCV=10)
  if( k==0 ){
    complexityParam      = k
    cvResults            = res[[1]]
    bestPredictorFormula = res[[3]] 
  }else{
    complexityParam = cbind(complexityParam,k) 
    cvResults       = cbind( cvResults, res[[1]] ) ## each column is a different value of the complexity parameter ...
                                                   ## each row    is a point sample of (y - y_hat)^2 where y_hat is estimated using the 
                                                   ## data in the fold that y is not in.
    bestPredictorFormula = c( bestPredictorFormula, res[[3]] ) 
  }
}

# Group all cvResults by k (the subset size) and compute statistics:
#
OSE = one_standard_error_rule( complexityParam, cvResults )
complexVValue = OSE[[1]]
complexIndex  = OSE[[2]] 
means = OSE[[4]]
stds = OSE[[5]]
oseHValue = OSE[[3]]

library(gplots) # plotCI, plotmeans found here

##postscript("../../WriteUp/Graphics/Chapter3/dup_fig_3_7_all_subsets.eps", onefile=FALSE, horizontal=FALSE)
plotCI( x=0:p, y=means, uiw=0.5*stds, xlim=c(0,8), ylim=c(0.3,1.8),
        col="black", barcol="blue", lwd=1, type="l", 
        xlab="subset size (p)", ylab="expected squared prediction error (ESPE)" )
abline( h=oseHValue, lty=2 ) 
abline( v=complexVValue, lty=2 )
grid()
##dev.off() 

# Pick the best model, retrain over the entire data set, and predict on the testing data set:
#
bestModelFormula = bestPredictorFormula[ complexIndex ]

DCV  = XTraining
#
#
lpsa = DCV[,p+1] # get the response and delete it from the data frame 
DCV$lpsa = NULL

# Standardize the predictors and demean the response
#
# Note that one can get the centering value (the mean) with the command attr(DCV,'scaled:center')
#                   and the scaling value (the sample standard deviation) with the command attr(DCV,'scaled:scale')
# 
DCV      = scale( DCV )
lpsaMean = mean( lpsa )
lpsa     = lpsa - lpsaMean 
DCVb     = cbind( DCV, lpsa ) # append back on the response 
DCVf     = data.frame( DCVb ) # a data frame containing all scaled variables of interest 
# extract the centering and scaling information:
#
means = attr(DCV,"scaled:center")
stds  = attr(DCV,"scaled:scale")

# apply the computed scaling based on the training data to the testing data:
#
DCVTest  = XTesting
lpsaTest = DCVTest[,p+1] # in physical units (not mean adjusted)
NTest    = length(lpsaTest)
DCVTest$lpsa = NULL 
DCVTest  = t( apply( DCVTest, 1, '-', means ) ) 
DCVTest  = t( apply( DCVTest, 1, '/', stds ) ) 
DCVTestb = cbind( DCVTest, lpsaTest - lpsaMean ) # append back on the response
DCVTestf = data.frame( DCVTestb ) # a data frame containing all scaled variables of interest
names(DCVTestf)[p+1] = "lpsa" # fix the name of the response


# fit this linear model and compute the expected prediction error (EPE) using these features (this just estimates the mean in this case):
# 
mk = lm( formula = bestModelFormula, data=DCVf )

# print the coefficients of this model:
#
print( lpsaMean, digits=4 )  # add back in the mean 
print( coef( mk ), digits=4 ) 

pdt  = predict( mk, newdata=DCVTestf, interval="prediction" )[,1] # get predictions on the test set
pdt  = pdt + lpsaMean # add back in the mean.  Now in physical units (not mean adjusted)
mErr = mean( (lpsaTest - pdt)^2 )
print( mErr ) 
sErr = sqrt( var( (lpsaTest - pdt)^2 )/NTest ) 
print( sErr )