#
# Problem EPage 210
# 
# Written by:
# -- 
# John L. Weatherwax                2009-04-21
# 
# email: wax@alum.mit.edu
# 
# Please send comments and especially bug reports to the
# above email address.
# 
#-----

save_plots = F

library(caret)
library(AppliedPredictiveModeling)
library(rpart)

set.seed(0)

data(permeability)

# Part (b):
# 
zero_cols = nearZeroVar( fingerprints )
print( sprintf("Found %d zero variance columns from %d",length(zero_cols), dim(fingerprints)[2] ) )
fingerprints = fingerprints[,-zero_cols] # drop these zero variance columns 

# Split this data into training and testing sets:
#
training = createDataPartition( permeability, p=0.8 )

fingerprints_training = fingerprints[training$Resample1,]
permeability_training = permeability[training$Resample1]

fingerprints_testing = fingerprints[-training$Resample1,]
permeability_testing = permeability[-training$Resample1]


# Build various tree based models and then compare performance:
# 
# Note we use the default "trainControl" bootstrap evaluations for each of the models below: 
#
preProc_Arguments = c("center","scale")
#preProc_Arguments = c("pca")

# A rpart model:
# 
set.seed(0)
rpartModel = train(x=fingerprints_training, y=permeability_training, method="rpart", preProc=preProc_Arguments, tuneLength=10)

# predict on training/testing sets
rpartPred = predict(rpartModel, newdata=fingerprints_training)
rpartPR = postResample(pred=rpartPred, obs=permeability_training)
rmses_training = c(rpartPR[1])
r2s_training = c(rpartPR[2])
methods = c("RPART")

rpartPred = predict(rpartModel, newdata=fingerprints_testing)
rpartPR = postResample(pred=rpartPred, obs=permeability_testing)
rmses_testing = c(rpartPR[1])
r2s_testing = c(rpartPR[2])


# A random forest model:
#
set.seed(0)
rfModel = train(x=fingerprints_training, y=permeability_training, method="rf", preProc=preProc_Arguments)

rfPred = predict(rfModel, newdata=fingerprints_training)
rfPR = postResample(pred=rfPred, obs=permeability_training)
rmses_training = c(rmses_training,rfPR[1])
r2s_training = c(r2s_training,rfPR[2])
methods = c(methods,"RF")

rfPred = predict(rfModel, newdata=fingerprints_testing)
rfPR = postResample(pred=rfPred, obs=permeability_testing)
rmses_testing = c(rmses_testing,rfPR[1])
r2s_testing = c(r2s_testing,rfPR[2])


# gradient boosting machine: 
#
gbmGrid = expand.grid( .interaction.depth = seq( 1, 7, by=2 ),
                       .n.trees = seq( 100, 1000, by=100 ),
                       .shrinkage = c(0.01, 0.1) )
set.seed(0)
gbmModel = train(x=fingerprints_training, y=permeability_training, method="gbm", preProc=preProc_Arguments, tuneGrid=gbmGrid, verbose=FALSE)
      
gbmPred = predict(gbmModel, newdata=fingerprints_training)
gbmPR = postResample(pred=gbmPred, obs=permeability_training)
rmses_training = c(rmses_training,gbmPR[1])
r2s_training = c(r2s_training,gbmPR[2])
methods = c(methods,"GBM")

gbmPred = predict(gbmModel, newdata=fingerprints_testing)
gbmPR = postResample(pred=gbmPred, obs=permeability_testing)
rmses_testing = c(rmses_testing,gbmPR[1])
r2s_testing = c(r2s_testing,gbmPR[2])

# Lets see what variables are most important in the GBM model: 
varImp(gbmModel)

# Cubist
#
set.seed(0)
cubistModel = train(x=fingerprints_training, y=permeability_training, method="cubist", preProc=preProc_Arguments, tuneLength=20)

cubistPred = predict(cubistModel, newdata=fingerprints_training)
cubistPR = postResample(pred=cubistPred, obs=permeability_training) 
rmses_training = c(rmses_training,cubistPR[1])
r2s_training = c(r2s_training,cubistPR[2])
methods = c(methods,"CUBIST")

cubistPred = predict(cubistModel, newdata=fingerprints_testing)
cubistPR = postResample(pred=cubistPred, obs=permeability_testing)
rmses_testing = c(rmses_testing,cubistPR[1])
r2s_testing = c(r2s_testing,cubistPR[2])

# Package the results up:
# 
res_training = data.frame( rmse=rmses_training, r2=r2s_training )
rownames(res_training) = methods

training_order = order( -res_training$rmse )

res_training = res_training[ training_order, ] # Order the dataframe so that the best results are at the bottom:
print( "Final Training Results" ) 
print( res_training )

res_testing = data.frame( rmse=rmses_testing, r2=r2s_testing )
rownames(res_testing) = methods

res_testing = res_testing[ training_order, ] # Order the dataframe so that the best results for the training set are at the bottom:
print( "Final Testing Results" ) 
print( res_testing )

# EPage 82 
resamp = resamples( list(rpart=rpartModel,cubist=cubistModel,gbm=gbmModel,rf=rfModel) )
print( summary(resamp) )

if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter8/chap_8_prob_6_resamp_dotplot.eps", onefile=FALSE, horizontal=FALSE) }
dotplot( resamp, metric="RMSE" )
if( save_plots ){ dev.off() }

print( summary(diff(resamp)) )