#
# Multiple training routines EPage 82
# Problem EPage 162
# 
# Written by:
# -- 
# John L. Weatherwax                2009-04-21
# 
# email: wax@alum.mit.edu
# 
# Please send comments and especially bug reports to the
# above email address.
# 
#-----

save_plots = F

library(caret)
library(AppliedPredictiveModeling)

set.seed(0)

data(ChemicalManufacturingProcess)

processPredictors = ChemicalManufacturingProcess[,2:58]
yield = ChemicalManufacturingProcess[,1]

n_samples = dim(processPredictors)[1]
n_features = dim(processPredictors)[2]

# Fill in missing values where we have NAs with the median over the non-NA values: 
#
replacements = sapply( processPredictors, median, na.rm=TRUE )
for( ci in 1:n_features ){
  bad_inds = is.na( processPredictors[,ci] )
  processPredictors[bad_inds,ci] = replacements[ci]
}

# Look for any features with no variance:
# 
zero_cols = nearZeroVar( processPredictors )
print( sprintf("Found %d zero variance columns from %d",length(zero_cols), dim(processPredictors)[2] ) )
processPredictors = processPredictors[,-zero_cols] # drop these zero variance columns 

# Split this data into training and testing sets:
#
training = createDataPartition( yield, p=0.8 )

processPredictors_training = processPredictors[training$Resample1,]
yield_training = yield[training$Resample1]

processPredictors_testing = processPredictors[-training$Resample1,]
yield_testing = yield[-training$Resample1]

# Build various nonlinear models and then compare performance:
# 
# Note we use the default "trainControl" of bootstrap evaluations for each of the models below: 
#
preProc_Arguments = c("center","scale")

# A K-NN model:
# 
set.seed(0)
knnModel = train(x=processPredictors_training, y=yield_training, method="knn", preProc=preProc_Arguments, tuneLength=10)

# predict on training/testing sets
knnPred = predict(knnModel, newdata=processPredictors_training)
knnPR = postResample(pred=knnPred, obs=yield_training)
rmses_training = c(knnPR[1])
r2s_training = c(knnPR[2])
methods = c("KNN")

knnPred = predict(knnModel, newdata=processPredictors_testing)
knnPR = postResample(pred=knnPred, obs=yield_testing)
rmses_testing = c(knnPR[1])
r2s_testing = c(knnPR[2])


# A Neural Network model:
#
nnGrid = expand.grid( .decay=c(0,0.01,0.1), .size=1:10, .bag=FALSE )
set.seed(0)
nnetModel = train(x=processPredictors_training, y=yield_training, method="nnet", preProc=preProc_Arguments,
                  linout=TRUE,trace=FALSE,MaxNWts=10 * (ncol(processPredictors_training)+1) + 10 + 1, maxit=500)

nnetPred = predict(nnetModel, newdata=processPredictors_training)
nnetPR = postResample(pred=nnetPred, obs=yield_training)
rmses_training = c(rmses_training,nnetPR[1])
r2s_training = c(r2s_training,nnetPR[2])
methods = c(methods,"NN")

nnetPred = predict(nnetModel, newdata=processPredictors_testing)
nnetPR = postResample(pred=nnetPred, obs=yield_testing)
rmses_testing = c(rmses_testing,nnetPR[1])
r2s_testing = c(r2s_testing,nnetPR[2])


# Averaged Neural Network models:
#
set.seed(0)
avNNetModel = train(x=processPredictors_training, y=yield_training, method="avNNet", preProc=preProc_Arguments,
                    linout=TRUE,trace=FALSE,MaxNWts=10 * (ncol(processPredictors_training)+1) + 10 + 1, maxit=500)
 
avNNetPred = predict(avNNetModel, newdata=processPredictors_training)
avNNetPR = postResample(pred=avNNetPred, obs=yield_training)
rmses_training = c(rmses_training,avNNetPR[1])
r2s_training = c(r2s_training,avNNetPR[2])
methods = c(methods,"AvgNN")

avNNetPred = predict(avNNetModel, newdata=processPredictors_testing)
avNNetPR = postResample(pred=avNNetPred, obs=yield_testing)
rmses_testing = c(rmses_testing,avNNetPR[1])
r2s_testing = c(r2s_testing,avNNetPR[2])


# MARS model:
#
marsGrid = expand.grid(.degree=1:2, .nprune=2:38)
set.seed(0)
marsModel = train(x=processPredictors_training, y=yield_training, method="earth", preProc=preProc_Arguments, tuneGrid=marsGrid)
      
marsPred = predict(marsModel, newdata=processPredictors_training)
marsPR = postResample(pred=marsPred, obs=yield_training)
rmses_training = c(rmses_training,marsPR[1])
r2s_training = c(r2s_training,marsPR[2])
methods = c(methods,"MARS")

marsPred = predict(marsModel, newdata=processPredictors_testing)
marsPR = postResample(pred=marsPred, obs=yield_testing)
rmses_testing = c(rmses_testing,marsPR[1])
r2s_testing = c(r2s_testing,marsPR[2])

# Lets see what variables are most important in the MARS model: 
varImp(marsModel)

# A Support Vector Machine (SVM):
#
set.seed(0)
svmModel = train(x=processPredictors_training, y=yield_training, method="svmRadial", preProc=preProc_Arguments, tuneLength=20)

svmPred = predict(svmModel, newdata=processPredictors_training)
svmPR = postResample(pred=svmPred, obs=yield_training) 
rmses_training = c(rmses_training,svmPR[1])
r2s_training = c(r2s_training,svmPR[2])
methods = c(methods,"SVM")

svmPred = predict(svmModel, newdata=processPredictors_testing)
svmPR = postResample(pred=svmPred, obs=yield_testing)
rmses_testing = c(rmses_testing,svmPR[1])
r2s_testing = c(r2s_testing,svmPR[2])

# Package the results up:
# 
res_training = data.frame( rmse=rmses_training, r2=r2s_training )
rownames(res_training) = methods

training_order = order( -res_training$rmse )

res_training = res_training[ training_order, ] # Order the dataframe so that the best results are at the bottom:
print( "Final Training Results" ) 
print( res_training )

res_testing = data.frame( rmse=rmses_testing, r2=r2s_testing )
rownames(res_testing) = methods

res_testing = res_testing[ training_order, ] # Order the dataframe so that the best results for the training set are at the bottom:
print( "Final Testing Results" ) 
print( res_testing )

# EPage 82 
resamp = resamples( list(knn=knnModel,svm=svmModel,mars=marsModel,nnet=nnetModel,avnnet=avNNetModel) )
print( summary(resamp) )

if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter7/chap_7_prob_5_resamp_dotplot.eps", onefile=FALSE, horizontal=FALSE) }
dotplot( resamp, metric="RMSE" )
if( save_plots ){ dev.off() }

print( summary(diff(resamp)) )

# Part (b): the variable importance
#
varImp(svmModel)

# Part (c): Explore yield output as we vary the most important predictors of the SVM model:
#
# We pick a predictor and plot how the responce varies as a function of this value
#
p_range = range( processPredictors$ManufacturingProcess32 )
variation = seq( from=p_range[1], to=p_range[2], length.out=100 )
mean_predictor_values = apply( processPredictors, 2, mean )

# build a dataframe with variation in only one dimension (for this part we pick ManufacturingProcess32)
if( !require(pracma) ){
  install.packages('pracma') # needed for repmat
  library(pracma)
}

newdata = repmat( as.double(mean_predictor_values), length(variation), 1 )
newdata = data.frame( newdata )
colnames( newdata ) = colnames( processPredictors )
newdata$ManufacturingProcess32 = variation

xs = variation
y_hat = predict( svmModel, newdata=as.matrix(newdata) )

if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter7/chap_7_prob_5_variation_of_ManufacturingProcess32.eps", onefile=FALSE, horizontal=FALSE) }
plot( xs, y_hat, xlab='variation in ManufacturingProcess32', ylab='predicted yield' )
grid()
if( save_plots ){ dev.off() }