#
# Problem EPage 210
# 
# Written by:
# -- 
# John L. Weatherwax                2009-04-21
# 
# email: wax@alum.mit.edu
# 
# Please send comments and especially bug reports to the
# above email address.
# 
#-----

save_plots = F

library(caret)
library(AppliedPredictiveModeling)
library(rpart)

set.seed(0)

data(ChemicalManufacturingProcess)

processPredictors = ChemicalManufacturingProcess[,2:58]
yield = ChemicalManufacturingProcess[,1]

n_samples = dim(processPredictors)[1]
n_features = dim(processPredictors)[2]

# Fill in missing values where we have NAs with the median over the non-NA values: 
#
replacements = sapply( processPredictors, median, na.rm=TRUE )
for( ci in 1:n_features ){
  bad_inds = is.na( processPredictors[,ci] )
  processPredictors[bad_inds,ci] = replacements[ci]
}

# Look for any features with no variance:
# 
zero_cols = nearZeroVar( processPredictors )
print( sprintf("Found %d zero variance columns from %d",length(zero_cols), dim(processPredictors)[2] ) )
processPredictors = processPredictors[,-zero_cols] # drop these zero variance columns 

# Split this data into training and testing sets:
#
training = createDataPartition( yield, p=0.8 )

processPredictors_training = processPredictors[training$Resample1,]
yield_training = yield[training$Resample1]

processPredictors_testing = processPredictors[-training$Resample1,]
yield_testing = yield[-training$Resample1]

# Build various tree based models and then compare performance:
# 
# Note we use the default "trainControl" of bootstrap evaluations for each of the models below: 
#
preProc_Arguments = c("center","scale")

# A rpart model:
# 
set.seed(0)
rpartModel = train(x=processPredictors_training, y=yield_training, method="rpart", preProc=preProc_Arguments, tuneLength=10)

# predict on training/testing sets
rpartPred = predict(rpartModel, newdata=processPredictors_training)
rpartPR = postResample(pred=rpartPred, obs=yield_training)
rmses_training = c(rpartPR[1])
r2s_training = c(rpartPR[2])
methods = c("RPART")

rpartPred = predict(rpartModel, newdata=processPredictors_testing)
rpartPR = postResample(pred=rpartPred, obs=yield_testing)
rmses_testing = c(rpartPR[1])
r2s_testing = c(rpartPR[2])


# A random forest model:
#
set.seed(0)
rfModel = train(x=processPredictors_training, y=yield_training, method="rf", preProc=preProc_Arguments, tuneLength=10)

rfPred = predict(rfModel, newdata=processPredictors_training)
rfPR = postResample(pred=rfPred, obs=yield_training)
rmses_training = c(rmses_training,rfPR[1])
r2s_training = c(r2s_training,rfPR[2])
methods = c(methods,"RF")

rfPred = predict(rfModel, newdata=processPredictors_testing)
rfPR = postResample(pred=rfPred, obs=yield_testing)
rmses_testing = c(rmses_testing,rfPR[1])
r2s_testing = c(r2s_testing,rfPR[2])


# gradient boosting machine: 
#
gbmGrid = expand.grid( .interaction.depth = seq( 1, 7, by=2 ),
                       .n.trees = seq( 100, 1000, by=100 ),
                       .shrinkage = c(0.01, 0.1) )

set.seed(0)
gbmModel = train(x=processPredictors_training, y=yield_training, method="gbm",
                 preProc=preProc_Arguments,
                 tuneGrid=gbmGrid,
                 verbose=FALSE)
      
gbmPred = predict(gbmModel, newdata=processPredictors_training)
gbmPR = postResample(pred=gbmPred, obs=yield_training)
rmses_training = c(rmses_training,gbmPR[1])
r2s_training = c(r2s_training,gbmPR[2])
methods = c(methods,"GBM")

gbmPred = predict(gbmModel, newdata=processPredictors_testing)
gbmPR = postResample(pred=gbmPred, obs=yield_testing)
rmses_testing = c(rmses_testing,gbmPR[1])
r2s_testing = c(r2s_testing,gbmPR[2])

# Cubist: 
#
set.seed(0)
cubistModel = train(x=processPredictors_training, y=yield_training, method="cubist", preProc=preProc_Arguments, tuneLength=20)

cubistPred = predict(cubistModel, newdata=processPredictors_training)
cubistPR = postResample(pred=cubistPred, obs=yield_training) 
rmses_training = c(rmses_training,cubistPR[1])
r2s_training = c(r2s_training,cubistPR[2])
methods = c(methods,"CUBIST")

cubistPred = predict(cubistModel, newdata=processPredictors_testing)
cubistPR = postResample(pred=cubistPred, obs=yield_testing)
rmses_testing = c(rmses_testing,cubistPR[1])
r2s_testing = c(r2s_testing,cubistPR[2])

# Package the results up:
# 
res_training = data.frame( rmse=rmses_training, r2=r2s_training )
rownames(res_training) = methods

training_order = order( -res_training$rmse )

res_training = res_training[ training_order, ] # Order the dataframe so that the best results are at the bottom:
print( "Final Training Results" ) 
print( res_training )

res_testing = data.frame( rmse=rmses_testing, r2=r2s_testing )
rownames(res_testing) = methods

res_testing = res_testing[ training_order, ] # Order the dataframe so that the best results for the training set are at the bottom:
print( "Final Testing Results" ) 
print( res_testing )

# EPage 82 
resamp = resamples( list(rpart=rpartModel,rf=rfModel,cubist=cubistModel,gbm=gbmModel) )
print( summary(resamp) )

if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter8/chap_8_prob_7_resamp_dotplot.eps", onefile=FALSE, horizontal=FALSE) }
dotplot( resamp, metric="RMSE" )
if( save_plots ){ dev.off() }

print( summary(diff(resamp)) )

# Part (b): the variable importance
#
# Lets see what variables are most important in the GBM model: 
varImp(gbmModel)
#varImp(cubistModel)

# Explore yield output as we vary the most important predictors of the GBM model:
#
# We pick a predictor and plot how the responce varies as a function of this value
#
p_range = range( processPredictors$ManufacturingProcess32 )
variation = seq( from=p_range[1], to=p_range[2], length.out=100 )
mean_predictor_values = apply( processPredictors, 2, mean )

# build a dataframe with variation in only one dimension (for this part we pick ManufacturingProcess32)
if( !require(pracma) ){
  install.packages('pracma') # needed for repmat
  library(pracma)
}

newdata = repmat( as.double(mean_predictor_values), length(variation), 1 )
newdata = data.frame( newdata )
colnames( newdata ) = colnames( processPredictors )
newdata$ManufacturingProcess32 = variation

xs = variation
y_hat = predict( gbmModel, newdata=as.matrix(newdata) )

if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter8/chap_8_prob_7_variation_of_ManufacturingProcess32.eps", onefile=FALSE, horizontal=FALSE) }
plot( xs, y_hat, xlab='variation in ManufacturingProcess32', ylab='predicted yield' )
grid()
if( save_plots ){ dev.off() }


# Part (c): Plot the optimal single tree using the fitted rPartModel above:
#
# Use the optimal settings found using the train function: 
#
# RMSE was used to select the optimal model using  the smallest value.
# The final value used for the model was cp = 0.07533616.
#

trainData = processPredictors_training
trainData$y = yield_training
rPartModel = rpart( y ~ ., data=trainData, method="anova", control=rpart.control(cp = 0.07533616) )

# Plot the optimal single regression tree:
# 
if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter8/chap_8_prob_7_optimal_tree.eps", onefile=FALSE, horizontal=FALSE) }
plot(rPartModel); text(rPartModel)
if( save_plots ){ dev.off() }