# # Problem EPage 210 # # Written by: # -- # John L. Weatherwax 2009-04-21 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # #----- save_plots = F library(caret) library(AppliedPredictiveModeling) library(rpart) set.seed(0) data(ChemicalManufacturingProcess) processPredictors = ChemicalManufacturingProcess[,2:58] yield = ChemicalManufacturingProcess[,1] n_samples = dim(processPredictors)[1] n_features = dim(processPredictors)[2] # Fill in missing values where we have NAs with the median over the non-NA values: # replacements = sapply( processPredictors, median, na.rm=TRUE ) for( ci in 1:n_features ){ bad_inds = is.na( processPredictors[,ci] ) processPredictors[bad_inds,ci] = replacements[ci] } # Look for any features with no variance: # zero_cols = nearZeroVar( processPredictors ) print( sprintf("Found %d zero variance columns from %d",length(zero_cols), dim(processPredictors)[2] ) ) processPredictors = processPredictors[,-zero_cols] # drop these zero variance columns # Split this data into training and testing sets: # training = createDataPartition( yield, p=0.8 ) processPredictors_training = processPredictors[training$Resample1,] yield_training = yield[training$Resample1] processPredictors_testing = processPredictors[-training$Resample1,] yield_testing = yield[-training$Resample1] # Build various tree based models and then compare performance: # # Note we use the default "trainControl" of bootstrap evaluations for each of the models below: # preProc_Arguments = c("center","scale") # A rpart model: # set.seed(0) rpartModel = train(x=processPredictors_training, y=yield_training, method="rpart", preProc=preProc_Arguments, tuneLength=10) # predict on training/testing sets rpartPred = predict(rpartModel, newdata=processPredictors_training) rpartPR = postResample(pred=rpartPred, obs=yield_training) rmses_training = c(rpartPR[1]) r2s_training = c(rpartPR[2]) methods = c("RPART") rpartPred = predict(rpartModel, newdata=processPredictors_testing) rpartPR = postResample(pred=rpartPred, obs=yield_testing) rmses_testing = c(rpartPR[1]) r2s_testing = c(rpartPR[2]) # A random forest model: # set.seed(0) rfModel = train(x=processPredictors_training, y=yield_training, method="rf", preProc=preProc_Arguments, tuneLength=10) rfPred = predict(rfModel, newdata=processPredictors_training) rfPR = postResample(pred=rfPred, obs=yield_training) rmses_training = c(rmses_training,rfPR[1]) r2s_training = c(r2s_training,rfPR[2]) methods = c(methods,"RF") rfPred = predict(rfModel, newdata=processPredictors_testing) rfPR = postResample(pred=rfPred, obs=yield_testing) rmses_testing = c(rmses_testing,rfPR[1]) r2s_testing = c(r2s_testing,rfPR[2]) # gradient boosting machine: # gbmGrid = expand.grid( .interaction.depth = seq( 1, 7, by=2 ), .n.trees = seq( 100, 1000, by=100 ), .shrinkage = c(0.01, 0.1) ) set.seed(0) gbmModel = train(x=processPredictors_training, y=yield_training, method="gbm", preProc=preProc_Arguments, tuneGrid=gbmGrid, verbose=FALSE) gbmPred = predict(gbmModel, newdata=processPredictors_training) gbmPR = postResample(pred=gbmPred, obs=yield_training) rmses_training = c(rmses_training,gbmPR[1]) r2s_training = c(r2s_training,gbmPR[2]) methods = c(methods,"GBM") gbmPred = predict(gbmModel, newdata=processPredictors_testing) gbmPR = postResample(pred=gbmPred, obs=yield_testing) rmses_testing = c(rmses_testing,gbmPR[1]) r2s_testing = c(r2s_testing,gbmPR[2]) # Cubist: # set.seed(0) cubistModel = train(x=processPredictors_training, y=yield_training, method="cubist", preProc=preProc_Arguments, tuneLength=20) cubistPred = predict(cubistModel, newdata=processPredictors_training) cubistPR = postResample(pred=cubistPred, obs=yield_training) rmses_training = c(rmses_training,cubistPR[1]) r2s_training = c(r2s_training,cubistPR[2]) methods = c(methods,"CUBIST") cubistPred = predict(cubistModel, newdata=processPredictors_testing) cubistPR = postResample(pred=cubistPred, obs=yield_testing) rmses_testing = c(rmses_testing,cubistPR[1]) r2s_testing = c(r2s_testing,cubistPR[2]) # Package the results up: # res_training = data.frame( rmse=rmses_training, r2=r2s_training ) rownames(res_training) = methods training_order = order( -res_training$rmse ) res_training = res_training[ training_order, ] # Order the dataframe so that the best results are at the bottom: print( "Final Training Results" ) print( res_training ) res_testing = data.frame( rmse=rmses_testing, r2=r2s_testing ) rownames(res_testing) = methods res_testing = res_testing[ training_order, ] # Order the dataframe so that the best results for the training set are at the bottom: print( "Final Testing Results" ) print( res_testing ) # EPage 82 resamp = resamples( list(rpart=rpartModel,rf=rfModel,cubist=cubistModel,gbm=gbmModel) ) print( summary(resamp) ) if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter8/chap_8_prob_7_resamp_dotplot.eps", onefile=FALSE, horizontal=FALSE) } dotplot( resamp, metric="RMSE" ) if( save_plots ){ dev.off() } print( summary(diff(resamp)) ) # Part (b): the variable importance # # Lets see what variables are most important in the GBM model: varImp(gbmModel) #varImp(cubistModel) # Explore yield output as we vary the most important predictors of the GBM model: # # We pick a predictor and plot how the responce varies as a function of this value # p_range = range( processPredictors$ManufacturingProcess32 ) variation = seq( from=p_range[1], to=p_range[2], length.out=100 ) mean_predictor_values = apply( processPredictors, 2, mean ) # build a dataframe with variation in only one dimension (for this part we pick ManufacturingProcess32) if( !require(pracma) ){ install.packages('pracma') # needed for repmat library(pracma) } newdata = repmat( as.double(mean_predictor_values), length(variation), 1 ) newdata = data.frame( newdata ) colnames( newdata ) = colnames( processPredictors ) newdata$ManufacturingProcess32 = variation xs = variation y_hat = predict( gbmModel, newdata=as.matrix(newdata) ) if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter8/chap_8_prob_7_variation_of_ManufacturingProcess32.eps", onefile=FALSE, horizontal=FALSE) } plot( xs, y_hat, xlab='variation in ManufacturingProcess32', ylab='predicted yield' ) grid() if( save_plots ){ dev.off() } # Part (c): Plot the optimal single tree using the fitted rPartModel above: # # Use the optimal settings found using the train function: # # RMSE was used to select the optimal model using the smallest value. # The final value used for the model was cp = 0.07533616. # trainData = processPredictors_training trainData$y = yield_training rPartModel = rpart( y ~ ., data=trainData, method="anova", control=rpart.control(cp = 0.07533616) ) # Plot the optimal single regression tree: # if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter8/chap_8_prob_7_optimal_tree.eps", onefile=FALSE, horizontal=FALSE) } plot(rPartModel); text(rPartModel) if( save_plots ){ dev.off() }