# # Multiple training routines EPage 82 # Problem EPage 162 # # Written by: # -- # John L. Weatherwax 2009-04-21 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # #----- save_plots = F library(caret) library(AppliedPredictiveModeling) set.seed(0) data(ChemicalManufacturingProcess) processPredictors = ChemicalManufacturingProcess[,2:58] yield = ChemicalManufacturingProcess[,1] n_samples = dim(processPredictors)[1] n_features = dim(processPredictors)[2] # Fill in missing values where we have NAs with the median over the non-NA values: # replacements = sapply( processPredictors, median, na.rm=TRUE ) for( ci in 1:n_features ){ bad_inds = is.na( processPredictors[,ci] ) processPredictors[bad_inds,ci] = replacements[ci] } # Look for any features with no variance: # zero_cols = nearZeroVar( processPredictors ) print( sprintf("Found %d zero variance columns from %d",length(zero_cols), dim(processPredictors)[2] ) ) processPredictors = processPredictors[,-zero_cols] # drop these zero variance columns # Split this data into training and testing sets: # training = createDataPartition( yield, p=0.8 ) processPredictors_training = processPredictors[training$Resample1,] yield_training = yield[training$Resample1] processPredictors_testing = processPredictors[-training$Resample1,] yield_testing = yield[-training$Resample1] # Build various nonlinear models and then compare performance: # # Note we use the default "trainControl" of bootstrap evaluations for each of the models below: # preProc_Arguments = c("center","scale") # A K-NN model: # set.seed(0) knnModel = train(x=processPredictors_training, y=yield_training, method="knn", preProc=preProc_Arguments, tuneLength=10) # predict on training/testing sets knnPred = predict(knnModel, newdata=processPredictors_training) knnPR = postResample(pred=knnPred, obs=yield_training) rmses_training = c(knnPR[1]) r2s_training = c(knnPR[2]) methods = c("KNN") knnPred = predict(knnModel, newdata=processPredictors_testing) knnPR = postResample(pred=knnPred, obs=yield_testing) rmses_testing = c(knnPR[1]) r2s_testing = c(knnPR[2]) # A Neural Network model: # nnGrid = expand.grid( .decay=c(0,0.01,0.1), .size=1:10, .bag=FALSE ) set.seed(0) nnetModel = train(x=processPredictors_training, y=yield_training, method="nnet", preProc=preProc_Arguments, linout=TRUE,trace=FALSE,MaxNWts=10 * (ncol(processPredictors_training)+1) + 10 + 1, maxit=500) nnetPred = predict(nnetModel, newdata=processPredictors_training) nnetPR = postResample(pred=nnetPred, obs=yield_training) rmses_training = c(rmses_training,nnetPR[1]) r2s_training = c(r2s_training,nnetPR[2]) methods = c(methods,"NN") nnetPred = predict(nnetModel, newdata=processPredictors_testing) nnetPR = postResample(pred=nnetPred, obs=yield_testing) rmses_testing = c(rmses_testing,nnetPR[1]) r2s_testing = c(r2s_testing,nnetPR[2]) # Averaged Neural Network models: # set.seed(0) avNNetModel = train(x=processPredictors_training, y=yield_training, method="avNNet", preProc=preProc_Arguments, linout=TRUE,trace=FALSE,MaxNWts=10 * (ncol(processPredictors_training)+1) + 10 + 1, maxit=500) avNNetPred = predict(avNNetModel, newdata=processPredictors_training) avNNetPR = postResample(pred=avNNetPred, obs=yield_training) rmses_training = c(rmses_training,avNNetPR[1]) r2s_training = c(r2s_training,avNNetPR[2]) methods = c(methods,"AvgNN") avNNetPred = predict(avNNetModel, newdata=processPredictors_testing) avNNetPR = postResample(pred=avNNetPred, obs=yield_testing) rmses_testing = c(rmses_testing,avNNetPR[1]) r2s_testing = c(r2s_testing,avNNetPR[2]) # MARS model: # marsGrid = expand.grid(.degree=1:2, .nprune=2:38) set.seed(0) marsModel = train(x=processPredictors_training, y=yield_training, method="earth", preProc=preProc_Arguments, tuneGrid=marsGrid) marsPred = predict(marsModel, newdata=processPredictors_training) marsPR = postResample(pred=marsPred, obs=yield_training) rmses_training = c(rmses_training,marsPR[1]) r2s_training = c(r2s_training,marsPR[2]) methods = c(methods,"MARS") marsPred = predict(marsModel, newdata=processPredictors_testing) marsPR = postResample(pred=marsPred, obs=yield_testing) rmses_testing = c(rmses_testing,marsPR[1]) r2s_testing = c(r2s_testing,marsPR[2]) # Lets see what variables are most important in the MARS model: varImp(marsModel) # A Support Vector Machine (SVM): # set.seed(0) svmModel = train(x=processPredictors_training, y=yield_training, method="svmRadial", preProc=preProc_Arguments, tuneLength=20) svmPred = predict(svmModel, newdata=processPredictors_training) svmPR = postResample(pred=svmPred, obs=yield_training) rmses_training = c(rmses_training,svmPR[1]) r2s_training = c(r2s_training,svmPR[2]) methods = c(methods,"SVM") svmPred = predict(svmModel, newdata=processPredictors_testing) svmPR = postResample(pred=svmPred, obs=yield_testing) rmses_testing = c(rmses_testing,svmPR[1]) r2s_testing = c(r2s_testing,svmPR[2]) # Package the results up: # res_training = data.frame( rmse=rmses_training, r2=r2s_training ) rownames(res_training) = methods training_order = order( -res_training$rmse ) res_training = res_training[ training_order, ] # Order the dataframe so that the best results are at the bottom: print( "Final Training Results" ) print( res_training ) res_testing = data.frame( rmse=rmses_testing, r2=r2s_testing ) rownames(res_testing) = methods res_testing = res_testing[ training_order, ] # Order the dataframe so that the best results for the training set are at the bottom: print( "Final Testing Results" ) print( res_testing ) # EPage 82 resamp = resamples( list(knn=knnModel,svm=svmModel,mars=marsModel,nnet=nnetModel,avnnet=avNNetModel) ) print( summary(resamp) ) if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter7/chap_7_prob_5_resamp_dotplot.eps", onefile=FALSE, horizontal=FALSE) } dotplot( resamp, metric="RMSE" ) if( save_plots ){ dev.off() } print( summary(diff(resamp)) ) # Part (b): the variable importance # varImp(svmModel) # Part (c): Explore yield output as we vary the most important predictors of the SVM model: # # We pick a predictor and plot how the responce varies as a function of this value # p_range = range( processPredictors$ManufacturingProcess32 ) variation = seq( from=p_range[1], to=p_range[2], length.out=100 ) mean_predictor_values = apply( processPredictors, 2, mean ) # build a dataframe with variation in only one dimension (for this part we pick ManufacturingProcess32) if( !require(pracma) ){ install.packages('pracma') # needed for repmat library(pracma) } newdata = repmat( as.double(mean_predictor_values), length(variation), 1 ) newdata = data.frame( newdata ) colnames( newdata ) = colnames( processPredictors ) newdata$ManufacturingProcess32 = variation xs = variation y_hat = predict( svmModel, newdata=as.matrix(newdata) ) if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter7/chap_7_prob_5_variation_of_ManufacturingProcess32.eps", onefile=FALSE, horizontal=FALSE) } plot( xs, y_hat, xlab='variation in ManufacturingProcess32', ylab='predicted yield' ) grid() if( save_plots ){ dev.off() }