# # Problem EPage 210 # # Written by: # -- # John L. Weatherwax 2009-04-21 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # #----- save_plots = F library(caret) library(AppliedPredictiveModeling) library(rpart) set.seed(0) data(permeability) # Part (b): # zero_cols = nearZeroVar( fingerprints ) print( sprintf("Found %d zero variance columns from %d",length(zero_cols), dim(fingerprints)[2] ) ) fingerprints = fingerprints[,-zero_cols] # drop these zero variance columns # Split this data into training and testing sets: # training = createDataPartition( permeability, p=0.8 ) fingerprints_training = fingerprints[training$Resample1,] permeability_training = permeability[training$Resample1] fingerprints_testing = fingerprints[-training$Resample1,] permeability_testing = permeability[-training$Resample1] # Build various tree based models and then compare performance: # # Note we use the default "trainControl" bootstrap evaluations for each of the models below: # preProc_Arguments = c("center","scale") #preProc_Arguments = c("pca") # A rpart model: # set.seed(0) rpartModel = train(x=fingerprints_training, y=permeability_training, method="rpart", preProc=preProc_Arguments, tuneLength=10) # predict on training/testing sets rpartPred = predict(rpartModel, newdata=fingerprints_training) rpartPR = postResample(pred=rpartPred, obs=permeability_training) rmses_training = c(rpartPR[1]) r2s_training = c(rpartPR[2]) methods = c("RPART") rpartPred = predict(rpartModel, newdata=fingerprints_testing) rpartPR = postResample(pred=rpartPred, obs=permeability_testing) rmses_testing = c(rpartPR[1]) r2s_testing = c(rpartPR[2]) # A random forest model: # set.seed(0) rfModel = train(x=fingerprints_training, y=permeability_training, method="rf", preProc=preProc_Arguments) rfPred = predict(rfModel, newdata=fingerprints_training) rfPR = postResample(pred=rfPred, obs=permeability_training) rmses_training = c(rmses_training,rfPR[1]) r2s_training = c(r2s_training,rfPR[2]) methods = c(methods,"RF") rfPred = predict(rfModel, newdata=fingerprints_testing) rfPR = postResample(pred=rfPred, obs=permeability_testing) rmses_testing = c(rmses_testing,rfPR[1]) r2s_testing = c(r2s_testing,rfPR[2]) # gradient boosting machine: # gbmGrid = expand.grid( .interaction.depth = seq( 1, 7, by=2 ), .n.trees = seq( 100, 1000, by=100 ), .shrinkage = c(0.01, 0.1) ) set.seed(0) gbmModel = train(x=fingerprints_training, y=permeability_training, method="gbm", preProc=preProc_Arguments, tuneGrid=gbmGrid, verbose=FALSE) gbmPred = predict(gbmModel, newdata=fingerprints_training) gbmPR = postResample(pred=gbmPred, obs=permeability_training) rmses_training = c(rmses_training,gbmPR[1]) r2s_training = c(r2s_training,gbmPR[2]) methods = c(methods,"GBM") gbmPred = predict(gbmModel, newdata=fingerprints_testing) gbmPR = postResample(pred=gbmPred, obs=permeability_testing) rmses_testing = c(rmses_testing,gbmPR[1]) r2s_testing = c(r2s_testing,gbmPR[2]) # Lets see what variables are most important in the GBM model: varImp(gbmModel) # Cubist # set.seed(0) cubistModel = train(x=fingerprints_training, y=permeability_training, method="cubist", preProc=preProc_Arguments, tuneLength=20) cubistPred = predict(cubistModel, newdata=fingerprints_training) cubistPR = postResample(pred=cubistPred, obs=permeability_training) rmses_training = c(rmses_training,cubistPR[1]) r2s_training = c(r2s_training,cubistPR[2]) methods = c(methods,"CUBIST") cubistPred = predict(cubistModel, newdata=fingerprints_testing) cubistPR = postResample(pred=cubistPred, obs=permeability_testing) rmses_testing = c(rmses_testing,cubistPR[1]) r2s_testing = c(r2s_testing,cubistPR[2]) # Package the results up: # res_training = data.frame( rmse=rmses_training, r2=r2s_training ) rownames(res_training) = methods training_order = order( -res_training$rmse ) res_training = res_training[ training_order, ] # Order the dataframe so that the best results are at the bottom: print( "Final Training Results" ) print( res_training ) res_testing = data.frame( rmse=rmses_testing, r2=r2s_testing ) rownames(res_testing) = methods res_testing = res_testing[ training_order, ] # Order the dataframe so that the best results for the training set are at the bottom: print( "Final Testing Results" ) print( res_testing ) # EPage 82 resamp = resamples( list(rpart=rpartModel,cubist=cubistModel,gbm=gbmModel,rf=rfModel) ) print( summary(resamp) ) if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter8/chap_8_prob_6_resamp_dotplot.eps", onefile=FALSE, horizontal=FALSE) } dotplot( resamp, metric="RMSE" ) if( save_plots ){ dev.off() } print( summary(diff(resamp)) )