# # Problem EPage 210 # # Written by: # -- # John L. Weatherwax 2009-04-21 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # #----- save_plots = F library(caret) library(AppliedPredictiveModeling) library(rpart) library(randomForest) library(Cubist) set.seed(0) data(solubility) # We have the following variables in this dataset: # ## solTrainX: training set predictors in their natural units. # ## solTrainXtrans: training set predictors after transformations for ## skewness and centering/scaling. # ## solTrainY: a vector of log10 solubility values for the training set. # ## solTestX: test set predictors in their natural units. # ## solTestXtrans: test set predictors after the same transformations used ## on the training set are applied. # ## solTestY: a vector of log10 solubility values for the training set. # # Make sure we don't access the unscaled variables by accident (we want to use the scaled variables): rm(solTrainX) rm(solTestX) # Use solTrainXtrans\$MolWeight as our scalar predictor: trainData = data.frame( x=solTrainXtrans\$MolWeight, y=solTrainY ) # Plot the predictor data vs. the solubility: # if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter8/chap_8_prob_4_predictor_vs_response.eps", onefile=FALSE, horizontal=FALSE) } plot( trainData\$x, trainData\$y, xlab='MolWeight', ylab='log10(solubility)' ) if( save_plots ){ dev.off() } # For a sanity check fit a linear model and look at the predictions it gives: # lmModel = lm( y ~ ., data=trainData ) lm_yhat = predict( lmModel, newdata=data.frame(x=solTestXtrans\$MolWeight) ) plot( solTestY, lm_yhat ) # Part (a): (fit a simple regression tree): # # defaults: for rpart.control are cp=0.01, maxdepth=30 rPartModel = rpart( y ~ ., data=trainData, method="anova", control=rpart.control(cp=0.01,maxdepth=30) ) # decreasing cp makes deeper trees; increasing maxdepth ###plotcp(rPartModel) # Plot the regression tree: # ###plot(rPartModel); text(rPartModel) # predict solubility with this regression tree: rPart_yHat = predict(rPartModel,newdata=data.frame(x=solTestXtrans\$MolWeight)) if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter8/chap_8_prob_4_rpart_predictions.eps", onefile=FALSE, horizontal=FALSE) } plot( solTestXtrans\$MolWeight, rPart_yHat, col='red', xlab='MolWeight', ylab='log10(solubility)', main='rpart test set predictions' ) lines( solTestXtrans\$MolWeight, solTestY, type='p' ) if( save_plots ){ dev.off() } # Part (b): (fit a randomforest): # rfModel = randomForest( y ~ ., data=trainData, ntree=500 ) # ntree=500, mtry=does not matter when we have a scalar feature # predict solubility: rf_yHat = predict(rfModel,newdata=data.frame(x=solTestXtrans\$MolWeight)) if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter8/chap_8_prob_4_rf_predictions.eps", onefile=FALSE, horizontal=FALSE) } plot( solTestXtrans\$MolWeight, rf_yHat, col='red', xlab='MolWeight', ylab='log10(solubility)', main='randomForest test set predictions' ) lines( solTestXtrans\$MolWeight, solTestY, type='p' ) if( save_plots ){ dev.off() } # Part (c): (fit different Cubist models): # cubistModel = cubist( data.frame( x=solTrainXtrans\$MolWeight ), solTrainY, committees=1 ) # committees=1 # predict solubility: cubist_yHat = predict(cubistModel,newdata=data.frame(x=solTestXtrans\$MolWeight)) if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter8/chap_8_prob_4_cubist_predictions.eps", onefile=FALSE, horizontal=FALSE) } plot( solTestXtrans\$MolWeight, cubist_yHat, col='red', xlab='MolWeight', ylab='log10(solubility)', main='cubist test set predictions' ) lines( solTestXtrans\$MolWeight, solTestY, type='p' ) if( save_plots ){ dev.off() }