# # Multiple training routines EPage 82 # Problem EPage 130 # # Written by: # -- # John L. Weatherwax 2009-04-21 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # #----- save_plots = F library(caret) library(AppliedPredictiveModeling) set.seed(0) data(permeability) # Part (b): # zero_cols = nearZeroVar( fingerprints ) print( sprintf("Found %d zero variance columns from %d",length(zero_cols), dim(fingerprints)[2] ) ) fingerprints = fingerprints[,-zero_cols] # drop these zero variance columns # Split this data into training and testing sets: # training = createDataPartition( permeability, p=0.8 ) fingerprints_training = fingerprints[training$Resample1,] permeability_training = permeability[training$Resample1] fingerprints_testing = fingerprints[-training$Resample1,] permeability_testing = permeability[-training$Resample1] # Part (c): Build a PLSR model on this data: # set.seed(0) pls_model = train( fingerprints_training, permeability_training, method="pls", # the default tuning grid evaluates components 1 ... tuneLength tuneLength=40, preProcess=c("center","scale"), trControl=trainControl(method="repeatedcv",repeats=5) ) # Part (d): Predict performance using PLS # y_hat = predict( pls_model, newdata=fingerprints_testing ) r2_pls = cor(y_hat,permeability_testing,method="pearson")^2 rmse_pls = sqrt( mean( (y_hat-permeability_testing)^2 ) ) print( sprintf( "%-10s: Testing R^2= %10.6f; RMSE= %10.6f", "PLS", r2_pls, rmse_pls ) ) # Part (e): Build models to predict permeability using other methods: # # Lets try an Elastic net (this seems to have performed well in past problems) and some other models: # enetGrid = expand.grid(.lambda=seq(0,1,length=20), .fraction=seq(0.05, 1.0, length=20)) set.seed(0) enet_model = train( fingerprints_training, permeability_training, method="enet", # fit the model over many penalty values tuneGrid = enetGrid, preProcess=c("center","scale"), trControl=trainControl(method="repeatedcv",repeats=5) ) y_hat = predict( enet_model, newdata=fingerprints_testing ) r2_enet = cor(y_hat,permeability_testing,method="pearson")^2 rmse_enet = sqrt( mean( (y_hat-permeability_testing)^2 ) ) print( sprintf( "%-10s: Testing R^2= %10.6f; RMSE= %10.6f", "ENET", r2_enet, rmse_enet ) ) set.seed(0) lm_model = train( fingerprints_training, permeability_training, method="lm", preProcess=c("center","scale"), trControl=trainControl(method="repeatedcv",repeats=5) ) y_hat = predict( lm_model, newdata=fingerprints_testing ) r2_lm = cor(y_hat,permeability_testing,method="pearson")^2 rmse_lm = sqrt( mean( (y_hat-permeability_testing)^2 ) ) print( sprintf( "%-10s: Testing R^2= %10.6f; RMSE= %10.6f", "LM", r2_lm, rmse_lm ) ) # For rlm we cannot have a singular predictor covariance matrix thus we preprocess with PCA: # set.seed(0) rlm_model = train( fingerprints_training, permeability_training, method="rlm", preProcess=c("pca"), trControl=trainControl(method="repeatedcv",repeats=5) ) y_hat = predict( rlm_model, newdata=fingerprints_testing ) r2_rlm = cor(y_hat,permeability_testing,method="pearson")^2 rmse_rlm = sqrt( mean( (y_hat-permeability_testing)^2 ) ) print( sprintf( "%-10s: Testing R^2= %10.6f; RMSE= %10.6f", "RLM", r2_rlm, rmse_rlm ) ) # Compare the given models using resamples # resamp = resamples( list(pls=pls_model,enet=enet_model,lm=lm_model,rlm=rlm_model) ) # examples of using this are on EPage 82 print( summary(resamp) ) if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter6/chap_6_prob_2_resamp_dotplot.eps", onefile=FALSE, horizontal=FALSE) } dotplot( resamp, metric="Rsquared" ) if( save_plots ){ dev.off() } print( summary(diff(resamp)) )