# # Multiple training routines EPage 82 # Problem EPage 131 # # Written by: # -- # John L. Weatherwax 2009-04-21 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # #----- save_plots = T library(caret) library(AppliedPredictiveModeling) set.seed(0) # Part (a): # data(ChemicalManufacturingProcess) processPredictors = ChemicalManufacturingProcess[,2:58] yield = ChemicalManufacturingProcess[,1] n_samples = dim(processPredictors)[1] n_features = dim(processPredictors)[2] # Part (b): Fill in missing values where we have NAs with the median over the non-NA values: # replacements = sapply( processPredictors, median, na.rm=TRUE ) for( ci in 1:n_features ){ bad_inds = is.na( processPredictors[,ci] ) processPredictors[bad_inds,ci] = replacements[ci] } # Look for any features with no variance: # zero_cols = nearZeroVar( processPredictors ) print( sprintf("Found %d zero variance columns from %d",length(zero_cols), dim(processPredictors)[2] ) ) processPredictors = processPredictors[,-zero_cols] # drop these zero variance columns # Part (c): Split this data into training and testing sets: # training = createDataPartition( yield, p=0.8 ) processPredictors_training = processPredictors[training$Resample1,] yield_training = yield[training$Resample1] processPredictors_testing = processPredictors[-training$Resample1,] yield_testing = yield[-training$Resample1] # Build some linear models and predict the performance on the testing data set: # set.seed(0) pls_model = train( processPredictors_training, yield_training, method="pls", # the default tuning grid evaluates components 1 ... tuneLength tuneLength=40, preProcess=c("center","scale"), trControl=trainControl(method="repeatedcv",repeats=5) ) y_hat = predict( pls_model, newdata=processPredictors_testing ) r2_pls = cor(y_hat,yield_testing,method="pearson")^2 rmse_pls = sqrt( mean( (y_hat-yield_testing)^2 ) ) print( sprintf( "%-10s: Testing R^2= %10.6f; RMSE= %10.6f", "PLS", r2_pls, rmse_pls ) ) # Lets try an Elastic net (this seems to have performed well in past problems) and some other models: # enetGrid = expand.grid(.lambda=seq(0,1,length=20), .fraction=seq(0.05, 1.0, length=20)) set.seed(0) enet_model = train( processPredictors_training, yield_training, method="enet", # fit the model over many penalty values tuneGrid = enetGrid, preProcess=c("center","scale"), trControl=trainControl(method="repeatedcv",repeats=5) ) y_hat = predict( enet_model, newdata=processPredictors_testing ) r2_enet = cor(y_hat,yield_testing,method="pearson")^2 rmse_enet = sqrt( mean( (y_hat-yield_testing)^2 ) ) print( sprintf( "%-10s: Testing R^2= %10.6f; RMSE= %10.6f", "ENET", r2_enet, rmse_enet ) ) set.seed(0) lm_model = train( processPredictors_training, yield_training, method="lm", preProcess=c("center","scale"), trControl=trainControl(method="repeatedcv",repeats=5) ) y_hat = predict( lm_model, newdata=processPredictors_testing ) r2_lm = cor(y_hat,yield_testing,method="pearson")^2 rmse_lm = sqrt( mean( (y_hat-yield_testing)^2 ) ) print( sprintf( "%-10s: Testing R^2= %10.6f; RMSE= %10.6f", "LM", r2_lm, rmse_lm ) ) # For rlm we cannot have a singular predictor covariance matrix thus we preprocess with PCA: # set.seed(0) rlm_model = train( processPredictors_training, yield_training, method="rlm", preProcess=c("pca"), trControl=trainControl(method="repeatedcv",repeats=5) ) y_hat = predict( rlm_model, newdata=processPredictors_testing ) r2_rlm = cor(y_hat,yield_testing,method="pearson")^2 rmse_rlm = sqrt( mean( (y_hat-yield_testing)^2 ) ) print( sprintf( "%-10s: Testing R^2= %10.6f; RMSE= %10.6f", "RLM", r2_rlm, rmse_rlm ) ) # Compare the given models using resamples # resamp = resamples( list(pls=pls_model,enet=enet_model,lm=lm_model,rlm=rlm_model) ) # examples of using this are on EPage 82 print( summary(resamp) ) if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter6/chap_6_prob_3_resamp_dotplot.eps", onefile=FALSE, horizontal=FALSE) } dotplot( resamp, metric="RMSE" ) if( save_plots ){ dev.off() } print( summary(diff(resamp)) ) # Part (e): Lets look at the coefficients choosen selected by the optimal model: # enet_base_model = enet( x=as.matrix(processPredictors_training), y=yield_training, lambda=0.5263158, normalize=TRUE ) enet_coefficients = predict( enet_base_model, newx=as.matrix(processPredictors_testing), s=0.35, mode="fraction", type="coefficients" ) non_zero = enet_coefficients$coefficients != 0 enet_coefficients$coefficients[ non_zero ] # Part (f): Explore the relationships between the top predictors and the response: # # Pick a predictor and plot how the responce varies as a function of this value # p_range = range( processPredictors$ManufacturingProcess32 ) variation = seq( from=p_range[1], to=p_range[2], length.out=100 ) mean_predictor_values = apply( processPredictors, 2, mean ) # build a dataframe with variation in only one dimension (for this part we pick ManufacturingProcess32) if( !require(pracma) ){ install.packages('pracma') # needed for repmat library(pracma) } newdata = repmat( as.double(mean_predictor_values), length(variation), 1 ) newdata = data.frame( newdata ) colnames( newdata ) = colnames( processPredictors ) newdata$ManufacturingProcess32 = variation xs = variation y_hat = predict( enet_base_model, newx=as.matrix(newdata), s=0.35, mode="fraction", type="fit" ) if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter6/chap_6_prob_3_variation_of_ManufacturingProcess32.eps", onefile=FALSE, horizontal=FALSE) } plot( xs, y_hat$fit, xlab='variation in ManufacturingProcess32', ylab='predicted yield' ) grid() if( save_plots ){ dev.off() }