# # Written by: # -- # John L. Weatherwax 2009-04-21 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # # EPage 351 # #----- save_plots = FALSE library(caret) library(AppliedPredictiveModeling) # needed for the hepatic dataset library(pROC) source('build_AUC_nonlinear_models.R') data(hepatic) # Use the same pre-processing that we did for Chapter 12 Problem 1: # # Lump all compounds that cause injury into a "Yes" category: # any_damage = as.character( injury ) any_damage[ any_damage=="Mild" ] = "Yes" any_damage[ any_damage=="Severe" ] = "Yes" any_damage[ any_damage=="None" ] = "No" # Convert our response to a factor (make the first factor correspond to the event of interest): # any_damage = factor( any_damage, levels=c("Yes","No") ) # Part a: # #------------------------------------------------------------------------ # Use the biological predictors: #------------------------------------------------------------------------ # zv_cols = nearZeroVar(bio) print( sprintf("Dropping %d zero variance columns from %d (fraction=%10.6f)", length(zv_cols), dim(bio)[2], length(zv_cols)/dim(bio)[2]) ); X = bio[,-zv_cols] # There are no linearly dependent columns remaining (or to start with) print( findLinearCombos(X) ) # Build linear models with this data: # bio_nonlinear_models = build_AUC_nonlinear_models( X, any_damage ) # Present the sampled ROC estimate for each model: # df = rbind( data.frame(name="MDA", auc=bio_nonlinear_models$mda$auc), data.frame(name="NNET", auc=bio_nonlinear_models$nnet$auc), data.frame(name="SVM", auc=bio_nonlinear_models$svm$auc), data.frame(name="KNN", auc=bio_nonlinear_models$knn$auc), data.frame(name="NB", auc=bio_nonlinear_models$nb$auc) ) # Order our dataframe by performance: # df = df[ with( df, order(auc) ), ] print( "AUC Performance using biological predictors" ) print( df ) # For the best model (Mixture Discriminant Analysis) what are the most important predictors: # varImp(bio_nonlinear_models$mda$classifier) #------------------------------------------------------------------------ # Use the chemical predictors: #------------------------------------------------------------------------ # zv_cols = nearZeroVar(chem) print( sprintf("Dropping %d zero variance columns from %d (fraction=%10.6f)", length(zv_cols), dim(chem)[2], length(zv_cols)/dim(chem)[2]) ); X = chem[,-zv_cols] # Remove the linearly dependent columns fLC = findLinearCombos(X) X = X[,-fLC$remove] # Build nonlinear models with this data: # chem_nonlinear_models = build_AUC_nonlinear_models( X, any_damage ) # Present the sampled ROC estimate for each model: # df = rbind( data.frame(name="MDA", auc=chem_nonlinear_models$mda$auc), data.frame(name="NNET", auc=chem_nonlinear_models$nnet$auc), data.frame(name="SVM", auc=chem_nonlinear_models$svm$auc), data.frame(name="KNN", auc=chem_nonlinear_models$knn$auc), data.frame(name="NB", auc=chem_nonlinear_models$nb$auc) ) # Order our dataframe by performance: # df = df[ with( df, order(auc) ), ] print( "AUC Performance using chemical predictors" ) print( df ) # For the best model (neural networks) what are the most important predictors: # varImp(chem_nonlinear_models$nnet$classifier) # Plot the best two logistic regression rocCurves: # plot( bio_nonlinear_models$mda$roc, legacy.axes=T, add=F, col="gray" ) plot( chem_nonlinear_models$nnet$roc, legacy.axes=T, add=T ) legend( 0.6, 0.2, c("MDA (biological)","NNET (chemical)"), col=c("gray","black"), lty=c(1,1) ) #------------------------------------------------------------------------ # Use ALL predictors into one set of predictors: #------------------------------------------------------------------------ # all = cbind( bio, chem ) zv_cols = nearZeroVar(all) print( sprintf("Dropping %d zero variance columns from %d (fraction=%10.6f)", length(zv_cols), dim(all)[2], length(zv_cols)/dim(all)[2]) ); X = all[,-zv_cols] # There are no linearly dependent columns remaining (or to start with) fLC = findLinearCombos(X) X = X[,-fLC$remove] print( sprintf("Dropping %d columns due to linear combinations %d (fraction=%10.6f)",length(fLC$remove), dim(X)[2], length(fLC$remove)/dim(X)[2]) ); # Build nonlinear models with this data: # all_nonlinear_models = build_AUC_nonlinear_models( X, any_damage, build_mda_model=FALSE ) # Present the sampled ROC estimate for each model: # df = rbind( data.frame(name="NNET", auc=all_nonlinear_models$nnet$auc), data.frame(name="SVM", auc=all_nonlinear_models$svm$auc), data.frame(name="KNN", auc=all_nonlinear_models$knn$auc), data.frame(name="NB", auc=all_nonlinear_models$nb$auc) ) # Order our dataframe by performance: # df = df[ with( df, order(auc) ), ] print( "AUC Performance using all predictors" ) print( df ) # Report the top five predictors: # varImp(all_nonlinear_models$knn$classifier) # Plot the logistic regression rocCurves using the bio and chem predictors and the best roc curve using both bio and chem predictors: # if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter13/chap_13_prob_1_ROC_curves.eps", onefile=FALSE, horizontal=FALSE) } plot( bio_nonlinear_models$mda$roc, legacy.axes=T, add=F, col="gray", lty=2, ylim=c(0,1.05) ) plot( chem_nonlinear_models$nnet$roc, legacy.axes=T, add=T, col="black" ) plot( all_nonlinear_models$svm$roc, legacy.axes=T, add=T, col="darkgray" ) legend( 0.6, 0.2, c("MDA (biological)","NNET (chemical)", "SVM (all)"), col=c("gray","black","darkgray"), lty=c(2,1,1) ) if( save_plots ){ dev.off() }