# # Written by: # -- # John L. Weatherwax 2009-04-21 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # # EPage 311 # #----- save_plots = F if( ! require("caret") ){ install.packages("caret") } if( ! require("AppliedPredictiveModeling") ){ install.packages("AppliedPredictiveModeling") } # needed for the hepatic dataset if( ! require("pROC") ){ install.packages("pROC") } source('build_AUC_linear_models.R') data(hepatic) table(injury) # Lump all compounds that cause injury into a "Yes" category: # any_damage = as.character( injury ) any_damage[ any_damage=="Mild" ] = "Yes" any_damage[ any_damage=="Severe" ] = "Yes" any_damage[ any_damage=="None" ] = "No" table( any_damage ) # Here we just verify that using the function "createDataPartition" does not # change the proportion of Yes and No's from the population proportions. # This is especially important when one's problem has non equal class proportions and we # don't use the "t" variable in what follows below. The function we do use (carets "train" function) # calls createDataPartition as a subprocess. # t = createDataPartition( any_damage, p=0.8, list=FALSE, times=1 ) table( any_damage ) / sum( table( any_damage ) ) # the population proportions table( any_damage[t] ) / sum( table( any_damage[t] ) ) # the proprtions from one resampling t = createDataPartition( any_damage, p=0.8, list=FALSE, times=10 ) # check again with times=10 table( any_damage[t[,1]] ) / sum( table( any_damage[t[,1]] ) ) # Convert our response to a factor (make the first factor correspond to the event of interest): # any_damage = factor( any_damage, levels=c("Yes","No") ) # Part c: # #------------------------------------------------------------------------ # Use the biological predictors: #------------------------------------------------------------------------ zv_cols = nearZeroVar(bio) print( sprintf("Dropping %d zero variance columns from %d (fraction=%10.6f)", length(zv_cols), dim(bio)[2], length(zv_cols)/dim(bio)[2]) ); X = bio[,-zv_cols] # There are no linearly dependent columns remaining (or to start with) print( findLinearCombos(X) ) # Build linear models with this data: # bio_linear_models = build_AUC_linear_models( X, any_damage ) # Present the sampled ROC estimate for each model: # df = rbind( data.frame(name="LR", auc=bio_linear_models$glm$auc), data.frame(name="LDA", auc=bio_linear_models$lda$auc), data.frame(name="PLSDA", auc=bio_linear_models$plsda$auc), data.frame(name="GLMNET", auc=bio_linear_models$glmnet$auc), data.frame(name="NSC", auc=bio_linear_models$nsc$auc) ) # Order our dataframe by performance: # df = df[ with( df, order(auc) ), ] print( "AUC Performance using biological predictors" ) print( df ) # For the best model (logistic regression) what are the most important predictors: # varImp(bio_linear_models$glm$classifier) #------------------------------------------------------------------------ # Use the chemical predictors: #------------------------------------------------------------------------ # zv_cols = nearZeroVar(chem) print( sprintf("Dropping %d zero variance columns from %d (fraction=%10.6f)", length(zv_cols), dim(chem)[2], length(zv_cols)/dim(chem)[2]) ); X = chem[,-zv_cols] # Remove the linearly dependent columns fLC = findLinearCombos(X) X = X[,-fLC$remove] # Build linear models with this data: # chem_linear_models = build_AUC_linear_models( X, any_damage ) # Present the sampled ROC estimate for each model: # df = rbind( data.frame(name="LR", auc=chem_linear_models$glm$auc), data.frame(name="LDA", auc=chem_linear_models$lda$auc), data.frame(name="PLSDA", auc=chem_linear_models$plsda$auc), data.frame(name="GLMNET", auc=chem_linear_models$glmnet$auc), data.frame(name="NSC", auc=chem_linear_models$nsc$auc) ) # Order our dataframe by performance: # df = df[ with( df, order(auc) ), ] print( "AUC Performance using chemical predictors" ) print( df ) # For the best model (logistic regression again) what are the most important predictors: # varImp(chem_linear_models$glm$classifier) # Plot the best two logistic regression rocCurves: # plot( bio_linear_models$glm$roc, legacy.axes=T, add=F, col="gray" ) plot( chem_linear_models$glm$roc, legacy.axes=T, add=T ) legend( 0.6, 0.2, c("LR (biological)","LR (chemical)"), col=c("gray","black"), lty=c(1,1) ) #------------------------------------------------------------------------ # Use ALL predictors into one set of predictors: #------------------------------------------------------------------------ # all = cbind( bio, chem ) zv_cols = nearZeroVar(all) print( sprintf("Dropping %d zero variance columns from %d (fraction=%10.6f)", length(zv_cols), dim(all)[2], length(zv_cols)/dim(all)[2]) ); X = all[,-zv_cols] # There are no linearly dependent columns remaining (or to start with) fLC = findLinearCombos(X) X = X[,-fLC$remove] print( sprintf("Dropping %d columns due to linear combinations %d (fraction=%10.6f)", length(fLC$remove), dim(X)[2], length(fLC$remove)/dim(X)[2]) ); # Build linear models with this data: # all_linear_models = build_AUC_linear_models( X, any_damage ) # Present the sampled ROC estimate for each model: # df = rbind( data.frame(name="LR", auc=all_linear_models$glm$auc), data.frame(name="LDA", auc=all_linear_models$lda$auc), data.frame(name="PLSDA", auc=all_linear_models$plsda$auc), data.frame(name="GLMNET", auc=all_linear_models$glmnet$auc), data.frame(name="NSC", auc=all_linear_models$nsc$auc) ) # Order our dataframe by performance: # df = df[ with( df, order(auc) ), ] print( "AUC Performance using all predictors" ) print( df ) # Report the top five predictors (for the LDA model): # varImp(all_linear_models$lda$classifier) # Plot the logistic regression rocCurves using the bio and chem predictors and the best roc curve using both bio and chem predictors: # if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter12/chap_12_prob_1_ROC_curves.eps", onefile=FALSE, horizontal=FALSE) } plot( bio_linear_models$glm$roc, legacy.axes=T, add=F, col="gray", lty=2, ylim=c(0,1.05) ) plot( chem_linear_models$glm$roc, legacy.axes=T, add=T, col="darkgray" ) plot( all_linear_models$lda$roc, legacy.axes=T, add=T ) legend( 0.6, 0.2, c("LR (biological)","LR (chemical)", "LDA (all)"), col=c("gray","darkgray","black"), lty=c(2,1,1) ) if( save_plots ){ dev.off() }