# # Written by: # -- # John L. Weatherwax 2009-04-21 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # # EPage 311 # #----- save_plots = F library(caret) library(AppliedPredictiveModeling) library(pROC) library(C50) # needed for the churn dataset source('build_AUC_linear_models.R') data(churn) # loads churnTrain & churnTest str(churnTrain) table(churnTrain$churn) # Drop all factor predictors: # churnTrain = churnTrain[,-c(1,3,4,5)] churnTest = churnTest[,c(1,3,4,5)] # Build various linear models: # X = churnTrain[,-16] # drop the churn response y = churnTrain[,16] # Look for (and drop) zero variance columns: zv_cols = nearZeroVar(X) X = X[,-zv_cols] # There are no linearly dependent columns remaining: print( findLinearCombos(X) ) # Get a high level View of which predictors might be most predictive: # y_numeric = rep( +1, length(y) ) # let +1 represent a churn and a negative outcome y_numeric[y=="no"] = 0 # let 0 be a non churn cor_with_response = cor( cbind( X, y_numeric ) ) n = names( cor_with_response[,15] ) c = as.double( cor_with_response[,15] ) dfv = data.frame(c) rownames(dfv) = n colnames(dfv) = c("correlation") print( dfv[ order(abs(dfv$correlation)), , drop=FALSE ] ) # Part b (build some linear models ... running PLS takes too much time and is not run): # linear_models = build_AUC_linear_models( X, y, build_pls_model=FALSE ) # Present the sampled ROC AUC estimate for each model: # df = rbind( data.frame(name="LR", auc=linear_models$glm$auc), data.frame(name="LDA",auc=linear_models$lda$auc), data.frame(name="GLMNET", auc=linear_models$glmnet$auc), data.frame(name="NSC", auc=linear_models$nsc$auc) ) # Order our dataframe by performance: # df = df[ with( df, order(auc) ), ] print( "AUC Performance" ) print( df ) # For the best model (LDA) what are the most important predictors: # varImp(linear_models$lda$classifier) # Plot the best rocCurve: # if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter12/chap_12_prob_3_ROC_curves.eps", onefile=FALSE, horizontal=FALSE) } plot( linear_models$lda$roc, legacy.axes=T, add=F, col="black" ) if( save_plots ){ dev.off() } # Plot the lift curve for this data: # lp = lift( y ~ yes, data=linear_models$lda$predictions, class="yes" ) if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter12/chap_12_prob_3_lift_plot.eps", onefile=FALSE, horizontal=FALSE) } plot(lp) if( save_plots ){ dev.off() }