library(pROC) build_AUC_linear_models = function(X, y, seed_value=456, build_pls_model=TRUE){ # # Builds (using caret's train function) several of the linear models discussed in # this chapter of the book optimizing the area-under-the-curve metric (AUC) # # X [n samples, n features] needs to have zero variance columns removed (use nearZeroVar to remove them) and # linear dependent columns removed (use findLinearCombos to remove them) # # y [n samples, 1] is a two factor vector of predictions with the FIRST factor corresponding to the event of interest # # Written by: # -- # John L. Weatherwax 2009-04-21 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # #----- # Set up the train control arguments so that we can compute the area-under-the-curve: # ctrl = trainControl( summaryFunction=twoClassSummary, classProbs=TRUE ) # Logistic Regression Model: # set.seed(seed_value) glm.classifier = train( X, y, method="glm", metric="ROC", trControl=ctrl ) glm.predictions = predict( glm.classifier, X, type="prob" ) glm.rocCurve = pROC::roc( response=y, predictor=glm.predictions[,1] ) glm.auc = glm.rocCurve$auc[1] glm=list( classifier=glm.classifier, predictions=glm.predictions, roc=glm.rocCurve, auc=glm.auc ) # Linear Discriminant Analysis: # set.seed(seed_value) lda.classifier = train( X, y, method="lda", preProc=c("center","scale"), metric="ROC", trControl=ctrl ) lda.predictions = predict( lda.classifier, X, type="prob" ) # <~ returns probability of "Yes" (event of interest) & "No" lda.rocCurve = pROC::roc( response=y, predictor=lda.predictions[,1] ) lda.auc = lda.rocCurve$auc[1] lda=list( classifier=lda.classifier, predictions=lda.predictions, roc=lda.rocCurve, auc=lda.auc ) # Partial Least Squares Discriminant Analysis (this can take a very long time on the churn data set): # if( build_pls_model ){ set.seed(seed_value) plsda.classifier = train( X, y, method="pls", tuneGrid=expand.grid(.ncomp=1:10), preProc=c("center","scale"), metric="ROC", trControl=ctrl ) plsda.predictions = predict( plsda.classifier, X, type="prob" ) plsda.rocCurve = pROC::roc( response=y, predictor=plsda.predictions[,1] ) plsda.auc = plsda.rocCurve$auc[1] plsda=list( classifier=plsda.classifier, predictions=plsda.predictions, roc=plsda.rocCurve, auc=plsda.auc ) } # Penalized Methods: # glmnGrid = expand.grid(.alpha=c(0, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0), .lambda=seq( 0.01, 0.2, length=40)) set.seed(seed_value) glmnet.classifier = train( X, y, method="glmnet", tuneGrid=glmnGrid, preProc=c("center","scale"), metric="ROC", trControl=ctrl ) glmnet.predictions = predict( glmnet.classifier, X, type="prob" ) glmnet.rocCurve = pROC::roc( response=y, predictor=glmnet.predictions[,1] ) glmnet.auc = glmnet.rocCurve$auc[1] glmnet=list( classifier=glmnet.classifier, predictions=glmnet.predictions, roc=glmnet.rocCurve, auc=glmnet.auc ) # Nearest shrunken Centroids: # nscGrid = expand.grid(.threshold=0:25) set.seed(seed_value) nsc.classifier = train( X, y, method="pam", tuneGrid=nscGrid, preProc=c("center","scale"), metric="ROC", trControl=ctrl ) nsc.predictions = predict( nsc.classifier, X, type="prob" ) nsc.rocCurve = pROC::roc( response=y, predictor=nsc.predictions[,1] ) nsc.auc = nsc.rocCurve$auc[1] nsc=list( classifier=nsc.classifier, predictions=nsc.predictions, roc=nsc.rocCurve, auc=nsc.auc ) result = list( glm=glm, lda=lda, glmnet=glmnet, nsc=nsc ) if( build_pls_model ){ result = c(result, list(plsda=plsda)) } return( result ) }