# # Written by: # -- # John L. Weatherwax 2009-04-21 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # # EPage 425-426 # #----- # # http://www.dataminingconsultant.com/DMMM.htm # # Get data from here: # # http://www.dataminingconsultant.com/data/Clothing_Store # save_plots = F library(caret) DF = read.csv("../../Data/Clothing_Store.csv", header=TRUE) # Drop any samples that don't have complete records: DF = DF[complete.cases(DF),] # Lets look at what "type" of variable each one is: # ## The variables are: ## ## > colnames(DF) ## [1] "HHKEY" "ZIP_CODE" "REC" "FRE" "MON" ## [6] "CC_CARD" "AVRG" "PC_CALC20" "PSWEATERS" "PKNIT_TOPS" ## [11] "PKNIT_DRES" "PBLOUSES" "PJACKETS" "PCAR_PNTS" "PCAS_PNTS" ## [16] "PSHIRTS" "PDRESSES" "PSUITS" "POUTERWEAR" "PJEWELRY" ## [21] "PFASHION" "PLEGWEAR" "PCOLLSPND" "AMSPEND" "PSSPEND" ## [26] "CCSPEND" "AXSPEND" "TMONSPEND" "OMONSPEND" "SMONSPEND" ## [31] "PREVPD" "GMP" "PROMOS" "DAYS" "FREDAYS" ## [36] "MARKDOWN" "CLASSES" "COUPONS" "STYLES" "STORES" ## [41] "STORELOY" "VALPHON" "WEB" "MAILED" "RESPONDED" ## [46] "RESPONSERATE" "HI" "LTFREDAY" "CLUSTYPE" "PERCRET" ## [51] "RESP" ## Only one variable is not numeric (VALPHON): ## for( k in colnames(DF) ){ c = class( DF[,k] ) if( (c != "numeric") && (c != "integer") ){ print( k ) } } valphon = rep( 0.0, dim(DF)[1] ) valphon[DF$VALPHON=="Y"] = 1.0 DF$VALPHON = valphon # Make sure that "true/yes" is the first level of the factor: inds_true = DF$RESP == 1 inds_false = DF$RESP == 0 DF$RESP[inds_true] = "Yes" DF$RESP[inds_false] = "No" DF$RESP = factor( DF$RESP, levels=c("Yes","No") ) # How imbalanced are our classes: print( table( DF$RESP ) ) T = table( DF$RESP ) print( T / sum(T) ) # Lets look if we have any zero variance predictors in the features themselves: # # This is a property of each feature itself # zero_cols = nearZeroVar( DF[,-51] ) cn = colnames(DF) print( sprintf("Dropping %5d features (due to zero variance) from %5d (%8.4f fraction)", length(zero_cols), length(cn), length(zero_cols)/length(cn)) ) print( "Features dropped would be:" ) print( cn[zero_cols] ) # Lets look at these features with KDE: plot( density( DF[,"PKNIT_DRES"] ) ) plot( density( DF[,"PJACKETS"] ) ) # There are no linearly dependent columns remaining (or to start with) print( findLinearCombos(DF[,-51]) ) # Break our dataset into a training/evaluation/testing dataset: # set.seed(156) split1 = createDataPartition( DF$VALPHON, p=0.7 )[[1]] # Simple verification that we have the same fraction of samples in each of these sets: # T_split1 = table( DF$VALPHON[split1] ) print( T_split1 / sum(T_split1) ) T_not_split1 = table( DF$VALPHON[-split1] ) print( T_not_split1 / sum(T_not_split1) ) other = DF[-split1,] training = DF[ split1,] # Split "other" into an evaluation and a test set: # split2 = createDataPartition( other$VALPHON, p=1./3 )[[1]] evaluation = other[ split2, ] testing = other[-split2, ] # Part (c) Build several classification models (we follow the books examples from this chapter closely here): # # Wrapper functions for performance measures: # fiveStats = function(...) c(twoClassSummary(...), defaultSummary(...)) fourStats = function( data, lev=levels(data$obs), model=NULL ){ accKapp = postResample( data[, "pred"], data[, "obs"] ) out = c(accKapp, sensitivity(data[, "pred"], data[, "obs"], lev[1]), specificity(data[, "pred"], data[, "obs"], lev[2])) names(out)[3:4] = c("Sens", "Spec") out } ctrl = trainControl( method="cv", number=5, classProbs=TRUE, summaryFunction=fiveStats, verboseIter=TRUE ) ctrlNoProb = ctrl ctrlNoProb$classProbs = FALSE ctrlNoProb$summaryFunction = fourStats set.seed(1410) rfFit = train( RESP ~ ., data=training, method="rf", trControl=ctrl, ntree=500, tuneLength=5, metric="ROC" ) set.seed(1410) lrFit = train( RESP ~ ., data=training, method="glm", trControl=ctrl, metric="ROC" ) set.seed(1410) fdaFit = train( RESP ~ ., data=training, method="fda", tuneLength=10, trControl=ctrl, metric="ROC" ) # Compare our estimated sensitivity and specificity for these methods on the training data: # res = matrix( data=c( mean( rfFit$resample$ROC ), mean( rfFit$resample$Spec ), mean( rfFit$resample$Sens ), mean( lrFit$resample$ROC ), mean( lrFit$resample$Spec), mean( lrFit$resample$Sens ), mean( fdaFit$resample$ROC ), mean( fdaFit$resample$Spec ), mean( fdaFit$resample$Sens ) ), nrow=3, ncol=3, byrow=T ) res = data.frame( res ) rownames(res) = c("RF", "LR", "FDA" ) colnames(res) = c("AUC", "Spec", "Sens" ) print(res) # What are our default sensitivity and specificity for these methods on the evaluation set? # evalResults = data.frame( RESP = evaluation$RESP ) # put in the truth evalResults$RF = predict( rfFit, newdata=evaluation, type="prob" )[,1] evalResults$LogReg = predict( lrFit, newdata=evaluation, type="prob" )[,1] evalResults$FDA = predict( fdaFit, newdata=evaluation, type="prob" )[,1] # Part (c): Construct lift plots on the above classifiers # library(pROC) rfROC = roc( evalResults$RESP, evalResults$RF, levels=rev( levels(evalResults$RESP) ) ) logRegROC = roc( evalResults$RESP, evalResults$LogReg, levels=rev( levels(evalResults$RESP) ) ) FDAROC = roc( evalResults$RESP, evalResults$FDA, levels=rev( levels(evalResults$RESP) ) ) if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter16/chap_16_prob_2_roc_curves.eps", onefile=FALSE, horizontal=FALSE) } plot(rfROC, legacy.axes=TRUE, col="red") plot(logRegROC, legacy.axes=TRUE, add=T, col="blue") plot(FDAROC, legacy.axes=TRUE, add=T, col="green") legend( x="bottomright",c(sprintf("RF (%f)", rfROC$auc), sprintf("LR (%f)", logRegROC$auc), sprintf("FDA (%f)", FDAROC$auc)), col=c("red","blue", "green"), lty=c(1,1,1) ) if( save_plots ){ dev.off() } # The lift plots: # if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter16/chap_16_prob_2_lift_curves.eps", onefile=FALSE, horizontal=FALSE) } labs = c(RF = "Random Forest", LogReg = "Logistic Regression", FDA = "FDA (MARS)") lift1 = lift( RESP ~ RF + LogReg + FDA, data=evalResults, labels=labs ) xyplot(lift1, ylab="%Events Found", xlab ="%Customers Evaluated", lwd=2, type="l", col=c("red", "blue", "green")) grid() if( save_plots ){ dev.off() } # Part (d): Using sampling methods on several models # # set.seed(1103) upSampleTraining = upSample( x=training[,-51] , y=training$RESP, yname="RESP" ) print( table( upSampleTraining$RESP ) ) # Lets build the same models as above but using this new dataset: # set.seed(1410) rfFit_us = train( RESP ~ ., data=upSampleTraining, method="rf", trControl = ctrl, ntree = 500, tuneLength = 5, metric = "ROC" ) set.seed(1410) lrFit_us = train( RESP ~ ., data=upSampleTraining, method="glm", trControl = ctrl, metric = "ROC" ) set.seed(1410) fdaFit_us = train( RESP ~ ., data=upSampleTraining, method="fda", tuneLength = 10, trControl = ctrl, metric = "ROC" ) # Compare our estimated sensitivity and specificity for these methods on the training data: # res = matrix( data=c( mean( rfFit_us$resample$ROC ), mean( rfFit_us$resample$Spec ), mean( rfFit_us$Eesample$Sens ), mean( lrFit_us$resample$ROC ), mean( lrFit_us$resample$Spec ), mean( lrFit_us$resample$Sens ), mean( fdaFit_us$resample$ROC ), mean( fdaFit_us$resample$Spec ), mean( fdaFit_us$resample$Sens ) ), nrow=3, ncol=3, byrow=T ) res = data.frame( res ) rownames(res) = c("RF", "LR", "FDA" ) colnames(res) = c("AUC", "Spec", "Sens" ) print(res) # What are our default sensitivity and specificity for these upsampled methods on the evaluation set? # evalResults = data.frame( RESP = evaluation$RESP ) # put in the truth evalResults$RF = predict( rfFit_us, newdata=evaluation, type="prob" )[,1] evalResultssLogReg = predict( lrFit_us, newdata=evaluation,type="prob" )[,1] evalResults$FDA = predict( fdaFit_us, newdata=evaluation, type="prob" )[,1] library(pROC) rfROC = roc( evalResults$RESP, evalResults$RF, levels=rev( levels(evalResults$RESP) ) ) logRegROC = roc( evalResults$RESP, evalResults$LogReg, levels=rev(levels(evalResults$RESP) ) ) FDAROC = roc( evalResults$RESP, evalResults$FDA, levels=rev( levels(evalResults$RESP) ) ) plot(rfROC, legacy.axes=TRUE, col="red") plot(logRegROC, legacy.axes=TRUE, add=T, col="blue") plot(FDAROC, legacy.axes=TRUE, add=T, col="green") legend( x="bottomright", c(sprintf("RF (%f)", rfROC$auc), sprintf("LR (%f)", logRegROC$auc), sprintf("FDA (%f)", FDAROC$auc)), col=c("red","blue", "green"), lty=c(1,1,1) ) baseline = coords( logRegROC, X=0.5, input="threshold" ) logRegThresh_ctopleft = coords( logRegROC, x="best", best.method="closest.topleft" ) logRegThresh_youden = coords( logRegROC, x="best", best.method="youden" ) print( rbind( baseline, logRegThresh_ctopleft, logRegThresh_youden ) ) # Lets create a lift plot with the random forest results: # if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter16/chap_16_prob_2_upsample_lift_curves.eps", onefile=FALSE, horizontal=FALSE) } labs = c(RF = "Random Forest", LogReg = "Logistic Regression", FDA = "FDA (MARS)") lift1 = lift( RESP ~ RF + LogReg + FDA, data=evalResults, labels=labs ) xyplot(lift1, ylab="%Events Found", xlab="%Customers Evaluated", lwd=2, type="l", col=c("red", "blue", "green")) grid() if( save_plots ){ dev.off() }