#
# Written by:
# -- 
# John L. Weatherwax                2009-04-21
# 
# email: wax@alum.mit.edu
# 
# Please send comments and especially bug reports to the
# above email address.
#
# EPage 425-426
#
#-----

#
# http://www.dataminingconsultant.com/DMMM.htm
#
# Get data from here:
#
# http://www.dataminingconsultant.com/data/Clothing_Store
#

save_plots = F

library(caret)

DF = read.csv("../../Data/Clothing_Store.csv", header=TRUE)

# Drop any samples that don't have complete records:
DF = DF[complete.cases(DF),]

# Lets look at what "type" of variable each one is:
#
## The variables are:
##
## > colnames(DF)
##  [1] "HHKEY"        "ZIP_CODE"     "REC"          "FRE"          "MON"         
##  [6] "CC_CARD"      "AVRG"         "PC_CALC20"    "PSWEATERS"    "PKNIT_TOPS"  
## [11] "PKNIT_DRES"   "PBLOUSES"     "PJACKETS"     "PCAR_PNTS"    "PCAS_PNTS"   
## [16] "PSHIRTS"      "PDRESSES"     "PSUITS"       "POUTERWEAR"   "PJEWELRY"    
## [21] "PFASHION"     "PLEGWEAR"     "PCOLLSPND"    "AMSPEND"      "PSSPEND"     
## [26] "CCSPEND"      "AXSPEND"      "TMONSPEND"    "OMONSPEND"    "SMONSPEND"   
## [31] "PREVPD"       "GMP"          "PROMOS"       "DAYS"         "FREDAYS"     
## [36] "MARKDOWN"     "CLASSES"      "COUPONS"      "STYLES"       "STORES"      
## [41] "STORELOY"     "VALPHON"      "WEB"          "MAILED"       "RESPONDED"   
## [46] "RESPONSERATE" "HI"           "LTFREDAY"     "CLUSTYPE"     "PERCRET"     
## [51] "RESP"        

## Only one variable is not numeric (VALPHON):
##
for( k in colnames(DF) ){
    c = class( DF[,k] )
    if( (c != "numeric") && (c != "integer") ){
        print( k )
    }
}

valphon = rep( 0.0, dim(DF)[1] )
valphon[DF$VALPHON=="Y"] = 1.0
DF$VALPHON = valphon

# Make sure that "true/yes" is the first level of the factor:
inds_true = DF$RESP == 1
inds_false = DF$RESP == 0
DF$RESP[inds_true] = "Yes"
DF$RESP[inds_false] = "No" 
DF$RESP = factor( DF$RESP, levels=c("Yes","No") )

# How imbalanced are our classes:
print( table( DF$RESP ) )
T = table( DF$RESP )
print( T / sum(T) )

# Lets look if we have any zero variance predictors in the features themselves:
#
# This is a property of each feature itself
#
zero_cols = nearZeroVar( DF[,-51] )
cn = colnames(DF)
print( sprintf("Dropping %5d features (due to zero variance) from %5d (%8.4f fraction)", length(zero_cols), length(cn), length(zero_cols)/length(cn)) )
print( "Features dropped would be:" )
print( cn[zero_cols] )

# Lets look at these features with KDE:
plot( density( DF[,"PKNIT_DRES"] ) )
plot( density( DF[,"PJACKETS"] ) )

# There are no linearly dependent columns remaining (or to start with)
print( findLinearCombos(DF[,-51]) )

# Break our dataset into a training/evaluation/testing dataset:
#
set.seed(156)
split1 = createDataPartition( DF$VALPHON, p=0.7 )[[1]]

# Simple verification that we have the same fraction of samples in each of these sets:
#
T_split1 = table( DF$VALPHON[split1] )
print( T_split1 / sum(T_split1) )

T_not_split1 = table( DF$VALPHON[-split1] )
print( T_not_split1 / sum(T_not_split1) )

other = DF[-split1,]
training = DF[ split1,]

# Split "other" into an evaluation and a test set:
#
split2 = createDataPartition( other$VALPHON, p=1./3 )[[1]]
evaluation = other[ split2, ]
testing = other[-split2, ]

# Part (c) Build several classification models (we follow the books examples from this chapter closely here):
#

# Wrapper functions for performance measures:
#
fiveStats = function(...) c(twoClassSummary(...), defaultSummary(...))
fourStats = function( data, lev=levels(data$obs), model=NULL ){
    accKapp = postResample( data[, "pred"], data[, "obs"] )
    out = c(accKapp,
        sensitivity(data[, "pred"], data[, "obs"], lev[1]),
        specificity(data[, "pred"], data[, "obs"], lev[2]))
    names(out)[3:4] = c("Sens", "Spec")
    out
}

ctrl = trainControl( method="cv", number=5, classProbs=TRUE, summaryFunction=fiveStats, verboseIter=TRUE )

ctrlNoProb = ctrl
ctrlNoProb$classProbs = FALSE
ctrlNoProb$summaryFunction = fourStats

set.seed(1410)
rfFit = train( RESP ~ ., data=training, method="rf", trControl=ctrl, ntree=500, tuneLength=5, metric="ROC" )

set.seed(1410)
lrFit = train( RESP ~ ., data=training, method="glm", trControl=ctrl, metric="ROC" )

set.seed(1410)
fdaFit = train( RESP ~ ., data=training, method="fda", tuneLength=10, trControl=ctrl, metric="ROC" )

# Compare our estimated sensitivity and specificity for these methods on the training data:
# 
res = matrix( data=c( mean( rfFit$resample$ROC ), mean( rfFit$resample$Spec ), mean( rfFit$resample$Sens ),
                      mean( lrFit$resample$ROC ), mean( lrFit$resample$Spec), mean( lrFit$resample$Sens ),
                      mean( fdaFit$resample$ROC ), mean( fdaFit$resample$Spec ), mean( fdaFit$resample$Sens ) ),
    nrow=3, ncol=3, byrow=T )
res = data.frame( res )
rownames(res) = c("RF", "LR", "FDA" )
colnames(res) = c("AUC", "Spec", "Sens" )
print(res)

# What are our default sensitivity and specificity for these methods on the evaluation set?
#
evalResults = data.frame( RESP = evaluation$RESP ) # put in the truth
evalResults$RF = predict( rfFit, newdata=evaluation, type="prob" )[,1]
evalResults$LogReg = predict( lrFit, newdata=evaluation, type="prob" )[,1]
evalResults$FDA = predict( fdaFit, newdata=evaluation, type="prob" )[,1]

# Part (c): Construct lift plots on the above classifiers
#
library(pROC)
rfROC = roc( evalResults$RESP, evalResults$RF, levels=rev( levels(evalResults$RESP) ) )
logRegROC = roc( evalResults$RESP, evalResults$LogReg, levels=rev( levels(evalResults$RESP) ) )
FDAROC = roc( evalResults$RESP, evalResults$FDA, levels=rev( levels(evalResults$RESP) ) )

if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter16/chap_16_prob_2_roc_curves.eps", onefile=FALSE, horizontal=FALSE) } 
plot(rfROC, legacy.axes=TRUE, col="red")
plot(logRegROC, legacy.axes=TRUE, add=T, col="blue")
plot(FDAROC, legacy.axes=TRUE, add=T, col="green")
legend( x="bottomright",c(sprintf("RF (%f)", rfROC$auc), sprintf("LR (%f)", logRegROC$auc), sprintf("FDA (%f)", FDAROC$auc)), col=c("red","blue", "green"), lty=c(1,1,1) )
if( save_plots ){ dev.off() }

# The lift plots:
#
if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter16/chap_16_prob_2_lift_curves.eps", onefile=FALSE, horizontal=FALSE) }
labs = c(RF = "Random Forest", LogReg = "Logistic Regression", FDA = "FDA (MARS)")
lift1 = lift( RESP ~ RF + LogReg + FDA, data=evalResults, labels=labs )
xyplot(lift1, ylab="%Events Found", xlab ="%Customers Evaluated", lwd=2, type="l", col=c("red", "blue", "green"))
grid()
if( save_plots ){ dev.off() }

# Part (d): Using sampling methods on several models
#
#
set.seed(1103)
upSampleTraining = upSample( x=training[,-51] , y=training$RESP, yname="RESP" )
print( table( upSampleTraining$RESP ) )

# Lets build the same models as above but using this new dataset:
#
set.seed(1410)
rfFit_us = train( RESP ~ ., data=upSampleTraining, method="rf", trControl = ctrl, ntree = 500, tuneLength = 5, metric = "ROC" )

set.seed(1410)
lrFit_us = train( RESP ~ ., data=upSampleTraining, method="glm", trControl = ctrl, metric = "ROC" )

set.seed(1410)
fdaFit_us = train( RESP ~ ., data=upSampleTraining, method="fda", tuneLength = 10, trControl = ctrl, metric = "ROC" )

# Compare our estimated sensitivity and specificity for these methods on the training data:
#
res = matrix( data=c( mean( rfFit_us$resample$ROC ), mean( rfFit_us$resample$Spec ), mean( rfFit_us$Eesample$Sens ), 
                      mean( lrFit_us$resample$ROC ), mean( lrFit_us$resample$Spec ), mean( lrFit_us$resample$Sens ), 
                      mean( fdaFit_us$resample$ROC ), mean( fdaFit_us$resample$Spec ), mean( fdaFit_us$resample$Sens ) ),
               nrow=3, ncol=3, byrow=T )
res = data.frame( res )
rownames(res) = c("RF", "LR", "FDA" )
colnames(res) = c("AUC", "Spec", "Sens" )
print(res)

# What are our default sensitivity and specificity for these upsampled methods on the evaluation set?
#
evalResults = data.frame( RESP = evaluation$RESP ) # put in the truth
evalResults$RF = predict( rfFit_us, newdata=evaluation, type="prob" )[,1]
evalResultssLogReg = predict( lrFit_us, newdata=evaluation,type="prob" )[,1] 
evalResults$FDA = predict( fdaFit_us, newdata=evaluation, type="prob" )[,1]

library(pROC)
rfROC = roc( evalResults$RESP, evalResults$RF, levels=rev( levels(evalResults$RESP) ) )
logRegROC = roc( evalResults$RESP, evalResults$LogReg, levels=rev(levels(evalResults$RESP) ) )
FDAROC = roc( evalResults$RESP, evalResults$FDA, levels=rev( levels(evalResults$RESP) ) )

plot(rfROC, legacy.axes=TRUE, col="red")
plot(logRegROC, legacy.axes=TRUE, add=T, col="blue")
plot(FDAROC, legacy.axes=TRUE, add=T, col="green")
legend( x="bottomright", c(sprintf("RF (%f)", rfROC$auc), sprintf("LR (%f)", logRegROC$auc), sprintf("FDA (%f)", FDAROC$auc)), col=c("red","blue", "green"), lty=c(1,1,1) )

baseline = coords( logRegROC, X=0.5, input="threshold" )
logRegThresh_ctopleft = coords( logRegROC, x="best", best.method="closest.topleft" )
logRegThresh_youden = coords( logRegROC, x="best", best.method="youden" )
print( rbind( baseline, logRegThresh_ctopleft, logRegThresh_youden ) )

# Lets create a lift plot with the random forest results:
#
if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter16/chap_16_prob_2_upsample_lift_curves.eps", onefile=FALSE, horizontal=FALSE) }
labs = c(RF = "Random Forest", LogReg = "Logistic Regression", FDA = "FDA (MARS)")
lift1 = lift( RESP ~ RF + LogReg + FDA, data=evalResults, labels=labs )
xyplot(lift1, ylab="%Events Found", xlab="%Customers Evaluated", lwd=2, type="l", col=c("red", "blue", "green"))
grid()
if( save_plots ){ dev.off() }