#
# Written by:
# -- 
# John L. Weatherwax                2009-04-21
# 
# email: wax@alum.mit.edu
# 
# Please send comments and especially bug reports to the
# above email address.
#
# EPage 311
# 
#-----

save_plots = F

library(caret)
library(AppliedPredictiveModeling)
library(pROC)

library(C50) # needed for the churn dataset

source('build_AUC_linear_models.R')

data(churn) # loads churnTrain & churnTest

str(churnTrain)

table(churnTrain$churn)

# Drop all factor predictors:
#
churnTrain = churnTrain[,-c(1,3,4,5)]
churnTest = churnTest[,c(1,3,4,5)]

# Build various linear models:
#
X = churnTrain[,-16] # drop the churn response
y = churnTrain[,16]

# Look for (and drop) zero variance columns:
zv_cols = nearZeroVar(X)
X = X[,-zv_cols]

# There are no linearly dependent columns remaining:
print( findLinearCombos(X) )

# Get a high level View of which predictors might be most predictive:
# 
y_numeric = rep( +1, length(y) ) # let +1 represent a churn and a negative outcome
y_numeric[y=="no"] = 0 # let 0 be a non churn
cor_with_response = cor( cbind( X, y_numeric ) )
n = names( cor_with_response[,15] )
c = as.double( cor_with_response[,15] )
dfv = data.frame(c)
rownames(dfv) = n
colnames(dfv) = c("correlation")
print( dfv[ order(abs(dfv$correlation)), , drop=FALSE ] )

# Part b (build some linear models ... running PLS takes too much time and is not run):
#
linear_models = build_AUC_linear_models( X, y, build_pls_model=FALSE )

# Present the sampled ROC AUC estimate for each model:
#
df = rbind( data.frame(name="LR", auc=linear_models$glm$auc), data.frame(name="LDA",auc=linear_models$lda$auc),
            data.frame(name="GLMNET", auc=linear_models$glmnet$auc), data.frame(name="NSC", auc=linear_models$nsc$auc) )

# Order our dataframe by performance:
#
df = df[ with( df, order(auc) ), ]
print( "AUC Performance" )
print( df )

# For the best model (LDA) what are the most important predictors:
#
varImp(linear_models$lda$classifier)

# Plot the best rocCurve:
#
if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter12/chap_12_prob_3_ROC_curves.eps", onefile=FALSE, horizontal=FALSE) }
plot( linear_models$lda$roc, legacy.axes=T, add=F, col="black" )
if( save_plots ){ dev.off() }

# Plot the lift curve for this data:
#
lp = lift( y ~ yes, data=linear_models$lda$predictions, class="yes" )
if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter12/chap_12_prob_3_lift_plot.eps", onefile=FALSE, horizontal=FALSE) }
plot(lp)
if( save_plots ){ dev.off() }