#
# Written by:
# -- 
# John L. Weatherwax                2009-04-21
# 
# email: wax@alum.mit.edu
# 
# Please send comments and especially bug reports to the
# above email address.
#
# EPage 351
#
#-----

save_plots = FALSE

library(caret)

library(AppliedPredictiveModeling) # needed for the hepatic dataset
library(pROC)

source('build_AUC_nonlinear_models.R')

data(hepatic)

# Use the same pre-processing that we did for Chapter 12 Problem 1:
#
# Lump all compounds that cause injury into a "Yes" category:
#
any_damage = as.character( injury )
any_damage[ any_damage=="Mild" ] = "Yes"
any_damage[ any_damage=="Severe" ] = "Yes"
any_damage[ any_damage=="None" ] = "No"

# Convert our response to a factor (make the first factor correspond to the event of interest):
#
any_damage = factor( any_damage, levels=c("Yes","No") )

# Part a:
#

#------------------------------------------------------------------------
# Use the biological predictors:
#------------------------------------------------------------------------
#
zv_cols = nearZeroVar(bio)
print( sprintf("Dropping %d zero variance columns from %d (fraction=%10.6f)", length(zv_cols), dim(bio)[2], length(zv_cols)/dim(bio)[2]) );
X = bio[,-zv_cols]

# There are no linearly dependent columns remaining (or to start with)
print( findLinearCombos(X) )

# Build linear models with this data:
#
bio_nonlinear_models = build_AUC_nonlinear_models( X, any_damage )

# Present the sampled ROC estimate for each model:
#
df = rbind( data.frame(name="MDA", auc=bio_nonlinear_models$mda$auc),
            data.frame(name="NNET", auc=bio_nonlinear_models$nnet$auc),
            data.frame(name="SVM", auc=bio_nonlinear_models$svm$auc),
            data.frame(name="KNN", auc=bio_nonlinear_models$knn$auc),
            data.frame(name="NB", auc=bio_nonlinear_models$nb$auc) )

# Order our dataframe by performance:
#
df = df[ with( df, order(auc) ), ]
print( "AUC Performance using biological predictors" )
print( df )

# For the best model (Mixture Discriminant Analysis) what are the most important predictors:
#
varImp(bio_nonlinear_models$mda$classifier)

#------------------------------------------------------------------------
# Use the chemical predictors:
#------------------------------------------------------------------------
#
zv_cols = nearZeroVar(chem)
print( sprintf("Dropping %d zero variance columns from %d (fraction=%10.6f)", length(zv_cols), dim(chem)[2], length(zv_cols)/dim(chem)[2]) );
X = chem[,-zv_cols]

# Remove the linearly dependent columns
fLC = findLinearCombos(X)
X = X[,-fLC$remove]

# Build nonlinear models with this data:
#
chem_nonlinear_models = build_AUC_nonlinear_models( X, any_damage )

# Present the sampled ROC estimate for each model:
#
df = rbind( data.frame(name="MDA", auc=chem_nonlinear_models$mda$auc),
            data.frame(name="NNET", auc=chem_nonlinear_models$nnet$auc),
            data.frame(name="SVM", auc=chem_nonlinear_models$svm$auc),
            data.frame(name="KNN", auc=chem_nonlinear_models$knn$auc),
            data.frame(name="NB", auc=chem_nonlinear_models$nb$auc) )

# Order our dataframe by performance:
#
df = df[ with( df, order(auc) ), ]
print( "AUC Performance using chemical predictors" )
print( df )

# For the best model (neural networks) what are the most important predictors:
#
varImp(chem_nonlinear_models$nnet$classifier)

# Plot the best two logistic regression rocCurves:
#
plot( bio_nonlinear_models$mda$roc, legacy.axes=T, add=F, col="gray" )
plot( chem_nonlinear_models$nnet$roc, legacy.axes=T, add=T )
legend( 0.6, 0.2, c("MDA (biological)","NNET (chemical)"), col=c("gray","black"), lty=c(1,1) )

#------------------------------------------------------------------------
# Use ALL predictors into one set of predictors:
#------------------------------------------------------------------------
#
all = cbind( bio, chem )

zv_cols = nearZeroVar(all)
print( sprintf("Dropping %d zero variance columns from %d (fraction=%10.6f)", length(zv_cols), dim(all)[2], length(zv_cols)/dim(all)[2]) );
X = all[,-zv_cols]

# There are no linearly dependent columns remaining (or to start with)
fLC = findLinearCombos(X)
X = X[,-fLC$remove]
print( sprintf("Dropping %d columns due to linear combinations %d (fraction=%10.6f)",length(fLC$remove), dim(X)[2], length(fLC$remove)/dim(X)[2]) );

# Build nonlinear models with this data:
#
all_nonlinear_models = build_AUC_nonlinear_models( X, any_damage, build_mda_model=FALSE )

# Present the sampled ROC estimate for each model:
#
df = rbind( data.frame(name="NNET", auc=all_nonlinear_models$nnet$auc),
            data.frame(name="SVM", auc=all_nonlinear_models$svm$auc),
            data.frame(name="KNN", auc=all_nonlinear_models$knn$auc),
            data.frame(name="NB", auc=all_nonlinear_models$nb$auc) )

# Order our dataframe by performance:
#
df = df[ with( df, order(auc) ), ]
print( "AUC Performance using all predictors" )
print( df )

# Report the top five predictors:
#
varImp(all_nonlinear_models$knn$classifier)

# Plot the logistic regression rocCurves using the bio and chem predictors and the best roc curve using both bio and chem predictors:
#
if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter13/chap_13_prob_1_ROC_curves.eps", onefile=FALSE, horizontal=FALSE) }
plot( bio_nonlinear_models$mda$roc, legacy.axes=T, add=F, col="gray", lty=2, ylim=c(0,1.05) )
plot( chem_nonlinear_models$nnet$roc, legacy.axes=T, add=T, col="black" )
plot( all_nonlinear_models$svm$roc, legacy.axes=T, add=T, col="darkgray" )
legend( 0.6, 0.2, c("MDA (biological)","NNET (chemical)", "SVM (all)"), col=c("gray","black","darkgray"), lty=c(2,1,1) )
if( save_plots ){ dev.off() }