#
# Written by:
# -- 
# John L. Weatherwax                2009-04-21
# 
# email: wax@alum.mit.edu
# 
# Please send comments and especially bug reports to the
# above email address.
#
# EPage 311
# 
#-----

save_plots = F

if( ! require("caret") ){ install.packages("caret") }
if( ! require("AppliedPredictiveModeling") ){ install.packages("AppliedPredictiveModeling") } # needed for the hepatic dataset
if( ! require("pROC") ){ install.packages("pROC") }

source('build_AUC_linear_models.R')

data(hepatic)

table(injury)

# Lump all compounds that cause injury into a "Yes" category:
#
any_damage = as.character( injury )
any_damage[ any_damage=="Mild" ] = "Yes"
any_damage[ any_damage=="Severe" ] = "Yes"
any_damage[ any_damage=="None" ] = "No"
table( any_damage )

# Here we just verify that using the function "createDataPartition" does not
# change the proportion of Yes and No's from the population proportions.
# This is especially important when one's problem has non equal class proportions and we
# don't use the "t" variable in what follows below.  The function we do use (carets "train" function)
# calls createDataPartition as a subprocess.
#
t = createDataPartition( any_damage, p=0.8, list=FALSE, times=1 )
table( any_damage ) / sum( table( any_damage ) ) # the population proportions 
table( any_damage[t] ) / sum( table( any_damage[t] ) ) # the proprtions from one resampling 

t = createDataPartition( any_damage, p=0.8, list=FALSE, times=10 ) # check again with times=10 
table( any_damage[t[,1]] ) / sum( table( any_damage[t[,1]] ) )

# Convert our response to a factor (make the first factor correspond to the event of interest):
#
any_damage = factor( any_damage, levels=c("Yes","No") )

# Part c:
#

#------------------------------------------------------------------------
# Use the biological predictors:
#------------------------------------------------------------------------
zv_cols = nearZeroVar(bio)
print( sprintf("Dropping %d zero variance columns from %d (fraction=%10.6f)", length(zv_cols), dim(bio)[2], length(zv_cols)/dim(bio)[2]) );
X = bio[,-zv_cols]

# There are no linearly dependent columns remaining (or to start with)
print( findLinearCombos(X) )

# Build linear models with this data:
#
bio_linear_models = build_AUC_linear_models( X, any_damage )

# Present the sampled ROC estimate for each model:
#
df = rbind( data.frame(name="LR", auc=bio_linear_models$glm$auc), data.frame(name="LDA", auc=bio_linear_models$lda$auc),
            data.frame(name="PLSDA", auc=bio_linear_models$plsda$auc), data.frame(name="GLMNET", auc=bio_linear_models$glmnet$auc),
            data.frame(name="NSC", auc=bio_linear_models$nsc$auc) )

# Order our dataframe by performance:
#
df = df[ with( df, order(auc) ), ]
print( "AUC Performance using biological predictors" )
print( df )

# For the best model (logistic regression) what are the most important predictors:
#
varImp(bio_linear_models$glm$classifier)


#------------------------------------------------------------------------
# Use the chemical predictors:
#------------------------------------------------------------------------
#
zv_cols = nearZeroVar(chem)

print( sprintf("Dropping %d zero variance columns from %d (fraction=%10.6f)", length(zv_cols), dim(chem)[2], length(zv_cols)/dim(chem)[2]) );
X = chem[,-zv_cols]

# Remove the linearly dependent columns
fLC = findLinearCombos(X)
X = X[,-fLC$remove]

# Build linear models with this data:
#
chem_linear_models = build_AUC_linear_models( X, any_damage )

# Present the sampled ROC estimate for each model:
#
df = rbind( data.frame(name="LR", auc=chem_linear_models$glm$auc), data.frame(name="LDA", auc=chem_linear_models$lda$auc),
            data.frame(name="PLSDA", auc=chem_linear_models$plsda$auc), data.frame(name="GLMNET", auc=chem_linear_models$glmnet$auc),
            data.frame(name="NSC", auc=chem_linear_models$nsc$auc) )

# Order our dataframe by performance:
#
df = df[ with( df, order(auc) ), ]
print( "AUC Performance using chemical predictors" )
print( df )

# For the best model (logistic regression again) what are the most important predictors:
#
varImp(chem_linear_models$glm$classifier)

# Plot the best two logistic regression rocCurves:
#
plot( bio_linear_models$glm$roc, legacy.axes=T, add=F, col="gray" )
plot( chem_linear_models$glm$roc, legacy.axes=T, add=T )
legend( 0.6, 0.2, c("LR (biological)","LR (chemical)"), col=c("gray","black"), lty=c(1,1) )

#------------------------------------------------------------------------
# Use ALL predictors into one set of predictors:
#------------------------------------------------------------------------
#
all = cbind( bio, chem )
zv_cols = nearZeroVar(all)
print( sprintf("Dropping %d zero variance columns from %d (fraction=%10.6f)", length(zv_cols), dim(all)[2], length(zv_cols)/dim(all)[2]) );
X = all[,-zv_cols]

# There are no linearly dependent columns remaining (or to start with)
fLC = findLinearCombos(X)
X = X[,-fLC$remove]
print( sprintf("Dropping %d columns due to linear combinations %d (fraction=%10.6f)", length(fLC$remove), dim(X)[2], length(fLC$remove)/dim(X)[2]) );

# Build linear models with this data:
#
all_linear_models = build_AUC_linear_models( X, any_damage )

# Present the sampled ROC estimate for each model:
#
df = rbind( data.frame(name="LR", auc=all_linear_models$glm$auc), data.frame(name="LDA", auc=all_linear_models$lda$auc),
            data.frame(name="PLSDA", auc=all_linear_models$plsda$auc), data.frame(name="GLMNET", auc=all_linear_models$glmnet$auc),
            data.frame(name="NSC", auc=all_linear_models$nsc$auc) )

# Order our dataframe by performance:
#
df = df[ with( df, order(auc) ), ]
print( "AUC Performance using all predictors" )
print( df )

# Report the top five predictors (for the LDA model):
#
varImp(all_linear_models$lda$classifier)

# Plot the logistic regression rocCurves using the bio and chem predictors and the best roc curve using both bio and chem predictors:
#
if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter12/chap_12_prob_1_ROC_curves.eps", onefile=FALSE, horizontal=FALSE) }
plot( bio_linear_models$glm$roc, legacy.axes=T, add=F, col="gray", lty=2, ylim=c(0,1.05) )
plot( chem_linear_models$glm$roc, legacy.axes=T, add=T, col="darkgray" )
plot( all_linear_models$lda$roc, legacy.axes=T, add=T )
legend( 0.6, 0.2, c("LR (biological)","LR (chemical)", "LDA (all)"), col=c("gray","darkgray","black"), lty=c(2,1,1) )
if( save_plots ){ dev.off() }