#
# Written by:
# -- 
# John L. Weatherwax                2009-04-21
# 
# email: wax@alum.mit.edu
# 
# Please send comments and especially bug reports to the
# above email address.
# 
# EPage 311
#
#-----

save_plots = F

library(caret) # needed for the oil dataset
library(AppliedPredictiveModeling)
library(pROC)

source('build_PCC_linear_models.R')

data(oil)

table(oilType)

table(oilType) / sum(table(oilType))

# Part (c):
#
zv_cols = nearZeroVar(fattyAcids)
print( sprintf("Dropping %d zero variance columns from %d (fraction=%10.6f)", length(zv_cols), dim(fattyAcids)[2], length(zv_cols)/dim(fattyAcids)[2]) );
X = fattyAcids

# There are no linearly dependent columns remaining (or to start with)
print( findLinearCombos(X) )

# Build linear models with this data:
#
linear_models = build_PCC_linear_models( X, oilType )

# Present the sampled accuracy estimates for each model:
#
df = rbind( data.frame(name="LDA", Accuracy=linear_models$lda$confusionMatrix$overall[1]),
            data.frame(name="GLMNET", Accuracy=linear_models$glmnet$confusionMatrix$overall[1]),
            data.frame(name="NSC", Accuracy=linear_models$nsc$confusionMatrix$overall[1]) )
rownames(df) = NULL

# Order our dataframe by performance:
#
df = df[ with( df, order(Accuracy) ), ]
print( "ACCURACY Performance on the oil dataset" )
print( df )

# For the NSC model ... where is it making its errors:
#
print( linear_models$nsc$confusionMatrix )