#
# Written by:
# -- 
# John L. Weatherwax                2009-04-21
# 
# email: wax@alum.mit.edu
# 
# Please send comments and especially bug reports to the
# above email address.
#
# EPage 497
#
#-----

save_plots = F
num_processors_to_use =  2

library(caret)
library(AppliedPredictiveModeling)
data(AlzheimerDisease)

# Lets look at the code for this chapter using the command:
#
# scriptLocation()
#
# We will use some of the code there for this problem
#--

# We start by loaing in the data exactly as is done in the .R file for this chapter:
#

## The baseline set of predictors
bl <- c("Genotype", "age", "tau", "p_tau", "Ab_42", "male")

## The set of new assays
newAssays <- colnames(predictors)
newAssays <- newAssays[!(newAssays %in% c("Class", bl))]

## Decompose the genotype factor into binary dummy variables

predictors$E2 <- predictors$E3 <- predictors$E4 <- 0
predictors$E2[grepl("2", predictors$Genotype)] <- 1
predictors$E3[grepl("3", predictors$Genotype)] <- 1
predictors$E4[grepl("4", predictors$Genotype)] <- 1
genotype <- predictors$Genotype

## Partition the data
library(caret)
set.seed(730)
split <- createDataPartition(diagnosis, p = .8, list = FALSE)

adData <- predictors
adData$Class <- diagnosis

training <- adData[ split, ]
testing <- adData[-split, ]

predVars <- names(adData)[!(names(adData) %in% c("Class","Genotype"))]

## This summary function is used to evaluate the models.
fiveStats <- function(...) c(twoClassSummary(...), defaultSummary(...))

## We create the cross-validation files as a list to use with different
## functions

set.seed(104)
index <- createMultiFolds(training$Class, times = 5)

## The candidate set of the number of predictors to evaluate
varSeq <- seq(1, length(predVars)-1, by = 2)

## We can also use parallel processing to run each resampled RFE
## iteration (or resampled model with train()) using different
## workers.

library(doMC)
registerDoMC(num_processors_to_use)

## The rfe() function in the caret package is used for recursive feature
## elimination. We setup control functions for this and train() that use
## the same cross-validation folds. The 'ctrl' object will be modifed several 
## times as we try different models

ctrl <- rfeControl(method = "repeatedcv", repeats = 5,
                   saveDetails = TRUE,
                   index = index,
                   returnResamp = "final")

fullCtrl <- trainControl(method = "repeatedcv",
                         repeats = 5,
                         summaryFunction = fiveStats,
                         classProbs = TRUE,
                         index = index)

## The original correlation matrix of the new data without dropping highly correlated features:
predCor <- cor(training[, newAssays])

if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter19/chap_19_prob_1_orig_corrplot.eps", onefile=FALSE, horizontal=FALSE) }
library(RColorBrewer)
cols <- c(rev(brewer.pal(7, "Blues")), brewer.pal(7, "Reds"))
library(corrplot)
corrplot(predCor, order = "hclust", tl.pos = "n", addgrid.col = rgb(1,1,1,.01), col = colorRampPalette(cols)(51))
if( save_plots ){ dev.off() }

#
# Part (a):
#
predictors_to_drop = findCorrelation( predCor, cutoff=0.75 )
print( "Predictors to drop ... as they are too correlated" )
print( newAssays[ predictors_to_drop ] )

#--
# Drop these correlated predictors:
#--
newAssays = newAssays[ -predictors_to_drop ]
training = training[, -predictors_to_drop ]
testing = testing[, -predictors_to_drop ]
predVars = newAssays

# Replot the correlation plot with these highly correlated predictors dropped:
#
if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter19/chap_19_prob_1_new_corrplot.eps", onefile=FALSE, horizontal=FALSE) }
cols <- c(rev(brewer.pal(7, "Blues")), brewer.pal(7, "Reds"))
library(corrplot)
corrplot(cor(training[,newAssays]), order = "hclust", tl.pos = "n",addgrid.col = rgb(1,1,1,.01), col = colorRampPalette(cols)(51))
if( save_plots ){ dev.off() }

#
# Part (b): With this reduced features selection we now try to perform recursive feature selection:
#
# Again borrowed from the code location above:
#
## Now fit the RFE versions. To do this, the 'functions' argument of the rfe()
## object is modified to the approproate functions. For model details about
## these functions and their arguments, see
## 
## http://caret.r-forge.r-project.org/featureSelection.html
##
## for more information.
#

ctrl$functions <- rfFuncs
ctrl$functions$summary <- fiveStats
set.seed(721)
rfRFE <- rfe(training[, predVars],
             training$Class,
             sizes = varSeq,
             metric = "ROC",
             ntree = 1000,
             rfeControl = ctrl)
rfRFE

ctrl$functions <- ldaFuncs
ctrl$functions$summary <- fiveStats
set.seed(721)
ldaRFE <- rfe(training[, predVars],
              training$Class,
              sizes = varSeq,
              metric = "ROC",
              tol = 1.0e-12,
              rfeControl = ctrl)
ldaRFE

ctrl$functions <- nbFuncs
ctrl$functions$summary <- fiveStats
set.seed(721)
nbRFE <- rfe(training[, predVars],
            training$Class,
            sizes = varSeq,
            metric = "ROC",
            rfeControl = ctrl)
nbRFE

## This options tells train() to run it's model tuning
## sequentially. Otherwise, there would be parallel processing at two
## levels, which is possible but requires W^2 workers. On our machine,
## it was more efficient to only run the RFE process in parallel.

cvCtrl <- trainControl(method = "cv",
                       verboseIter = FALSE,
                       classProbs = TRUE,
                       allowParallel = FALSE)

## Here, the caretFuncs list allows for a model to be tuned at each iteration
## of feature seleciton.
##
ctrl$functions <- caretFuncs
ctrl$functions$summary <- fiveStats
set.seed(721)
smvRFE <- rfe(training[, predVars],
              training$Class, 
              sizes = varSeq,
              rfeControl = ctrl,
              metric = "ROC", 
              ## Now arguments to train() are used.
              method = "svmRadial",
              tuneLength = 12, 
              preProc = c("center", "scale"),
              trControl = cvCtrl)
smvRFE

ctrl$functions <- lrFuncs
ctrl$functions$summary <- fiveStats
set.seed(721)
lrRFE <- rfe(training[, predVars],
             training$Class,
             sizes = varSeq,
             metric = "ROC",
             rfeControl = ctrl)
lrRFE

ctrl$functions <- caretFuncs
ctrl$functions$summary <- fiveStats
set.seed(721)
knnRFE <- rfe(training[, predVars],
              training$Class,
              sizes = varSeq,
              metric = "ROC",
              method = "knn",
              tuneLength = 20,
              preProc = c("center", "scale"),
              trControl = cvCtrl,
              rfeControl = ctrl)
knnRFE

# Here we want to plot the same profiles as in Figure 19.5 (Page 506) we could use:
# 
# http://www.cookbook-r.com/Graphs/Facets_%28ggplot2%29/
#

# Extract the (x,y) points from the "plot" command:
#
p_lda = plot(ldaRFE)
p_rf = plot(rfRFE)
p_nb = plot(nbRFE)
p_lr = plot(lrRFE)
p_knn = plot(knnRFE)
p_svm = plot(smvRFE)

# Plot these in the same "format" as the figure in the book:
#
min_y = min( p_lda$panel.args[[1]]$y, p_rf$panel.args[[1]]$y, p_nb$panel.args[[1]]$y, p_lr$panel.args[[1]]$y, p_knn$panel.args[[1]]$y, p_svm$panel.args[[1]]$y )
max_y = max( p_lda$panel.args[[1]]$y, p_rf$panel.args[[1]]$y, p_nb$panel.args[[1]]$y, p_lr$panel.args[[1]]$y, p_knn$panel.args[[1]]$y, p_svm$panel.args[[1]]$y )

if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter19/chap_19_prob_1_RFE_profiles.eps", onefile=FALSE, horizontal=FALSE) }
par(mfrow=c(3,2))
plot( p_lda$panel.args[[1]]$x, p_lda$panel.args[[1]]$y, ylim=c(min_y,max_y), xlab="", ylab="", main="LDA" )
plot( p_rf$panel.args[[1]]$x, p_rf$panel.args[[1]]$y, ylim=c(min_y,max_y), xlab="", ylab="", main="RF" )
plot( p_nb$panel.args[[1]]$x, p_nb$panel.args[[1]]$y, ylim=c(min_y,max_y), xlab="", ylab="ROC", main="NB" )
plot( p_lr$panel.args[[1]]$x, p_lr$panel.args[[1]]$y, ylim=c(min_y,max_y), xlab="", ylab="ROC", main="LR" )
plot( p_knn$panel.args[[1]]$x, p_knn$panel.args[[1]]$y, ylim=c(min_y,max_y), xlab="Number of Predictors", ylab="", main="KNN" )
plot( p_svm$panel.args[[1]]$x, p_svm$panel.args[[1]]$y, ylim=c(min_y,max_y), xlab="Number of Predictors", ylab="", main="SVM" )
grid()
par(mfrow=c(1,1))
if( save_plots ){ dev.off() }