#
# Written by:
# -- 
# John L. Weatherwax                2009-04-21
# 
# email: wax@alum.mit.edu
# 
# Please send comments and especially bug reports to the
# above email address.
#
# WARNING: This code can take a long time (~ two days) to run.
#
# EPage 497
#
#-----

save_plots = F
num_processors_to_use = 2

library(caret)
library(AppliedPredictiveModeling)
data(AlzheimerDisease)

# Lets look at the code for this chapter from the book.  The location of that code can be found using the command:
#
# scriptLocation()
#

# We start by loading in the data exactly as is done in the .R file for this chapter:
#

## The baseline set of predictors
bl <- c("Genotype", "age", "tau", "p_tau", "Ab_42", "male")

## The set of new assays
newAssays <- colnames(predictors)
newAssays <- newAssays[!(newAssays %in% c("Class", bl))]

## Decompose the genotype factor into binary dummy variables

predictors$E2 <- predictors$E3 <- predictors$E4 <- 0
predictors$E2[grepl("2", predictors$Genotype)] <- 1
predictors$E3[grepl("3", predictors$Genotype)] <- 1
predictors$E4[grepl("4", predictors$Genotype)] <- 1
genotype <- predictors$Genotype

## Partition the data
library(caret)
set.seed(730)
split <- createDataPartition(diagnosis, p = .8, list = FALSE)

adData <- predictors
adData$Class <- diagnosis

training <- adData[ split, ]
testing <- adData[-split, ]

predVars <- names(adData)[!(names(adData) %in% c("Class","Genotype"))]

## This summary function is used to evaluate the models.
fiveStats <- function(...) c(twoClassSummary(...), defaultSummary(...))

## We create the cross-validation files as a list to use with different 
## functions

set.seed(104)
index <- createMultiFolds(training$Class, times = 5)

## The candidate set of the number of predictors to evaluate
varSeq <- seq(1, length(predVars)-1, by = 2)

## We can also use parallel processing to run each resampled RFE
## iteration (or resampled model with train()) using different
## workers.
library(doMC)
registerDoMC(num_processors_to_use)

## The rfe() function in the caret package is used for recursive feature
## elimiation. We setup control functions for this and train() that use
## the same cross-validation folds. The 'Ctrl' object will be modifed several 
## times as we try different models

ctrl <- rfeControl(method = "repeatedcv", repeats = 5,
                   saveDetails = TRUE,
                   index = index,
                   returnResamp = "final")

fullCtrl <- trainControl(method = "repeatedcv", repeats = 5,
                         summaryFunction = fiveStats,
                         classProbs = TRUE,
                         index = index)

## This options tells train() to run it's model tuning
## sequentially. Otherwise, there would be parallel processing at two
## levels, which is possible but requires W^2 workers. On our machine,
## it was more efficient to only run the REF process in parallel.

cvCtrl <- trainControl(method = "cv",
                       verboseIter = FALSE,
                       classProbs = TRUE,
                       allowParallel = FALSE)

#--
# Fit the full model (using all predictors) under RFE using train with method="sparseLDA"
#--
ctrl$functions <- caretFuncs
ctrl$functions$summary <- fiveStats
set.seed(721)
sparseLDA_full <- rfe(training[, predVars],
                      training$Class,
                      sizes = varSeq,
                      rfeControl = ctrl,
                      metric = "ROC",
                      ## Now arguments to train() are used.
                      method = "sparseLDA",
                      tuneLength = 5,
                      trControl = cvCtrl)
sparseLDA_full


#--
# Fit the reduced model (using smaller predictor set) under RFE using train with method="sparseLDA"
#--

predCor <- cor(training[, newAssays])
predictors_to_drop = findCorrelation( predCor, cutoff=0.75 )
print( "Predictors to drop ... as they are too correlated" )
print( newAssays[ predictors_to_drop ] )

newAssays = newAssays[ -predictors_to_drop ]
training = training[, -predictors_to_drop ]
testing = testing[, -predictors_to_drop ]
predVars = newAssays

ctrl$functions <- caretFuncs
ctrl$functions$summary <- fiveStats
set.seed(721)
sparseLDA_filter <- rfe(training[, predVars],
                        training$Class,
                        sizes = varSeq,
                        rfeControl = ctrl,
                        metric = "ROC",
                        ## Now arguments to train() are used.
                        method = "sparseLDA",
                        tuneLength = 5,
                        trControl = cvCtrl)
sparseLDA_filter


# Lets plot the performance profiles for each of these two

# Extract the (x,y) points from the "plot" command:
#
p_full = plot(sparseLDA_full)
p_filter = plot(sparseLDA_filter)

# Plot these in the same "format" as the figure in the book:
#
min_y = min( p_full$panel.args[[1]]$y, p_filter$panel.args[[1]]$y )
max_y = max( p_full$panel.args[[1]]$y, p_filter$panel.args[[1]]$y )

if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter19/chap_19_prob_2_RFE_profiles.eps", onefile=FALSE, horizontal=FALSE) }
par(mfrow=c(1,2))
plot( p_full$panel.args[[1]]$x, p_full$panel.args[[1]]$y, ylim=c(min_y,max_y), xlab="", ylab="", main="All predictors" ); grid()
plot( p_filter$panel.args[[1]]$x, p_filter$panel.args[[1]]$y, ylim=c(min_y,max_y), xlab="", ylab="", main="Filtered prediCtors" ); grid()
par(mfrow=c(1,1))
if( save_plots ){ dev.off() }