#
# Written by:
# -- 
# John L. Weatherwax                2009-04-21
# 
# email: wax@alum.mit.edu
# 
# Please send comments and especially bug reports to the
# above email address.
#
# EPage 497
#
#-----

#
# The cell segmentation data is discussed on EPage 50
#
save_plots = F
num_processors_to_use = 2

library(caret)
library(AppliedPredictiveModeling)
data(segmentationOriginal) # found examples for reading this data in the file: O3_Data_Pre_Processing.R

## Retain the original training set
segTrain <- subset(segmentationOriginal, Case == "Train")

## Remove the first three columns (identifier columns)
segTrainX <- segTrain[, -(1:3)]
segTrainClass <- segTrain$Class

#--
# Part (a)
#--
zero_cols = nearZeroVar( segTrainX )
print( colnames( segTrainX )[zero_cols] )
segTrainX = segTrainX[,-zero_cols]

# What pairs of predictors have the largest correlations between them:
source("../Chapter18/predictor_importance_utils.R")
LC = largest_cors( segTrainX )
print( head( LC ) )

# Remove highly correlated features:
#
highCorr = findCorrelation( cor( segTrainX ), cutoff=0.75 )
segTrainX = segTrainX[,-highCorr]

#--
# Apply wrapper methods:
#--

# Using the "step" function:
#
df = segTrainX; df$Class = segTrainClass
null = glm( Class ~ 1, data=df, family=binomial )
full = glm( Class ~ ., data=df, family=binomial )

# Forward selection:
step( null, scope=list(lower=null, upper=full), direction="forward", data=df )

# Backward selection:
step( full, direction="backward", data=df )

# Both directions:
step( null, scope=list(upper=full), direction="both", data=df )

# Using the "fastbw" function:
#
## library(rms)
## fastbw()

# Using the "regsubsets" function:
#
## library(leaps)
## regsubsets(Class ~ ., data=df, nbest=1, really.big=T, method="forward" )
## regsubsets(Class ~ ., data=df, nbest=1, really.big=T, method="backward" )

# Using the "stepclass" function:
#
library(klaR)
stepclass( Class ~ ., data=df, method="lda", direction="forward" )

stepclass( Class ~ ., data=df, method="lda", direction="backward" )

stepclass( Class ~ ., data=df, method="lda", direction="both" )

#--
# Apply filter methods:
#--

# Evaluate each predictor separately and take the top five:
#
VI = filterVarImp( df[,-114], df$Class )
print( VI[ order(VI$PS, decreasing=T), , drop=F ] )

# Evaluate them together using ReliefF and take the top five:
#
library(CORElearn)
relieralues = attrEval( Class ~ ., data=df, estimator="Relief" )
print( sort( relieralues, decreasing=TRUE ) )

# If we use recursive feature estimation we find that the models with lots of features do better:
#
library(doMC)
registerDoMC(num_processors_to_use)

fiveStats <- function(...) c(twoClassSummary(...), defaultSummary(...))

set.seed(104)
index <- createMultiFolds(df$Class, times = 5)

varSeq <- seq(1, dim(df)[2]-1, by = 2)

ctrl <- rfeControl(method = "repeatedcv", repeats = 5,
                   saveDetails = TRUE,
                   index = index,
                   returnResamp = "final")

fullCtrl <- trainControl(method = "repeatedcv",
                         repeats = 5,
                         summaryFunction = fiveStats,
                         classProbs = TRUE,
                         index = index)

ctrl$functions <- rfFuncs
ctrl$functions$summary <- fiveStats
set.seed(721)
rfRFE <- rfe(df[, -67],
             df$Class,
             sizes = varSeq,
             metric = "ROC",
             ntree = 1000,
             rfeControl = ctrl)
rfRFE