# # Written by: # -- # John L. Weatherwax 2009-04-21 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # # WARNING: This code can take a long time (~ two days) to run. # # EPage 497 # #----- save_plots = F num_processors_to_use = 2 library(caret) library(AppliedPredictiveModeling) data(AlzheimerDisease) # Lets look at the code for this chapter from the book. The location of that code can be found using the command: # # scriptLocation() # # We start by loading in the data exactly as is done in the .R file for this chapter: # ## The baseline set of predictors bl <- c("Genotype", "age", "tau", "p_tau", "Ab_42", "male") ## The set of new assays newAssays <- colnames(predictors) newAssays <- newAssays[!(newAssays %in% c("Class", bl))] ## Decompose the genotype factor into binary dummy variables predictors$E2 <- predictors$E3 <- predictors$E4 <- 0 predictors$E2[grepl("2", predictors$Genotype)] <- 1 predictors$E3[grepl("3", predictors$Genotype)] <- 1 predictors$E4[grepl("4", predictors$Genotype)] <- 1 genotype <- predictors$Genotype ## Partition the data library(caret) set.seed(730) split <- createDataPartition(diagnosis, p = .8, list = FALSE) adData <- predictors adData$Class <- diagnosis training <- adData[ split, ] testing <- adData[-split, ] predVars <- names(adData)[!(names(adData) %in% c("Class","Genotype"))] ## This summary function is used to evaluate the models. fiveStats <- function(...) c(twoClassSummary(...), defaultSummary(...)) ## We create the cross-validation files as a list to use with different ## functions set.seed(104) index <- createMultiFolds(training$Class, times = 5) ## The candidate set of the number of predictors to evaluate varSeq <- seq(1, length(predVars)-1, by = 2) ## We can also use parallel processing to run each resampled RFE ## iteration (or resampled model with train()) using different ## workers. library(doMC) registerDoMC(num_processors_to_use) ## The rfe() function in the caret package is used for recursive feature ## elimiation. We setup control functions for this and train() that use ## the same cross-validation folds. The 'Ctrl' object will be modifed several ## times as we try different models ctrl <- rfeControl(method = "repeatedcv", repeats = 5, saveDetails = TRUE, index = index, returnResamp = "final") fullCtrl <- trainControl(method = "repeatedcv", repeats = 5, summaryFunction = fiveStats, classProbs = TRUE, index = index) ## This options tells train() to run it's model tuning ## sequentially. Otherwise, there would be parallel processing at two ## levels, which is possible but requires W^2 workers. On our machine, ## it was more efficient to only run the REF process in parallel. cvCtrl <- trainControl(method = "cv", verboseIter = FALSE, classProbs = TRUE, allowParallel = FALSE) #-- # Fit the full model (using all predictors) under RFE using train with method="sparseLDA" #-- ctrl$functions <- caretFuncs ctrl$functions$summary <- fiveStats set.seed(721) sparseLDA_full <- rfe(training[, predVars], training$Class, sizes = varSeq, rfeControl = ctrl, metric = "ROC", ## Now arguments to train() are used. method = "sparseLDA", tuneLength = 5, trControl = cvCtrl) sparseLDA_full #-- # Fit the reduced model (using smaller predictor set) under RFE using train with method="sparseLDA" #-- predCor <- cor(training[, newAssays]) predictors_to_drop = findCorrelation( predCor, cutoff=0.75 ) print( "Predictors to drop ... as they are too correlated" ) print( newAssays[ predictors_to_drop ] ) newAssays = newAssays[ -predictors_to_drop ] training = training[, -predictors_to_drop ] testing = testing[, -predictors_to_drop ] predVars = newAssays ctrl$functions <- caretFuncs ctrl$functions$summary <- fiveStats set.seed(721) sparseLDA_filter <- rfe(training[, predVars], training$Class, sizes = varSeq, rfeControl = ctrl, metric = "ROC", ## Now arguments to train() are used. method = "sparseLDA", tuneLength = 5, trControl = cvCtrl) sparseLDA_filter # Lets plot the performance profiles for each of these two # Extract the (x,y) points from the "plot" command: # p_full = plot(sparseLDA_full) p_filter = plot(sparseLDA_filter) # Plot these in the same "format" as the figure in the book: # min_y = min( p_full$panel.args[[1]]$y, p_filter$panel.args[[1]]$y ) max_y = max( p_full$panel.args[[1]]$y, p_filter$panel.args[[1]]$y ) if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter19/chap_19_prob_2_RFE_profiles.eps", onefile=FALSE, horizontal=FALSE) } par(mfrow=c(1,2)) plot( p_full$panel.args[[1]]$x, p_full$panel.args[[1]]$y, ylim=c(min_y,max_y), xlab="", ylab="", main="All predictors" ); grid() plot( p_filter$panel.args[[1]]$x, p_filter$panel.args[[1]]$y, ylim=c(min_y,max_y), xlab="", ylab="", main="Filtered prediCtors" ); grid() par(mfrow=c(1,1)) if( save_plots ){ dev.off() }