# # Written by: # -- # John L. Weatherwax 2009-04-21 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # # EPage 497 # #----- save_plots = F num_processors_to_use = 2 library(caret) library(AppliedPredictiveModeling) data(AlzheimerDisease) # Lets look at the code for this chapter using the command: # # scriptLocation() # # We will use some of the code there for this problem #-- # We start by loaing in the data exactly as is done in the .R file for this chapter: # ## The baseline set of predictors bl <- c("Genotype", "age", "tau", "p_tau", "Ab_42", "male") ## The set of new assays newAssays <- colnames(predictors) newAssays <- newAssays[!(newAssays %in% c("Class", bl))] ## Decompose the genotype factor into binary dummy variables predictors$E2 <- predictors$E3 <- predictors$E4 <- 0 predictors$E2[grepl("2", predictors$Genotype)] <- 1 predictors$E3[grepl("3", predictors$Genotype)] <- 1 predictors$E4[grepl("4", predictors$Genotype)] <- 1 genotype <- predictors$Genotype ## Partition the data library(caret) set.seed(730) split <- createDataPartition(diagnosis, p = .8, list = FALSE) adData <- predictors adData$Class <- diagnosis training <- adData[ split, ] testing <- adData[-split, ] predVars <- names(adData)[!(names(adData) %in% c("Class","Genotype"))] ## This summary function is used to evaluate the models. fiveStats <- function(...) c(twoClassSummary(...), defaultSummary(...)) ## We create the cross-validation files as a list to use with different ## functions set.seed(104) index <- createMultiFolds(training$Class, times = 5) ## The candidate set of the number of predictors to evaluate varSeq <- seq(1, length(predVars)-1, by = 2) ## We can also use parallel processing to run each resampled RFE ## iteration (or resampled model with train()) using different ## workers. library(doMC) registerDoMC(num_processors_to_use) ## The rfe() function in the caret package is used for recursive feature ## elimination. We setup control functions for this and train() that use ## the same cross-validation folds. The 'ctrl' object will be modifed several ## times as we try different models ctrl <- rfeControl(method = "repeatedcv", repeats = 5, saveDetails = TRUE, index = index, returnResamp = "final") fullCtrl <- trainControl(method = "repeatedcv", repeats = 5, summaryFunction = fiveStats, classProbs = TRUE, index = index) ## The original correlation matrix of the new data without dropping highly correlated features: predCor <- cor(training[, newAssays]) if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter19/chap_19_prob_1_orig_corrplot.eps", onefile=FALSE, horizontal=FALSE) } library(RColorBrewer) cols <- c(rev(brewer.pal(7, "Blues")), brewer.pal(7, "Reds")) library(corrplot) corrplot(predCor, order = "hclust", tl.pos = "n", addgrid.col = rgb(1,1,1,.01), col = colorRampPalette(cols)(51)) if( save_plots ){ dev.off() } # # Part (a): # predictors_to_drop = findCorrelation( predCor, cutoff=0.75 ) print( "Predictors to drop ... as they are too correlated" ) print( newAssays[ predictors_to_drop ] ) #-- # Drop these correlated predictors: #-- newAssays = newAssays[ -predictors_to_drop ] training = training[, -predictors_to_drop ] testing = testing[, -predictors_to_drop ] predVars = newAssays # Replot the correlation plot with these highly correlated predictors dropped: # if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter19/chap_19_prob_1_new_corrplot.eps", onefile=FALSE, horizontal=FALSE) } cols <- c(rev(brewer.pal(7, "Blues")), brewer.pal(7, "Reds")) library(corrplot) corrplot(cor(training[,newAssays]), order = "hclust", tl.pos = "n",addgrid.col = rgb(1,1,1,.01), col = colorRampPalette(cols)(51)) if( save_plots ){ dev.off() } # # Part (b): With this reduced features selection we now try to perform recursive feature selection: # # Again borrowed from the code location above: # ## Now fit the RFE versions. To do this, the 'functions' argument of the rfe() ## object is modified to the approproate functions. For model details about ## these functions and their arguments, see ## ## http://caret.r-forge.r-project.org/featureSelection.html ## ## for more information. # ctrl$functions <- rfFuncs ctrl$functions$summary <- fiveStats set.seed(721) rfRFE <- rfe(training[, predVars], training$Class, sizes = varSeq, metric = "ROC", ntree = 1000, rfeControl = ctrl) rfRFE ctrl$functions <- ldaFuncs ctrl$functions$summary <- fiveStats set.seed(721) ldaRFE <- rfe(training[, predVars], training$Class, sizes = varSeq, metric = "ROC", tol = 1.0e-12, rfeControl = ctrl) ldaRFE ctrl$functions <- nbFuncs ctrl$functions$summary <- fiveStats set.seed(721) nbRFE <- rfe(training[, predVars], training$Class, sizes = varSeq, metric = "ROC", rfeControl = ctrl) nbRFE ## This options tells train() to run it's model tuning ## sequentially. Otherwise, there would be parallel processing at two ## levels, which is possible but requires W^2 workers. On our machine, ## it was more efficient to only run the RFE process in parallel. cvCtrl <- trainControl(method = "cv", verboseIter = FALSE, classProbs = TRUE, allowParallel = FALSE) ## Here, the caretFuncs list allows for a model to be tuned at each iteration ## of feature seleciton. ## ctrl$functions <- caretFuncs ctrl$functions$summary <- fiveStats set.seed(721) smvRFE <- rfe(training[, predVars], training$Class, sizes = varSeq, rfeControl = ctrl, metric = "ROC", ## Now arguments to train() are used. method = "svmRadial", tuneLength = 12, preProc = c("center", "scale"), trControl = cvCtrl) smvRFE ctrl$functions <- lrFuncs ctrl$functions$summary <- fiveStats set.seed(721) lrRFE <- rfe(training[, predVars], training$Class, sizes = varSeq, metric = "ROC", rfeControl = ctrl) lrRFE ctrl$functions <- caretFuncs ctrl$functions$summary <- fiveStats set.seed(721) knnRFE <- rfe(training[, predVars], training$Class, sizes = varSeq, metric = "ROC", method = "knn", tuneLength = 20, preProc = c("center", "scale"), trControl = cvCtrl, rfeControl = ctrl) knnRFE # Here we want to plot the same profiles as in Figure 19.5 (Page 506) we could use: # # http://www.cookbook-r.com/Graphs/Facets_%28ggplot2%29/ # # Extract the (x,y) points from the "plot" command: # p_lda = plot(ldaRFE) p_rf = plot(rfRFE) p_nb = plot(nbRFE) p_lr = plot(lrRFE) p_knn = plot(knnRFE) p_svm = plot(smvRFE) # Plot these in the same "format" as the figure in the book: # min_y = min( p_lda$panel.args[[1]]$y, p_rf$panel.args[[1]]$y, p_nb$panel.args[[1]]$y, p_lr$panel.args[[1]]$y, p_knn$panel.args[[1]]$y, p_svm$panel.args[[1]]$y ) max_y = max( p_lda$panel.args[[1]]$y, p_rf$panel.args[[1]]$y, p_nb$panel.args[[1]]$y, p_lr$panel.args[[1]]$y, p_knn$panel.args[[1]]$y, p_svm$panel.args[[1]]$y ) if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter19/chap_19_prob_1_RFE_profiles.eps", onefile=FALSE, horizontal=FALSE) } par(mfrow=c(3,2)) plot( p_lda$panel.args[[1]]$x, p_lda$panel.args[[1]]$y, ylim=c(min_y,max_y), xlab="", ylab="", main="LDA" ) plot( p_rf$panel.args[[1]]$x, p_rf$panel.args[[1]]$y, ylim=c(min_y,max_y), xlab="", ylab="", main="RF" ) plot( p_nb$panel.args[[1]]$x, p_nb$panel.args[[1]]$y, ylim=c(min_y,max_y), xlab="", ylab="ROC", main="NB" ) plot( p_lr$panel.args[[1]]$x, p_lr$panel.args[[1]]$y, ylim=c(min_y,max_y), xlab="", ylab="ROC", main="LR" ) plot( p_knn$panel.args[[1]]$x, p_knn$panel.args[[1]]$y, ylim=c(min_y,max_y), xlab="Number of Predictors", ylab="", main="KNN" ) plot( p_svm$panel.args[[1]]$x, p_svm$panel.args[[1]]$y, ylim=c(min_y,max_y), xlab="Number of Predictors", ylab="", main="SVM" ) grid() par(mfrow=c(1,1)) if( save_plots ){ dev.off() }