# # Written by: # -- # John L. Weatherwax 2009-04-21 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # # WARNING: This code can take a long time (~ two days) to run. # # EPage 497 # #----- save_plots = F num_processors_to_use = 2 library(caret) library(AppliedPredictiveModeling) library(mlbench) # has the function friedman1 # # Exercise 7.2 can be found on EPage 160 # set.seed(200) trainingData = mlbench.friedman1( 500, sd=1 ) trainingData$x = data.frame(trainingData$x) testingData = mlbench.friedman1( 500, sd=1 ) testingData$x = data.frame(testingData$x) if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter19/chap_19_prob_4_featurePlot.eps", onefile=FALSE, horizontal=FALSE) } featurePlot( trainingData$x, trainingData$y ) if( save_plots ){ dev.off() } if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter19/chap_19_prob_4_pairs.eps", onefile=FALSE, horizontal=FALSE) } pairs( trainingData$x ) if( save_plots ){ dev.off() } if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter19/chap_19_prob_4_corrplot.eps", onefile=FALSE, horizontal=FALSE) } library(RColorBrewer) cols <- c(rev(brewer.pal(7, "Blues")), brewer.pal(7, "Reds")) library(corrplot) corrplot(cor(trainingData$x),order = "hclust",tl.pos = "n",addgrid.col = rgb(1,1,1,.01),col = colorRampPalette(cols)(51)) if( save_plots ){ dev.off() } # # Part (b): # df = trainingData$x; df$Y = trainingData$y null = lm( Y ~ 1, data=df ) full = lm( Y ~ ., data=df ) # Forward selection: step( null, scope=list(lower=null, upper=full), direction="forward", data=df ) # Backward selection: step( full, direction="backward", data=df ) # Both directions: step( null, scope=list(upper=full), direction="both", data=df ) # # Part (c): Use recursive feature selection with a couple of different model types: # library(doMC) registerDoMC(num_processors_to_use) set.seed(104) index <- createMultiFolds(trainingData$y, times = 5) ## The candidate set of the number of predictors to evaluate varSeq <- seq(1, dim(trainingData$x)[2], by=1) ## This is the control ctrl <- rfeControl(method = "repeatedcv", repeats = 5, saveDetails = TRUE, index = index, returnResamp = "final") ctrl$functions <- rfFuncs set.seed(721) rfRFE <- rfe(trainingData$x, trainingData$y, sizes = varSeq, ntree = 1000, rfeControl = ctrl) rfRFE ctrl$functions <- lmFuncs set.seed(721) lmRFE <- rfe(trainingData$x, trainingData$y, sizes = varSeq, tol = 1.0e-12, rfeControl = ctrl, preProc = c("center", "scale")) lmRFE # For these models we will also perform cross-validation to select parameters: cvCtrl <- trainControl(method = "cv", verboseIter = FALSE, classProbs = TRUE, allowParallel = FALSE) ctrl$functions <- caretFuncs set.seed(721) svmRFE <- rfe(trainingData$x, trainingData$y, sizes = varSeq, rfeControl = ctrl, ## Now arguments to train() are used. method = "svmRadial", tuneLength = 10, preProc = c("center", "scale"), trControl = cvCtrl) svmRFE ctrl$functions <- caretFuncs set.seed(721) knnRFE <- rfe(trainingData$x, trainingData$y, sizes = varSeq, method = "knn", tuneLength = 20, preProc = c("center", "scale"), trControl = cvCtrl, rfeControl = ctrl) knnRFE # Package the results on what variables were selected at each subset size: WWX: Here # paste0( "RF: ", paste( sprintf( "%s", rownames(rfRFE$fit[[7]]) ), collapse=", " ) ) paste0( "LM: ", paste( sprintf( "%s", names(lmRFE$fit$coefficients) ), collapse=", " ) ) cn = colnames(svmRFE$fit$trainingData[-length(svmRFE$fit$trainingData)]) paste0( "SVM: ", paste( cn, collapse=", " ) ) paste0( "KNN: ", paste( sprintf( "%s", colnames(knnRFE$fit$finalModel$learn$X) ), collapse=", " ) ) # # Part (d): Apply filter methods # # Evaluate each predictor separately and take the top five: # VI = filterVarImp( trainingData$x, trainingData$y ) print( VI[ order(VI$Overall, decreasing=T), , drop=F ] ) # Evaluate them together using ReliefF and take the top five: # library(CORElearn) df = trainingData$x; df$Y = trainingData$y reliefValues = attrEval( Y ~ ., data=df, estimator="RReliefFequalK" ) print( sort( reliefValues, decreasing=TRUE ) ) # # Part (e): # # Take 100 data points and build model with step and observe how it performs: # set.seed(201) trainingData = mlbench.friedman1( 100, sd=1 ) trainingData$x = data.frame(trainingData$x) df = trainingData$x; df$Y = trainingData$y null = lm( Y ~ 1, data=df ) full = lm( Y ~ ., data=df ) # Forward selection: step( null, scope=list(lower=null, upper=full), direction="forward", data=df ) # Backward selection: step( full, direction="backward", data=df ) # Both directions: step( null, scope=list(upper=full), direction="both", data=df )