# # Written by: # -- # John L. Weatherwax 2009-04-21 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # # EPage 497 # #----- # # The cell segmentation data is discussed on EPage 50 # save_plots = F num_processors_to_use = 2 library(caret) library(AppliedPredictiveModeling) data(segmentationOriginal) # found examples for reading this data in the file: O3_Data_Pre_Processing.R ## Retain the original training set segTrain <- subset(segmentationOriginal, Case == "Train") ## Remove the first three columns (identifier columns) segTrainX <- segTrain[, -(1:3)] segTrainClass <- segTrain$Class #-- # Part (a) #-- zero_cols = nearZeroVar( segTrainX ) print( colnames( segTrainX )[zero_cols] ) segTrainX = segTrainX[,-zero_cols] # What pairs of predictors have the largest correlations between them: source("../Chapter18/predictor_importance_utils.R") LC = largest_cors( segTrainX ) print( head( LC ) ) # Remove highly correlated features: # highCorr = findCorrelation( cor( segTrainX ), cutoff=0.75 ) segTrainX = segTrainX[,-highCorr] #-- # Apply wrapper methods: #-- # Using the "step" function: # df = segTrainX; df$Class = segTrainClass null = glm( Class ~ 1, data=df, family=binomial ) full = glm( Class ~ ., data=df, family=binomial ) # Forward selection: step( null, scope=list(lower=null, upper=full), direction="forward", data=df ) # Backward selection: step( full, direction="backward", data=df ) # Both directions: step( null, scope=list(upper=full), direction="both", data=df ) # Using the "fastbw" function: # ## library(rms) ## fastbw() # Using the "regsubsets" function: # ## library(leaps) ## regsubsets(Class ~ ., data=df, nbest=1, really.big=T, method="forward" ) ## regsubsets(Class ~ ., data=df, nbest=1, really.big=T, method="backward" ) # Using the "stepclass" function: # library(klaR) stepclass( Class ~ ., data=df, method="lda", direction="forward" ) stepclass( Class ~ ., data=df, method="lda", direction="backward" ) stepclass( Class ~ ., data=df, method="lda", direction="both" ) #-- # Apply filter methods: #-- # Evaluate each predictor separately and take the top five: # VI = filterVarImp( df[,-114], df$Class ) print( VI[ order(VI$PS, decreasing=T), , drop=F ] ) # Evaluate them together using ReliefF and take the top five: # library(CORElearn) relieralues = attrEval( Class ~ ., data=df, estimator="Relief" ) print( sort( relieralues, decreasing=TRUE ) ) # If we use recursive feature estimation we find that the models with lots of features do better: # library(doMC) registerDoMC(num_processors_to_use) fiveStats <- function(...) c(twoClassSummary(...), defaultSummary(...)) set.seed(104) index <- createMultiFolds(df$Class, times = 5) varSeq <- seq(1, dim(df)[2]-1, by = 2) ctrl <- rfeControl(method = "repeatedcv", repeats = 5, saveDetails = TRUE, index = index, returnResamp = "final") fullCtrl <- trainControl(method = "repeatedcv", repeats = 5, summaryFunction = fiveStats, classProbs = TRUE, index = index) ctrl$functions <- rfFuncs ctrl$functions$summary <- fiveStats set.seed(721) rfRFE <- rfe(df[, -67], df$Class, sizes = varSeq, metric = "ROC", ntree = 1000, rfeControl = ctrl) rfRFE