# # Written by: # -- # John L. Weatherwax 2009-04-21 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # # Problem on EPage 523 # # Use R Code on Pages 541-542 (EPage 519-520) # save_plots = T library(caret) library(AppliedPredictiveModeling) data(solubility) set.seed(100) indx = createFolds(solTrainY, returnTrain=TRUE) ctrl = trainControl(method="cv", index=indx) set.seed(100) mtryVals = floor(seq(10, ncol(solTrainXtrans), length=10)) mtryGrid = data.frame(.mtry=mtryVals) rfTune = train(x=solTrainXtrans, y=solTrainY, method="rf", tuneGrid=mtryGrid, ntree=200, importance=TRUE, trControl=ctrl ) ImportanceOrder = order( rfTune$finalModel$importance[,1], decreasing=TRUE ) top20 = rownames( rfTune$finalModel$importance[ImportanceOrder,] )[1:20] solTrainXimp = subset( solTrainX, select=top20 ) solTestXimp = subset( solTestX, select=top20 ) permutesolTrainXimp = apply( solTrainXimp, 2, function(x) sample(x)) solSimX = rbind( solTrainXimp, permutesolTrainXimp ) groupVals = c("Training", "Random") groups = factor( rep(groupVals, each=nrow(solTrainX)) ) rfSolClass = train( x=solSimX, y=groups, method="rf", tuneLength=5, ntree=1000, control=trainControl(method="LGOCV") ) solTestGroupProbs = predict(rfSolClass, solTestXimp, type="prob") # Extract the test sets probability of membership in the training set: # if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter20/chap_20_prob_3_probability_from_training_set.eps", onefile=FALSE, horizontal=FALSE) } hist( solTestGroupProbs$Training ) if( save_plots ){ dev.off() } # How many test set samples have a probability of less than 0.5 that they are in the training set: # print( sum( solTestGroupProbs$Training <= 0.5 ) )