#
# Written by:
# -- 
# John L. Weatherwax                2009-04-21
# 
# email: wax@alum.mit.edu
# 
# Please send comments and especially bug reports to the
# above email address.
#
# Problem on EPage 523
#
# Use R Code on Pages 541-542 (EPage 519-520)
#
                 
save_plots = T

library(caret)
library(AppliedPredictiveModeling)
data(solubility)

set.seed(100)
indx = createFolds(solTrainY, returnTrain=TRUE)
ctrl = trainControl(method="cv", index=indx)

set.seed(100)
mtryVals = floor(seq(10, ncol(solTrainXtrans), length=10))
mtryGrid = data.frame(.mtry=mtryVals)
rfTune = train(x=solTrainXtrans, y=solTrainY, method="rf", tuneGrid=mtryGrid, ntree=200, importance=TRUE, trControl=ctrl )

ImportanceOrder = order( rfTune$finalModel$importance[,1], decreasing=TRUE )
top20 = rownames( rfTune$finalModel$importance[ImportanceOrder,] )[1:20]

solTrainXimp = subset( solTrainX, select=top20 )
solTestXimp = subset( solTestX, select=top20 )

permutesolTrainXimp = apply( solTrainXimp, 2, function(x) sample(x))
solSimX = rbind( solTrainXimp, permutesolTrainXimp )

groupVals = c("Training", "Random")
groups = factor( rep(groupVals, each=nrow(solTrainX)) )

rfSolClass = train( x=solSimX, y=groups, method="rf", tuneLength=5, ntree=1000, control=trainControl(method="LGOCV") )
solTestGroupProbs = predict(rfSolClass, solTestXimp, type="prob")

# Extract the test sets probability of membership in the training set:
#
if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter20/chap_20_prob_3_probability_from_training_set.eps", onefile=FALSE, horizontal=FALSE) }
hist( solTestGroupProbs$Training )
if( save_plots ){ dev.off() }

# How many test set samples have a probability of less than 0.5 that they are in the training set:
#
print( sum( solTestGroupProbs$Training <= 0.5 ) )