#
# Written by:
# -- 
# John L. Weatherwax                2009-04-21
# 
# email: wax@alum.mit.edu
# 
# Please send comments and especially bug reports to the
# above email address.
#
# EPage 523-524
#
#-----

library(caret)
library(AppliedPredictiveModeling)
data(oil)
set.seed(314)

sampleRows = sample.int( nrow(fattyAcids), size=0.5*nrow(fattyAcids) )
fattyAcidsTrain = fattyAcids[sampleRows,]
oilTypeTrain = oilType[sampleRows] 
fattyAcidsTest = fattyAcids[-sampleRows,]
oilTypeTest = oilType[-sampleRows] 

# Part (a):
#
set.seed(100)
indx = createFolds(oilTypeTrain, returnTrain=TRUE)
ctrl = trainControl(method="cv", index=indx)

# Add permulted data, train a model, and learn which test samples are not likely to be part of the training dataset:
permutefattyAcidsTrain = apply( fattyAcidsTrain, 2, function(x) sample(x) )
solSimX = rbind( fattyAcidsTrain, permutefattyAcidsTrain )
groupVals = c("Training", "Random")
groupY = factor( rep(groupVals, each=nrow(fattyAcidsTrain)) )

rfSolClass = train( x=solSimX, y=groupY, method="rf", tuneLength=5, ntree=1000, control=trainControl(method="LGOCV") )
solTestGroupProbs = predict(rfSolClass, fattyAcidsTest, type="prob")

notTraining = ( solTestGroupProbs$Training < 0.5 )
print( "The test samples that are not likely to be in the training set are:" )
print( fattyAcidsTest[notTraining,] )

print( range(fattyAcidsTrain) )