# # Written by: # -- # John L. Weatherwax 2009-04-21 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # # EPage 523-524 # #----- library(caret) library(AppliedPredictiveModeling) data(oil) set.seed(314) sampleRows = sample.int( nrow(fattyAcids), size=0.5*nrow(fattyAcids) ) fattyAcidsTrain = fattyAcids[sampleRows,] oilTypeTrain = oilType[sampleRows] fattyAcidsTest = fattyAcids[-sampleRows,] oilTypeTest = oilType[-sampleRows] # Part (a): # set.seed(100) indx = createFolds(oilTypeTrain, returnTrain=TRUE) ctrl = trainControl(method="cv", index=indx) # Add permulted data, train a model, and learn which test samples are not likely to be part of the training dataset: permutefattyAcidsTrain = apply( fattyAcidsTrain, 2, function(x) sample(x) ) solSimX = rbind( fattyAcidsTrain, permutefattyAcidsTrain ) groupVals = c("Training", "Random") groupY = factor( rep(groupVals, each=nrow(fattyAcidsTrain)) ) rfSolClass = train( x=solSimX, y=groupY, method="rf", tuneLength=5, ntree=1000, control=trainControl(method="LGOCV") ) solTestGroupProbs = predict(rfSolClass, fattyAcidsTest, type="prob") notTraining = ( solTestGroupProbs$Training < 0.5 ) print( "The test samples that are not likely to be in the training set are:" ) print( fattyAcidsTest[notTraining,] ) print( range(fattyAcidsTrain) )