# # Problem EPage 208 # # Written by: # -- # John L. Weatherwax 2009-04-21 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # #----- use_conditional_true = T # whether to use the conditional argument in the cforest function call library(caret) library(mlbench) library(gbm) set.seed(200) simulated = mlbench.friedman1(200,sd=1) simulated = cbind(simulated$x, simulated$y) simulated = as.data.frame(simulated) colnames(simulated)[ncol(simulated)] = "y" library(randomForest) model1 = randomForest( y ~ ., data=simulated, importance=TRUE, ntree=1000 ) rfImp1 = varImp(model1, scale=FALSE) rfImp1 = rfImp1[ order(-rfImp1), , drop=FALSE ] print("randomForest (no correlated predictor)") print(rfImp1) # Part (b): Add a correlated variable # simulated$duplicate1 = simulated$V1 + rnorm(200) * 0.1 cor(simulated$duplicate1,simulated$V1) model2 = randomForest( y ~ ., data=simulated, importance=TRUE, ntree=1000 ) rfImp2 = varImp(model2, scale=FALSE) rfImp2 = rfImp2[ order(-rfImp2), , drop=FALSE ] print("randomForest (one correlated predictor)") print(rfImp2) simulated$duplicate2 = simulated$V1 + rnorm(200) * 0.1 cor(simulated$duplicate2,simulated$V1) model3 = randomForest( y ~ ., data=simulated, importance=TRUE, ntree=1000 ) rfImp3 = varImp(model3, scale=FALSE) rfImp3 = rfImp3[ order(-rfImp3), , drop=FALSE ] print("randomForest (two correlated predictors)") print(rfImp3) # Part (c): Study this when fitting conditional inference trees: # library(party) simulated$duplicate1 = NULL simulated$duplicate2 = NULL model1 = cforest( y ~ ., data=simulated ) cfImp1 = as.data.frame(varimp(model1),conditional=use_conditional_true) cfImp1 = cfImp1[ order(-cfImp1), , drop=FALSE ] print(sprintf("cforest (no correlated predictor); varimp(*,conditional=%s)",use_conditional_true)) print(cfImp1) # Now we add correlated predictors one at a time simulated$duplicate1 = simulated$V1 + rnorm(200) * 0.1 model2 = cforest( y ~ ., data=simulated ) cfImp2 = as.data.frame(varimp(model2),conditional=use_conditional_true) cfImp2 = cfImp2[ order(-cfImp2), , drop=FALSE ] print(sprintf("cforest (one correlated predictor); varimp(*,conditional=%s)",use_conditional_true)) print(cfImp2) simulated$duplicate2 = simulated$V1 + rnorm(200) * 0.1 model3 = cforest( y ~ ., data=simulated ) cfImp3 = as.data.frame(varimp(model3),conditional=use_conditional_true) cfImp3 = cfImp3[ order(-cfImp3), , drop=FALSE ] print(sprintf("cforest (two correlated predictor); varimp(*,conditional=%s)",use_conditional_true)) print(cfImp3) # Lets try the same experiment but using boosted trees: # simulated$duplicate1 = NULL simulated$duplicate2 = NULL model1 = gbm( y ~ ., data=simulated, distribution="gaussian", n.trees=1000 ) print(sprintf("gbm (no correlated predictor)")) print(summary(model1,plotit=F)) # the summary method gives variable importance ... # Now we add correlated predictors one at a time simulated$duplicate1 = simulated$V1 + rnorm(200) * 0.1 model2 = gbm( y ~ ., data=simulated, distribution="gaussian", n.trees=1000 ) print(sprintf("gbm (one correlated predictor)")) print(summary(model2,plotit=F)) simulated$duplicate2 = simulated$V1 + rnorm(200) * 0.1 model3 = gbm( y ~ ., data=simulated, distribution="gaussian", n.trees=1000 ) print(sprintf("gbm (two correlated predictor)")) print(summary(model3,plotit=F))