#
# Problem EPage 208
# 
# Written by:
# -- 
# John L. Weatherwax                2009-04-21
# 
# email: wax@alum.mit.edu
# 
# Please send comments and especially bug reports to the
# above email address.
# 
#-----

use_conditional_true = T # whether to use the conditional argument in the cforest function call 

library(caret) 
library(mlbench)
library(gbm)

set.seed(200)
simulated = mlbench.friedman1(200,sd=1)
simulated = cbind(simulated$x, simulated$y)
simulated = as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] = "y" 

library(randomForest)

model1 = randomForest( y ~ ., data=simulated, importance=TRUE, ntree=1000 )
rfImp1 = varImp(model1, scale=FALSE)
rfImp1 = rfImp1[ order(-rfImp1), , drop=FALSE ]
print("randomForest (no correlated predictor)")
print(rfImp1)

# Part (b): Add a correlated variable
#
simulated$duplicate1 = simulated$V1 + rnorm(200) * 0.1
cor(simulated$duplicate1,simulated$V1)

model2 = randomForest( y ~ ., data=simulated, importance=TRUE, ntree=1000 )
rfImp2 = varImp(model2, scale=FALSE)
rfImp2 = rfImp2[ order(-rfImp2), , drop=FALSE ] 
print("randomForest (one correlated predictor)")
print(rfImp2)

simulated$duplicate2 = simulated$V1 + rnorm(200) * 0.1
cor(simulated$duplicate2,simulated$V1)

model3 = randomForest( y ~ ., data=simulated, importance=TRUE, ntree=1000 )
rfImp3 = varImp(model3, scale=FALSE)
rfImp3 = rfImp3[ order(-rfImp3), , drop=FALSE ] 
print("randomForest (two correlated predictors)")
print(rfImp3)

# Part (c): Study this when fitting conditional inference trees:
# 
library(party)

simulated$duplicate1 = NULL
simulated$duplicate2 = NULL

model1 = cforest( y ~ ., data=simulated )
cfImp1 = as.data.frame(varimp(model1),conditional=use_conditional_true)
cfImp1 = cfImp1[ order(-cfImp1), , drop=FALSE ] 
print(sprintf("cforest (no correlated predictor); varimp(*,conditional=%s)",use_conditional_true))
print(cfImp1)

# Now we add correlated predictors one at a time 
simulated$duplicate1 = simulated$V1 + rnorm(200) * 0.1

model2 = cforest( y ~ ., data=simulated )
cfImp2 = as.data.frame(varimp(model2),conditional=use_conditional_true)
cfImp2 = cfImp2[ order(-cfImp2), , drop=FALSE ]  
print(sprintf("cforest (one correlated predictor); varimp(*,conditional=%s)",use_conditional_true))
print(cfImp2)

simulated$duplicate2 = simulated$V1 + rnorm(200) * 0.1

model3 = cforest( y ~ ., data=simulated )
cfImp3 = as.data.frame(varimp(model3),conditional=use_conditional_true)
cfImp3 = cfImp3[ order(-cfImp3), , drop=FALSE ] 
print(sprintf("cforest (two correlated predictor); varimp(*,conditional=%s)",use_conditional_true))
print(cfImp3)

# Lets try the same experiment but using boosted trees:
#

simulated$duplicate1 = NULL
simulated$duplicate2 = NULL
      
model1 = gbm( y ~ ., data=simulated, distribution="gaussian", n.trees=1000 ) 
print(sprintf("gbm (no correlated predictor)"))
print(summary(model1,plotit=F)) # the summary method gives variable importance ... 

# Now we add correlated predictors one at a time 
simulated$duplicate1 = simulated$V1 + rnorm(200) * 0.1

model2 = gbm( y ~ ., data=simulated, distribution="gaussian", n.trees=1000 ) 
print(sprintf("gbm (one correlated predictor)"))
print(summary(model2,plotit=F))

simulated$duplicate2 = simulated$V1 + rnorm(200) * 0.1

model3 = gbm( y ~ ., data=simulated, distribution="gaussian", n.trees=1000 ) 
print(sprintf("gbm (two correlated predictor)"))
print(summary(model3,plotit=F))