# 
# Written by:
# -- 
# John L. Weatherwax                2007-07-05
# 
# email: wax@alum.mit.edu
# 
# Please send comments and especially bug reports to the
# above email address.
# 
#-----

###install.packages('gbm')
library(gbm)

set.seed(0)

source('gen_eq_10_2_data.R')

p = 10 # the dimension of the feature vector (a "p" dimensional vector)
N_train = 2000 # number samples to use in training
N_test = 10000 # number of samples to use in testing 

# Extract the data we will to classify: 
D_train = gen_eq_10_2_data(N=N_train,p=p)
D_train[ D_train$Y==-1, p+1 ] = 0 # Map the response "-1" to the value of "0" (required format for the call to gbm): 

# Generate the formula used to fit our model with: 
#
terms = paste( colnames(D_train)[1:p], collapse="+" ) # dont consider the last column (the response variable)
formula = formula( paste( colnames(D_train)[p+1], " ~ ", terms ) )

# Do training with the maximum number of trees: 
#
n_trees = 400
if( T ){
  print( "Running Adaboost ..." )
  m = gbm( formula, data=D_train, distribution='adaboost', n.trees=n_trees, shrinkage=1, verbose=TRUE )
}else{
  print( "Running Bernoulli Boosting ..." )
  m = gbm( formula, data=D_train, distribution='bernoulli', n.trees=n_trees, verbose=TRUE )
}

# Lets plot the training error as a function of the number of trees:
#
# Note that when using the "bernoulli" output to compute the output of predict into a probability one needs to use:
# plogis( predict( m, D_train[,1:p], n.trees=nti ) ) 
#
training_error = matrix( 0, nrow=n_trees, ncol=1 )
for( nti in seq(1,n_trees) ){
  Fhat = predict( m, D_train[,1:p], n.trees=nti )
  pcc = mean( ( ( Fhat <= 0 ) & ( D_train[,p+1] == 0 ) ) | ( ( Fhat > 0 ) & ( D_train[,p+1] == 1 ) ) )
  training_error[nti] = 1 - pcc
}

# Lets plot the testing error as a function of the number of trees:
#
D_test = gen_eq_10_2_data(N=N_test,p=p)
D_test[ D_test$Y==-1, p+1 ] = 0 # Map the response "-1" to the value of "0" (required format for the call to gbm): 

test_error = matrix( 0, nrow=n_trees, ncol=1 )
for( nti in seq(1,n_trees) ){
  Fhat = predict( m, D_test[,1:p], n.trees=nti )
  pcc = mean( ( ( Fhat <= 0 ) & ( D_test[,p+1] == 0 ) ) | ( ( Fhat > 0 ) & ( D_test[,p+1] == 1 ) ) )
  test_error[nti] = 1 - pcc 
}


#postscript("../../WriteUp/Graphics/Chapter10/dup_fig_10_2.eps", onefile=FALSE, horizontal=FALSE)

plot( seq(1,n_trees), training_error, type="l", main="Boosting Probability of Error", col="red", xlab="number of boosting iterations", ylab="classification error" )
lines( seq(1,n_trees), test_error, type="l", col="green" )

legend( 275, 0.45, c("training error", "testing error"), col=c("red", "green"), lty=c(1,1) )

#dev.off()