# 
# Written by:
# -- 
# John L. Weatherwax                2007-07-05
# 
# email: wax@alum.mit.edu
# 
# Please send comments and especially bug reports to the
# above email address.
# 
#-----

library(gbm)

set.seed(0)

source('../Chapter3/load_spam_data.R')

PD         = load_spam_data(trainingScale=FALSE,responseScale=FALSE) # read in unscaled data
p          = dim(PD[[1]])[2]-1 # the last column is the response 1=>Spam; 0=>Ham
XTraining  = PD[[1]]; colnames(XTraining)[p+1] = "Y"
XTesting   = PD[[2]]; colnames(XTesting)[p+1] = "Y"
spam_words = PD[[3]]

# Generate the formula used to fit our model with: 
#
terms = paste( colnames(XTraining)[1:p], collapse="+" ) # dont consider the last column (the response variable)
formula = formula( paste( colnames(XTraining)[p+1], " ~ ", terms ) )

n_trees = 50000
K = 3 # larger values of K can give different performance
m = gbm( formula, data=XTraining, distribution='bernoulli', n.trees=n_trees, shrinkage=0.005, interaction.depth=K, verbose=TRUE, cv.folds=5 )

# This works but I think the default method of gbm.perf(m,method="cv")
# provides a better esimate of performance as a function of boosting iteration : 
# 
if( FALSE ){ 
    # Simple sanity check that our algorithm is working correctly (plot training as a function of number of boosts): 
    #
    training_error = matrix( 0, nrow=n_trees, ncol=1 )
    for( nti in seq(1,n_trees) ){
      Fhat = predict( m, XTraining[,1:p], n.trees=nti )
      pcc = mean( ( ( Fhat <= 0 ) & ( XTraining[,p+1] == 0 ) ) | ( ( Fhat > 0 ) & ( XTraining[,p+1] == 1 ) ) )
      training_error[nti] = 1 - pcc
    }

    # Lets plot the testing error as a function of the number of trees:
    #
    test_error = matrix( 0, nrow=n_trees, ncol=1 )
    for( nti in seq(1,n_trees) ){
      Fhat = predict( m, XTesting[,1:p], n.trees=nti )
      pcc = mean( ( ( Fhat <= 0 ) & ( XTesting[,p+1] == 0 ) ) | ( ( Fhat > 0 ) & ( XTesting[,p+1] == 1 ) ) )
      test_error[nti] = 1 - pcc 
    }

    plot( seq(1,n_trees), training_error, type="l", main="Boosting Probability of Error", col="red", xlab="number of boosting iterations", ylab="classification error" )
    lines( seq(1,n_trees), test_error, type="l", col="green" )

    legend( 700, 0.4, c("training error", "testing error"), col=c("red", "green"), lty=c(1,1) )
}

# Look at the learning curves (estimated via cross validation) and determine the best number of boosting trees to use:
# 
#postscript("../../WriteUp/Graphics/Chapter10/dup_spam_learning_curves_K_3.eps", onefile=FALSE, horizontal=FALSE)
best.iter = gbm.perf(m,method="cv")
#dev.off()

# Plot the relative importance of each word used in the classification:
#
if( FALSE ){
  rel_importance = summary(m,n.trees=best.iter)
  
  # Get the relative importance of each word (normalized to 100 for the most important word): 
  max_value = rel_importance[[2]][1]
  rel_importance_norm_values = ( rel_importance[[2]]/max_value ) * 100.

  # Get the list of words ordered by the most important: 
  word_indx = as.double( sub( "V", "", rel_importance[[1]] ) )
  most_important_words = spam_words[ word_indx ] 

  #postscript("../../WriteUp/Graphics/Chapter10/dup_spam_rel_importance.eps", onefile=FALSE, horizontal=FALSE)
  barplot( rel_importance_norm_values, horiz=TRUE, names.arg=most_important_words )
  #dev.off()
}

# Lets evaluate the testing error given as a function of the optimal number of trees: 
#
Fhat = predict( m, XTesting[,1:p], n.trees=best.iter )
pcc = mean( ( ( Fhat <= 0 ) & ( XTesting[,p+1] == 0 ) ) | ( ( Fhat > 0 ) & ( XTesting[,p+1] == 1 ) ) )
print( 1 - pcc )