# # Written by: # -- # John L. Weatherwax 2007-07-05 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # #----- library(gbm) set.seed(0) source('../Chapter3/load_spam_data.R') PD = load_spam_data(trainingScale=FALSE,responseScale=FALSE) # read in unscaled data p = dim(PD[[1]])[2]-1 # the last column is the response 1=>Spam; 0=>Ham XTraining = PD[[1]]; colnames(XTraining)[p+1] = "Y" XTesting = PD[[2]]; colnames(XTesting)[p+1] = "Y" spam_words = PD[[3]] # Generate the formula used to fit our model with: # terms = paste( colnames(XTraining)[1:p], collapse="+" ) # dont consider the last column (the response variable) formula = formula( paste( colnames(XTraining)[p+1], " ~ ", terms ) ) n_trees = 50000 K = 3 # larger values of K can give different performance m = gbm( formula, data=XTraining, distribution='bernoulli', n.trees=n_trees, shrinkage=0.005, interaction.depth=K, verbose=TRUE, cv.folds=5 ) # This works but I think the default method of gbm.perf(m,method="cv") # provides a better esimate of performance as a function of boosting iteration : # if( FALSE ){ # Simple sanity check that our algorithm is working correctly (plot training as a function of number of boosts): # training_error = matrix( 0, nrow=n_trees, ncol=1 ) for( nti in seq(1,n_trees) ){ Fhat = predict( m, XTraining[,1:p], n.trees=nti ) pcc = mean( ( ( Fhat <= 0 ) & ( XTraining[,p+1] == 0 ) ) | ( ( Fhat > 0 ) & ( XTraining[,p+1] == 1 ) ) ) training_error[nti] = 1 - pcc } # Lets plot the testing error as a function of the number of trees: # test_error = matrix( 0, nrow=n_trees, ncol=1 ) for( nti in seq(1,n_trees) ){ Fhat = predict( m, XTesting[,1:p], n.trees=nti ) pcc = mean( ( ( Fhat <= 0 ) & ( XTesting[,p+1] == 0 ) ) | ( ( Fhat > 0 ) & ( XTesting[,p+1] == 1 ) ) ) test_error[nti] = 1 - pcc } plot( seq(1,n_trees), training_error, type="l", main="Boosting Probability of Error", col="red", xlab="number of boosting iterations", ylab="classification error" ) lines( seq(1,n_trees), test_error, type="l", col="green" ) legend( 700, 0.4, c("training error", "testing error"), col=c("red", "green"), lty=c(1,1) ) } # Look at the learning curves (estimated via cross validation) and determine the best number of boosting trees to use: # #postscript("../../WriteUp/Graphics/Chapter10/dup_spam_learning_curves_K_3.eps", onefile=FALSE, horizontal=FALSE) best.iter = gbm.perf(m,method="cv") #dev.off() # Plot the relative importance of each word used in the classification: # if( FALSE ){ rel_importance = summary(m,n.trees=best.iter) # Get the relative importance of each word (normalized to 100 for the most important word): max_value = rel_importance[[2]][1] rel_importance_norm_values = ( rel_importance[[2]]/max_value ) * 100. # Get the list of words ordered by the most important: word_indx = as.double( sub( "V", "", rel_importance[[1]] ) ) most_important_words = spam_words[ word_indx ] #postscript("../../WriteUp/Graphics/Chapter10/dup_spam_rel_importance.eps", onefile=FALSE, horizontal=FALSE) barplot( rel_importance_norm_values, horiz=TRUE, names.arg=most_important_words ) #dev.off() } # Lets evaluate the testing error given as a function of the optimal number of trees: # Fhat = predict( m, XTesting[,1:p], n.trees=best.iter ) pcc = mean( ( ( Fhat <= 0 ) & ( XTesting[,p+1] == 0 ) ) | ( ( Fhat > 0 ) & ( XTesting[,p+1] == 1 ) ) ) print( 1 - pcc )