# # Written by: # -- # John L. Weatherwax 2009-04-21 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # # EPage 788 # #----- set.seed(0) source('../Chapter18/restaurant_problem_gen_data.R') source('../Chapter18/decision_tree_learning.R') source('adaboost_w_naive_bayes.R') # Generate some data for our problem (and show how to use the adaboosting code): # N = 100 res = restaurant_problem_gen_data( N ) X = res$data y = res$willwait boosted_bayes = adaboost( X, y, M=10 ) y_hat = adaboost_predict_multiple_samples( boosted_bayes, X ) training_set_err_rate = sum( y_hat != y ) / length( y ) print(training_set_err_rate) # Loop over various training set sizes (tss): # training_set_sizes = seq( 100, 1000, by=100 ) dt_training_err = c() dt_testing_err = c() nb_training_err = c() nb_testing_err = c() bnb_training_err = c() # boosted naive bayes bnb_testing_err = c() for( tss in training_set_sizes ){ N = tss # For this training set size lets do some number of monte carlo # dt_mc_train_er = c() dt_mc_test_er = c() nb_mc_train_er = c() nb_mc_test_er = c() bnb_mc_train_er = c() bnb_mc_test_er = c() for( mci in 1:20 ){ # Generate some data for our problem: # res = restaurant_problem_gen_data( N ) data = res$data willwait = res$willwait # # Model #1 (decision tree) # # Create the initial attributes variable (an R list with items and keys as above): # Note our training set must have seen the possible values for each attribute. # attributes = list() for( cn in colnames(data) ){ attributes[[cn]] = sort(as.matrix(unique(data[cn]))) } # Compute the default classification: # default = majority_vote(willwait) # Build a model using this data and then use it to predict on the training data: # tree = decision_tree_learning( data, willwait, attributes, default ) willwait_hat = decision_tree_predict_multiple_samples( tree, data ) training_set_err_rate = sum( willwait_hat != willwait ) / length( willwait ) dt_mc_train_er = c( dt_mc_train_er, training_set_err_rate ) # # Model #2 (naive bayes): # # Build a model using this data and then use it to predict on the training data: # nb_model = naive_bayes_build_model( data, willwait ) willwait_hat = naive_bayes_predict( nb_model, data ) training_set_err_rate = sum( willwait_hat != willwait ) / length( willwait ) nb_mc_train_er = c( nb_mc_train_er, training_set_err_rate ) # # Model #3 (boosted naive bayes): # # Build a model using this data and then use it to predict on the training data: # bnb_model = adaboost( data, willwait, M=10 ) willwait_hat = adaboost_predict_multiple_samples( bnb_model, data ) training_set_err_rate = sum( willwait_hat != willwait ) / length( willwait ) bnb_mc_train_er = c( bnb_mc_train_er, training_set_err_rate ) # Generate a new dataset for testing and compute its error rate: # res = restaurant_problem_gen_data( N ) data_test = res$data willwait_test = res$willwait willwait_test_hat = decision_tree_predict_multiple_samples( tree, data_test ) testing_set_err_rate = sum( willwait_test_hat != willwait_test ) / length( willwait_test ) dt_mc_test_er = c( dt_mc_test_er, testing_set_err_rate ) willwait_test_hat = naive_bayes_predict( nb_model, data_test ) testing_set_err_rate = sum( willwait_test_hat != willwait_test ) / length( willwait_test ) nb_mc_test_er = c( nb_mc_test_er, testing_set_err_rate ) willwait_test_hat = adaboost_predict_multiple_samples( bnb_model, data_test ) testing_set_err_rate = sum( willwait_test_hat != willwait_test ) / length( willwait_test ) bnb_mc_test_er = c( bnb_mc_test_er, testing_set_err_rate ) } dt_training_set_err_rate = mean( dt_mc_train_er ) dt_testing_set_err_rate = mean( dt_mc_test_er ) nb_training_set_err_rate = mean( nb_mc_train_er ) nb_testing_set_err_rate = mean( nb_mc_test_er ) bnb_training_set_err_rate = mean( bnb_mc_train_er ) bnb_testing_set_err_rate = mean( bnb_mc_test_er ) print( sprintf('Testing Error Rates: N= %5d; Tree= %10.6f, NaiveBayes= %10.6f, BoostedNaiveBayes= %10.6f', N, dt_testing_set_err_rate, nb_testing_set_err_rate, bnb_testing_set_err_rate) ) dt_training_err = c(dt_training_err,dt_training_set_err_rate) dt_testing_err = c(dt_testing_err,dt_testing_set_err_rate) nb_training_err = c(nb_training_err,nb_training_set_err_rate) nb_testing_err = c(nb_testing_err,nb_testing_set_err_rate) bnb_training_err = c(bnb_training_err,bnb_training_set_err_rate) bnb_testing_err = c(bnb_testing_err,bnb_testing_set_err_rate) } #postscript("../../WriteUp/Graphics/Chapter20/chap_20_prob_5.eps", onefile=FALSE, horizontal=FALSE) plot( training_set_sizes, 1-dt_testing_err, col='green', type='l', xlab='training set size', ylab='probability correct decision', ylim=c(0.8,1.0) ) lines( training_set_sizes, 1-nb_testing_err, col='black', type='l' ) lines( training_set_sizes, 1-bnb_testing_err, col='blue', type='l' ) grid() legend( 70, 0.85, c('decision tree','boosted naive bayes','naive bayes'), lty=1, col=c('green','blue','black') ) #dev.off()