# # Written by: # -- # John L. Weatherwax 2009-04-21 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # # EPage 747 # #----- source('restaurant_problem_gen_data.R') source('decision_tree_learning.R') source('adaboost_w_decision_trees.R') set.seed(0) # Generate some data for our problem (and show how to use the adaboosting code): # N = 100 res = restaurant_problem_gen_data( N ) X = res$data y = res$willwait boosted_trees = adaboost( X, y, M=10 ) y_hat = adaboost_predict_multiple_samples( boosted_trees, X ) training_set_err_rate = sum( y_hat != y ) / length( y ) print(training_set_err_rate) # Test each procedure on training and testing: # # Loop over various training set sizes (tss): # training_set_sizes = seq( 5, 100, by=5 ) training_set_err_rate_adaboost = c() testing_set_err_rate_adaboost = c() training_set_err_rate_decision_stumps = c() testing_set_err_rate_decision_stumps = c() for( tss in training_set_sizes ){ #print( sprintf("Working a data set of size %10d", tss) ) N = tss # For this training set size lets do some number of Monte Carlo: # mc_train_er_adaboost = c() mc_test_er_adaboost = c() mc_train_er_decision_stumps = c() mc_test_er_decision_stumps = c() for( mci in 1:20 ){ # Generate some data for our problem: # res = restaurant_problem_gen_data( N ) data = res$data willwait = res$willwait # Build a model using adaboost: # boosted_trees = adaboost( data, willwait, M=5 ) willwait_hat = adaboost_predict_multiple_samples( boosted_trees, data ) mc_train_ab = sum( willwait_hat != willwait ) / length( willwait ) # Build a model with a decision stump (max_depth=1): # attributes = list() for( cn in colnames(data) ){ attributes[[cn]] = sort(as.matrix(unique(data[cn]))) } default = majority_vote(willwait) stump = decision_tree_learning( data, willwait, attributes, default, max_depth=1 ) willwait_hat = decision_tree_predict_multiple_samples( stump, data ) mc_train_smp = sum( willwait_hat != willwait ) / length( willwait ) # Generate a new dataset for testing and compute its error rate: # res = restaurant_problem_gen_data( N ) data_test = res$data willwait_test = res$willwait willwait_test_hat = adaboost_predict_multiple_samples( boosted_trees, data_test ) mc_test_ab = sum( willwait_test_hat != willwait_test ) / length( willwait_test ) willwait_test_hat = decision_tree_predict_multiple_samples( stump, data_test ) mc_test_smp = sum( willwait_test_hat != willwait_test ) / length( willwait_test ) mc_train_er_adaboost = c( mc_train_er_adaboost, mc_train_ab ) mc_test_er_adaboost = c( mc_test_er_adaboost, mc_test_ab ) mc_train_er_decision_stumps = c( mc_train_er_decision_stumps, mc_train_smp ) mc_test_er_decision_stumps = c( mc_test_er_decision_stumps, mc_test_smp ) } training_set_err_rate_adaboost = c( training_set_err_rate_adaboost, mean( mc_train_er_adaboost ) ) testing_set_err_rate_adaboost = c( testing_set_err_rate_adaboost, mean( mc_test_er_adaboost ) ) training_set_err_rate_decision_stumps = c( training_set_err_rate_decision_stumps, mean( mc_train_er_decision_stumps ) ) testing_set_err_rate_decision_stumps = c( testing_set_err_rate_decision_stumps, mean( mc_test_er_decision_stumps ) ) print( sprintf('AdaBoost Data Error Rates: N= %5d; training= %10.6f, testing= %10.6f; Stump Error Rates: training= %10.6f, testing= %10.6f', N, mean( mc_train_er_adaboost ), mean( mc_test_er_adaboost ), mean( mc_train_er_decision_stumps ), mean( mc_test_er_decision_stumps ))) } #postscript("../../WriteUp/Graphics/Chapter18/dup_fig_18_11_a.eps", onefile=FALSE, horizontal=FALSE) plot( training_set_sizes, 1-testing_set_err_rate_adaboost, col='green', type='l', xlab='training set size', ylab='Proporation correct on test set', ylim=c(0.5,1.0) ) lines( training_set_sizes, 1-testing_set_err_rate_decision_stumps, col='cyan', type='l' ) grid() legend( 40, 0.6, c('AdaBoost test accuracy','Stumps test accuracy'), lty=1, col=c('green','cyan') ) #dev.off() # Loop over number of boosting hypothesis: # N = 100 mValues = seq( 1, 200, by=10 ) training_set_err_rate = c() testing_set_err_rate = c() for( m in mValues ){ mc_train = c() mc_test = c() for( mci in 1:1 ){ # Generate some data for our problem: # res = restaurant_problem_gen_data( N ) data = res$data willwait = res$willwait # Build a model using adaboost: # boosted_trees = adaboost( data, willwait, M=m ) willwait_hat = adaboost_predict_multiple_samples( boosted_trees, data ) train_er = sum( willwait_hat != willwait ) / length( willwait ) mc_train = c( mc_train, train_er ) # Generate a new dataset for testing and compute its error rate: # res = restaurant_problem_gen_data( N ) data_test = res$data willwait_test = res$willwait willwait_test_hat = adaboost_predict_multiple_samples( boosted_trees, data_test ) test_er = sum( willwait_test_hat != willwait_test ) / length( willwait_test ) mc_test = c( mc_test, test_er ) } training_set_err_rate = c( training_set_err_rate, mean( mc_train ) ) testing_set_err_rate = c( testing_set_err_rate, mean( mc_test ) ) print( sprintf( "Boosting with M=%5d stumps; training error= %10.6f, testing error= %10.6f", m, mean( mc_train ), mean( mc_test ) ) ) } plot( mValues, 1-training_set_err_rate, col='red', type='l', xlab='number of hypothesis M', ylab='Training/Test accuracy', ylim=c(0.5,1.0) ) lines( mValues, 1-testing_set_err_rate, col='green', type='l' ) grid() legend( 40, 0.6, c('AdaBoost train accuracy','AdaBoost test accuracy'), lty=1, col=c('red','green') )