# # Written by: # -- # John L. Weatherwax 2009-04-21 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # # EPage 747 # #----- source('../Chapter18/restaurant_problem_gen_data.R') source('../Chapter18/decision_tree_learning.R') source('naive_bayes.R') # Loop over various training set sizes (tss): # training_set_sizes = seq( 10, 100, by=5 ) dt_training_err = c() dt_testing_err = c() nb_training_err = c() nb_testing_err = c() for( tss in training_set_sizes ){ N = tss # For this training set size lets do some number of monte carlo # dt_mc_train_er = c() dt_mc_test_er = c() nb_mc_train_er = c() nb_mc_test_er = c() for( mci in 1:20 ){ # Generate some data for our problem: # res = restaurant_problem_gen_data( N ) data = res$data willwait = res$willwait # # Model #1 (decision tree) # # Create the initial attributes variable (an R list with items and keys as above): # Note our training set must have seen the possible values for each attribute. # attributes = list() for( cn in colnames(data) ){ attributes[[cn]] = sort(as.matrix(unique(data[cn]))) } # Compute the default classification: # default = majority_vote(willwait) # Build a model using this data and then use it to predict on the training data: # tree = decision_tree_learning( data, willwait, attributes, default ) willwait_hat = decision_tree_predict_multiple_samples( tree, data ) training_set_err_rate = sum( willwait_hat != willwait ) / length( willwait ) dt_mc_train_er = c( dt_mc_train_er, training_set_err_rate ) # # Model #2 (naive bayes): # # Build a model using this data and then use it to predict on the training data: # nb_model = naive_bayes_build_model( data, willwait ) willwait_hat = naive_bayes_predict( nb_model, data ) training_set_err_rate = sum( willwait_hat != willwait ) / length( willwait ) nb_mc_train_er = c( nb_mc_train_er, training_set_err_rate ) # Generate a new dataset for testing and compute its error rate: # res = restaurant_problem_gen_data( N ) data_test = res$data willwait_test = res$willwait willwait_test_hat = decision_tree_predict_multiple_samples( tree, data_test ) testing_set_err_rate = sum( willwait_test_hat != willwait_test ) / length( willwait_test ) dt_mc_test_er = c( dt_mc_test_er, testing_set_err_rate ) willwait_test_hat = naive_bayes_predict( nb_model, data_test ) testing_set_err_rate = sum( willwait_test_hat != willwait_test ) / length( willwait_test ) nb_mc_test_er = c( nb_mc_test_er, testing_set_err_rate ) } dt_training_set_err_rate = mean( dt_mc_train_er ) dt_testing_set_err_rate = mean( dt_mc_test_er ) nb_training_set_err_rate = mean( nb_mc_train_er ) nb_testing_set_err_rate = mean( nb_mc_test_er ) print( sprintf('Testing Error Rates: N= %5d; Tree= %10.6f, NaiveBayes= %10.6f', N, dt_testing_set_err_rate, nb_testing_set_err_rate) ) dt_training_err = c(dt_training_err,dt_training_set_err_rate) dt_testing_err = c(dt_testing_err,dt_testing_set_err_rate) nb_training_err = c(nb_training_err,nb_training_set_err_rate) nb_testing_err = c(nb_testing_err,nb_testing_set_err_rate) } #postscript("../../WriteUp/Graphics/Chapter20/dup_fig_20.eps", onefile=FALSE, horizontal=FALSE) plot( training_set_sizes, 1-dt_testing_err, col='green', type='l', xlab='training set size', ylab='probability correct decision', ylim=c(0.4,1.0) ) lines( training_set_sizes, 1-nb_testing_err, col='black', type='l' ) grid() legend( 70, 0.5, c('decision tree','naive bayes'), lty=1, col=c('green','black') ) #dev.off()