# # Written by: # -- # John L. Weatherwax 2009-04-21 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # # EPage 747 # #----- source('restaurant_problem_gen_data.R') source('table_lookup_learning.R') source('decision_tree_learning.R') # Loop over various training set sizes (tss): # training_set_sizes = seq( 1, 100, by=1 ) nb_training_err = c() nb_testing_err = c() tlup_nb_training_err = c() tlup_nb_testing_err = c() for( tss in training_set_sizes ){ N = tss # For this training set size lets do some number of monte carlo # mc_train_er = c() mc_test_er = c() tlup_mc_train_er = c() tlup_mc_test_er = c() for( mci in 1:20 ){ # Generate some data for our problem: # res = restaurant_problem_gen_data( N ) data = res$data willwait = res$willwait # Create the initial attributes variable (an R list with items and keys as above): # Note our training set must have seen the possible values for each attribute. # attributes = list() for( cn in colnames(data) ){ attributes[[cn]] = sort(as.matrix(unique(data[cn]))) } # Compute the default classification: # default = majority_vote(willwait) # Build a model using this data and then use it to predict on the training data: # tree = decision_tree_learning( data, willwait, attributes, default, use_chi2_pruning=FALSE, use_information_gain_ratio=FALSE ) willwait_hat = decision_tree_predict_multiple_samples( tree, data ) training_set_err_rate = sum( willwait_hat != willwait ) / length( willwait ) # Build a table lookup classifier based on the training set: # lookup_table = table_lookup_learning( data, willwait ) tlup_training_set_err_rate = 0. # by definition the error rate in a table lookup classifier is zero # Generate a new dataset for testing and compute its error rate: # res = restaurant_problem_gen_data( N ) data_test = res$data willwait_test = res$willwait willwait_test_hat = decision_tree_predict_multiple_samples( tree, data_test ) testing_set_err_rate = sum( willwait_test_hat != willwait_test ) / length( willwait_test ) mc_train_er = c( mc_train_er, training_set_err_rate ) mc_test_er = c( mc_test_er, testing_set_err_rate ) willwait_test_hat = table_lookup_predict_multiple_samples( lookup_table, data_test ) testing_set_err_rate = sum( willwait_test_hat != willwait_test ) / length( willwait_test ) tlup_mc_train_er = c( tlup_mc_train_er, training_set_err_rate ) tlup_mc_test_er = c( tlup_mc_test_er, testing_set_err_rate ) } training_set_err_rate = mean( mc_train_er ) testing_set_err_rate = mean( mc_test_er ) tlup_training_set_err_rate = mean( tlup_mc_train_er ) tlup_testing_set_err_rate = mean( tlup_mc_test_er ) print( sprintf('Error Rates: N= %5d; DT: training= %10.6f, testing= %10.6f; TLUP: training= %10.6f, testing= %10.6f', N, training_set_err_rate, testing_set_err_rate, tlup_training_set_err_rate, tlup_testing_set_err_rate) ) nb_training_err = c(nb_training_err,training_set_err_rate) nb_testing_err = c(nb_testing_err,testing_set_err_rate) tlup_nb_training_err = c(tlup_nb_training_err,tlup_training_set_err_rate) tlup_nb_testing_err = c(tlup_nb_testing_err,tlup_testing_set_err_rate) } #postscript("../../WriteUp/Graphics/Chapter18/dup_fig_18_7.eps", onefile=FALSE, horizontal=FALSE) plot( training_set_sizes, 1-nb_training_err, col='red', type='l', xlab='training set size', ylab='Proporation correct on test set', ylim=c(0.4,1.0) ) lines( training_set_sizes, 1-nb_testing_err, col='green', type='l' ) lines( training_set_sizes, 1-tlup_nb_training_err, col='red', type='l' ) lines( training_set_sizes, 1-tlup_nb_testing_err, col='green', type='l' ) grid() #dev.off()