# # Written by: # -- # John L. Weatherwax 2009-04-21 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # # EPage 747 # #----- source('restaurant_problem_gen_data.R') source('decision_tree_learning.R') # Test each procedure (building a tree with and without chi^2 pruning) on training and testing data: # # Loop over various training set sizes (tss): # training_set_sizes = seq( 20, 200, by=5 ) training_set_err_rate = c() testing_set_err_rate = c() training_set_err_rate_w_chisq = c() testing_set_err_rate_w_chisq = c() for( tss in training_set_sizes ){ N = tss # For this training set size lets do some number of monte carlo # mc_train_er = c() mc_test_er = c() mc_train_er_w_chisq = c() mc_test_er_w_chisq = c() for( mci in 1:20 ){ # Generate some data for our problem: # res = restaurant_problem_gen_data( N ) data = res$data willwait = res$willwait # Create the initial attributes variable (an R list with items and keys as above): # Note our training set must have seen the possible values for each attribute. # attributes = list() for( cn in colnames(data) ){ attributes[[cn]] = sort(as.matrix(unique(data[cn]))) } # Compute the default classification: # default = majority_vote(willwait) # Build a model using the complete data and then use it to predict on the training data: # tree = decision_tree_learning( data, willwait, attributes, default ) willwait_hat = decision_tree_predict_multiple_samples( tree, data ) training_set_err_rate_data = sum( willwait_hat != willwait ) / length( willwait ) # Build a model with chi^2 pruning: # tree_w_chisq = decision_tree_learning( data, willwait, attributes, default, use_chi2_pruning=TRUE ) willwait_hat = decision_tree_predict_multiple_samples( tree_w_chisq, data ) training_set_err_rate_w_chisq_data = sum( willwait_hat != willwait ) / length( willwait ) # Generate a new dataset for testing and compute its error rate: # res = restaurant_problem_gen_data( N ) data_test = res$data willwait_test = res$willwait willwait_test_hat = decision_tree_predict_multiple_samples( tree, data_test ) testing_set_err_rate_data = sum( willwait_test_hat != willwait_test ) / length( willwait_test ) willwait_test_hat = decision_tree_predict_multiple_samples( tree_w_chisq, data_test ) testing_set_err_rate_w_chisq_data = sum( willwait_test_hat != willwait_test ) / length( willwait_test ) mc_train_er = c( mc_train_er, training_set_err_rate_data ) mc_test_er = c( mc_test_er, testing_set_err_rate_data ) mc_train_er_w_chisq = c( mc_train_er_w_chisq, training_set_err_rate_w_chisq_data ) mc_test_er_w_chisq = c( mc_test_er_w_chisq, testing_set_err_rate_w_chisq_data ) } training_set_err_rate = c( training_set_err_rate, mean( mc_train_er ) ) testing_set_err_rate = c( testing_set_err_rate, mean( mc_test_er ) ) training_set_err_rate_w_chisq = c( training_set_err_rate_w_chisq, mean( mc_train_er_w_chisq ) ) testing_set_err_rate_w_chisq = c( testing_set_err_rate_w_chisq, mean( mc_test_er_w_chisq ) ) print( sprintf('Baseline Tree Building: N= %5d; training= %10.6f, testing= %10.6f; With Chi2 Pruning: training= %10.6f, testing= %10.6f', N, mean( mc_train_er ), mean( mc_test_er ), mean( mc_train_er_w_chisq ), mean( mc_test_er_w_chisq ) ) ) } #postscript("../../WriteUp/Graphics/Chapter18/chap_18_prob_11.eps", onefile=FALSE, horizontal=FALSE) plot( training_set_sizes, 1-training_set_err_rate, col='red', type='l', xlab='training set size', ylab='Proporation correct on test set', ylim=c(0.7,1.0) ) lines( training_set_sizes, 1-testing_set_err_rate, col='green', type='l' ) lines( training_set_sizes, 1-training_set_err_rate_w_chisq, col='magenta', type='l' ) lines( training_set_sizes, 1-testing_set_err_rate_w_chisq, col='cyan', type='l' ) grid() legend( 110, 0.8, c('BL train accuracy','BL test accuracy','Chi2 train accuracy','Chi2 test accuracy'), lty=1, col=c('red','green','magenta','cyan') ) #dev.off()