# # Written by: # -- # John L. Weatherwax 2009-04-21 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # # EPage 747 # #----- source('restaurant_problem_gen_data.R') source('decision_tree_learning.R') N = 100 # Generate some data for our problem: # res = restaurant_problem_gen_data( N ) data = res$data willwait = res$willwait # Create the initial attributes variable (an R list with items and keys as above): # Note our training set must have seen the possible values for each attribute. # attributes = list() for( cn in colnames(data) ){ attributes[[cn]] = sort(as.matrix(unique(data[cn]))) } # Compute the default classification: # default = majority_vote(willwait) # Learn a tree on the complete set of data: # tree = decision_tree_learning( data, willwait, attributes, default ) willwait_hat = decision_tree_predict_multiple_samples( tree, data ) training_set_err_rate = sum( willwait_hat != willwait ) / length( willwait ) print(training_set_err_rate) # Replace some of the data in data with NAs: # n_NA_pts = min( N, 10 ) row_inds = sample( seq(1,N), n_NA_pts ) col_inds = sample( seq(1,dim(data)[2]), n_NA_pts ) data[row_inds,col_inds] = NA # Learn a tree on the data with some NAs: # tree = decision_tree_learning( data, willwait, attributes, default ) willwait_hat = decision_tree_predict_multiple_samples( tree, data ) training_set_err_rate = sum( willwait_hat != willwait ) / length( willwait ) print(training_set_err_rate) # Test each procedure on training and testing: # # Loop over various training set sizes (tss): # training_set_sizes = seq( 20, 200, by=5 ) training_set_err_rate_complete = c() testing_set_err_rate_complete = c() training_set_err_rate_missing = c() testing_set_err_rate_missing = c() for( tss in training_set_sizes ){ N = tss # For this training set size lets do some number of monte carlo # mc_train_er_complete = c() mc_test_er_complete = c() mc_train_er_missing = c() mc_test_er_missing = c() for( mci in 1:20 ){ # Generate some data for our problem: # res = restaurant_problem_gen_data( N ) data = res$data willwait = res$willwait # Create the initial attributes variable (an R list with items and keys as above): # Note our training set must have seen the possible values for each attribute. # attributes = list() for( cn in colnames(data) ){ attributes[[cn]] = sort(as.matrix(unique(data[cn]))) } # Compute the default classification: # default = majority_vote(willwait) # Build a model using the complete data and then use it to predict on the training data: # tree_complete = decision_tree_learning( data, willwait, attributes, default ) willwait_hat = decision_tree_predict_multiple_samples( tree_complete, data ) training_set_err_rate_complete_data = sum( willwait_hat != willwait ) / length( willwait ) # Replace some of the data in data with NAs: # n_NA_pts = min( N, 10 ) row_inds = sample( seq(1,N), n_NA_pts ) col_inds = sample( seq(1,dim(data)[2]), n_NA_pts ) data[row_inds,col_inds] = NA # Build a model using the data with NA and then use it to predict on the training data: # tree_missing = decision_tree_learning( data, willwait, attributes, default ) willwait_hat = decision_tree_predict_multiple_samples( tree_missing, data ) training_set_err_rate_missing_data = sum( willwait_hat != willwait ) / length( willwait ) # Generate a new dataset for testing and compute its error rate: # res = restaurant_problem_gen_data( N ) data_test = res$data willwait_test = res$willwait willwait_test_hat = decision_tree_predict_multiple_samples( tree_complete, data_test ) testing_set_err_rate_complete_data = sum( willwait_test_hat != willwait_test ) / length( willwait_test ) # Replace some of the data in data with NAs: # n_NA_pts = min( N, 10 ) row_inds = sample( seq(1,N), n_NA_pts ) col_inds = sample( seq(1,dim(data)[2]), n_NA_pts ) data[row_inds,col_inds] = NA willwait_test_hat = decision_tree_predict_multiple_samples( tree_missing, data_test ) testing_set_err_rate_missing_data = sum( willwait_test_hat != willwait_test ) / length( willwait_test ) mc_train_er_complete = c( mc_train_er_complete, training_set_err_rate_complete_data ) mc_test_er_complete = c( mc_test_er_complete, testing_set_err_rate_complete_data ) mc_train_er_missing = c( mc_train_er_missing, training_set_err_rate_missing_data ) mc_test_er_missing = c( mc_test_er_missing, testing_set_err_rate_missing_data ) } training_set_err_rate_complete = c( training_set_err_rate_complete, mean( mc_train_er_complete ) ) testing_set_err_rate_complete = c( testing_set_err_rate_complete, mean( mc_test_er_complete ) ) training_set_err_rate_missing = c( training_set_err_rate_missing, mean( mc_train_er_missing ) ) testing_set_err_rate_missing = c( testing_set_err_rate_missing, mean( mc_test_er_missing ) ) print( sprintf('Complete Data Error Rates: N= %5d; training= %10.6f, testing= %10.6f; Missing Data Error Rates: training= %10.6f, testing= %10.6f', N, mean( mc_train_er_complete ), mean( mc_test_er_complete ), mean( mc_train_er_missing ), mean( mc_test_er_missing )) ) } #postscript("../../WriteUp/Graphics/Chapter18/chap_18_prob_12.eps", onefile=FALSE, horizontal=FALSE) plot( training_set_sizes, 1-training_set_err_rate_complete, col='red', type='l', xlab='training set size', ylab='Proporation correct on test set', ylim=c(0.7,1.0) ) lines( training_set_sizes, 1-testing_set_err_rate_complete, col='green', type='l' ) lines( training_set_sizes, 1-training_set_err_rate_missing, col='magenta', type='l' ) lines( training_set_sizes, 1-testing_set_err_rate_missing, col='cyan', type='l' ) grid() legend( 110, 0.8, c('CD train accuracy','CD test accuracy','MD train accuracy','MD test accuracy'), lty=1, col=c('red','green','magenta','cyan') ) #dev.off()