# # Written by: # -- # John L. Weatherwax 2009-04-21 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # # EPage 686 # #----- majority_vote = function(y){ if( sum(y==TRUE) > sum(y==FALSE) ){ # more true than false return(TRUE) }else{ return(FALSE) } } choose_attribute = function( attributes, X, y, use_information_gain_ratio=FALSE ){ n_positive = sum(y==TRUE) n_negative = sum(y==FALSE) p_positive = n_positive / ( n_positive + n_negative ) p_negative = n_negative / ( n_positive + n_negative ) base_info = -p_positive * log( p_positive, base=2 ) - p_negative * log( p_negative, base=2 ) # Loop over each attribute to find the one that maximizes our splitting criterion: # max_gain = -Inf for( attrib in names(attributes) ){ # Count the number of data samples that have NA for this attribute: # good_inds = !is.na( X[[attrib]] ) n_NAs = dim(X)[1] - sum(good_inds) X = X[good_inds,] y = y[good_inds] # Compute the information gain / information gain ratio if I split this node on values for this attribute: # remainder_A = 0. for( av in attributes[[attrib]] ){ # Count the number of data samples that will go down into each branch if we were to split on this attribute # inds = X[attrib]==av # extract the counts first ignoring any samples with NAs n_positive_Av = sum(y[inds]==TRUE) n_negative_Av = sum(y[inds]==FALSE) if( (n_positive_Av==0) | (n_negative_Av==0) ){ next } # extract the counts now including any samples with NAs n_positive_Av = n_positive_Av + n_NAs * ( n_positive_Av / ( n_positive_Av + n_negative_Av ) ) n_negative_Av = n_negative_Av + n_NAs * ( n_negative_Av / ( n_positive_Av + n_negative_Av ) ) # Compute the information in this node: # p_positive_Av = n_positive_Av / ( n_positive_Av + n_negative_Av ) p_negative_Av = n_negative_Av / ( n_positive_Av + n_negative_Av ) info_Av = -p_positive_Av * log( p_positive_Av, base=2 ) - p_negative_Av * log( p_negative_Av, base=2 ) remainder_A = remainder_A + (n_positive_Av + n_negative_Av)/(n_positive + n_negative) * info_Av } gain_A = base_info - remainder_A # this is the inforamation gain if( use_information_gain_ratio ){ gain_A = gain_A / base_info } # we use the information gain ratio rather than the information gain directly if( gain_A > max_gain ){ max_gain = gain_A best_attribute = attrib } } return(best_attribute) } decision_tree_learning = function( X, y, attributes, default, use_chi2_pruning=FALSE, chi2_pruning_alpha=0.05, use_information_gain_ratio=FALSE, max_depth=Inf, current_depth=0 ){ # # This implements the DECISION-TREE-LEARNING code from the book. # # X: matrix with each row a case and each column a feature value (attribute value) for that case # y: a column vector with each row the TRUE/FALSE label for the corresponding row in X (can only be either TRUE/FALSE) # attributes: an R list with "keys" the set of string attributes (the feature name) and "values" # the possible discrete choices for each attribute. # default: the default labeling of TRUE/FALSE to use # max_depth: how deep to build a tree. Note that max_depth=1 is a decision stump # in that it will select the single attribute that best splits # the data and classifications for each of the attribute leaves # # This implementation can build a tree when some of the samples have missing values (represented as NA's in R) # This is done by assuming that all samples with NA's will be assigned according to proporation to the attributes # of the non NA samples at that node. See the write up for more details. # #----- n_samples = dim(X)[1] # the number of samples if( n_samples==0 ){ return(default) } # no samples return the default # Check if all examples have the same classification: # if( sum(y==TRUE)==n_samples ){ return(TRUE) } if( sum(y==FALSE)==n_samples ){ return(FALSE) } # Check if there are no attributes we can split on: # if( length(attributes)==0 ){ return(majority_vote(y)) } # Check if we have reached the maximum desired tree depth: # if( current_depth >= max_depth ){ return(majority_vote(y)) } # Find the attribute we will split on: # best = choose_attribute( attributes, X, y, use_information_gain_ratio ) # Decide whether to split the tree on this attribute based on its X^2 # if( use_chi2_pruning ){ p = sum(y==TRUE) n = sum(y==FALSE) D = 0. n_attributes = 0 for( av in attributes[[best]] ){ examples_spots = (X[best]==av) & !is.na(X[best]) y_examples = y[examples_spots] p_i = sum(y_examples==TRUE) n_i = sum(y_examples==FALSE) if( p_i == 0 & n_i == 0 ){ next } # the expected number of positive & negative samples at this node: p_hat_i = (p_i + n_i) * ( p / ( p + n ) ) n_hat_i = (p_i + n_i) * ( n / ( p + n ) ) D = D + ( p_i - p_hat_i )^2 / p_hat_i + ( n_i - n_hat_i )^2 / n_hat_i n_attributes = n_attributes + 1 } if( D < qchisq( 1-chi2_pruning_alpha, n_attributes-1 ) ){ # this split is insignificant ... don't split return(majority_vote(y)) } } # Create a node in our tree with: # # 1) the root being the attribute we will split on e.g. "type" (for restaurant type), # 2) a list of possible values of this attribute e.g. c( "Burger" "French" "Italian" "Thai" ), # 3) the number of training samples with each attribute value that ended up at this node. # (needed to classify a sample that has a missing value for this attribute) e.g. a table like # Burger French Italian Thai # 24 26 25 25 # tree = list( root=best, attribute_values=attributes[[best]], number_of_data_with_attribute=table(X[[best]]) ) m = majority_vote(y) for( av in attributes[[best]] ){ examples_spots = (X[best]==av) & !is.na(X[best]) X_examples = X[examples_spots,] y_examples = y[examples_spots] attributes_without_best = attributes attributes_without_best[[best]] = NULL # remove this attribute from further consideration subtree = decision_tree_learning( X_examples, y_examples, attributes_without_best, m, use_chi2_pruning=use_chi2_pruning, chi2_pruning_alpha=chi2_pruning_alpha, use_information_gain_ratio=use_information_gain_ratio, current_depth=current_depth+1, max_depth=max_depth ) tree[[av]] = subtree } return(tree) } decision_tree_predict_single_sample = function( tree, x ){ # # Predicts TRUE/FALSE label for a single sample using a decision tree. # The feature vector x CAN have missing values (denoted by the R symbol NA). # Vectors with missing features are assigned TRUE/FALSE labels according to Ex. 18.12 # if( is.logical(tree) ){ return(tree) }else{ tsav = x[[ tree$root ]] # get this samples attribute value if( is.na(tsav) ){ # this sample DOES NOT have this measurement we compute the weighted sum assuming that x had every possible value for this attribute result = 0.0 for( av in tree$attribute_values ){ wght = tree$number_of_data_with_attribute[av] / sum( tree$number_of_data_with_attribute ) result = result + wght * decision_tree_predict_single_sample( tree[[av]], x ) } return(as.logical(round(result))) # convert to a bool result }else{ # this sample DOES have this measurement value matches = tree$attribute_values == tsav if( sum(matches)==0 ){ print(sprintf("WARNING: the input vector x attribute='%s' with value= '%s' that was not seen in our training set ... returning a random choice", tree$root, tsav)) return(sample(c(TRUE,FALSE),1)) } av = tree$attribute_values[ matches ] return(decision_tree_predict_single_sample( tree[[av]], x )) } } } decision_tree_predict_multiple_samples = function( tree, X ){ # # Predicts TRUE/FALSE label for a many samples using a decision tree # Note: I was not sure how to code the prediction algorithm in a vectorized way. # If anyone knows how to do such a thing please contact me. # n_samples = dim(X)[1] y = rep( FALSE, n_samples ) # some initial values for( si in 1:n_samples ){ y[si] = decision_tree_predict_single_sample( tree, X[si,] ) } return(y) }