#
# Written by:
# -- 
# John L. Weatherwax                2009-04-21
# 
# email: wax@alum.mit.edu
# 
# Please send comments and especially bug reports to the
# above email address.
#
# EPage 686
# 
#-----


majority_vote = function(y){
  if( sum(y==TRUE) > sum(y==FALSE) ){ # more true than false
    return(TRUE)
  }else{
    return(FALSE)
  }
}


choose_attribute = function( attributes, X, y, use_information_gain_ratio=FALSE ){

  n_positive = sum(y==TRUE)
  n_negative = sum(y==FALSE)
  p_positive = n_positive / ( n_positive + n_negative )
  p_negative = n_negative / ( n_positive + n_negative )
  base_info = -p_positive * log( p_positive, base=2 ) - p_negative * log( p_negative, base=2 ) 

  # Loop over each attribute to find the one that maximizes our splitting criterion:
  #
  max_gain = -Inf 
  for( attrib in names(attributes) ){

    # Count the number of data samples that have NA for this attribute:
    #
    good_inds = !is.na( X[[attrib]] )
    n_NAs = dim(X)[1] - sum(good_inds)
    X = X[good_inds,]
    y = y[good_inds]
    
    # Compute the information gain / information gain ratio if I split this node on values for this attribute:
    #
    remainder_A = 0.
    for( av in attributes[[attrib]] ){

      # Count the number of data samples that will go down into each branch if we were to split on this attribute
      # 
      inds = X[attrib]==av

      # extract the counts first ignoring any samples with NAs
      n_positive_Av = sum(y[inds]==TRUE) 
      n_negative_Av = sum(y[inds]==FALSE)
      if( (n_positive_Av==0) | (n_negative_Av==0) ){ 
        next
      }

      # extract the counts now including any samples with NAs
      n_positive_Av = n_positive_Av + n_NAs * ( n_positive_Av / ( n_positive_Av + n_negative_Av ) )
      n_negative_Av = n_negative_Av + n_NAs * ( n_negative_Av / ( n_positive_Av + n_negative_Av ) )

      # Compute the information in this node:
      # 
      p_positive_Av = n_positive_Av / ( n_positive_Av + n_negative_Av )
      p_negative_Av = n_negative_Av / ( n_positive_Av + n_negative_Av )
      info_Av = -p_positive_Av * log( p_positive_Av, base=2 ) - p_negative_Av * log( p_negative_Av, base=2 )

      remainder_A = remainder_A + (n_positive_Av + n_negative_Av)/(n_positive + n_negative) * info_Av 
    }

    gain_A = base_info - remainder_A # this is the inforamation gain 
    if( use_information_gain_ratio ){ gain_A = gain_A / base_info } # we use the information gain ratio rather than the information gain directly
    if( gain_A > max_gain ){
      max_gain = gain_A
      best_attribute = attrib      
    }
  }

  return(best_attribute)
}


decision_tree_learning = function( X, y, attributes, default,
                                   use_chi2_pruning=FALSE, chi2_pruning_alpha=0.05,
                                   use_information_gain_ratio=FALSE,
                                   max_depth=Inf, current_depth=0 ){
  #
  # This implements the DECISION-TREE-LEARNING code from the book.
  #
  # X: matrix with each row a case and each column a feature value (attribute value) for that case
  # y: a column vector with each row the TRUE/FALSE label for the corresponding row in X (can only be either TRUE/FALSE) 
  # attributes: an R list with "keys" the set of string attributes (the feature name) and "values"
  #             the possible discrete choices for each attribute.
  # default: the default labeling of TRUE/FALSE to use
  # max_depth: how deep to build a tree.  Note that max_depth=1 is a decision stump
  #            in that it will select the single attribute that best splits
  #            the data and classifications for each of the attribute leaves
  #
  # This implementation can build a tree when some of the samples have missing values (represented as NA's in R)
  # This is done by assuming that all samples with NA's will be assigned according to proporation to the attributes
  # of the non NA samples at that node.  See the write up for more details.
  # 
  #-----
  
  n_samples = dim(X)[1] # the number of samples
  if( n_samples==0 ){ return(default) } # no samples return the default 

  # Check if all examples have the same classification:
  #
  if( sum(y==TRUE)==n_samples ){
    return(TRUE) 
  }
  if( sum(y==FALSE)==n_samples ){ 
    return(FALSE)
  }

  # Check if there are no attributes we can split on:
  # 
  if( length(attributes)==0 ){
    return(majority_vote(y)) 
  }

  # Check if we have reached the maximum desired tree depth:
  #
  if( current_depth >= max_depth ){
    return(majority_vote(y))
  }

  # Find the attribute we will split on:
  # 
  best = choose_attribute( attributes, X, y, use_information_gain_ratio )

  # Decide whether to split the tree on this attribute based on its X^2
  #  
  if( use_chi2_pruning ){
    p = sum(y==TRUE)
    n = sum(y==FALSE)
  
    D = 0.
    n_attributes = 0
    for( av in attributes[[best]] ){
      examples_spots = (X[best]==av) & !is.na(X[best])
      y_examples     = y[examples_spots]
      
      p_i = sum(y_examples==TRUE) 
      n_i = sum(y_examples==FALSE)
      if( p_i == 0 & n_i == 0 ){ next }

      # the expected number of positive & negative samples at this node: 
      p_hat_i = (p_i + n_i) * ( p / ( p + n ) )
      n_hat_i = (p_i + n_i) * ( n / ( p + n ) )

      D = D + ( p_i - p_hat_i )^2 / p_hat_i + ( n_i - n_hat_i )^2 / n_hat_i 
      n_attributes = n_attributes + 1 
    }
    if( D < qchisq( 1-chi2_pruning_alpha, n_attributes-1 ) ){ # this split is insignificant ... don't split
      return(majority_vote(y))
    }
  }
  
  # Create a node in our tree with: 
  # 
  # 1) the root being the attribute we will split on e.g. "type" (for restaurant type),
  # 2) a list of possible values of this attribute e.g. c( "Burger"  "French"  "Italian" "Thai" ),
  # 3) the number of training samples with each attribute value that ended up at this node.
  #    (needed to classify a sample that has a missing value for this attribute) e.g. a table like
  #    Burger  French Italian    Thai 
  #        24      26      25      25 
  # 
  tree = list( root=best, attribute_values=attributes[[best]], number_of_data_with_attribute=table(X[[best]]) )
  m = majority_vote(y)
  for( av in attributes[[best]] ){
    examples_spots = (X[best]==av) & !is.na(X[best])
    X_examples = X[examples_spots,]
    y_examples = y[examples_spots]

    attributes_without_best = attributes
    attributes_without_best[[best]] = NULL # remove this attribute from further consideration 
    subtree = decision_tree_learning( X_examples, y_examples, attributes_without_best, m,
                                      use_chi2_pruning=use_chi2_pruning, chi2_pruning_alpha=chi2_pruning_alpha, 
                                      use_information_gain_ratio=use_information_gain_ratio, 
                                      current_depth=current_depth+1, max_depth=max_depth )
    tree[[av]] = subtree 
  }
  return(tree)  
}


decision_tree_predict_single_sample = function( tree, x ){
  #
  # Predicts TRUE/FALSE label for a single sample using a decision tree.
  # The feature vector x CAN have missing values (denoted by the R symbol NA).
  # Vectors with missing features are assigned TRUE/FALSE labels according to Ex. 18.12
  #
  if( is.logical(tree) ){
    return(tree)
  }else{
    tsav = x[[ tree$root ]] # get this samples attribute value
    if( is.na(tsav) ){ # this sample DOES NOT have this measurement we compute the weighted sum assuming that x had every possible value for this attribute 
      result = 0.0
      for( av in tree$attribute_values ){
        wght = tree$number_of_data_with_attribute[av] / sum( tree$number_of_data_with_attribute ) 
        result = result +  wght * decision_tree_predict_single_sample( tree[[av]], x )
      }
      return(as.logical(round(result))) # convert to a bool result 
    }else{ # this sample DOES have this measurement value
      matches = tree$attribute_values == tsav
      if( sum(matches)==0 ){
        print(sprintf("WARNING: the input vector x attribute='%s' with value= '%s' that was not seen in our training set ... returning a random choice", tree$root, tsav))
        return(sample(c(TRUE,FALSE),1))
      }
      av = tree$attribute_values[ matches ]
      return(decision_tree_predict_single_sample( tree[[av]], x ))
    }
  }
}


decision_tree_predict_multiple_samples = function( tree, X ){
  #
  # Predicts TRUE/FALSE label for a many samples using a decision tree
  # Note: I was not sure how to code the prediction algorithm in a vectorized way.
  # If anyone knows how to do such a thing please contact me.
  #
  n_samples = dim(X)[1]
  y = rep( FALSE, n_samples ) # some initial values 
  for( si in 1:n_samples ){
    y[si] = decision_tree_predict_single_sample( tree, X[si,] )
  }
  return(y)
}