#
# Written by:
# -- 
# John L. Weatherwax                2009-04-21
# 
# email: wax@alum.mit.edu
# 
# Please send comments and especially bug reports to the
# above email address.
#
# EPage 464
#
#-----

source('predictor_importance_utils.R')

library(caret)
library(AppliedPredictiveModeling)
library(C50) # needed for the churn dataset
library(CORElearn)

data(churn) # loads ChurnTrain & churnTest

# Find the categorical predictors:
#
factor_col_indices = c()
for( k in colnames(churnTrain) ){
    if( class(churnTrain[,k])=="factor" ){
        print(k)
        factor_col_indices = c( factor_col_indices, which( colnames(churnTrain)==k ) )
    }
}

# What are the categorical features:
print( colnames( churnTrain )[factor_col_indices] )

# Compute the correlation between the real valued predictors:
#
LC = largest_cors( churnTrain[,-factor_col_indices] )
print( head(LC,10))

# b:
#
# The importance of the categorical predictors in predicting churn (another categorical predictor):
#
VI = filterVarImp( x=churnTrain[,factor_col_indices[-length(factor_col_indices)]], y=churnTrain$churn )

# Sort by the value of the variable importance:
var_order = order( VI$yes, decreasing=TRUE )
VI = as.matrix( VI[var_order,] )
print( VI[,"yes"] )

# For the categorical predictors:
reliefalues = attrEval( churn ~ ., data=churnTrain[,factor_col_indices], estimator="ReliefFequalK", ReliefIterations=50 )
print( sort( reliefalues, decreasing=TRUE ) )

# c:
#
# The continuous predictors:
#
VI = filterVarImp( x=churnTrain[,-factor_col_indices], y=churnTrain$churn )

# Sort by the value of the variable importance:
#
var_order = order( VI$yes, decreasing=TRUE )
VI = as.matrix( VI[var_order,] )
print( VI[,"yes"] )

reliefalues = attrEval( churn ~ ., data=churnTrain[,-factor_col_indices[-length(factor_col_indices)]], estimator="ReliefFequalK", ReliefIterations=50 )
print( sort( reliefalues, decreasing=TRUE ) )