# # Written by: # -- # John L. Weatherwax 2009-04-21 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # # EPage 425 # #----- save_plots = F library(caret) library(arules) # needed for the AdultUCI data data(AdultUCI) AdultUCI$fnlwgt = NULL # we are told that this predictor is not informative and should not be used # Drop any samples that don't have complete records: AdultUCI = AdultUCI[complete.cases(AdultUCI),] # Make sure that "large" is the first level of the factor: AdultUCI$income = factor( as.character( AdultUCI$income ), levels=c( "large", "small" ) ) # Convert some of the column names so that they are easier to work with (i.e. remove the hyphens): colnames( AdultUCI ) = as.character( lapply( colnames( AdultUCI ), function(x){ gsub("-","_",x) } ) ) # Lets look at the number of levels associated with each factor: # # seems like there are not any factors with a huge number of levels # print( levels( AdultUCI$workclass ) ) # 8 levels print( levels( AdultUCI$education ) ) # 16 levels print( levels( AdultUCI$marital_status ) ) # 7 levels print( levels( AdultUCI$occupation ) ) # 14 levels print( levels( AdultUCI$relationship ) ) # 6 levels print( levels( AdultUCI$race ) ) # 5 levels print( levels( AdultUCI$sex ) ) # 2 levels (of course) print( levels( AdultUCI$native_country ) ) # 41 levels print( levels( AdultUCI$income ) ) # 2 levels small & large print( table( AdultUCI$income ) ) T = table( AdultUCI$income ) print( T / sum(T) ) # Break our dataset into a training/evaluation/testing dataset: # set.seed(156) split1 = createDataPartition( AdultUCI$income, p=0.7 )[[1]] # Simple verification that we have the same fraction of samples in each of these sets: # T_split1 = table( AdultUCI$income[split1] ) print( T_split1 / sum(T_split1) ) T_not_split1 = table( AdultUCI$income[-split1] ) print( T_not_split1 / sum(T_not_split1) ) other = AdultUCI[-split1,] training = AdultUCI[ split1,] # Split "other" into an evaluation and a test set: # split2 = createDataPartition( other$income, p=1./3 )[[1]] evaluation = other[ split2, ] testing = other[-split2, ] # Lets look if we have any zero variance predictors: zero_cols = nearZeroVar( training[,-14] ) cn = colnames(training) print( cn[zero_cols] ) # Drop the feature "native-country": # training$native_country = NULL evaluation$native_country = NULL testing$native_country = NULL plot( density( training$capital_gain ) ) plot( density( training$capital_loss ) ) # There are no linearly dependent columns remaining from the continious features: print( findLinearCombos(training[ , c( "age", "education_num", "capital_gain", "capital_loss", "hours_per_week" ) ]) ) # Lets look at how well some of these continious predictors work: if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter16/chap_16_prob_1_boxplots.eps", onefile=FALSE, horizontal=FALSE) } par(mfrow=c(1,2)) boxplot( age ~ income, data=training, ylab="Age" ) boxplot( education_num ~ income, data=training, ylab="Education Number" ) par(mfrow=c(1,1)) if( save_plots ){ dev.off() } boxplot( capital_gain ~ income, data=training, main="captial gain as a function of age" ) boxplot( capital_loss ~ income, data=training, main="captial loss as a function of age" ) boxplot( hours_per_week ~ income, data=training, main="hours per week as a function of age" ) # Lets look at the correlation of the predictors that are numeric: # cor( training[ , c("age", "education_num", "capital_gain", "capital_loss", "hours_per_week") ] ) # Part (c) Build several classification models (we follow the book heavily here): # # Wrapper functions for performance measures: # fiveStats = function(...) c(twoClassSummary(...), defaultSummary(...)) fourStats = function( data, lev=levels(data$obs), model=NULL ){ accKapp = postResample( data[, "pred"], data[, "obs"] ) out = c( accKapp, sensitivity(data[, "pred"], data[, "obs"], lev[1]), specificity(data[, "pred"], data[, "obs"], lev[2]) ) names(out)[3:4] = c("Sens", "Spec") out } ctrl = trainControl( method="cv", number=5, classProbs=TRUE, summaryFunction=fiveStats, verboseIter=TRUE ) ctrlNoProb = ctrl ctrlNoProb$classProbs = FALSE ctrlNoProb$summaryFunction = fourStats set.seed(1410) rfFit = train( income ~ ., data=training, method="rf", trControl = ctrl, ntree = 500, tuneLength = 5, metric = "ROC" ) set.seed(1410) lrFit = train( income ~ ., data=training, method="glm", trControl = ctrl, metric = "ROC" ) set.seed(1410) fdaFit = train( income ~ ., data=training, method="fda", tuneGrid = data.frame(.degree=1, .nprune=1:25), trControl = ctrl, metric = "ROC" ) # Compare our estimated sensitivity and specificity for these methods on the training data: # res = matrix( data=c( mean( rfFit$resample$ROC ), mean( rfFit$resample$Spec ), mean( rfFit$resample$Sens ), mean( lrFit$resample$ROC ), mean( lrFit$resample$Spec ), mean( lrFit$resample$Sens ), mean( fdaFit$resample$ROC ), mean( fdaFit$resample$Spec ),mean( fdaFit$resample$Sens ) ), nrow=3, ncol=3, byrow=T ) res = data.frame( res ) rownames(res) = c("RF", "LR", "FDA" ) colnames(res) = c("AUC", "Spec", "Sens" ) print(res) # Lets look at the predictions from each model on the evaluation dataset: # evalResults = data.frame( income = evaluation$income ) # put in the truth evalResults$RF = predict( rfFit, newdata=evaluation, type="prob" )[,1] evalResults$LogReg = predict( lrFit, newdata=evaluation, type="prob" )[,1] evalResults$FDA = predict( fdaFit, newdata=evaluation, type="prob" )[,1] # Part (d): Find a good trade-off between sensitivity and specificity: # # The ROC curves (see the note on the event of interest class in the text): # library(pROC) rfROC = roc( evalResults$income, evalResults$RF, levels=rev( levels(evalResults$income) ) ) logRegROC = roc( evalResults$income, evalResults$LogReg, levels=rev(levels(evalResults$income) ) ) FDAROC = roc( evalResults$income, evalResults$FDA, levels=rev( levels(evalResults$income) ) ) if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter16/chap_16_prob_1_roc_curves.eps", onefile=FALSE, horizontal=FALSE) } plot(rfROC, legacy.axes=TRUE, col="red") plot(logRegROC, legacy.axes=TRUE, add=T, col="blue") plot(FDAROC, legacy.axes=TRUE, add=T, col="green") legend( x="bottomright", c(sprintf("RF (%f)", rfROC$auc), sprintf("LR (%f)", logRegROC$auc), sprintf("FDA (%f)", FDAROC$auc)), col=c("red","blue", "green"), lty=c(1,1,1) ) if( save_plots ){ dev.off() } # The lift plots: # if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter16/chap_16_prob_1_lift_curves.eps", onefile=FALSE, horizontal=FALSE) } labs = c(RF = "Random Forest", LogReg = "Logistic Regression", FDA = "FDA (MARS)") lift1 = lift( income ~ RF + LogReg + FDA, data=evalResults, labels=labs ) xyplot(lift1, ylab="%Events Found", xlab="%Customers Evaluated", lwd=2, type="l", col=c("red", "blue", "green")) if( save_plots ){ dev.off() } # Alternate Cutoffs (using the logistic regression classifier): # baseline = coords( logRegROC, x=0.5, input="threshold" ) logRegThresh_ctopleft = coords( logRegROC, x="best", best.method="closest.topleft" ) logRegThresh_youden = coords( logRegROC, x="best", best.method="youden" ) print( rbind( baseline, logRegThresh_ctopleft, logRegThresh_youden ) ) # # Sampling Methods (EPage 422): # # Upsampling: # set.seed(1103) upSampleTraining = upSample( x=training[,-13] , y=training$income, yname="income" ) print( table( upSampleTraining$income ) ) # Lets build the same models as above but using this new dataset: # set.seed(1410) rfFit_us = train( income ~ ., data=upSampleTraining, method="rf", trControl = ctrl, ntree = 500, tuneLength = 5, metric = "ROC" ) set.seed(1410) lrFit_us = train( income ~ ., data=upSampleTraining, method="glm", trControl = ctrl, metric = "ROC" ) set.seed(1410) fdaFit_us = train( income ~ ., data=upSampleTraining, method="fda", tuneLength = 10, trControl = ctrl, metric = "ROC" ) # Compare our estimated sensitivity and specificity for these methods on the training data: # res = matrix( data=c( mean( rfFit_us$resample$ROC ), mean( rfFit_us$resample$Spec ), mean( rfFit_us$resample$Sens ), mean( lrFit_us$resample$ROC ), mean( lrFit_us$resample$Spec ), mean( lrFit_us$resample$Sens ), mean( fdaFit_us$resample$ROC ), mean( fdaFit_us$resample$Spec ), mean( fdaFit_us$resample$Sens ) ), nrow=3, ncol=3, byrow=T ) res = data.frame( res ) rownames(res) = c("RF", "LR", "FDA" ) colnames(res) = c("AUC", "Spec", "Sens" ) print(res) # What are our default sensitivity and specificity for these methods on the evaluation set? # evalResults = data.frame( income = evaluation$income ) # put in the truth evalResults$RF = predict( rfFit_us, newdata=evaluation, type="prob" )[,1] evalResults$LogReg = predict( lrFit_us, newdata=evaluation, type="prob" )[,1] evalResults$FDA = predict( fdaFit_us, newdata=evaluation, type="prob" )[,1] rfROC = roc( evalResults$income, evalResults$RF, levels:rev( levels(evalResults$income) ) ) logRegROC = roc( evalResults$income, evalResults$LogReg, levels=rev( levels(evalResults$income) ) ) FDAROC = roc( evalResults$income, evalResults$FDA, levels=rev( levels(evalResults$income) ) ) baseline = coords( logRegROC, x=0.5, input="threshold" ) logRegThresh_ctopleft = coords( logRegROC, x="best", best.method="closest.topleft" ) logRegThresh_youden = coords( logRegROC, x="best", best.method="youden" ) print( rbind( baseline, logRegThresh_ctopleft, logRegThresh_youden ) ) #--- # Part (f): Cost Sensitive Models (EPage 423): #--- # Cost Sensitive SVMs: # library(kernlab) set.seed(1157) sigma = sigest( income ~ ., data=training, frac=0.75 ) names(sigma) = NULL svmGrid = data.frame( .sigma=sigma[2], .C=2^seq( -6, +1, length=15 ) ) set.seed(1401) SVMwts = train( income ~ ., data=training, method="svmRadial", tuneGrid=svmGrid, preProc=c("center", "scale"), class.weights=c(large=10, small=1), metric="Sens", trControl=ctrlNoProb ) SVMwts_predictions = predict( SVMwts, newdata=evaluation ) SVMwts_results = c( sensitivity( data=SVMwts_predictions, reference=evaluation$income, postive="large" ), specificity( data=SVMwts_predictions, reference=evaluation$income, postive="large" ) ) # Lets check the functions we are using above are correct: # sum( SVMwts_predictions=="large" & evaluation$income=="large" ) / sum( evaluation$income=="large" ) # check the calculation of sensitivity sum( SVMwts_predictions!="large" & evaluation$income!="large" ) / sum( evaluation$income!="large" ) # check the calculation of specificity # Cost Sensitive CART models: # costMatrix = matrix( c(0, 1, 10, 0), ncol=2 ) # 10 fold increase in the cost for a false negative than a false positive rownames(costMatrix) = levels(training$income) colnames(costMatrix) = levels(training$income) library(rpart) set.seed(1401) cartCosts = train( x=training[,-13], y=training$income, method="rpart", tuneLength=10, parms = list(loss=costMatrix), metric="Kappa", trControl=ctrlNoProb ) cart_predictions = predict( cartCosts, newdata=evaluation ) cart_results = c( sensitivity( data=cart_predictions, reference=evaluation$income, postive="large" ), specificity( data=cart_predictions, reference=evaluation$income, postive="large" ) ) # Cost Sensitive C5.0 models: # c5Matrix = matrix( c(0, 10, 1, 0), ncol=2 ) rownames(c5Matrix) = levels(training$income) colnames(c5Matrix) = levels(training$income) library(C50) set.seed(1401) C5Costs = train( x=training[,-13], y=training$income, method="C5.0", tuneLength=10, cost = c5Matrix, metric="Kappa", trControl=ctrlNoProb ) C5_predictions = predict( C5Costs, newdata=evaluation ) C5_results = c( sensitivity( data=C5_predictions, reference=evaluation$income, postive="large" ), specificity( data=C5_predictions, reference=evaluation$income, postive="large" ) ) # Print all of the results: # M = matrix( c( SVMwts_results, cart_results, C5_results ), nrow=3, ncol=2, byrow=T ) rownames(M) = c("SVM", "CART", "C5.0") colnames(M) = c("sensitivity", "specificity") print( M[ order( M[,1] ), ] )