# # Written by: # -- # John L. Weatherwax 2009-04-21 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # # EPage 350 # #----- save_plots = FALSE if( ! require("MASS") ){ install.packages("MASS") } if( ! require("ISLR") ){ install.packages("ISLR") } if( ! require("gbm") ){ install.packages("gbm") } if( ! require("glmnet") ){ install.packages("glmnet") } if( ! require("randomForest") ){ install.packages("randomForest") } set.seed(0) vt100ClearScreen <- function(...) cat("\033[2J") vt100ClearScreen() Caravan = na.omit(Caravan) n = nrow(Caravan) p = ncol(Caravan) - 1 # one column is the response we are trying to model i.e. "Purchase" train = 1:1000 test = 1001:n # Transform the response "Purchase" to be in [0,1] as required by gbm: # PurchaseBinary = rep(0,n) PurchaseBinary[ Caravan$Purchase == 'Yes' ] = 1 Caravan$Purchase = PurchaseBinary # Some variables seem to be very noninformative (have zero variance as reported by gbm): # Caravan$PVRAAUT = NULL Caravan$AVRAAUT = NULL # Train a gbm: # lm = 0.01 boost.caravan = gbm( Purchase ~ ., data=Caravan[train,], distribution="bernoulli", n.trees=1000, interaction.depth=2, shrinkage=lm ) summary( boost.caravan ) # Predict the testing error: # y_hat = predict( boost.caravan, newdata=Caravan[test,], n.trees=1000 ) p_hat = exp( y_hat ) / ( 1 + exp( y_hat ) ) # convert the logodd output into probabilities will_buy = rep( 0, length(test) ) will_buy[ p_hat > 0.2 ] = 1 # Create a confusion matrix: # table( will_buy, Caravan[test,]$Purchase ) # Train a logistic regression: # lr_model = glm( Purchase ~ ., data=Caravan[train,], family="binomial" ) y_hat = predict( lr_model, newdata=Caravan[test,] ) p_hat = exp( y_hat ) / ( 1 + exp( y_hat ) ) # convert the logodd output into probabilities will_buy = rep( 0, length(test) ) will_buy[ p_hat > 0.2 ] = 1 # Create a confusion matrix: # table( will_buy, Caravan[test,]$Purchase ) # Try bagging (not implemented/tested): # if( FALSE ){ bag.hitters = randomForest( Purchase ~ ., data=Caravan, mtry=p, ntree=1000, importance=TRUE, subset=train ) y_hat = predict( bag.hitters, newdata=Caravan[test,] ) mse.bag = mean( ( Caravan[test,]$Purchase - y_hat )^2 ) print( 'randomForest test MSE:' ) print( mse.bag ) }