# # Written by: # -- # John L. Weatherwax 2009-04-21 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # # EPage 520-522 # # #----- library(caret) library(AppliedPredictiveModeling) data(solubility) trainData = solTrainXtrans lowcut = mean(solTrainY) - sd(solTrainY) highcut = mean(solTrainY) + sd(solTrainY) breakpoints = c(min(solTrainY), lowcut, highcut, max(solTrainY)) groupNames = c("Insoluble", "MidRange", "Soluble") solTrainY3bin = cut( solTrainY, breaks=breakpoints, include.lowest=TRUE, labels=groupNames ) solTestY3bin = cut( solTestY, breaks=breakpoints, include.lowest=TRUE, labels=groupNames ) set.seed(100) indx3bin = createFolds(solTrainY3bin, returnTrain=TRUE) ctrl3bin = trainControl( method="cv", index=indx3bin, classProb=TRUE, savePredictions=TRUE ) # Part (a): Build several models (linear/nonlinear/tree based): # lda3bin = train( x=trainData, y=solTrainY3bin, method="lda", preProc=c("center","scale"), metric="Kappa", tuneLength=10, trControl=ctrl3bin ) #glmnet3bin = train( x=trainData, y=solTrainY3bin, method="glmnet", preProc=c("center","scale"), metric="Kappa", tuneLength=10, trControl=ctrl3bin ) library(kernlab) sigmaEst = kernlab::sigest( as.matrix(trainData) ) svmGrid = expand.grid( .sigma=sigmaEst[1], .C=2^seq(-4,+4) ) svm3bin = train( x=trainData, y=solTrainY3bin, method="svmRadial", preProc=c("center","scale"), metric="Kappa", tuneGrid=svmGrid, trControl=ctrl3bin ) rpart3bin = train( x=trainData, y=solTrainY3bin, method="rpart", metric="Kappa", tuneLength=30, trControl=ctrl3bin ) # Part (b): Predict with each medel on the test data: # y_hat = predict( lda3bin, newdata=solTestXtrans ) ldaDSummary = defaultSummary( data.frame( pred=y_hat, obs=solTestY3bin ) ) y_hat = predict( svm3bin, newdata=solTestXtrans ) svdSummary = defaultSummary( data.frame( pred=y_hat, obs=solTestY3bin ) ) y_hat = predict( rpart3bin, newdata=solTestXtrans ) rpartDSummary = defaultSummary( data.frame( pred=y_hat, obs=solTestY3bin ) ) res = rbind( ldaDSummary, svdSummary, rpartDSummary ) print( res[ order( res[,2] ), ] ) # Part (c): # trainXfiltered2bin = trainData[solTrainY3bin != "MidRange",] solTrainY2bin = solTrainY3bin[solTrainY3bin != "MidRange"] solTrainY2bin = factor( as.character( solTrainY2bin ) ) # need only two levels testXfiltered2bin = solTestXtrans[solTestY3bin != "MidRange",] solTestY2bin = solTestY3bin[solTestY3bin != "MidRange"] solTestY2bin = factor( as.character( solTestY2bin ) ) set.seed(100) indx2bin = createFolds( solTrainY2bin, returnTrain=TRUE ) ctrl2bin = trainControl( method="cv", index=indx2bin, classProb=TRUE, savePredictions=TRUE ) lda2bin = train( x=trainXfiltered2bin, y=solTrainY2bin, method="lda", preProc=c("center","scale"), metric="Kappa", tuneLength=10, trControl=ctrl2bin ) sigmaEst = kernlab::sigest( as.matrix(trainXfiltered2bin) ) svmGrid = expand.grid( .sigma=sigmaEst[1], .C=2^seq(-4,+4) ) svm2bin = train( x=trainXfiltered2bin, y=solTrainY2bin, method="svmRadial", preProc=c("center","scale"), metric="Kappa", tuneGrid=svmGrid, trControl=ctrl2bin ) rpart2bin = train( x=trainXfiltered2bin, y=solTrainY2bin, method="rpart", metric="Kappa", tuneLength=30, trControl=ctrl2bin ) # Predict using each model on the test data: # y_hat = predict( lda2bin, newdata=testXfiltered2bin ) ldaDSummary = data.frame( Sens=sensitivity( y_hat, solTestY2bin ), Spec=specificity( y_hat, solTestY2bin ) ) y_hat = predict( svm2bin, newdata=testXfiltered2bin ) svmSummary = data.frame( Sens=sensitivity( y_hat, solTestY2bin ), Spec=specificity( y_hat, solTestY2bin ) ) y_hat = predict( rpart2bin, newdata=testXfiltered2bin ) rpartDSummary = data.frame( Sens=sensitivity( y_hat, solTestY2bin ), Spec=specificity( y_hat, solTestY2bin ) ) res = rbind( ldaDSummary, svmSummary, rpartDSummary ) rownames(res) = c("LDA", "SVM", "RPart") print( res[ order( res$Sens ), ] )