load_spam_data <- function(trainingScale=TRUE,responseScale=TRUE){ # # R code to load in the spam data set from the book ESLII # # Output: # # res: list of data frames XT # # Written by: # -- # John L. Weatherwax 2009-04-21 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # #----- X = read.table("../../Data/spam.data") tt = read.table("../../Data/spam.traintest") # separate into training/testing sets # (ESLII on epage 319 says that we have 3065 training instances) # XTraining = subset( X, tt==0 ) p = dim(XTraining)[2]-1 XTesting = subset( X, tt==1 ) # (ESLII on epage 319 says that we have 1536 testing instances) # # Sometime data is processed and stored in a certain order. When doing cross validation # on such data sets we don't want to bias our results if we grab the first or the last samples. # Thus we randomize the order of the rows in the Training data frame to make sure that each # cross validation training/testing set is as random as possible. # if( FALSE ){ nSamples = dim(XTraining)[1] inds = sample( 1:nSamples, nSamples ) XTraining = XTraining[inds,] } # # In reality we have to estimate everything based on the training data only # Thus here we estimate the predictor statistics using the training set # and then scale the testing set by the same statistics # if( trainingScale ){ X = XTraining if( responseScale ){ meanV58 = mean(X$V58) v58 = X$V58 - meanV58 }else{ v58 = X$V58 } X$V58 = NULL X = scale(X, TRUE, TRUE) means = attr(X,"scaled:center") stds = attr(X,"scaled:scale") Xf = data.frame(X) Xf$V58 = v58 XTraining = Xf # scale the testing predictors by the same amounts: # DCVTest = XTesting if( responseScale ){ v58Test = DCVTest$V58 - meanV58 }else{ v58Test = DCVTest$V58 # in physical units (not mean adjusted) } DCVTest$V58 = NULL DCVTest = t( apply( DCVTest, 1, '-', means ) ) DCVTest = t( apply( DCVTest, 1, '/', stds ) ) DCVTestb = cbind( DCVTest, v58Test ) # append back on the response DCVTestf = data.frame( DCVTestb ) # a data frame containing all scaled variables of interest names(DCVTestf)[p+1] = "V58" # fix the name of the response XTesting = DCVTestf } # Many algorithms wont do well if the data is presented all of one class and # then all of another class thus we permute our data frames : # XTraining = XTraining[sample(nrow(XTraining)),] XTesting = XTesting[sample(nrow(XTesting)),] # Read in the list of s(pam)words (and delete garbage characters): # spam_words = read.table("../../Data/spambase.names",skip=33,sep=":",comment.char="|",stringsAsFactors=F) spam_words = spam_words[[1]] for( si in 1:length(spam_words) ){ spam_words[si] = sub( "word_freq_", "", spam_words[si] ) spam_words[si] = sub( "char_freq_", "", spam_words[si] ) } return( list( XTraining, XTesting, spam_words ) ) }