#
# Written by:
# -- 
# John L. Weatherwax                2009-04-21
# 
# email: wax@alum.mit.edu
# 
# Please send comments and especially bug reports to the
# above email address.
#
# EPage 187
#
#-----

save_plots = F

if( ! require("ISLR") ){ install.packages("ISLR") }
if( ! require("MASS") ){ install.packages("MASS") }
if( ! require("class") ){ install.packages("class") }

set.seed(0)

vt100ClearScreen <- function(...) cat("\033[2J") 
vt100ClearScreen()

n = dim(Boston)[1]

# Introduce a variable whether or not the crime rate is above=1 / below=0 the median 
Boston$crim01 = rep( 0, n )
Boston$crim01[ Boston$crim >= median( Boston$crim ) ] = 1
Boston$crim = NULL 

# Look to see what features are most strongly correlated with crim01:
#
Boston.cor = cor( Boston )
print( sort( Boston.cor[,'crim01'] ) )

# Split the data set into testing and training parts:
inds.train = sample(1:n,3*n/4)
inds.test = (1:n)[-inds.train]
Boston.train = Boston[inds.train,]
Boston.test = Boston[inds.test,]

# Fit several models to the training data 
#
lr_model = glm( crim01 ~ nox + rad + dis, data=Boston.train, family=binomial )

p_hat = predict( lr_model, newdata=Boston.test, type="response" )
y_hat = rep( 0, length(p_hat) )
y_hat[ p_hat > 0.5 ] = 1
CM = table( predicted=y_hat, truth=Boston.test$crim01 )
print( CM )
print( sprintf( "LR: overall fraction correct= %10.6f", ( CM[1,1] + CM[2,2] ) / sum(CM) ) )

lda.fit = lda( crim01 ~ nox + rad + dis, data=Boston.train )

# Use LDA
#
lda.fit = lda( crim01 ~ nox + rad + dis, data=Boston.train )

lda.predict = predict( lda.fit, newdata=Boston.test )
CM = table( predicted=lda.predict$class, truth=Boston.test$crim01 )
print( CM )
print( sprintf( "LDA: overall fraction correct= %10.6f", ( CM[1,1] + CM[2,2] ) / sum(CM) ) )


#f: Use QDA
#
qda.fit = qda( crim01 ~ nox + rad + dis, data=Boston.train )

qda.predict = predict( qda.fit, newdata=Boston.test ) 
CM = table( predicted=qda.predict$class, truth=Boston.test$crim01 )
print( CM )
print( sprintf( "QDA: overall fraction correct= %10.6f", ( CM[1,1] + CM[2,2] ) / sum(CM) ) )

#g: KNN 
#
X.train = Boston.train; X.train$"crim01" = NULL 
Y.train = Boston.train$"crim01"

X.test = Boston.test; X.test$"crim01" = NULL 

y_hat_k_1 = knn( X.train, X.test, Y.train, k=1 )

CM = table( predicted=y_hat_k_1, truth=Boston.test$crim01 ) 
print( CM )
print( sprintf( "KNN (k=1): overall fraction correct= %10.6f", ( CM[1,1] + CM[2,2] ) / sum(CM) ) )

y_hat_k_3 = knn( X.train, X.test, Y.train, k=3 )
CM = table( predicted=y_hat_k_3, truth=Boston.test$crim01 ) 
print( CM )
print( sprintf( "KNN (k=3): overall fraction correct= %10.6f", ( CM[1,1] + CM[2,2] ) / sum(CM) ) )