#
# Written by:
# -- 
# John L. Weatherwax                2009-04-21
# 
# email: wax@alum.mit.edu
# 
# Please send comments and especially bug reports to the
# above email address.
#
# EPage 464
#
#-----

save_plots = F

source('predictor_importance_utils.R')

library(caret)
library(AppliedPredictiveModeling)
library(corrplot)

data(abalone)

# a: Look at how the response depends on the predictors:
#
if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter18/chap_18_prob_3_type_boxplot.eps", onefile=FALSE, horizontal=FALSE) }
boxplot( Rings ~ Type, data=abalone )
if( save_plots ){ dev.off() }

# Plot all of these:
#
cn = colnames(abalone)
for( ci in 2:8 ){
    plot( abalone[,ci], abalone$Rings, main=cn[ci] )
    cat ("Press [enter] to continue")
    line <- readline()
}

#--
# Save scatter plots that look interesting:
#--

# Height vs. Rings (we have two outliers):
#
if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter18/chap_18_prob_3_height_plots.eps", onefile=FALSE, horizontal=FALSE) }
par(mfrow=c(1,2))
plot( Rings ~ Height, data=abalone )
orig_limits = par("usr")
m_orig = lm( Rings ~ Height, data=abalone )
abline(m_orig)
grid()

# Drop these outliers and replot the scatter plot:
#
c_thresh = 4 / dim(abalone)[1] # see http://en.wikipedia.org/wiki/Cook%27s_distance ... seems too stringent for this data
c_thresh = 0.1
cd_mask = cooks.distance(m_orig) > c_thresh
print( abalone[cd_mask,] )

plot( Rings ~ Height, data=abalone[-which(cd_mask),], xlim=orig_limits[1:2], ylim=orig_limits[3:4] )
m = lm( Rings ~ Height, data=abalone[-which(cd_mask),] )
abline(m)
grid()
par(mfrow=c(1,1))
if( save_plots ){ dev.off() }

# Look at how different the coefficients would be with and without these two points:
#
print( coef(m_orig) )
print( coef(m) )

# Two more scatter plots:
#
if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter18/chap_18_prob_3_feature_scatter_plots.eps", onefile=FALSE, horizontal=FALSE) }
par(mfrow=c(1,2))
plot( Rings ~ Diameter, data=abalone, main="Diameter" )
plot( Rings ~ VisceraWeight, data=abalone, main="VisceraWeight" )
par(mfrow=c(1,1))
if( save_plots ){ dev.off() }

# b:
#
if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter18/chap_18_prob_3_pairs_plot.eps", onefile=FALSE, horizontal=FALSE) }
pairs(abalone)
if( save_plots ){ dev.off() }

# What are the variables that have the largest correlations (don't include the predictor "type" which is a factor):
#
largest_cors( abalone[,-1] )
library(corrplot) 
if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter18/chap_18_prob_3_corrplot.eps", onefile=FALSE, horizontal=FALSE) }
corrplot(cor(abalone[,-1]), order="hclust")
if( save_plots ){ dev.off() }

findCorrelation(cor(abalone[,-1]), cutoff=0.75)

# c:
#

# Try one method at assessing importance:
VI = filterVarImp( abalone[,1:(dim(abalone)[2]-1)], abalone[,dim(abalone)[2]] )
print( VI[ order(VI$Overall, decreasing=T), , drop=F ] )

# Try a second method of assessing importance:
library(CORElearn)

reliefvalues = attrEval( Rings ~ ., data=abalone[,-1], estimator="RReliefFequalK" )
print( sort( reliefvalues, decreasing=TRUE ) )

# d (apply PCA on to the continious data):
#
pca_out = prcomp( abalone[,-1], center=T, scale=T )
if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter18/chap_18_prob_3_pca_plot.eps", onefile=FALSE, horizontal=FALSE) }
plot( pca_out )
if( save_plots ){ dev.off() }