# 
# Written by:
# -- 
# John L. Weatherwax                2009-04-21
# 
# email: wax@alum.mit.edu
# 
# Please send comments and especially bug reports to the
# above email address.
# 
#-----

#
# EPage 140
#

save_plots = F

DF = read.csv( "../../Data/bodyfat.csv", header=TRUE )

# Lets look for the most significant predictors:
#
m = lm( PBF ~ . - Density, data=DF )
summary(m)

# Look for transformations of the top predictors that will make them more normal:
#
par(mfrow=c(1,2))
plot( density( DF$Abdomen_Circ ), main='Abdomen_Circ' ) # skewed to the right
plot( density( log(DF$Abdomen_Circ) ), main='log(Abdomen_Circ)' ) # slightly better (i.e. less skewed)
par(mfrow=c(1,1))

par(mfrow=c(1,2))
plot( density( DF$Wrist_Circ ), main='Wrist_Circ' ) # skewed to the right
plot( density( log(DF$Wrist_Circ) ), main='log(Wrist_Circ)' ) # not much difference
par(mfrow=c(1,1))

# Lets use backwards stepwise regression to remove predictors:
#
sm = step( m )
summary(sm)

# Remove Hip_Circ:
#
sm1 = update(sm, . ~ . - Hip_Circ)
summary( sm1 )

# Can/should we remove Neck_Circ?
sm2 = update(sm1, . ~ . - Neck_Circ)
summary(sm2)

# Can/should we remove Thigh_Circ?
sm3 = update(sm1, . ~ . - Thigh_Circ)
summary(sm3)

# Look at Cook's D to find the samples that have undue influence on the linear models coefficients:
#
if( save_plots ){ postscript ("../../WriteUp/Graphics/Chapter6/pbf_plot_which_4.eps", onefile=FALSE, horizontal=FALSE) }
plot( sm1, which=4 )
if( save_plots ){ dev.off() }

# Print the samples that had the most extream values:
#
feats = c ('PBF','Age','Weight','Neck_Circ','Abdomen_Circ','Thigh_Circ','Forearm_Circ','Wrist_Circ')
summary( DF[ , feats ] )

DF[ c( 65, 139, 193 ), feats ]

# Look for outliers to our model:
#
if( save_plots ){ postscript ("../../WriteUp/Graphics/Chapter6/pbf_plot_which_1.eps", onefile=FALSE, horizontal=FALSE) }
plot( sm1, which=1 )
if( save_plots ){ dev.off() }

DF[ c( 133, 136, 201 ), feats ]