# 
# Written by:
# -- 
# John L. Weatherwax                2009-04-21
# 
# email: wax@alum.mit.edu
# 
# Please send comments and especially bug reports to the
# above email address.
# 
#-----

save_plots = F

#
# EPage 140
# 

DF = read.csv( "../../Data/wine.csv", header=TRUE )

if( FALSE ){
    # Move 2000 to the reference year:
    inds_gt50 = DF$Vintage_Year > 50
    DF[inds_gt50,2] = DF[inds_gt50,2] - 100
}else{ 
    # Convert to century based year indexing i.e. 1999, 2001 etc:
     inds_le5 = DF$Vintage_Year <= 5
     inds_ge80 = DF$Vintage_Year >= 80
     DF[inds_le5,2] = DF[inds_le5,2] + 2000
     DF[inds_ge80,2] = DF[inds_ge80,2] + 1900
}

# Part 1:
#
# The difference in prices between wines older than 10 years and ones less than 10 years
# (relative to the 2005 date on which these prices were quoted)
#
older_than_ten_years = DF$Vintage_Year <= 1995 # an indicator random variable on whether or not this wine is older than 10 years from 2005
DF$Older_Than_Ten = 0.
DF$Older_Than_Ten[older_than_ten_years] = 1.

# Build a linear model to see if there is an effect based on older than 10 years:
summary( lm( Price ~ Older_Than_Ten, data=DF ) )
summary( lm( log(Price) ~ Older_Than_Ten, data=DF ) )

# Part 2:
#
corr1 = cor( DF$Rating_Points, DF$Price )
corr2 = cor( log(DF$Rating_Points), DF$Price )
corr3 = cor( DF$Rating_Points, log(DF$Price ))
corr4 = cor( log(DF$Rating_Points), log(DF$Price) )
print( "Correlations between Rating_Points and Price (transformed or not)" )
print( c( corr1, corr2, corr3, corr4 ) )

if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter6/mow_plot_each_model.eps", onefile=FALSE, horizontal=FALSE) }
par(mfrow=c(1,2))
plot( DF$Rating_Points, DF$Price )
abline( lm( Price ~ Rating_Points, data=DF ) )
plot( log(DF$Rating_Points), log(DF$Price) )
abline( lm( log(Price) ~ log(Rating_Points), data=DF ) )
par(mfrow=c(1,1))
if( save_plots ){ dev.off() }

# Part 3 (lets take the logarithmic transformation of both price and rating points):
#
DF$LPrice = log(DF$Price)
DF$LRating_Points = log(DF$Rating_Points)
m = lm( LPrice ~ LRating_Points + Vintage_Year, data=DF )
summary(m)

# Plot the residuals of this reduced model:
# 
if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter6/mow_residual_plot.eps", onefile=FALSE, horizontal=FALSE) }
plot(m, which=1)
if( save_plots ){ dev.off() }

print( DF[ c(89, 85, 64), ] )
# Drop the $1400 bottle and refit:
DF2 = DF[-c(89) ,]

m2 = lm( LPrice ~ LRating_Points + Vintage_Year, data=DF2 )
summary(m2)

# Part 4 (Look for interaction terms)
#
m3 = lm( LPrice ~ LRating_Points + Vintage_Year + LRating_Points:Vintage_Year, data=DF )
summary(m3)

# Convert the vintage year to a relative year (number of years before 2005):
# As this is just a linear tranformation of Vintage_Year it won't affect the significance of this predictor
# (but the predictor might be easier to understand in the model)
#
DF$Relative_Vintage_Year = 2005 - DF$Vintage_Year
m4 = lm( LPrice ~ LRating_Points + Relative_Vintage_Year + LRating_Points:Relative_Vintage_Year, data=DF )
summary(m4)