# 
# Written by:
# -- 
# John L. Weatherwax                2009-04-21
# 
# email: wax@alum.mit.edu
# 
# Please send comments and especially bug reports to the
# above email address.
# 
#-----

library(alr3)
data(MWwords)
attach(MWwords)

# transform ALL the input data (to form the data we will regress using):
LHamilton     = log( Hamilton )
LHamiltonRank = -log( HamiltonRank )
LMadison      = log( Madison )
LMadisonRank  = -log( MadisonRank )
LJay          = log( Jay )
LJayRank      = -log( JayRank ) 

# 2.10.1:
# 
inds = HamiltonRank <= 50            # <- select the list of words we want to study 

# plot everything:
txt_ranks <- c( LHamiltonRank[inds], LMadisonRank[inds], LJayRank[inds] )
txt_freq  <- c( LHamilton[inds], LMadison[inds], LJay[inds] )

postscript("../../WriteUp/Graphics/Chapter2/prob_10_H_50_words.eps", onefile=FALSE, horizontal=FALSE)
plot( txt_ranks, txt_freq, xlab="log(word rank)", ylab="log(word frequency)" )
dev.off()

# fit a linear model: 
m_50 = lm( txt_freq ~ txt_ranks ) 

postscript("../../WriteUp/Graphics/Chapter2/prob_10_H_50_words.eps", onefile=FALSE, horizontal=FALSE)
plot( txt_ranks, txt_freq, xlab="log(word rank)", ylab="log(word frequency)" )
abline( m_50 ) 
dev.off()

# display the summary statistics for this linear model: 
summary(m_50)
anova(m_50)

WWX check this

# 2.10.2:
# 
b_hat    = summary(m_50)$coefficients[2,1]
b_hat_se = summary(m_50)$coefficients[2,2]
t = ( b_hat - 1 )/b_hat_se
n = lenght(m_50$residuals)
2 * pt( -abs(t), n-2 )


# 2.10.3 (for the top 75 and 100 words):
# 
inds = HamiltonRank <= 75            # <- select the list of words we want to study 

# plot everything:
txt_ranks <- c( LHamiltonRank[inds], LMadisonRank[inds], LJayRank[inds] )
txt_freq  <- c( LHamilton[inds], LMadison[inds], LJay[inds] )

postscript("../../WriteUp/Graphics/Chapter2/prob_10_H_75_words.eps", onefile=FALSE, horizontal=FALSE)
plot( txt_ranks, txt_freq, xlab="log(word rank)", ylab="log(word frequency)" )
dev.off()

# fit a linear model: 
m_75 = lm( txt_freq ~ txt_ranks ) 

postscript("../../WriteUp/Graphics/Chapter2/prob_10_H_75_words.eps", onefile=FALSE, horizontal=FALSE)
plot( txt_ranks, txt_freq, xlab="log(word rank)", ylab="log(word frequency)" )
abline( m_75 ) 
dev.off()

# display the summary statistics for this linear model: 
summary(m_75)
anova(m_75)


inds = HamiltonRank <= 100            # <- select the list of words we want to study 

# plot everything:
txt_ranks <- c( LHamiltonRank[inds], LMadisonRank[inds], LJayRank[inds] )
txt_freq  <- c( LHamilton[inds], LMadison[inds], LJay[inds] )

postscript("../../WriteUp/Graphics/Chapter2/prob_10_H_100_words.eps", onefile=FALSE, horizontal=FALSE)
plot( txt_ranks, txt_freq, xlab="log(word rank)", ylab="log(word frequency)" )
dev.off()

# fit a linear model: 
m_100 = lm( txt_freq ~ txt_ranks ) 

postscript("../../WriteUp/Graphics/Chapter2/prob_10_H_100_words.eps", onefile=FALSE, horizontal=FALSE)
plot( txt_ranks, txt_freq, xlab="log(word rank)", ylab="log(word frequency)" )
abline( m_100 ) 
dev.off()

# display the summary statistics for this linear model: 
summary(m_100)
#anova(m_100)


# take ALL terms:
# 

txt_ranks <- c( LHamiltonRank, LMadisonRank, LJayRank )
txt_freq  <- c( LHamilton, LMadison, LJay )

# fit a linear model: 
m_all = lm( txt_freq ~ txt_ranks ) 

postscript("../../WriteUp/Graphics/Chapter2/prob_10_all_words.eps", onefile=FALSE, horizontal=FALSE)
plot( txt_ranks, txt_freq, xlab="log(word rank)", ylab="log(word frequency)" )
abline( m_all ) 
dev.off()

# display the summary statistics for this linear model: 
summary(m_all)
#anova(m_all)