# # Written by: # -- # John L. Weatherwax 2009-04-21 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # #----- library(alr3) data(MWwords) attach(MWwords) # transform ALL the input data (to form the data we will regress using): LHamilton = log( Hamilton ) LHamiltonRank = -log( HamiltonRank ) LMadison = log( Madison ) LMadisonRank = -log( MadisonRank ) LJay = log( Jay ) LJayRank = -log( JayRank ) # 2.10.1: # inds = HamiltonRank <= 50 # <- select the list of words we want to study # plot everything: txt_ranks <- c( LHamiltonRank[inds], LMadisonRank[inds], LJayRank[inds] ) txt_freq <- c( LHamilton[inds], LMadison[inds], LJay[inds] ) postscript("../../WriteUp/Graphics/Chapter2/prob_10_H_50_words.eps", onefile=FALSE, horizontal=FALSE) plot( txt_ranks, txt_freq, xlab="log(word rank)", ylab="log(word frequency)" ) dev.off() # fit a linear model: m_50 = lm( txt_freq ~ txt_ranks ) postscript("../../WriteUp/Graphics/Chapter2/prob_10_H_50_words.eps", onefile=FALSE, horizontal=FALSE) plot( txt_ranks, txt_freq, xlab="log(word rank)", ylab="log(word frequency)" ) abline( m_50 ) dev.off() # display the summary statistics for this linear model: summary(m_50) anova(m_50) WWX check this # 2.10.2: # b_hat = summary(m_50)$coefficients[2,1] b_hat_se = summary(m_50)$coefficients[2,2] t = ( b_hat - 1 )/b_hat_se n = lenght(m_50$residuals) 2 * pt( -abs(t), n-2 ) # 2.10.3 (for the top 75 and 100 words): # inds = HamiltonRank <= 75 # <- select the list of words we want to study # plot everything: txt_ranks <- c( LHamiltonRank[inds], LMadisonRank[inds], LJayRank[inds] ) txt_freq <- c( LHamilton[inds], LMadison[inds], LJay[inds] ) postscript("../../WriteUp/Graphics/Chapter2/prob_10_H_75_words.eps", onefile=FALSE, horizontal=FALSE) plot( txt_ranks, txt_freq, xlab="log(word rank)", ylab="log(word frequency)" ) dev.off() # fit a linear model: m_75 = lm( txt_freq ~ txt_ranks ) postscript("../../WriteUp/Graphics/Chapter2/prob_10_H_75_words.eps", onefile=FALSE, horizontal=FALSE) plot( txt_ranks, txt_freq, xlab="log(word rank)", ylab="log(word frequency)" ) abline( m_75 ) dev.off() # display the summary statistics for this linear model: summary(m_75) anova(m_75) inds = HamiltonRank <= 100 # <- select the list of words we want to study # plot everything: txt_ranks <- c( LHamiltonRank[inds], LMadisonRank[inds], LJayRank[inds] ) txt_freq <- c( LHamilton[inds], LMadison[inds], LJay[inds] ) postscript("../../WriteUp/Graphics/Chapter2/prob_10_H_100_words.eps", onefile=FALSE, horizontal=FALSE) plot( txt_ranks, txt_freq, xlab="log(word rank)", ylab="log(word frequency)" ) dev.off() # fit a linear model: m_100 = lm( txt_freq ~ txt_ranks ) postscript("../../WriteUp/Graphics/Chapter2/prob_10_H_100_words.eps", onefile=FALSE, horizontal=FALSE) plot( txt_ranks, txt_freq, xlab="log(word rank)", ylab="log(word frequency)" ) abline( m_100 ) dev.off() # display the summary statistics for this linear model: summary(m_100) #anova(m_100) # take ALL terms: # txt_ranks <- c( LHamiltonRank, LMadisonRank, LJayRank ) txt_freq <- c( LHamilton, LMadison, LJay ) # fit a linear model: m_all = lm( txt_freq ~ txt_ranks ) postscript("../../WriteUp/Graphics/Chapter2/prob_10_all_words.eps", onefile=FALSE, horizontal=FALSE) plot( txt_ranks, txt_freq, xlab="log(word rank)", ylab="log(word frequency)" ) abline( m_all ) dev.off() # display the summary statistics for this linear model: summary(m_all) #anova(m_all)