# # Duplicates Table 3.1 and 3.2 from the book # # Written by: # -- # John L. Weatherwax 2009-04-21 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # #----- ## Note: to get numbers that match the book we first scale all the predictors AND then fit our model: ## source('load_prostate_data.R') res = load_prostate_data(globalScale=TRUE, trainingScale=FALSE, responseScale=FALSE) XTraining = res[[1]] XTesting = res[[2]] nrow = dim( XTraining )[1] p = dim( XTraining )[2] - 1 # the last column is the response D = XTraining[,1:p] # get the predictor data # This gives the raw data for Table 3.1 from the book: # print(round(cor(D), 3)) library(xtable) xtable( cor(D), caption="Duplication of the values from Table 3.1 from the book", digits=3 ) # # Duplicate Table 3.2 from the book # # Append a column of ones: # Dp = cbind( matrix(1,nrow,1), as.matrix( D ) ) lpsa = XTraining[,p+1] library(MASS) betaHat = ginv( t(Dp) %*% Dp ) %*% t(Dp) %*% as.matrix(lpsa) print('Table 3.2: first column: beta estimates') print(round(betaHat, 2)) # make predictions based on these estimated beta coefficients: # yhat = Dp %*% betaHat # estimate the variance: # sigmaHat = sum( ( lpsa - yhat )^2 ) / ( nrow - p - 1 ) # calculate the covariance of betaHat: # covarBetaHat = sigmaHat * ginv( t(Dp) %*% Dp ) # calulate the standard deviations of betahat: # stdBetaHat = sqrt(diag(covarBetaHat)) print('Table 3.2: second column: beta standard errors') print(round(as.matrix(stdBetaHat), 2)) # compute the z-scores: # z = betaHat / stdBetaHat print('Table 3.2: third column: beta z-scores') print(round(z, 2)) # reproduce/verify the above above results using R's "lm" function: # Db = cbind( D, lpsa ) Df = data.frame( Db ) m0 = lm( lpsa ~ lcavol+lweight+age+lbph+svi+lcp+gleason+pgg45, data=Df ) print('using the R lm function') print( summary(m0), digits=2 ) # display the results we get : F = data.frame( Term=names(coefficients(m0)), Coefficients=betaHat, Std_Error=stdBetaHat, Z_Score=z ) xtable( F, caption="Duplicated results for the books Table 3.2", digits=2 ) # Run this full linear model on the Testing data so that we can verify the comments at the end of section 3.2.1: Example: Prostate Cancer # pdt = predict( m0, newdata=XTesting, interval="prediction" )[,1] # get predictions on the test set lpsaTest = XTesting$lpsa NTest = length(lpsaTest) mErr = mean( (lpsaTest - pdt)^2 ) sErr = sqrt( var( (lpsaTest - pdt)^2 )/NTest ) print(sprintf('test data mean prediction error= %.3f; stdErr(mean)= %.3f', mErr, sErr)) mConstant = mean(Df$lpsa) ## the prediction we would make assuming a constant model mErrConstant = mean( (XTesting$lpsa - mConstant)^2 ) print(sprintf('test data mean prediction error= %.3f using the constant= %.3f', mErrConstant, mConstant))