# # Here we duplicate the results in Table 4,10 from the book. # The main purpose of this code is show how given a correlation # matrix and using the "sweep" operator we can perform in-sample # subset selection. # source('../../Data/data_loaders.R') source('../Chapter4/utils.R') DF = load_appendix_survival_data() DF$TIME = NULL # keep log(time) N = dim(DF)[1] p = dim(DF)[2] # This command duplicates the numbers in Table 4.11 given the raw data: # coefficients( lm( LTIME ~ LIV, data=DF ) ) # Extract the same numbers as we can get with the "lm" command but instead using the sweep operation: # X = model.matrix( LTIME ~ ., data=DF ) XTX = t( as.matrix( X ) ) %*% as.matrix( X ) XTy = t( as.matrix( X ) ) %*% as.matrix( DF$LTIME ) A = rbind( cbind( XTX, XTy ), c( t(XTy), sum( DF$LTIME^2 ) ) ) # see Appendix A.I.11 colnames(A) = c( 'Const', colnames(DF) ) rownames(A) = c( 'Const', colnames(DF) ) # Perform a sweep of every variable to get the full regression and s^2 value (needed for C_p): # AFull = lm_sweep(lm_sweep(lm_sweep(lm_sweep(lm_sweep(A, 1), 2), 3), 4), 5) k = 4 s2 = AFull[p+1, p+1]/(N-k-1) s = sqrt(s2) print( sprintf('Full regression: s2= %10.6f; s= %10.6f', s2, s) ) by_hand = AFull[1:p, p+1] by_lm = coefficients( lm( LTIME ~ ., data=DF ) ) print( rbind( by_hand, by_lm ) ) # # Following the notes on EPage 148 we use the sweep operator on each variable # one at at time extracting the information needed to compute R^2, C_p, and RMS # for different sized subsets # Syy = sum( ( DF$LTIME - mean(DF$LTIME) )^2 ) # Sweep out the constant term: # k = 0 A0 = lm_sweep( A, 1 ) RMS_0 = A0[p+1, p+1]/(N-k-1) # Sweep out the individual predictors one at at time: # k = 1 # subsets of size one # LIV: # var_index = 5 A0_LIV = lm_sweep( A0, var_index ) kp1 = k+1 beta_0 = A0_LIV[1, p+1] beta_1 = A0_LIV[var_index, p+1] RSS = A0_LIV[p+1, p+1] RMS = RSS/(N-k-1) Cp = (N-kp1) * ( RMS / s2 - 1 ) + kp1 R2 = 1 - RSS/Syy print(sprintf('LIV : R2= %6.3f; C_p= %8.3f; RMS= %6.3f; beta_0= %6.3f; beta_1= %6.3f', R2, Cp, RMS, beta_0, beta_1)) # ENZ # var_index = 4 A0_ENZ = lm_sweep( A0, var_index ) kp1 = k+1 beta_0 = A0_ENZ[1, p+1] beta_1 = A0_ENZ[var_index, p+1] RSS = A0_ENZ[p+1, p+1] RMS = RSS/(N-k-1) Cp = (N-kp1) * ( RMS / s2 - 1 ) + kp1 R2 = 1 - RSS/Syy print(sprintf('ENZ : R2= %6.3f; C_p= %8.3f; RMS= %6.3f; beta_0= %6.3f; beta_1= %6.3f', R2, Cp, RMS, beta_0, beta_1)) # PROG # var_index = 3 A0_PROG = lm_sweep( A0, var_index ) kp1 = k+1 beta_0 = A0_PROG[1, p+1] beta_1 = A0_PROG[var_index, p+1] RSS = A0_PROG[p+1, p+1] RMS = RSS/(N-k-1) Cp = (N-kp1) * ( RMS / s2 - 1 ) + kp1 R2 = 1 - RSS/Syy print(sprintf('PROG: R2= %6.3f; C_p= %8.3f; RMS= %6.3f; beta_0= %6.3f; beta_1= %6.3f', R2, Cp, RMS, beta_0, beta_1)) # CLOT # var_index = 2 A0_CLOT = lm_sweep( A0, var_index ) kp1 = k+1 beta_0 = A0_CLOT[1, p+1] beta_1 = A0_CLOT[var_index, p+1] RSS = A0_CLOT[p+1, p+1] RMS = RSS/(N-k-1) Cp = (N-kp1) * ( RMS / s2 - 1 ) + kp1 R2 = 1 - RSS/Syy print(sprintf('CLOT: R2= %6.3f; C_p= %8.3f; RMS= %6.3f; beta_0= %6.3f; beta_1= %6.3f', R2, Cp, RMS, beta_0, beta_1)) # After we pick LIV to include we then need to pick another predictor: # k = 2 # subsets of size two # ENZ # var_index = 4 A0_LIV_ENZ = lm_sweep( A0_LIV, var_index ) kp1 = k+1 beta_0 = A0_LIV_ENZ[1, p+1] beta_1 = A0_LIV_ENZ[5, p+1] # coefficient of LIV beta_2 = A0_LIV_ENZ[var_index, p+1] # coefficient of ENZ RSS = A0_LIV_ENZ[p+1, p+1] RMS = RSS/(N-k-1) Cp = (N-kp1) * ( RMS / s2 - 1 ) + kp1 R2 = 1 - RSS/Syy print(sprintf('LIV then ENZ : R2= %6.3f; C_p= %6.3f; RMS= %6.3f; beta_0= %6.3f; beta_1= %6.3f; beta_2= %6.3f', R2, Cp, RMS, beta_0, beta_1, beta_2)) # Compare the above result with the output of # coefficients( lm( LTIME ~ LIV + ENZ, data=DF ) ) # FROG # var_index = 3 A0_LIV_PROG = lm_sweep( A0_LIV, var_index ) kp1 = k+1 beta_0 = A0_LIV_PROG[1, p+1] beta_1 = A0_LIV_PROG[5, p+1] # coefficient of LIV beta_2 = A0_LIV_PROG[var_index, p+1] # coefficient of FROG RSS = A0_LIV_PROG[p+1, p+1] RMS = RSS/(N-k-1) Cp = (N-kp1) * ( RMS / s2 - 1 ) + kp1 R2 = 1 - RSS/Syy print(sprintf('LIV then PROG: R2= %6.3f; C_p= %6.3f; RMS= %6.3f; beta_0= %6.3f; beta_1= %6.3f; beta_2= %6.3f', R2, Cp, RMS, beta_0, beta_1, beta_2)) # Compare the above result with the output of # coefficients( lm( LTIME ~ LIV + PROG, data=DF ) )