#
# Here we duplicate the results in Table 4,10 from the book.
# The main purpose of this code is show how given a correlation
# matrix and using the "sweep" operator we can perform in-sample
# subset selection.
#

source('../../Data/data_loaders.R')
source('../Chapter4/utils.R')

DF = load_appendix_survival_data()
DF$TIME = NULL # keep log(time)

N = dim(DF)[1]
p = dim(DF)[2]

# This command duplicates the numbers in Table 4.11 given the raw data:
#
coefficients( lm( LTIME ~ LIV, data=DF ) )

# Extract the same numbers as we can get with the "lm" command but instead using the sweep operation:
#
X = model.matrix( LTIME ~ ., data=DF )
XTX = t( as.matrix( X ) ) %*% as.matrix( X )
XTy = t( as.matrix( X ) ) %*% as.matrix( DF$LTIME )
A = rbind( cbind( XTX, XTy ), c( t(XTy), sum( DF$LTIME^2 ) ) ) # see Appendix A.I.11
colnames(A) = c( 'Const', colnames(DF) )
rownames(A) = c( 'Const', colnames(DF) )

# Perform a sweep of every variable to get the full regression and s^2 value (needed for C_p):
#
AFull = lm_sweep(lm_sweep(lm_sweep(lm_sweep(lm_sweep(A, 1), 2), 3), 4), 5)

k = 4
s2 = AFull[p+1, p+1]/(N-k-1)
s = sqrt(s2)
print( sprintf('Full regression: s2= %10.6f; s= %10.6f', s2, s) )

by_hand = AFull[1:p, p+1]
by_lm = coefficients( lm( LTIME ~ ., data=DF ) )
print( rbind( by_hand, by_lm ) )

#
# Following the notes on EPage 148 we use the sweep operator on each variable
# one at at time extracting the information needed to compute R^2, C_p, and RMS
# for different sized subsets
#
Syy = sum( ( DF$LTIME - mean(DF$LTIME) )^2 )

# Sweep out the constant term:
#
k = 0
A0 = lm_sweep( A, 1 )
RMS_0 = A0[p+1, p+1]/(N-k-1)


# Sweep out the individual predictors one at at time:
#
k = 1 # subsets of size one

# LIV:
#
var_index = 5
A0_LIV = lm_sweep( A0, var_index )

kp1 = k+1
beta_0 = A0_LIV[1, p+1]
beta_1 = A0_LIV[var_index, p+1]
RSS = A0_LIV[p+1, p+1]
RMS = RSS/(N-k-1)
Cp = (N-kp1) * ( RMS / s2 - 1 ) + kp1
R2 = 1 - RSS/Syy
print(sprintf('LIV : R2= %6.3f; C_p= %8.3f; RMS= %6.3f; beta_0= %6.3f; beta_1= %6.3f', R2, Cp, RMS, beta_0, beta_1))

# ENZ
#
var_index = 4
A0_ENZ = lm_sweep( A0, var_index )

kp1 = k+1
beta_0 = A0_ENZ[1, p+1]
beta_1 = A0_ENZ[var_index, p+1]
RSS = A0_ENZ[p+1, p+1]
RMS = RSS/(N-k-1)
Cp = (N-kp1) * ( RMS / s2 - 1 ) + kp1
R2 = 1 - RSS/Syy
print(sprintf('ENZ : R2= %6.3f; C_p= %8.3f; RMS= %6.3f; beta_0= %6.3f; beta_1= %6.3f', R2, Cp, RMS, beta_0, beta_1))

# PROG
#
var_index = 3
A0_PROG = lm_sweep( A0, var_index )

kp1 = k+1
beta_0 = A0_PROG[1, p+1]
beta_1 = A0_PROG[var_index, p+1]
RSS = A0_PROG[p+1, p+1]
RMS = RSS/(N-k-1)
Cp = (N-kp1) * ( RMS / s2 - 1 ) + kp1
R2 = 1 - RSS/Syy
print(sprintf('PROG: R2= %6.3f; C_p= %8.3f; RMS= %6.3f; beta_0= %6.3f; beta_1= %6.3f', R2, Cp, RMS, beta_0, beta_1)) 

# CLOT
#
var_index = 2
A0_CLOT = lm_sweep( A0, var_index )

kp1 = k+1
beta_0 = A0_CLOT[1, p+1]
beta_1 = A0_CLOT[var_index, p+1]
RSS = A0_CLOT[p+1, p+1]
RMS = RSS/(N-k-1)
Cp = (N-kp1) * ( RMS / s2 - 1 ) + kp1
R2 = 1 - RSS/Syy
print(sprintf('CLOT: R2= %6.3f; C_p= %8.3f; RMS= %6.3f; beta_0= %6.3f; beta_1= %6.3f', R2, Cp, RMS, beta_0, beta_1)) 


# After we pick LIV to include we then need to pick another predictor:
#
k = 2 # subsets of size two

# ENZ
#
var_index = 4
A0_LIV_ENZ = lm_sweep( A0_LIV, var_index )

kp1 = k+1
beta_0 = A0_LIV_ENZ[1, p+1]
beta_1 = A0_LIV_ENZ[5, p+1] # coefficient of LIV
beta_2 = A0_LIV_ENZ[var_index, p+1] # coefficient of ENZ
RSS = A0_LIV_ENZ[p+1, p+1]
RMS = RSS/(N-k-1)
Cp = (N-kp1) * ( RMS / s2 - 1 ) + kp1
R2 = 1 - RSS/Syy
print(sprintf('LIV then ENZ : R2= %6.3f; C_p= %6.3f; RMS= %6.3f; beta_0= %6.3f; beta_1= %6.3f; beta_2= %6.3f', R2, Cp, RMS, beta_0, beta_1, beta_2))

# Compare the above result with the output of
#
coefficients( lm( LTIME ~ LIV + ENZ, data=DF ) )

# FROG
#
var_index = 3
A0_LIV_PROG = lm_sweep( A0_LIV, var_index )

kp1 = k+1
beta_0 = A0_LIV_PROG[1, p+1]
beta_1 = A0_LIV_PROG[5, p+1] # coefficient of LIV
beta_2 = A0_LIV_PROG[var_index, p+1] # coefficient of FROG
RSS = A0_LIV_PROG[p+1, p+1]
RMS = RSS/(N-k-1)
Cp = (N-kp1) * ( RMS / s2 - 1 ) + kp1
R2 = 1 - RSS/Syy
print(sprintf('LIV then PROG: R2= %6.3f; C_p= %6.3f; RMS= %6.3f; beta_0= %6.3f; beta_1= %6.3f; beta_2= %6.3f', R2, Cp, RMS, beta_0, beta_1, beta_2))

# Compare the above result with the output of
#
coefficients( lm( LTIME ~ LIV + PROG, data=DF ) )