findMostCorrelatedPredictor <- function( r, D ){

  N = dim(D)[1]
  p = dim(D)[2]

  max_abs_corr = 0.0
  best_j       = 0 
  for( j in 1:p ){
    c = abs( cor( r, D[,j] ) )
    #print(sprintf("j= %10d; c= %10.6f",j,c))
    if( c>max_abs_corr ){
      max_abs_corr = c
      best_j       = j 
    }
  }
  return(best_j)
}


IFSR <- function( DTrain, DTest, max_number_of_iterations = 200 ){
  #
  # Implements "Incremental Forward Stagewise Regression" using the algorithm discussed on
  # page 86 of the book:
  #
  # Elements of Statistical Learning by
  # Hastie, Tibshirani, and Friedman.
  #
  # Input:
  #   DTrain  = training data frame where the last column is the response variable
  #             of dimension [ N, p+1 ]
  #   DTest   = testing data frame on which we will return all possible prediction
  #             for the models derived from the training data above
  #   max_number_of_iterations = 
  #
  # Output:
  #   res[[1]] = beta_hat estimates coefficients for a linear model of dimension
  #              [ p, number_of_timesteps ]
  #              The first column corresponds to itration #0 (and all betas values are zero)
  #              The second column corresponds to iteration #1 (the first update of betas)
  #              ...
  #              The last column corresponds to the final iteration
  #                  and corresponds to the case when the residuals are uncorrelated with all
  #                  the predictors.  
  #   res[[2]] = y_hat column representing the predictions made by all models on the TESTING data
  #
  # Auxilary:
  # 
  #   epsilon = the increment of the update of the beta values:
  #             beta_{t+1} = beta_t + epsilon * sign( <x_j,r> )
  #             
  # Written by:
  # -- 
  # John L. Weatherwax                2010-03-11
  # 
  # email: wax@alum.mit.edu
  # 
  # Please send comments and especially bug reports to the
  # above email address.
  #
  #-----
  
  demeanResponse            = TRUE

  
  # estimate the least squares solution (and the number of iterations needed to get there):
  #
  library(MASS)
  XM       = as.matrix( DTrain )
  betaLS   = ginv( t(XM) %*% XM ) %*% t(XM) %*% r
  lOneNorm = sum( abs(betaLS) )
  epsilon  = lOneNorm/( 1.5 * max_number_of_iterations ) 

  correlation_min_threshold = epsilon 
  

  nTrain       = dim( DTrain )[1]
  p            = dim( DTrain )[2] - 1   # the last column is the response
  responseName = names( DTrain )[p+1]   # the the name of the response 

  nTest        = dim( DTest )[1]

  # Get the response and delete it from the data frame 
  response               = DTrain[,p+1] 
  DTrain[[responseName]] = NULL

  # We begin by standardizing the predictors and demeaning the response
  #
  # Note that one can get the centering value (the mean) with the command attr(DTrain,'scaled:center')
  #                   and the scaling value (the sample standard deviation) with the command attr(DTrain,'scaled:scale')
  # 
  DTrain               = scale( DTrain )
  r                    = response
  if( demeanResponse ){
    responseMean       = mean( response )
    r                  = r - responseMean # this is the initial residual
  }

    
  # extract the centering and scaling information:
  #
  means = attr(DTrain,"scaled:center")
  stds  = attr(DTrain,"scaled:scale")

  # apply the computed scaling based on the training data to the testing data:
  #
  responseTest          = DTest[,p+1]
  if( demeanResponse ){
    responseTest        = responseTest - responseMean
  }
  DTest[[responseName]] = NULL 
  DTest                 = t( apply( DTest, 1, '-', means ) ) 
  DTest                 = t( apply( DTest, 1, '/', stds ) )

  betaHat                  = as.matrix( mat.or.vec(p,1), p, 1 ) # initialize beta to zeros
  iteration_number         = 1
  max_residual_correlation = 1.e6
  while( max_residual_correlation > correlation_min_threshold && iteration_number < max_number_of_iterations ){
    #if( iteration_number %% 10 ==0 ) print(sprintf("iter_number= %10d; max_corr= %10.6f",iteration_number,max_residual_correlation))
    
    # Find the predictor that is the most correlated with the response:
    #
    j_index                  = findMostCorrelatedPredictor( r, DTrain )
    x_j                      = DTrain[,j_index]
    max_residual_correlation = abs( cor( x_j, r ) )

    # compute delta_j:
    #
    delta_j = epsilon * sign( x_j %*% r )
    
    # update beta_j (the jth component of beta)
    #
    newBeta          = betaHat[,iteration_number]
    newBeta[j_index] = newBeta[j_index] + delta_j 
    betaHat          = cbind(betaHat, newBeta)

    # update residual:
    #
    r = r - delta_j * x_j 

    iteration_number = iteration_number + 1 
  }
    
  # compute the predictions of each model on the test set:
  pTest = as.matrix( DTest[,1:p] ) %*% betaHat                # create a matrix of these results of size [ nTest, N_iterations ]
  if( demeanResponse ){ 
    pTest = pTest + responseMean                              # add back in the mean if we want the response in physical units (not mean adjusted)
  }
  
  res = list(betaHat,pTest)
  return(res)
}