% Implements a soft policy exploration Monte Carlo estimation algorithm
% to compute the optimal action-value function for the racetrack example.
% 
% Mods to try:
% 0) Implement recursive averaging update of Q
% 
%    Q_{k+1} <- Q_k + (1/(k+1)) (r_{k+1} - Q_{k}) 
% 
% Result: 
% 
% 1) Add a fixed step size learning algorithm,
%
%    Q_{k+1} <- Q_k + \alpha (r_{k+1} - Q_{k}) 
% 
%  which should be better for problems where the action value function may
%  change over time which is the case when we are performing value iteration.
% 
% Result: 
% 
% 2) Set Q initially very large to encourage exploration.  A value ~ +5
% should be large enough. 
% 
% Results:
% 
% With geometric update (involving alpha):
% 
% With recursive update: 
% 
% Written by:
% -- 
% John L. Weatherwax                2007-12-07
% 
% email: wax@alum.mit.edu
% 
% Please send comments and especially bug reports to the
% above email address.
% 
%-----

close all;
clc; 

% the fixed step size learning parameter: 
alpha = 0.1; 

N_EPISODES=10;     % a numerical approximation of +Inf
N_EPISODES=100;
N_EPISODES=2e3;    % 17 seconds 
N_EPISODES=5e5;    % 1 hour 
N_EPISODES=1.2e7;  % <- should take 24 hours ... 

% sample_discrete.m
addpath( genpath( '../../../FullBNT-1.0.4/KPMstats/' ) ); 
addpath( genpath( '/home/wax/Programming/Matlab/MathPath' ) ); 
try, BayesianNetworks; catch, end 

rand('seed',0); randn('seed',0); 

% generate the race track and initialize some sizes: 
RT = mk_rt(1); [maxNPii,maxNPjj] = size(RT); 

% the dimensions of the velocity state: 
maxNVii = 6; maxNVjj = 6; 

% the dimensions of the possible actions: 
maxNAii = 3; maxNAjj = 3; 

% the maximal state/action dimenions: 
% 
% a state consists of [pii,pjj,vii,vjj] with 
% pii \in maxNPii, pjj \in maxNPjj, vii \in 0:5, vjj \in 0:5
maxNStates  = prod([maxNPii,maxNPjj,maxNVii,maxNVjj]);   %  ~ 9216 states!
maxNActions = prod([maxNAii,maxNAjj]); 

% storage for the objects we will calculate: 
Q = zeros(maxNStates,maxNActions);         % the initial action-value function
%Q = +5*ones(maxNStates,maxNActions);      % the initial action-value function taken to encourage exploration
%Q = the_valid_spots; 

firstSARewSum = zeros(maxNStates,maxNActions); 
firstSARewCnt = zeros(maxNStates,maxNActions); 

%timePerPlay   = zeros(1,N_EPISODES); 

% enumerate the possible starting locations: 
posStarts = find(RT(end,:)); nPosStarts = length(posStarts); 

% initialize our policy: 
%pol_pi        = zeros(maxNStates,maxNActions);         % the storage for our initial policy
pol_pi = init_unif_policy(RT, maxNStates,maxNActions,maxNPii,maxNPjj,maxNVii,maxNVjj,maxNAii,maxNAjj);

tic
for ei=1:N_EPISODES,
  
  % (A) generate an episode following the policy pol_pi: 
  % 
  [stateseen,act_taken,rew] = gen_rt_episode(ei,pol_pi, RT,posStarts,nPosStarts,maxNStates,maxNActions,maxNPii,maxNPjj,maxNVii,maxNVjj,maxNAii,maxNAjj); 

  % (B) estimate the action value function "Q" via monte carlo methods:
  % 
  [Q,firstSARewCnt,firstSARewSum] = mcEstQ(stateseen,act_taken,rew, firstSARewCnt,firstSARewSum,Q, maxNPii,maxNPjj,maxNVii,maxNVjj); 
  
  % (C) update our policy:
  % 
  [pol_pi] = rt_pol_mod(stateseen,Q, pol_pi, maxNPii,maxNPjj,maxNVii,maxNVjj,maxNAii,maxNAjj);
  
end % end number of episods loop 
toc

%fprintf('timePerPlay = %f\n',mean(timePerPlay)); 


% plot the learned action-value function Q^{*} 
% ... skipped for now 


% plot the learned state-value function V^{*} (greedy from Q) as a function of position ONLY:
% 
% This means that we average out 
% -- the action variables in Q
% -- the velocity state variables (vxx,vyy)
%  
% We assume that 0.0 are variables that have NOT been updated and are INACCESABLE states ... 
% 
% to just look at the POSITION part of the state value function
% 
Q( find(Q(:)==0.0) ) = NaN;                           % <- replace zeros with NaN's
V = nanmean( Q, 2 );                                  % <- average out the action variables
V( find(isnan(V(:))) ) = 0.0;                         % <- replace back with zeros
V = reshape( V, [maxNPii,maxNPjj,maxNVii,maxNVjj] );  % <- do the same for the velocities ... 
V( find(V(:)==0.0) ) = NaN; 
V = nanmean( V, 4 ); 
V = nanmean( V, 3 ); 
V( find(isnan(V(:))) ) = 0.0; 

figure; imagesc( V ); colorbar; 
xlabel( 'jj location' ); ylabel( 'ii location' ); 
drawnow; 
saveas( gcf, sprintf('avg_state_value_fn_%d',N_EPISODES), 'png' );

return;