% DYNAQPLUS_MAZE_SCRIPT - Implements the DynaQ Algorithm on the simple maze example found in Chapter 9
% 
% Written by:
% -- 
% John L. Weatherwax                2007-12-03
% 
% email: wax@alum.mit.edu
% 
% Please send comments and especially bug reports to the
% above email address.
% 
%-----

close all; 
clc; 

% sample_discrete.m
addpath( genpath( '../../../FullBNT-1.0.4/KPMstats/' ) ); 

% the learning rate: 
alpha = 1e-1; 

% the probability of a random action (non-greedy): 
epsilon = 0.1; 

% the discount factor: 
gamma = 0.95;
%gamma = 1.0; 

% get our initial maze (the blocking maze): 
MZ = mk_ex_9_2_mz(0); [sideII,sideJJ] = size(MZ); 

% the beginning and terminal states (in matrix notation): 
s_start = [ 6, 4 ]; 
s_end   = [ 1, 9 ]; 

MAX_N_STEPS=30; 
%MAX_N_STEPS=1e3;
MAX_N_STEPS=1e4;
MAX_N_STEPS=1e5;
%MAX_N_STEPS=1e6;
%MAX_N_STEPS=10e6;

% the number of steps to do in planning: 
%nPlanningSteps = 0; 
nPlanningSteps = 5; 
%nPlanningSteps = 50; 
nPSV = [ 0, 5, 50 ]; 

% a factor relating how important revisiting old states is, relative to 
% the past recieved reward coming from these states/action pairs ... 
%kappa = 0.02; 
kappa = 2/sqrt(MAX_N_STEPS); 

allCR = zeros(2*length(nPSV),MAX_N_STEPS); 
for npsi=1:length(nPSV),
  nPlanningSteps = nPSV(npsi);
  
  [Q,ets,cr] = dynaQplus_maze(alpha,epsilon,gamma,kappa,nPlanningSteps,@mk_ex_9_2_mz,s_start,s_end,MAX_N_STEPS);
  allCR(npsi,:) = cr(2:end); 
  fhl{npsi} = sprintf('dynaQplus: %d planning steps',nPlanningSteps); 
  
  [Q,ets,dum1,dum2,dum3,cr] = dynaQ_maze(alpha,epsilon,gamma,nPlanningSteps,@mk_ex_9_2_mz,s_start,s_end,MAX_N_STEPS);
  allCR(length(nPSV)+npsi,:) = cr(2:end); 
  fhl{length(nPSV)+npsi} = sprintf('dynaQ: %d planning steps',nPlanningSteps); 
end

figure; fhs=plot( (1:3000), allCR(:,2:3001), '-' ); 
title( 'cummulative reward' ); grid on;
xlabel('timestep index'); ylabel('cum. reward'); drawnow; 
legend( fhs, fhl, 'Location', 'NorthWest' );
fn = sprintf('blocking_dynaQplus_vs_dyanQ_cum_reward'); saveas( gcf, fn, 'png' ); 

clear functions;
%close all; 
return;