function [Q_sarsa,Q_qlearn,rpt_sarsa,rpt_qlearn,n_sarsa,n_qlearn] = learn_cw(alpha,CF,s_start,s_end,MAX_N_EPISODES) % LEARN_CW - Performs on-policy sarsa and Q-learning to learn the policy for the % cliff walking problem example. % % Written by: % -- % John L. Weatherwax 2007-12-03 % % email: wax@alum.mit.edu % % Please send comments and especially bug reports to the % above email address. % %----- PLOT_STEPS = 0; gamma = 1; % <- take this is an undiscounted task epsilon = 0.1; % for our epsilon greedy policy % the number of states: [sideII,sideJJ] = size(CF); nStates = sideII*sideJJ; % on each grid we can choose from among this many actions (except on edges where this action is reduced): nActions = 4; % An array to hold the values of the action-value function Q_sarsa = zeros(nStates,nActions); Q_qlearn = zeros(nStates,nActions); rpt_sarsa = zeros(1,MAX_N_EPISODES); rpt_qlearn = zeros(1,MAX_N_EPISODES); n_sarsa = zeros(nStates,nActions); % <- lets store the number of times we are in this state and take this action n_qlearn = zeros(nStates,nActions); % keep track of how many timestep we take per episode and for method (episode time steps): ets = zeros(MAX_N_EPISODES,2); for ei=1:MAX_N_EPISODES, if( PLOT_STEPS ) close all; f_plot_steps=figure; subplot(2,1,1); imagesc( CF ); colorbar; hold on; plot( s_start(2), s_start(1), 'x', 'MarkerSize', 10, 'MarkerFaceColor', 'k' ); plot( s_end(2), s_end(1), 'o', 'MarkerSize', 10, 'MarkerFaceColor', 'k' ); subplot(2,1,2); imagesc( CF ); colorbar; hold on; plot( s_start(2), s_start(1), 'x', 'MarkerSize', 10, 'MarkerFaceColor', 'k' ); plot( s_end(2), s_end(1), 'o', 'MarkerSize', 10, 'MarkerFaceColor', 'k' ); end tic; % if( ei==1 ) % fprintf('working on episode %d...\n',ei); % else % fprintf('working on episode %d (ptt=%10.6f secs)...\n',ei, toc); tic; % end %set the control variables of sarsa finished and q-learning finished sarsa_finished=0; qlearning_finished=0; % initialize the starting state st_sarsa = s_start; sti_sarsa = sub2ind( [sideII,sideJJ], st_sarsa(1), st_sarsa(2) ); st_qlearn = s_start; sti_qlearn = sub2ind( [sideII,sideJJ], st_qlearn(1), st_qlearn(2) ); % pick an initial action using an epsilon greedy policy derived from Q: % [dum,at_sarsa] = max(Q_sarsa(sti_sarsa,:)); % at \in [1,2,3,4]=[up,down,right,left] if( randsideII ) stp1(1)=sideII; end if( stp1(2)<1 ) stp1(2)=1; end if( stp1(2)>sideJJ ) stp1(2)=sideJJ; end % get the reward for this step: % if( (ii==s_end(1)) && (jj==s_end(2)) ) % were at the end :) %rew = +1; rew = 0; elseif( CF(stp1(1),stp1(2))==0 ) % we fell off the cliff :( rew = -100; stp1 = s_start; % if (caller_id==1) % fprintf('Sarsa has fallen the clif\n'); % else % fprintf('Q-Learning has fallen the clif\n'); % end else % normal step rew = -1; end