% do_ex_9_1_exps - performs coparison of the rate of learning with various number p % % Written by: % -- % John L. Weatherwax 2007-12-03 % % email: wax@alum.mit.edu % % Please send comments and especially bug reports to the % above email address. % %----- close all; clc; % sample_discrete.m addpath( genpath( '../../../FullBNT-1.0.4/KPMstats/' ) ); % the learning rate: alpha = 1e-1; % the probability of a random action (non-greedy): epsilon = 0.1; % the discount factor: gamma = 0.95; %gamma = 1.0; % get our maze: MZ = mk_ex_9_1_mz(0); [sideII,sideJJ] = size(MZ); % the beginning and terminal states (in matrix notation): s_start = [ 3, 1 ]; s_end = [ 1, 9 ]; MAX_N_STEPS=30; %MAX_N_STEPS=1e3; %MAX_N_STEPS=1e4; MAX_N_STEPS=1e5; MAX_N_STEPS=1e6; %MAX_N_STEPS=10e6; nPSV = [ 0, 5, 50 ]; fhs = zeros(length(nPSV),1); fhl = cell(length(nPSV),1); aets = cell(length(nPSV),1); colors='rbk'; figure; hold on; for npi=1:length(nPSV), nPlanningSteps = nPSV(npi); fprintf( 'the number of planning steps = %10d ...\n',nPlanningSteps); tic [Q,ets,numFinishes] = dynaQ_maze(alpha,epsilon,gamma,nPlanningSteps,@mk_ex_9_1_mz,s_start,s_end,MAX_N_STEPS); toc aets{npi} = ets; % optionally perform some basic soothing to this signal: if( 0 ) fhs(npi) = plot( 1:length(ets), ets, ['-',colors(npi)] ); else filterLength = 10; % <- length of running average ... to smooth the signal tmp = filter( ones(filterLength,1)/filterLength, 1, ets ); tmp = tmp(11:end); % <- drop the first ten elements ... fhs(npi) = plot( 1:length(tmp), tmp, ['-',colors(npi)] ); end fhl{npi} = sprintf('%d planning steps',nPlanningSteps); end legend( fhs, fhl ); grid on; xlabel( 'episode number' ); ylabel( 'number of timesteps until solution' ); axis( [0,500,0,70] ); fn = 'dyna_q_various_learning_rates'; saveas( gcf, fn, 'png' ); return; % compute the (negative) cost to go and the optimal (greedy with respect to the state-value function) policy: pol_pi = zeros(sideII,sideJJ); V = zeros(sideII,sideJJ); for ii=1:sideII, for jj=1:sideJJ, sti = sub2ind( [sideII,sideJJ], ii, jj ); [V(ii,jj),pol_pi(ii,jj)] = max( Q(sti,:) ); end end plot_mz_policy(pol_pi,MZ,s_start,s_end); title( 'policy (1=>up,2=>down,3=>right,4=>left); start=green; stop=red' ); fn = sprintf('dyna_maze_policy_nE_%d',MAX_N_STEPS); saveas( gcf, fn, 'png' ); figure; imagesc( V ); colormap(flipud(jet)); colorbar; title( 'state value function' ); fn = sprintf('dyna_maze_state_value_fn_nE_%d',MAX_N_STEPS); saveas( gcf, fn, 'png' ); return;