% Implements a soft policy exploration scheem Monte Carlo estimation algorihtm % using first visit estimation to compute the optimal action-value function for % the black jack example. % % Mods to try: % 0) Implement recursive averaging update of Q % % Q_{k+1} <- Q_k + (1/(k+1)) (r_{k+1} - Q_{k}) % % Result: seemed to change the final policy but only in a few places which might be due to % the specific random draws i.e. the differences could be due to randomness. % % 1) Add a fixed step size learning algorithm, % % Q_{k+1} <- Q_k + \alpha (r_{k+1} - Q_{k}) % % which should be better for problems where the action value function may % change over time which is the case when we are performing value iteration. % % Result: results in very spotty policies that don't seem correct. % % 2) Set Q initially very large to encourage exploration. A value ~ +5 % should be large enough. % % Results: % % With geometric update (involving alpha):is quite poor very spoty policy. % % With recursive update: seems too look quite good with similar results to the % exploring starts algorithm % % Written by: % -- % John L. Weatherwax 2007-12-07 % % email: wax@alum.mit.edu % % Please send comments and especially bug reports to the % above email address. % %----- close all; clc; % the epsilon in our epsilon-soft policy: (eps=0 => greedy; eps=1 => random) %eps = 0.03; eps = 0.1; %eps = 0.5; %eps = 0.85; % the fixed step size learning parameter: %alpha = 0.1; %alpha = 0.05; %N_HANDS_TO_PLAY=10; % a numerical approximation of +Inf N_HANDS_TO_PLAY=2*1e4; %N_HANDS_TO_PLAY=5e5; N_HANDS_TO_PLAY=4.4e7; % <- should take 12 hours ... % sample_discrete.m addpath( genpath( '../../../FullBNT-1.0.4/KPMstats/' ) ); rand('seed',0); randn('seed',0); %-- % implement hands of bj %-- nStates = prod([21-12+1,13,2]); posHandSums = 12:21; nActions = 2; % 0=>stick; 1=>hit %Q = zeros(nStates,nActions); % the initial action-value function Q = +5*ones(nStates,nActions); % the initial action-value function taken initially large to encourage exploration pol_pi = 0.5*ones(nStates,nActions); % our initial policy is random firstSARewSum = zeros(nStates,nActions); firstSARewCnt = zeros(nStates,nActions); %timePerPlay = zeros(1,N_HANDS_TO_PLAY); tic for hi=1:N_HANDS_TO_PLAY, %tic; stateseen = []; pol_taken = []; deck = shufflecards(); % the player gets the first two cards: p = deck(1:2); deck = deck(3:end); phv = handValue(p); % the dealer gets the next two cards (and shows his first card): d = deck(1:2); deck = deck(3:end); dhv = handValue(d); cardShowing = d(1); % disgard states who's initial sum is less than 12 (the decision is always to hit): while( phv < 12 ) p = [ p, deck(1) ]; deck = deck(2:end); phv = handValue(p); % HIT end % accumulate/store the first state seen: stateseen(1,:) = stateFromHand( p, cardShowing ); % implement the policy specified by pol_pi (keep hitting till we should "stick"): si = 1; polInd = sub2ind( [21-12+1,13,2], stateseen(si,1)-12+1, stateseen(si,2), stateseen(si,3)+1 ); pol_to_take = sample_discrete( pol_pi(polInd,:), 1, 1 )-1; % value in (0,1) "stick/hit" pol_taken(1) = pol_to_take; while( pol_to_take && (phv < 22) ) p = [ p, deck(1) ]; deck = deck(2:end); phv = handValue(p); % HIT stateseen(end+1,:) = stateFromHand( p, cardShowing ); if( phv <= 21 ) % only then do we need to querry the next policy action when we have not gone bust si = si+1; %[ stateseen(si,1), stateseen(si,2), stateseen(si,3) ] polInd = sub2ind( [21-12+1,13,2], stateseen(si,1)-12+1, stateseen(si,2), stateseen(si,3)+1 ); pol_to_take = sample_discrete( pol_pi(polInd,:), 1, 1 )-1; pol_taken(end+1) = pol_to_take; end end % implement the fixed deterministic policy of the dealer (hit until we have a hand value of 17): while( dhv < 17 ) d = [ d, deck(1) ]; deck = deck(2:end); dhv = handValue(d); % HIT end % determine the reward for playing this game: rew = determineReward(phv,dhv); %fprintf( '[phv, dhv, rew] = \n' ); [ phv, dhv, rew ] % accumulate these values used in computing statistics on this action value function Q^{\pi}: % Note 1) in this game there is no difference between first occurance and each occurance ... % Note 2) since each state is entered only once there is no need to "unjam" this for loop into % separate for loops like "(b)" and "(c)" as seen in the algorihtm given in Fig~5.6 of the book for si=1:size(stateseen,1), if( (stateseen(si,1)>=12) && (stateseen(si,1)<=21) ) % we don't count "initial" and terminal states %[stateseen(si,1)] %[stateseen(si,1)-12+1, stateseen(si,2), stateseen(si,3)+1] staInd = sub2ind( [21-12+1,13,2], stateseen(si,1)-12+1, stateseen(si,2), stateseen(si,3)+1 ); actInd = pol_taken(si)+1; firstSARewCnt(staInd,actInd) = firstSARewCnt(staInd,actInd)+1; firstSARewSum(staInd,actInd) = firstSARewSum(staInd,actInd)+rew; %Q(staInd,actInd) = firstSARewSum(staInd,actInd)/firstSARewCnt(staInd,actInd); % <-take the direct average Q(staInd,actInd) = Q(staInd,actInd) + (1/firstSARewCnt(staInd,actInd))*(rew - Q(staInd,actInd)); % <-use incremental averaging %Q(staInd,actInd) = Q(staInd,actInd) + alpha*(rew - Q(staInd,actInd)); % <-take a geometric average [dum,greedyChoice] = max( Q(staInd,:) ); notGreedyChoice = 3-greedyChoice; % linear function that maps 1=>2 and 2=>1 % perform the eps-soft on-policy MC update: pol_pi(staInd,greedyChoice) = 1 - eps + eps/nActions; pol_pi(staInd,notGreedyChoice) = eps/nActions; end end %timePerPlay(hi) = toc; end % end number of hands loop toc %fprintf('timePerPlay = %f\n',mean(timePerPlay)); % plot the optimal state-value function V^{*}: % mc_value_fn = max( Q, [], 2 ); mc_value_fn = reshape( mc_value_fn, [21-12+1,13,2]); if( 1 ) figure; mesh( 1:13, 12:21, mc_value_fn(:,:,1) ); xlabel( 'dealer shows' ); ylabel( 'sum of cards in hand' ); axis xy; %view([67,5]); title( 'no usable ace' ); drawnow; fn=sprintf('state_value_fn_nua_%d_mesh.eps',N_HANDS_TO_PLAY); saveas( gcf, fn, 'eps2' ); figure; mesh( 1:13, 12:21, mc_value_fn(:,:,2) ); xlabel( 'dealer shows' ); ylabel( 'sum of cards in hand' ); axis xy; %view([67,5]); title( 'a usable ace' ); drawnow; fn=sprintf('state_value_fn_ua_%d_mesh.eps',N_HANDS_TO_PLAY); saveas( gcf, fn, 'eps2' ); figure;imagesc( 1:13, 12:21, mc_value_fn(:,:,1) ); caxis( [-1,+1] ); colorbar; xlabel( 'dealer shows' ); ylabel( 'sum of cards in hand' ); axis xy; title( 'no usable ace' ); drawnow; fn=sprintf('state_value_fn_nua_%d_img.eps',N_HANDS_TO_PLAY); saveas( gcf, fn, 'eps2' ); figure;imagesc( 1:13, 12:21, mc_value_fn(:,:,2) ); caxis( [-1,+1] ); colorbar; xlabel( 'dealer shows' ); ylabel( 'sum of cards in hand' ); axis xy; title( 'a usable ace' ); drawnow; fn=sprintf('state_value_fn_ua_%d_img.eps',N_HANDS_TO_PLAY); saveas( gcf, fn, 'eps2' ); end % plot the optimal policy: % pol_pi = pol_pi(:,2); pol_pi = reshape( pol_pi, [21-12+1,13,2] ); if( 1 ) figure; imagesc( 1:13, 12:21, pol_pi(:,:,1) ); colorbar; xlabel( 'dealer shows' ); ylabel( 'sum of cards in hand' ); axis xy; %view([67,5]); title( 'no usable ace' ); drawnow; fn=sprintf('bj_opt_pol_nua_%d_image.eps',N_HANDS_TO_PLAY); saveas( gcf, fn, 'eps2' ); figure; imagesc( 1:13, 12:21, pol_pi(:,:,2) ); colorbar; xlabel( 'dealer shows' ); ylabel( 'sum of cards in hand' ); axis xy; %view([67,5]); title( 'a usable ace' ); drawnow; fn=sprintf('bj_opt_pol_ua_%d_image.eps',N_HANDS_TO_PLAY); saveas( gcf, fn, 'eps2' ); end return;