% % Implements Monte Carlo ES (exploring starts) with first visit estimation to % compute the action-value function for the black jack example. % % Written by: % -- % John L. Weatherwax 2007-12-07 % % email: wax@alum.mit.edu % % Please send comments and especially bug reports to the % above email address. % %----- close all; clc; N_HANDS_TO_PLAY=10; % a numerical approximation of +Inf N_HANDS_TO_PLAY=2*1e4; N_HANDS_TO_PLAY=5e5; N_HANDS_TO_PLAY=5e6; %N_HANDS_TO_PLAY=1e7; rand('seed',0); randn('seed',0); %-- % implement hands of bj %-- nStates = prod([21-12+1,13,2]); posHandSums = 12:21; nActions = 2; % 0=>stick; 1=>hit Q = zeros(nStates,nActions); % the initial action-value function %pol_pi = zeros(1,nStates); % our initial policy is to always stick "0" pol_pi = ones(1,nStates); % our initial policy is to always hit "1" %pol_pi = unidrnd(2,1,nStates)-1; % our initial policy is random firstSARewSum = zeros(nStates,nActions); firstSARewCnt = zeros(nStates,nActions); tic for hi=1:N_HANDS_TO_PLAY, stateseen = []; deck = shufflecards(); % the player gets the first two cards: p = deck(1:2); deck = deck(3:end); phv = handValue(p); % the dealer gets the next two cards (and shows his first card): d = deck(1:2); deck = deck(3:end); dhv = handValue(d); cardShowing = d(1); % disgard states who's initial sum is less than 12 (the decision is always to hit): while( phv < 12 ) p = [ p, deck(1) ]; deck = deck(2:end); phv = handValue(p); % HIT end % accumulate/store the first state seen: stateseen(1,:) = stateFromHand( p, cardShowing ); % implement the policy specified by pol_pi (keep hitting till we should "stick"): si = 1; polInd = sub2ind( [21-12+1,13,2], stateseen(si,1)-12+1, stateseen(si,2), stateseen(si,3)+1 ); pol_pi(polInd) = unidrnd(2)-1; % FOR EXPLORING STARTS TAKE AN INITIAL RANDOM POLICY!!! pol_to_take = pol_pi(polInd); while( pol_to_take && (phv < 22) ) p = [ p, deck(1) ]; deck = deck(2:end); phv = handValue(p); % HIT stateseen(end+1,:) = stateFromHand( p, cardShowing ); if( phv <= 21 ) % only then do we need to querry the next policy action when we have not gone bust si = si+1; %[ stateseen(si,1), stateseen(si,2), stateseen(si,3) ] polInd = sub2ind( [21-12+1,13,2], stateseen(si,1)-12+1, stateseen(si,2), stateseen(si,3)+1 ); pol_to_take = pol_pi(polInd); end end % implement the fixed deterministic policy of the dealer (hit until we have a hand value of 17): while( dhv < 17 ) d = [ d, deck(1) ]; deck = deck(2:end); dhv = handValue(d); % HIT end % determine the reward for playing this game: rew = determineReward(phv,dhv); %fprintf( '[phv, dhv, rew] = \n' ); [ phv, dhv, rew ] % accumulate these values used in computing statistics on this action value function Q^{\pi}: for si=1:size(stateseen,1), if( (stateseen(si,1)>=12) && (stateseen(si,1)<=21) ) % we don't count "initial" and terminal states %[stateseen(si,1)] %[stateseen(si,1)-12+1, stateseen(si,2), stateseen(si,3)+1] staInd = sub2ind( [21-12+1,13,2], stateseen(si,1)-12+1, stateseen(si,2), stateseen(si,3)+1 ); actInd = pol_pi(staInd)+1; firstSARewCnt(staInd,actInd) = firstSARewCnt(staInd,actInd)+1; firstSARewSum(staInd,actInd) = firstSARewSum(staInd,actInd)+rew; Q(staInd,actInd) = firstSARewSum(staInd,actInd)/firstSARewCnt(staInd,actInd); % <-take the average [dum,greedyChoice] = max( Q(staInd,:) ); pol_pi(staInd) = greedyChoice-1; end end end % end number of hands loop toc % plot the optimal state-value function V^{*}: % mc_value_fn = max( Q, [], 2 ); mc_value_fn = reshape( mc_value_fn, [21-12+1,13,2]); if( 1 ) figure; mesh( 1:13, 12:21, mc_value_fn(:,:,1) ); xlabel( 'dealer shows' ); ylabel( 'sum of cards in hand' ); axis xy; %view([67,5]); title( 'no usable ace' ); drawnow; fn=sprintf('state_value_fn_nua_%d_mesh.eps',N_HANDS_TO_PLAY); saveas( gcf, fn, 'eps2' ); figure; mesh( 1:13, 12:21, mc_value_fn(:,:,2) ); xlabel( 'dealer shows' ); ylabel( 'sum of cards in hand' ); axis xy; %view([67,5]); title( 'a usable ace' ); drawnow; fn=sprintf('state_value_fn_ua_%d_mesh.eps',N_HANDS_TO_PLAY); saveas( gcf, fn, 'eps2' ); figure;imagesc( 1:13, 12:21, mc_value_fn(:,:,1) ); caxis( [-1,+1] ); colorbar; xlabel( 'dealer shows' ); ylabel( 'sum of cards in hand' ); axis xy; title( 'no usable ace' ); drawnow; fn=sprintf('state_value_fn_nua_%d_img.eps',N_HANDS_TO_PLAY); saveas( gcf, fn, 'eps2' ); figure;imagesc( 1:13, 12:21, mc_value_fn(:,:,2) ); caxis( [-1,+1] ); colorbar; xlabel( 'dealer shows' ); ylabel( 'sum of cards in hand' ); axis xy; title( 'a usable ace' ); drawnow; fn=sprintf('state_value_fn_ua_%d_img.eps',N_HANDS_TO_PLAY); saveas( gcf, fn, 'eps2' ); end % plot the optimal policy: % pol_pi = reshape( pol_pi, [21-12+1,13,2] ); if( 1 ) figure; imagesc( 1:13, 12:21, pol_pi(:,:,1) ); colorbar; xlabel( 'dealer shows' ); ylabel( 'sum of cards in hand' ); axis xy; %view([67,5]); title( 'no usable ace' ); drawnow; fn=sprintf('bj_opt_pol_nua_%d_image.eps',N_HANDS_TO_PLAY); saveas( gcf, fn, 'eps2' ); figure; imagesc( 1:13, 12:21, pol_pi(:,:,2) ); colorbar; xlabel( 'dealer shows' ); ylabel( 'sum of cards in hand' ); axis xy; %view([67,5]); title( 'a usable ace' ); drawnow; fn=sprintf('bj_opt_pol_ua_%d_mesh.eps',N_HANDS_TO_PLAY); saveas( gcf, fn, 'eps2' ); end return;