%function [] = binary_bandit_exps(nB,nP,p_win) % % Duplicates the binary bandit experiments. % % Inputs: % nB: the number of bandits % nP: the number of plays (times we will pull a arm) % p_win: p_win(i) is the probability we win when we pull arm i. % % Written by: % -- % John L. Weatherwax 2007-11-13 % % email: wax@alum.mit.edu % % Please send comments and especially bug reports to the % above email address. % %----- %close all; %clc; %clear; % if( nargin<1 ) % the number of bandits: % nB = 2000; % end % if( nargin<2 ) % the number of plays (times we will pull a arm): % nP = 2000; % end % if( nargin<3 ) % p_win = [ 0.1, 0.2 ]; % p_win = [ 0.8, 0.9 ]; % end % the number of arms: nA = 2; [dum,bestArm] = max( p_win ); %randn('seed',0); if( 1 ) % run the SUPERVISED experiment for two epsilon: % 0 => fully greedy % 0.1 => inbetween % 1.0 => explore on each trial epsArray = [ 0, 0.1 ]; perOptAction = zeros(length(epsArray),nP); for ei=1:length(epsArray), tEps = epsArray(ei); pickedMaxAction = zeros(nB,nP); for bi=1:nB, % pick a bandit % pick an arm to play initially ... %arm = 1; [dum,arm] = histc(rand(1),linspace(0,1+eps,nA+1)); clear dum; for pi=1:nP, % make a play % determine if this move is exploritory or greedy: if( rand(1) <= tEps ) % pick a RANDOM arm: [dum,arm] = histc(rand(1),linspace(0,1+eps,nA+1)); clear dum; end if( arm==1 ) otherArm=2; else otherArm=1; end % determine if the arm selected is the best possible: if( arm==bestArm ) pickedMaxAction(bi,pi)=1; end % get the reward from drawing on that arm: prob = p_win(arm); if( rand(1) <= prob ) % this arm gave SUCCESS keep playing it ... % do nothing ... else % this arm gave FAILURE switch to the other ... arm = otherArm; end end end percentOptAction = mean(pickedMaxAction,1); perOptAction(ei,:) = percentOptAction(:).'; end end %------------------------------------------------------------------------ % Learning with the L_{R-P} (linear reward penalty) algorithm: %------------------------------------------------------------------------ alpha = 0.1; if( 1 ) perOptActionRP = zeros(1,nP); %qT = zeros( nB, nA ); % initialize to zero the probability this arm gives a success (no knowledge) qT = 0.5*ones( nB, nA ); % initialize to uniform the probability this arm gives a success (no knowledge) pickedMaxAction = zeros(nB,nP); for bi=1:nB, % pick a bandit for pi=1:nP, % make a play % pick an arm based on the distribution in qT: if( rand(1) < qT(bi,1) ) arm = 1; else arm = 2; end if( arm==1 ) otherArm=2; else otherArm=1; end % determine if the arm selected is the best possible: if( arm==bestArm ) pickedMaxAction(bi,pi)=1; end % get the reward from drawing on that arm: prob = p_win(arm); if( rand(1) <= prob ) % this arm gave success increment ... addTo = arm; else % this arm gave failure increment the other ... addTo = otherArm; end if( addTo==1 ) otherArm=2; else otherArm=1; end % update qT: qT(bi,addTo) = qT(bi,addTo) + alpha*( 1 - qT(bi,addTo) ); qT(bi,otherArm) = 1.0-qT(bi,addTo); end end percentOptAction = mean(pickedMaxAction,1); perOptActionRP(1,:) = percentOptAction(:).'; end perOptAction = [ perOptAction; perOptActionRP ]; %------------------------------------------------------------------------ % Learning with the L_{R-I} (linear reward inaction) algorithm: %------------------------------------------------------------------------ alpha = 0.1; if( 1 ) perOptActionRI = zeros(1,nP); qT = zeros( nB, nA ); % initialize to zero the probability this arm gives a success (no knowledge) qT = 0.5*ones( nB, nA ); % initialize to uniform the probability this arm gives a success (no knowledge) pickedMaxAction = zeros(nB,nP); for bi=1:nB, % pick a bandit for pi=1:nP, % make a play % pick an arm based on the distribution in qT: if( rand(1) < qT(bi,1) ) arm = 1; else arm = 2; end if( arm==1 ) otherArm=2; else otherArm=1; end % determine if the arm selected is the best possible: if( arm==bestArm ) pickedMaxAction(bi,pi)=1; end % get the reward from drawing on that arm: prob = p_win(arm); if( ~(rand(1) <= prob) ) % this arm gave failure no learning occurs ... continue end % this arm gave a success ... % I selected arm "arm". I won when I played this arm therefore I infer this play to be correct addTo = arm; if( addTo==1 ) otherArm=2; else otherArm=1; end % update qT: qT(bi,addTo) = qT(bi,addTo) + alpha*( 1 - qT(bi,addTo) ); qT(bi,otherArm) = 1.0-qT(bi,addTo); end end percentOptAction = mean(pickedMaxAction,1); perOptActionRI(1,:) = percentOptAction(:).'; end perOptAction = [ perOptAction; perOptActionRI ]; % produce the percent optimal action plot: % figure; hold on; clrStr = 'brkc'; all_hnds = []; for ei=1:size(perOptAction,1) %all_hnds(ei) = plot( [ 0, avgReward(ei,:) ], [clrStr(ei)] ); all_hnds(ei) = plot( 1:nP, perOptAction(ei,:), [clrStr(ei),'-'] ); end %legend( all_hnds, { '0', '0.1', 'L_{RI}' }, 'Location', 'Best' ); legend( all_hnds, { '0', '0.1', 'L_{RP}', 'L_{RI}' }, 'Location', 'Best' ); axis( [ 0, nP, 0, 1 ] ); axis tight; grid on; xlabel( 'plays' ); ylabel( '% Optimal Action' ); return;