/******************************************************************************
* blackjack.c
*
* e-soft, on-policy Monte Carlo control for Blackjack
* (as described in example 5.1)
*
*
*
* NOTES:
*
* - We don't consider cases in which player has a natural, or
* has less than twelve, because the action to be taken by the
* player in those cases is obvious.
*
* - The game described in this example doesn't recognize the
* the situation in which player reaches 5 cards without
* going busted as a special situation.
*
******************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
void initialize();
int request_card();
int select_action(int p, int d, int u);
void show_POL();
void update_card_sum(int *current, int *new, int *usable);
/* Show episodes on screen? */
#define SHOW_EPISODES 0
/* How many sample episode to simulate */
#define MAX_EPISODES 5000000
/* How "soft" the policy should be */
#define EPSILON 0.1
/* What is what */
#define HIT 0
#define STICK 1
/*
* Let p = player_sum,
* d = dealer_showing,
* u = usable_ace,
* a = action.
*
* Q[p][d][u][a]represents the 400 action values Q(s,a), where every
* state s is defined by a triple (p,d,u)
*/
double Q[22][11][2][2]; // <-- p={12..21}, d={1..10}, u={0..1}, a={0..1}
/*
* POL[p][d][u][a] represents the probability of taking action a
* at state s(p,d,u) under the current policy
*/
double POL[22][11][2][2]; // <-- p={12..21}, d={1..10}, u={0..1}, a={0..1}
/*
* W[p][d][u][a] holds the number of times a state-action pair (s,a)
* has been encoutered in the simulation.
* Needed to incrementally calculate the average of Q(s,a). (see section
* 2.5 and 5.7 for details)
*/
int W[22][11][2][2]; // <-- p={12..21}, d={1..10}, u={0..1}, a={0..1}
/*
* To be used as nodes in the linked list of state-action pairs.
*/
struct sa_pair{
int p;
int d;
int u;
int a;
struct sa_pair *next;
};
/*
* main()
*
*
*/
int main(int argc, char *argv[]){
struct sa_pair *appeared_pairs, *new_pair, *temp_ptr;
int i, p, d, u, dealer_u, a, hit_result;
int player_busted, dealer_busted;
int dealer_has_natural;
int episode_ending, card_count;
printf("\n");
printf("Doing %d episodes.\n", MAX_EPISODES);
printf("Change SHOW_EPISODES to 1 if you want to see them displayed.\n");
printf("\n");
initialize();
/*
* Repeat "forever":
*
* 1. Simulate an episode and update action-values for
* every state-action pair that appeared in it.
*
* 2. Greedily improve the policy according to the updated
* action value function
*/
for(i = 0; i < MAX_EPISODES; i++){
appeared_pairs = NULL;
player_busted = 0; dealer_busted = 0;
dealer_has_natural = 0;
p = 0; u = 0;
/*
* Give 2 cards to the player and 1 to the dealer
*/
hit_result = request_card(); // <-- player's first card
update_card_sum(&p, &hit_result, &u);
hit_result = request_card(); // <-- player's second card
update_card_sum(&p, &hit_result, &u);
if ((p < 12)||(p == 21)){ // <-- We don't consider these cases
continue;
}
d = request_card(); // <-- dealer's first card
if(SHOW_EPISODES) printf("New episode: p=%d, d=%d, u=%d\n", p, d, u);
/*
* Player plays
*/
while(select_action(p, d, u) == HIT){
/*
* Add this state-action pair to the list of appeared (s,a)
*/
new_pair = (struct sa_pair *) malloc(sizeof(struct sa_pair));
new_pair->p = p;
new_pair->d = d;
new_pair->u = u;
new_pair->a = HIT;
new_pair->next = appeared_pairs;
appeared_pairs = new_pair;
/*
* Now request for a card
*/
hit_result = request_card();
update_card_sum(&p, &hit_result, &u);
if (p > 21){
player_busted = 1;
if(SHOW_EPISODES) printf("Player hit: %d and went busted.\n", hit_result);
break;
}
if(SHOW_EPISODES) printf("Player hit: %d and becomes p=%d, d=%d, u=%d\n", hit_result, p, d, u);
}
/*
* If player was not busted, record the action value of the last (s,a)
*/
if (!player_busted){
new_pair = (struct sa_pair *) malloc(sizeof(struct sa_pair));
new_pair->p = p;
new_pair->d = d;
new_pair->u = u;
new_pair->a = STICK;
new_pair->next = appeared_pairs;
appeared_pairs = new_pair;
}
/*
* Dealer plays
*/
if (!player_busted){
/*
* An ace as the first and only card is a usable ace
* and is counted as 11
*/
if (d == 1){
dealer_u = 1;
d = 11;
}
else{
dealer_u = 0;
}
/*
* Request for more card(s)
*/
card_count = 1;
while (d < 17){ // <-- has to hit if less than 17
hit_result = request_card();
update_card_sum(&d, &hit_result, &dealer_u);
if (d > 21){
dealer_busted = 1;
if(SHOW_EPISODES) printf("Dealer hit: %d and went busted.\n", hit_result);
break;
}
card_count++;
/*
* The dealer is said to have a natural if he has 2 cards AND:
* - Both of them are aces
* OR
* - 1 is an ace, and 1 is a ten
*/
if((card_count == 2) && (((hit_result == 1)&&(d == 13)) || (d == 21))){
dealer_has_natural == 1;
if (SHOW_EPISODES) printf("Dealer has a natural (%d, %d)\n", d, hit_result);
break;
}
if(SHOW_EPISODES) printf("Dealer hit: %d and becomes d=%d, dealer_u=%d\n", hit_result, d, dealer_u);
}
}
/*
* Who won
*/
if ((dealer_has_natural) || (player_busted)){
episode_ending = -1;
}
else if (dealer_busted){
episode_ending = 1;
}
else{
if (d < p)
episode_ending = 1;
else if (d == p)
episode_ending = 0;
else
episode_ending = -1;
}
if(SHOW_EPISODES) printf("---> Episode ended: %d\n", episode_ending);
/*
* Now update the action-values for
* every state-action pair that appeared in this episode.
*/
if(SHOW_EPISODES) printf("---> Involved state-action pairs (in reverse order): \n");
while(appeared_pairs != NULL){
p = appeared_pairs->p;
d = appeared_pairs->d;
u = appeared_pairs->u;
a = appeared_pairs->a;
if(SHOW_EPISODES) printf(" %d, %d, %d, %d\n", p, d, u, a);
/*
* Incrementally update the average of Q(s,a):
* (See section 2.5)
*/
Q[p][d][u][a] = Q[p][d][u][a] + (episode_ending - Q[p][d][u][a])/(W[p][d][u][a] + 1);
W[p][d][u][a]++;
temp_ptr = appeared_pairs->next;
free(appeared_pairs);
appeared_pairs = temp_ptr;
}
if(SHOW_EPISODES) printf("\n");
/*
* Finally, improve the policy according to the updated value function
*/
for(p = 12; p < 22; p++){
for(d = 1; d < 11; d++){
for(u = 0; u < 2; u++){
if (Q[p][d][u][HIT] > Q[p][d][u][STICK]){
POL[p][d][u][HIT] = 1 - EPSILON + EPSILON/2;
POL[p][d][u][STICK] = EPSILON/2;
}
else{
POL[p][d][u][STICK] = 1 - EPSILON + EPSILON/2;
POL[p][d][u][HIT] = EPSILON/2;
}
}
}
}
}
show_POL();
exit(0);
}
/*
* intitialize()
*
*
*/
void initialize(){
int p, d, u, a;
/*
* Provide a seed to the random generator
*/
// srand(time(NULL));
/*
* Start with an arbitrary (zero) action value function
*/
for(p = 12; p < 22; p++){
for(d = 1; d < 11; d++){
for(u = 0; u < 2; u++){
for(a = 0; a < 2; a++){
Q[p][d][u][a] = 0;
W[p][d][u][a] = 0;
}
}
}
}
/*
* Start with an arbitrary (always hit) policy
*/
for(p = 12; p < 22; p++){
for(d = 1; d < 11; d++){
for(u = 0; u < 2; u++){
POL[p][d][u][HIT] = 1 - EPSILON + EPSILON/2;
POL[p][d][u][STICK] = EPSILON/2;
}
}
}
return;
}
/*
* request_card()
*
*
*/
int request_card(){
int card;
/*
* There are 13 different cards in each color
* Each should has an equal chance to appear
*/
// card = (random() % 13) + 1; // <-- Won't do. Type "man rand" to see why
card = 1 + (int) (13.0*rand()/(RAND_MAX+1.0)); // <-- {1..13}
/*
* All the cards after 9 are counted as 10
*/
if (card > 9)
return 10;
else
return card;
}
/*
* select_action()
*
* decide whether we should "hit" or "stick"
*/
int select_action(int p, int d, int u){
/*
* Take the greedy action (1- EPSILON + EPSILON/2) percents
* of the time, and non-greedy (EPSILON/2) percents of the times
*/
if ((rand()/(RAND_MAX+1.0)) < (EPSILON/2)){ // <-- non-greedy action should be taken
if (POL[p][d][u][HIT] < POL[p][d][u][STICK]){ // <-- is HIT the "minority" action?
return HIT;
}
else{
return STICK;
}
}
else{
if (POL[p][d][u][HIT] < POL[p][d][u][STICK]){
return STICK;
}
else{
return HIT;
}
}
}
/*
* update_card_sum()
*
*
*/
void update_card_sum(int *current, int *new, int *usable){
int p = *current;
int hit_result = *new;
int u = *usable;
if ((p + hit_result) > 21){ // <-- potentially busted
if (u == 1){ // <-- No problem, since he has a usable ace
p = p - 10 + hit_result;
u = 0;
}
else{
p = p + hit_result; // <-- busted
}
}
else{
if ((hit_result == 1) && ((p + 11) < 22)){ // <-- Just got a usable ace
p = p + 11;
u = 1;
}
else{
p = p + hit_result;
}
}
*current = p;
*new = hit_result;
*usable = u;
return;
}
/*
* show_POL()
*
*
*/
void show_POL(){
int action0[22][11]; // <-- No usable ace
int action1[22][11]; // <-- usable ace
int i, j;
/*
* Compile the "soft" policy into 2 deterministic policies
*/
for(i = 12; i < 22; i++){
for(j = 1; j < 11; j++){
/*
* No usable ace
*/
if (POL[i][j][0][HIT] < POL[i][j][0][STICK]){ // <-- HIT is not the favorite action
action0[i][j] = STICK;
}
else // <-- Hit is the favorite action
action0[i][j] = HIT;
/*
* Usable ace
*/
if (POL[i][j][1][HIT] < POL[i][j][1][STICK]) // <-- HIT is not the favorite action
action1[i][j] = STICK;
else // <-- Hit is the favorite action
action1[i][j] = HIT;
}
}
/*
* Now show them
*/
printf(" Policy for episodes that start with a usable ace\n");
printf(" | 1 2 3 4 5 6 7 8 9 10\n");
printf("-----+-------------------------------\n");
for(i = 21; i >11 ; i--){
printf(" %3d |", i);
for(j = 1; j < 11; j++){
printf(" %d", action1[i][j]);
}
printf("\n");
}
printf("\n");
printf(" Policy for episodes that start without a usable ace\n");
printf(" | 1 2 3 4 5 6 7 8 9 10\n");
printf("-----+-------------------------------\n");
for(i = 21; i >11 ; i--){
printf(" %3d |", i);
for(j = 1; j < 11; j++){
printf(" %d", action0[i][j]);
}
printf("\n");
}
printf("\n");
}