From 6633097d31ef6405ab1c10a2219a09db512bc14f Mon Sep 17 00:00:00 2001
From: Max Chiswick <max.chiswick@gmail.com>
Date: Mon, 10 Jun 2019 22:39:48 -0400
Subject: [PATCH 1/2] Added optional double down action

Extends the game from just hit/stick to the 3rd action of doubling down, which means doubling the bet and getting only 1 additional card
---
 gym/envs/toy_text/blackjack.py | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/gym/envs/toy_text/blackjack.py b/gym/envs/toy_text/blackjack.py
index 0cb6193e298..e300336307e 100644
--- a/gym/envs/toy_text/blackjack.py
+++ b/gym/envs/toy_text/blackjack.py
@@ -70,8 +70,11 @@ class BlackjackEnv(gym.Env):
     by Sutton and Barto.
     http://incompleteideas.net/book/the-book-2nd.html
     """
-    def __init__(self, natural=False):
-        self.action_space = spaces.Discrete(2)
+    def __init__(self, natural=False, double_down=False):
+        if double_down:
+            self.action_space = spaces.Discrete(3)
+        else:
+            self.action_space = spaces.Discrete(2)
         self.observation_space = spaces.Tuple((
             spaces.Discrete(32),
             spaces.Discrete(11),
@@ -81,6 +84,8 @@ def __init__(self, natural=False):
         # Flag to payout 1.5 on a "natural" blackjack win, like casino rules
         # Ref: http://www.bicyclecards.com/how-to-play/blackjack/
         self.natural = natural
+        #Flag for allowing doubling down
+        self.double_down = double_down
         # Start the first game
         self.reset()
 
@@ -90,7 +95,17 @@ def seed(self, seed=None):
 
     def step(self, action):
         assert self.action_space.contains(action)
-        if action:  # hit: add a card to players hand and return
+        if self.double_down:
+            if action == 2: # double down: bet double and get only 1 card
+                self.player.append(draw_card(self.np_random))
+                done = True
+                if is_bust(self.player):
+                    reward = -2
+                else: 
+                    while sum_hand(self.dealer) < 17:
+                        self.dealer.append(draw_card(self.np_random))
+                    reward = 2 * cmp(score(self.player), score(self.dealer))
+        if action == 1:  # hit: add a card to players hand and return
             self.player.append(draw_card(self.np_random))
             if is_bust(self.player):
                 done = True
@@ -98,7 +113,7 @@ def step(self, action):
             else:
                 done = False
                 reward = 0
-        else:  # stick: play out the dealers hand, and score
+        elif action == 0:  # stick: play out the dealers hand, and score
             done = True
             while sum_hand(self.dealer) < 17:
                 self.dealer.append(draw_card(self.np_random))

From d153c85833c0761f48bdd1ba82b9899d1e4a74a5 Mon Sep 17 00:00:00 2001
From: Max Chiswick <max.chiswick@gmail.com>
Date: Tue, 25 Jun 2019 22:30:30 -0400
Subject: [PATCH 2/2] Corrections and tests for double down

1) Moved comments to docstring and updated docstring

2) Removed if self.double_down since not needed with assert

3) Made rewards always floats

4) Made the natural blackjack result in an instant win (when natural=True, this is a standard casino rule) instead of allowing the dealer to get to 21 to result in a draw. Now only a draw if dealer also has natural blackjack.

5) Test file with output below showing all rewards are {-2, 0, 2}

import gym
from gym.envs.registration import register

ENV_NAME = "BlackjackMax-v0"
DOUBLE = 2
HIT = 1
STICK = 0

register(id='BlackjackMax-v0', entry_point='blackjack1:BlackjackEnv1')

class Player:
	def __init__(self):
		self.env = gym.make(ENV_NAME, natural = True, double_down = True)
		self.state = self.env.reset()

	def play_action(self, blackjack_state):
		return DOUBLE

if __name__ == "__main__":
	agent = Player()
	new_state = agent.state

	for i in range(100):
		while True:
			print('state', new_state)
			action = agent.play_action(new_state)
			new_state, reward, done, _ = agent.env.step(action)
			print('new_state', new_state)
			print('reward', reward)
			if done:
				new_state = agent.env.reset()
				print('===New hand===')
				break

state (13, 10, False)
new_state (18, 10, False)
reward 2.0
===New hand===
state (13, 8, False)
new_state (15, 8, False)
reward -2.0
===New hand===
state (13, 7, False)
new_state (23, 7, False)
reward -2.0
===New hand===
state (12, 10, False)
new_state (22, 10, False)
reward -2.0
===New hand===
state (13, 4, False)
new_state (14, 4, False)
reward 2.0
===New hand===
state (17, 1, True)
new_state (21, 1, True)
reward 2.0
===New hand===
state (13, 5, False)
new_state (20, 5, False)
reward 2.0
===New hand===
state (6, 9, False)
new_state (15, 9, False)
reward -2.0
===New hand===
state (14, 9, True)
new_state (19, 9, True)
reward 2.0
===New hand===
state (15, 7, True)
new_state (15, 7, False)
reward -2.0
===New hand===
state (13, 10, False)
new_state (21, 10, False)
reward 2.0
===New hand===
state (13, 9, False)
new_state (15, 9, False)
reward -2.0
===New hand===
state (19, 10, False)
new_state (23, 10, False)
reward -2.0
===New hand===
state (11, 1, False)
new_state (20, 1, False)
reward 2.0
===New hand===
state (16, 1, False)
new_state (26, 1, False)
reward -2.0
===New hand===
state (9, 10, False)
new_state (20, 10, True)
reward 0.0
===New hand===
state (13, 4, True)
new_state (13, 4, False)
reward -2.0
===New hand===
state (18, 6, False)
new_state (21, 6, False)
reward 2.0
===New hand===
state (12, 3, False)
new_state (14, 3, False)
reward -2.0
===New hand===
state (20, 3, False)
new_state (21, 3, False)
reward 2.0
===New hand===
state (16, 10, False)
new_state (26, 10, False)
reward -2.0
===New hand===
state (17, 10, False)
new_state (20, 10, False)
reward -2.0
===New hand===
state (11, 9, False)
new_state (21, 9, False)
reward 2.0
===New hand===
state (14, 3, False)
new_state (15, 3, False)
reward -2.0
===New hand===
state (21, 1, True)
new_state (21, 1, True)
reward 1.5
===New hand===
state (14, 6, False)
new_state (17, 6, False)
reward 2.0
===New hand===
state (13, 8, False)
new_state (21, 8, False)
reward 2.0
===New hand===
state (21, 10, True)
new_state (21, 10, True)
reward 1.5
===New hand===
state (16, 2, False)
new_state (20, 2, False)
reward 2.0
===New hand===
state (20, 6, False)
new_state (25, 6, False)
reward -2.0
===New hand===
state (13, 9, True)
new_state (17, 9, True)
reward 2.0
===New hand===
state (21, 10, True)
new_state (21, 10, True)
reward 1.5
===New hand===
state (20, 10, False)
new_state (21, 10, False)
reward 2.0
===New hand===
state (9, 5, False)
new_state (14, 5, False)
reward -2.0
===New hand===
state (19, 2, False)
new_state (21, 2, False)
reward 2.0
===New hand===
state (17, 10, True)
new_state (17, 10, False)
reward -2.0
===New hand===
state (14, 1, False)
new_state (20, 1, False)
reward 2.0
===New hand===
state (13, 7, False)
new_state (23, 7, False)
reward -2.0
===New hand===
state (21, 7, True)
new_state (21, 7, True)
reward 1.5
===New hand===
state (20, 2, False)
new_state (21, 2, False)
reward 0.0
===New hand===
state (8, 8, False)
new_state (14, 8, False)
reward 2.0
===New hand===
state (12, 10, False)
new_state (19, 10, False)
reward 0.0
===New hand===
state (20, 5, False)
new_state (30, 5, False)
reward -2.0
===New hand===
state (14, 6, False)
new_state (22, 6, False)
reward -2.0
===New hand===
state (17, 9, True)
new_state (12, 9, False)
reward -2.0
===New hand===
state (20, 8, False)
new_state (24, 8, False)
reward -2.0
===New hand===
state (12, 8, False)
new_state (16, 8, False)
reward -2.0
===New hand===
state (16, 5, False)
new_state (18, 5, False)
reward 2.0
===New hand===
state (16, 10, False)
new_state (18, 10, False)
reward 2.0
===New hand===
state (19, 8, False)
new_state (27, 8, False)
reward -2.0
===New hand===
state (13, 4, False)
new_state (17, 4, False)
reward -2.0
===New hand===
state (12, 4, False)
new_state (14, 4, False)
reward 2.0
===New hand===
state (18, 9, False)
new_state (19, 9, False)
reward 0.0
===New hand===
state (17, 4, False)
new_state (21, 4, False)
reward 2.0
===New hand===
state (17, 7, True)
new_state (12, 7, False)
reward 2.0
===New hand===
state (11, 8, False)
new_state (13, 8, False)
reward 2.0
===New hand===
state (20, 3, False)
new_state (24, 3, False)
reward -2.0
===New hand===
state (16, 2, False)
new_state (23, 2, False)
reward -2.0
===New hand===
state (18, 4, False)
new_state (25, 4, False)
reward -2.0
===New hand===
state (11, 7, False)
new_state (21, 7, False)
reward 2.0
===New hand===
state (20, 10, False)
new_state (23, 10, False)
reward -2.0
===New hand===
state (12, 10, False)
new_state (15, 10, False)
reward -2.0
===New hand===
state (17, 10, True)
new_state (17, 10, False)
reward -2.0
===New hand===
state (14, 2, False)
new_state (24, 2, False)
reward -2.0
===New hand===
state (17, 10, True)
new_state (14, 10, False)
reward 2.0
===New hand===
state (11, 10, False)
new_state (21, 10, False)
reward 2.0
===New hand===
state (17, 6, False)
new_state (23, 6, False)
reward -2.0
===New hand===
state (10, 1, False)
new_state (20, 1, False)
reward -2.0
===New hand===
state (8, 10, False)
new_state (18, 10, False)
reward -2.0
===New hand===
state (17, 9, True)
new_state (17, 9, False)
reward -2.0
===New hand===
state (6, 2, False)
new_state (10, 2, False)
reward -2.0
===New hand===
state (15, 6, False)
new_state (17, 6, False)
reward 2.0
===New hand===
state (5, 10, False)
new_state (8, 10, False)
reward -2.0
===New hand===
state (14, 8, False)
new_state (24, 8, False)
reward -2.0
===New hand===
state (20, 10, False)
new_state (29, 10, False)
reward -2.0
===New hand===
state (14, 10, False)
new_state (24, 10, False)
reward -2.0
===New hand===
state (18, 10, False)
new_state (28, 10, False)
reward -2.0
===New hand===
state (12, 8, False)
new_state (22, 8, False)
reward -2.0
===New hand===
state (11, 5, False)
new_state (20, 5, False)
reward -2.0
===New hand===
state (12, 2, False)
new_state (17, 2, False)
reward -2.0
===New hand===
state (15, 10, False)
new_state (17, 10, False)
reward -2.0
===New hand===
state (11, 3, False)
new_state (12, 3, False)
reward 2.0
===New hand===
state (21, 10, True)
new_state (21, 10, True)
reward 1.5
===New hand===
state (15, 9, True)
new_state (16, 9, True)
reward -2.0
===New hand===
state (6, 3, False)
new_state (12, 3, False)
reward 2.0
===New hand===
state (21, 10, True)
new_state (21, 10, True)
reward 1.5
===New hand===
state (10, 2, False)
new_state (20, 2, False)
reward 2.0
===New hand===
state (8, 3, False)
new_state (14, 3, False)
reward -2.0
===New hand===
state (10, 5, False)
new_state (13, 5, False)
reward -2.0
===New hand===
state (13, 1, False)
new_state (15, 1, False)
reward 2.0
===New hand===
state (11, 3, False)
new_state (21, 3, False)
reward 2.0
===New hand===
state (18, 10, False)
new_state (28, 10, False)
reward -2.0
===New hand===
state (16, 9, True)
new_state (12, 9, False)
reward -2.0
===New hand===
state (15, 4, True)
new_state (15, 4, False)
reward -2.0
===New hand===
state (17, 3, True)
new_state (17, 3, False)
reward 2.0
===New hand===
state (14, 7, False)
new_state (21, 7, False)
reward 2.0
===New hand===
state (16, 6, False)
new_state (24, 6, False)
reward -2.0
===New hand===
state (11, 6, False)
new_state (21, 6, False)
reward 2.0
===New hand===
state (20, 6, False)
new_state (22, 6, False)
reward -2.0
===New hand===
state (13, 2, False)
new_state (18, 2, False)
reward 2.0
===New hand===
---
 gym/envs/toy_text/blackjack.py | 78 +++++++++++++++++++---------------
 1 file changed, 44 insertions(+), 34 deletions(-)

diff --git a/gym/envs/toy_text/blackjack.py b/gym/envs/toy_text/blackjack.py
index e300336307e..fcec7c09ff0 100644
--- a/gym/envs/toy_text/blackjack.py
+++ b/gym/envs/toy_text/blackjack.py
@@ -31,7 +31,7 @@ def is_bust(hand):  # Is this hand a bust?
     return sum_hand(hand) > 21
 
 
-def score(hand):  # What is the score of this hand (0 if bust)
+def score(hand):  # What is the score of this hand (0 if bust)?
     return 0 if is_bust(hand) else sum_hand(hand)
 
 
@@ -43,27 +43,36 @@ class BlackjackEnv(gym.Env):
     """Simple blackjack environment
 
     Blackjack is a card game where the goal is to obtain cards that sum to as
-    near as possible to 21 without going over.  They're playing against a fixed
-    dealer.
+    near as possible to 21 without going over.  
+    The player is playing against a dealer with a fixed strategy. 
     Face cards (Jack, Queen, King) have point value 10.
     Aces can either count as 11 or 1, and it's called 'usable' at 11.
-    This game is placed with an infinite deck (or with replacement).
-    The game starts with each (player and dealer) having one face up and one
-    face down card.
+    This game is played with an infinite deck (or with replacement).
+    The game starts with the player and dealer each receiving two cards. 
+    One of the dealer's cards is facedown and the other is visible. 
 
-    The player can request additional cards (hit=1) until they decide to stop
-    (stick=0) or exceed 21 (bust).
+    The player can request additional cards (action=1, hit) until they decide to stop
+    (action=0, stick) or exceed 21 (bust). If double down is flagged (double_down=True), 
+    the player can double their bet (action=2, double) and then will receive exactly one 
+    additional card.
 
-    After the player sticks, the dealer reveals their facedown card, and draws
-    until their sum is 17 or greater.  If the dealer goes bust the player wins.
+    If the player is dealt a 10 or face card and an Ace, this is called a 
+    natural blackjack and if natural is flagged (natural=True), the player immediately
+    wins a payout of 1.5 (reward=1.5) unless the dealer also has a natural Blackjack. 
+
+    If the player busts, they immediately lose (reward=-1). After a stick or 
+    double down that does not result in a bust, the dealer draws until their sum 
+    is 17 or greater. If the dealer busts, the player wins (reward=1). Rewards 
+    are doubled in the double down case. 
 
     If neither player nor dealer busts, the outcome (win, lose, draw) is
     decided by whose sum is closer to 21.  The reward for winning is +1,
-    drawing is 0, and losing is -1.
+    drawing is 0, and losing is -1. These are again doubled in the double down
+    case. 
 
-    The observation of a 3-tuple of: the players current sum,
-    the dealer's one showing card (1-10 where 1 is ace),
-    and whether or not the player holds a usable ace (0 or 1).
+    The observation is a 3-tuple of: the player's current sum,
+    the dealer's one showing card (1-10 where 1 is Ace),
+    and whether or not the player holds a usable Ace (0 or 1).
 
     This environment corresponds to the version of the blackjack problem
     described in Example 5.1 in Reinforcement Learning: An Introduction
@@ -80,11 +89,7 @@ def __init__(self, natural=False, double_down=False):
             spaces.Discrete(11),
             spaces.Discrete(2)))
         self.seed()
-
-        # Flag to payout 1.5 on a "natural" blackjack win, like casino rules
-        # Ref: http://www.bicyclecards.com/how-to-play/blackjack/
         self.natural = natural
-        #Flag for allowing doubling down
         self.double_down = double_down
         # Start the first game
         self.reset()
@@ -94,32 +99,37 @@ def seed(self, seed=None):
         return [seed]
 
     def step(self, action):
+        if self.natural and is_natural(self.player):
+            if is_natural(self.dealer):
+                reward = 0.0 #player and dealer natural Blackjack
+            else:
+                reward = 1.5 #player natural Blackjack
+            done = True
+            return self._get_obs(), reward, done, {}
+
         assert self.action_space.contains(action)
-        if self.double_down:
-            if action == 2: # double down: bet double and get only 1 card
-                self.player.append(draw_card(self.np_random))
-                done = True
-                if is_bust(self.player):
-                    reward = -2
-                else: 
-                    while sum_hand(self.dealer) < 17:
-                        self.dealer.append(draw_card(self.np_random))
-                    reward = 2 * cmp(score(self.player), score(self.dealer))
-        if action == 1:  # hit: add a card to players hand and return
+        if action == 2: # double down: bet double and get only 1 card, then compare
+            self.player.append(draw_card(self.np_random))
+            done = True
+            if is_bust(self.player):
+                reward = -2.0
+            else: 
+                while sum_hand(self.dealer) < 17:
+                    self.dealer.append(draw_card(self.np_random))
+                reward = 2 * cmp(score(self.player), score(self.dealer))
+        elif action == 1:  # hit: add a card to players hand and return
             self.player.append(draw_card(self.np_random))
             if is_bust(self.player):
                 done = True
-                reward = -1
+                reward = -1.0
             else:
                 done = False
-                reward = 0
-        elif action == 0:  # stick: play out the dealers hand, and score
+                reward = 0.0
+        elif action == 0:  # stick: play out the dealer's hand, then compare
             done = True
             while sum_hand(self.dealer) < 17:
                 self.dealer.append(draw_card(self.np_random))
             reward = cmp(score(self.player), score(self.dealer))
-            if self.natural and is_natural(self.player) and reward == 1:
-                reward = 1.5
         return self._get_obs(), reward, done, {}
 
     def _get_obs(self):