fhswf · steveyuwono · Jul 21, 2022 · Jul 20, 2022 · Jul 20, 2022 · Jul 20, 2022
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -35,8 +35,8 @@ jobs:
         pip install matplotlib
         pip install dill
         pip install stable-baselines3
-        pip install gym
         pip install PettingZoo
+        pip install gym==0.25.0
         pip install pyglet
         pip install pymunk
         pip install pygame

diff --git a/src/mlpro/rl/examples/howto_rl_002_run_agent_with_own_policy_with_gym_environment.py b/src/mlpro/rl/examples/howto_rl_002_run_agent_with_own_policy_with_gym_environment.py
@@ -15,10 +15,11 @@
 ## -- 2021-11-15  1.2.0     DA       Refactoring 
 ## -- 2021-11-16  1.2.1     DA       Added explicit scenario reset with constant seeding 
 ## -- 2021-12-03  1.2.2     DA       Refactoring 
+## -- 2022-07-20  1.2.3     SY       Update due to the latest introduction of Gym 0.25
 ## -------------------------------------------------------------------------------------------------
 
 """
-Ver. 1.2.2 (2021-12-03)
+Ver. 1.2.3 (2022-07-20)
 
 This module shows how to run an own policy inside the standard agent model with an OpenAI Gym environment using 
 the fhswf_at_ml framework.
@@ -72,7 +73,7 @@ class MyScenario (RLScenario):
 
     def _setup(self, p_mode, p_ada, p_logging):
         # 1 Setup environment
-        gym_env     = gym.make('CartPole-v1')
+        gym_env     = gym.make('CartPole-v1', new_step_api=True, render_mode=None)
         self._env   = WrEnvGYM2MLPro(gym_env, p_logging=p_logging) 
 
         # 2 Setup standard single-agent with own policy

diff --git a/src/mlpro/rl/examples/howto_rl_003_train_agent_with_own_policy_on_gym_environment.py b/src/mlpro/rl/examples/howto_rl_003_train_agent_with_own_policy_on_gym_environment.py
@@ -18,10 +18,11 @@
 ## -- 2021-11-15  1.3.0     DA       Refactoring 
 ## -- 2021-12-03  1.3.1     DA       Refactoring 
 ## -- 2021-12-07  1.3.2     DA       Refactoring 
+## -- 2022-07-20  1.3.3     SY       Update due to the latest introduction of Gym 0.25
 ## -------------------------------------------------------------------------------------------------
 
 """
-Ver. 1.3.2 (2021-12-07)
+Ver. 1.3.3 (2022-07-20)
 
 This module shows how to train an agent with a custom policy inside on an OpenAI Gym environment using the fhswf_at_ml framework.
 """
@@ -76,7 +77,7 @@ class MyScenario (RLScenario):
 
     def _setup(self, p_mode, p_ada, p_logging):
         # 1 Setup environment
-        gym_env     = gym.make('CartPole-v1')
+        gym_env     = gym.make('CartPole-v1', new_step_api=True, render_mode=None)
         self._env   = WrEnvGYM2MLPro(gym_env, p_logging=p_logging) 
 
         # 2 Setup and return standard single-agent with own policy

diff --git a/src/mlpro/rl/examples/howto_rl_007_train_wrapped_SB3_policy.py b/src/mlpro/rl/examples/howto_rl_007_train_wrapped_SB3_policy.py
@@ -14,10 +14,11 @@
 ## -- 2021-12-03  1.0.5     DA       Refactoring
 ## -- 2021-12-07  1.0.6     DA       Refactoring
 ## -- 2022-02-25  1.0.7     SY       Refactoring due to auto generated ID in class Dimension
+## -- 2022-07-20  1.0.8     SY       Update due to the latest introduction of Gym 0.25
 ## -------------------------------------------------------------------------------------------------
 
 """
-Ver. 1.0.7 (2022-02-25)
+Ver. 1.0.8 (2022-07-20)
 
 This module shows how to train with SB3 Wrapper for On-Policy Algorithm
 """
@@ -38,7 +39,7 @@ class MyScenario(RLScenario):
     def _setup(self, p_mode, p_ada, p_logging):
         # 1 Setup environment
         # self._env   = RobotHTM(p_logging=False)
-        gym_env = gym.make('CartPole-v1')
+        gym_env = gym.make('CartPole-v1', new_step_api=True, render_mode=None)
         self._env = WrEnvGYM2MLPro(gym_env, p_logging=p_logging)
 
         # 2 Instantiate Policy From SB3

diff --git a/src/mlpro/rl/examples/howto_rl_015_train_wrapped_sb3_policy_with_stagnation_detection.py b/src/mlpro/rl/examples/howto_rl_015_train_wrapped_sb3_policy_with_stagnation_detection.py
@@ -8,10 +8,11 @@
 ## -- 2022-01-20  0.0.0     MRD      Creation
 ## -- 2022-01-20  1.0.0     MRD      Released first version
 ## -- 2022-05-17  1.0.1     DA       Just a litte comment maintenance
+## -- 2022-07-20  1.0.2     SY       Update due to the latest introduction of Gym 0.25
 ## -------------------------------------------------------------------------------------------------
 
 """
-Ver. 1.0.1 (2022-05-17)
+Ver. 1.0.2 (2022-07-20)
 
 This module shows how to train with SB3 Wrapper and stagnation detection
 """
@@ -31,7 +32,7 @@ class MyScenario(RLScenario):
 
     def _setup(self, p_mode, p_ada, p_logging):
         # 1 Setup environment
-        gym_env = gym.make('CartPole-v1')
+        gym_env = gym.make('CartPole-v1', new_step_api=True, render_mode=None)
         self._env = WrEnvGYM2MLPro(gym_env, p_logging=p_logging)
 
         # 2 Instantiate PPO Policy from SB3

diff --git a/src/mlpro/rl/examples/howto_rl_016_comparison_native_vs_wrapped_sb3_policy.py b/src/mlpro/rl/examples/howto_rl_016_comparison_native_vs_wrapped_sb3_policy.py
@@ -14,10 +14,11 @@
 ## -- 2021-12-24  1.0.5     DA       Replaced separtor in log line by Training.C_LOG_SEPARATOR
 ## -- 2022-02-27  1.0.6     SY       Refactoring due to auto generated ID in class Dimension
 ## -- 2022-03-21  1.0.7     WB       Rewrite module description
+## -- 2022-07-20  1.0.8     SY       Update due to the latest introduction of Gym 0.25
 ## -------------------------------------------------------------------------------------------------
 
 """
-Ver. 1.0.7 (2022-03-21)
+Ver. 1.0.8 (2022-07-20)
 
 This module compares the native and wrapped implementation of the SB3 Policy on an
 environment.
@@ -77,7 +78,7 @@ def _reset(self, p_seed=None):
                 self._set_state(state)
 
         # 1 Setup environment
-        gym_env = gym.make('CartPole-v1')
+        gym_env = gym.make('CartPole-v1', new_step_api=True, render_mode=None)
         gym_env.seed(1)
         # self._env   = mlpro_env
         self._env = CustomWrapperFixedSeed(gym_env, p_logging=p_logging)

diff --git a/src/mlpro/rl/examples/howto_rl_017_comparison_native_vs_wrapped_sb3_policy_off_policy.py b/src/mlpro/rl/examples/howto_rl_017_comparison_native_vs_wrapped_sb3_policy_off_policy.py
@@ -8,10 +8,11 @@
 ## -- 2022-01-11  0.0.0     MRD      Creation
 ## -- 2022-01-18  1.0.0     MRD      Released first version
 ## -- 2022-02-27  1.0.1     SY       Refactoring due to auto generated ID in class Dimension
+## -- 2022-07-20  1.0.2     SY       Update due to the latest introduction of Gym 0.25
 ## -------------------------------------------------------------------------------------------------
 
 """
-Ver. 1.0.1 (2022-02-27)
+Ver. 1.0.2 (2022-07-20)
 
 This module shows comparison between native and wrapped SB3 policy (Off-policy).
 """
@@ -69,7 +70,7 @@ def _reset(self, p_seed=None):
                 self._set_state(state)
 
         # 1 Setup environment
-        gym_env = gym.make('CartPole-v1')
+        gym_env = gym.make('CartPole-v1', new_step_api=True, render_mode=None)
         gym_env.seed(2)
         self._env = CustomWrapperFixedSeed(gym_env, p_logging=p_logging)
 

diff --git a/src/mlpro/rl/examples/howto_rl_019_train_and_reload_single_agent.py b/src/mlpro/rl/examples/howto_rl_019_train_and_reload_single_agent.py
@@ -10,10 +10,11 @@
 ## -- 2022-05-19  1.0.1     MRD      Re-use the agent not for the re-training process
 ## --                                Remove commenting and numbering
 ## -- 2022-05-19  1.0.2     MRD      Re-add the commneting and reformat the numbering in comment
+## -- 2022-07-20  1.0.3     SY       Update due to the latest introduction of Gym 0.25
 ## -------------------------------------------------------------------------------------------------
 
 """
-Ver. 1.0.2 (2022-05-19)
+Ver. 1.0.3 (2022-07-20)
 
 This module shows how to train a single agent and load it again to do some extra cycles
 """
@@ -33,7 +34,7 @@ class MyScenario(RLScenario):
 
     def _setup(self, p_mode, p_ada, p_logging):
         # 1.1 Setup environment
-        gym_env = gym.make('CartPole-v1')
+        gym_env = gym.make('CartPole-v1', new_step_api=True, render_mode=None)
         self._env = WrEnvGYM2MLPro(gym_env, p_logging=p_logging)
 
         # 1.2 Setup Policy From SB3

diff --git a/src/mlpro/rl/pool/envs/multicartpole.py b/src/mlpro/rl/pool/envs/multicartpole.py
@@ -18,10 +18,11 @@
 ## -- 2021-12-21  1.2.4     DA       Class MultiCartPole: renamed method reset() to _reset()
 ## -- 2022-02-25  1.2.5     SY       Refactoring due to auto generated ID in class Dimension
 ## -- 2022-04-06  1.2.6     LSB      Freezing single environment after done returns true
+## -- 2022-07-20  1.2.7     SY       Update due to the latest introduction of Gym 0.25
 ## -------------------------------------------------------------------------------------------------
 
 """
-Ver. 1.2.6 (2022-04-06)
+Ver. 1.2.7 (2022-07-20)
 
 This module provides an environment with multivariate state and action spaces based on the 
 OpenAI Gym environment 'CartPole-v1'. 
@@ -67,7 +68,8 @@ def __init__(self,
             action_space_id  = self._action_space.get_dim_ids()
             state_space_env  = self._state_space.spawn([state_space_id[i*4], state_space_id[i*4+1], state_space_id[i*4+2], state_space_id[i*4+3]])
             action_space_env = self._action_space.spawn([action_space_id[i]])
-            env              = WrEnvGYM2MLPro(gym.make('CartPole-v1'), state_space_env, action_space_env, p_logging=p_logging)
+            env_make         = gym.make('CartPole-v1', new_step_api=True, render_mode=None)
+            env              = WrEnvGYM2MLPro(env_make, state_space_env, action_space_env, p_logging=p_logging)
             env.C_NAME = env.C_NAME + ' (' + str(i) + ')'
             self._envs.append(env)
 

diff --git a/src/mlpro/wrappers/openai_gym.py b/src/mlpro/wrappers/openai_gym.py
@@ -37,12 +37,15 @@
 ## -- 2022-02-27  1.3.4     SY       Refactoring due to auto generated ID in class Dimension
 ## -- 2022-03-21  1.3.5     MRD      Added new parameter to the WrEnvMLPro2GYM.reset()
 ## -- 2022-05-19  1.3.6     SY       Gym 0.23: Replace function env.seed(seed) to env.reset(seed=seed)
+## -- 2022-07-20  1.4.0     SY       Update due to the latest introduction of Gym 0.25
 ## -------------------------------------------------------------------------------------------------
 
 """
-Ver. 1.3.6 (2022-05-19)
+Ver. 1.4.0 (2022-07-20)
 
 This module provides wrapper classes for reinforcement learning tasks.
+This wrappers has been updated and follow the gym version of 0.25.0.
+The previous gym versions are still compatible, but it will not be available in the future.
 """
 
 import gym
@@ -147,14 +150,37 @@ def simulate_reaction(self, p_state: State, p_action: Action) -> State:
 
         # 2 Process step of Gym environment
         try:
-            observation, reward_gym, done, info = self._gym_env.step(action_gym)
+            # For gym version 0.25 or above
+            if self._gym_env.new_step_api:
+                try:
+                    observation, reward_gym, termination, truncation, info = self._gym_env.step(action_gym)
+                except:
+                    observation, reward_gym, termination, truncation, info = self._gym_env.step(np.atleast_1d(action_gym))
+            else:
+                try:
+                    observation, reward_gym, done, info = self._gym_env.step(action_gym)
+                except:
+                    observation, reward_gym, done, info = self._gym_env.step(np.atleast_1d(action_gym))
         except:
-            observation, reward_gym, done, info = self._gym_env.step(np.atleast_1d(action_gym))
-
+            # For gym version below than 0.25 (This will be removed soon)
+            self.log(self.C_LOG_TYPE_W, 'Please upgrade your gym version to 0.25.0 or above. This behaviour will be removed in near future.')
+            try:
+                observation, reward_gym, done, info = self._gym_env.step(action_gym)
+            except:
+                observation, reward_gym, done, info = self._gym_env.step(np.atleast_1d(action_gym))
+
         obs = DataObject(observation)
 
         # 3 Create state object from Gym observation
-        state = State(self._state_space, p_terminal=done)
+        try:
+            # For gym version 0.25 or above
+            if self._gym_env.new_step_api:
+                state = State(self._state_space, p_terminal=termination, p_timeout=truncation)
+            else:
+                state = State(self._state_space, p_terminal=done)
+        except:
+            # For gym version below than 0.25 (This will be removed soon)
+            state = State(self._state_space, p_terminal=done)
         state.set_values(obs.get_data())
 
         # 4 Create reward object
@@ -205,7 +231,8 @@ class WrEnvMLPro2GYM(gym.Env):
     metadata = {'render.modes': ['human']}
 
     ## -------------------------------------------------------------------------------------------------
-    def __init__(self, p_mlpro_env, p_state_space: MSpace = None, p_action_space: MSpace = None):
+    def __init__(self, p_mlpro_env, p_state_space: MSpace = None, p_action_space: MSpace = None, p_new_step_api: bool = False,
+                 p_render_mode: str = None):
         """
         Parameters:
             p_mlpro_env     MLPro's Environment object
@@ -227,6 +254,13 @@ def __init__(self, p_mlpro_env, p_state_space: MSpace = None, p_action_space: MS
         else:
             self.action_space = self.recognize_space(self._mlpro_env.get_action_space())
 
+        if p_render_mode is not None:
+            self.render_mode = p_render_mode
+        else:
+            self.render_mode = 'human'
+
+        self.new_step_api = p_new_step_api
+
         self.first_refresh = True
 
     ## -------------------------------------------------------------------------------------------------
@@ -281,22 +315,34 @@ def step(self, action):
             obs = np.array(self._mlpro_env.get_state().get_values())
 
         state = self._mlpro_env.get_state()
-        done = state.get_terminal()
+        terminated = state.get_terminal()
+        truncated = state.get_timeout()
 
         info = {}
-        info["TimeLimit.truncated"] = state.get_timeout()
 
-        return obs, reward.get_overall_reward(), done, info
+        if self.new_step_api:
+            return obs, reward.get_overall_reward(), terminated, truncated, info
+        else:
+            info["TimeLimit.truncated"] = state.get_timeout()
+            return obs, reward.get_overall_reward(), terminated, info
 
     ## -------------------------------------------------------------------------------------------------
-    def reset(self, seed=None, options=None):
+    def reset(self, seed=None, return_info=False, options=None):
+        # We need the following line to seed self.np_random
+        super().reset(seed=seed)
+
         self._mlpro_env.reset(seed)
         obs = None
         if isinstance(self.observation_space, gym.spaces.Box):
             obs = np.array(self._mlpro_env.get_state().get_values(), dtype=np.float32)
         else:
             obs = np.array(self._mlpro_env.get_state().get_values())
-        return obs
+
+        if return_info:
+            info = {}
+            return obs, info
+        else:
+            return obs
 
     ## -------------------------------------------------------------------------------------------------
     def render(self, mode='human'):

diff --git a/src/mlpro/wrappers/pettingzoo.py b/src/mlpro/wrappers/pettingzoo.py
@@ -34,10 +34,11 @@
 ## -- 2022-03-21  1.3.5     SY       Refactoring due to PettingZoo version 1.17.0
 ## -- 2022-05-20  1.3.6     SY       Refactoring: Action space boundaries in WrEnvPZOO2MLPro
 ## -- 2022-05-30  1.3.7     SY       Replace function env.seed(seed) to env.reset(seed=seed)
+## -- 2022-07-20  1.3.8     SY       Update due to the latest introduction of Gym 0.25
 ## -------------------------------------------------------------------------------------------------
 
 """
-Ver. 1.3.7 (2022-05-30)
+Ver. 1.3.8 (2022-07-20)
 This module provides wrapper classes for reinforcement learning tasks.
 """
 
@@ -357,7 +358,7 @@ def observe(self, agent_id):
 
 
 ## -------------------------------------------------------------------------------------------------
-        def reset(self):
+        def reset(self, seed, options):
             self.agents = self.possible_agents[:]
             self.rewards = {agent: 0 for agent in self.agents}
             self._cumulative_rewards = {agent: 0 for agent in self.agents}
@@ -366,7 +367,7 @@ def reset(self):
             self.state = {agent: None for agent in self.agents}
             self.observations = {agent: None for agent in self.agents}
 
-            self._mlpro_env.reset()
+            self._mlpro_env.reset(seed)
 
             self._agent_selector = agent_selector(self.agents)
             self.agent_selection = self._agent_selector.next()

diff --git a/test/test_sb3_policy_wrapper.py b/test/test_sb3_policy_wrapper.py
@@ -12,10 +12,11 @@
 ## -- 2021-12-20  1.0.3     DA       Refactoring
 ## -- 2022-01-18  2.0.0     MRD      Add Off Policy Algorithm into the test
 ## -- 2022-01-21  2.0.1     MRD      Include RobotHTM as the continues action envrionment
+## -- 2022-07-21  2.0.2     SY       Update due to the latest introduction of Gym 0.25
 ## -------------------------------------------------------------------------------------------------
 
 """
-Ver. 2.0.1 (2022-01-21)
+Ver. 2.0.2 (2022-07-21)
 
 Unit test classes for environment.
 """
@@ -76,7 +77,7 @@ def _reset(self, p_seed=None):
             else:
                 if issubclass(env_cls, DQN):
                     # 1 Setup environment
-                    gym_env = gym.make('CartPole-v1')
+                    gym_env = gym.make('CartPole-v1', render_mode=None)
                     gym_env.seed(2)
                     self._env = CustomWrapperFixedSeed(gym_env, p_logging=False)
                 else:
@@ -199,7 +200,7 @@ def _on_rollout_end(self) -> None:
     else:
         if issubclass(env_cls, DQN):
             # 1 Setup environment
-            gym_env = gym.make('CartPole-v1')
+            gym_env = gym.make('CartPole-v1', render_mode=None)
             gym_env.seed(2)
         else:
             env = RobotHTM(p_reset_seed=False, p_target_mode="fix", p_logging=False)