opendilab · PaParaZz1 · Jun 28, 2022 · Jun 21, 2022 · Jun 26, 2022 · Jun 28, 2022
diff --git a/dizoo/classic_control/pendulum/config/pedulum_dqn_config.py b/dizoo/classic_control/pendulum/config/pedulum_dqn_config.py
@@ -0,0 +1,62 @@
+from easydict import EasyDict
+
+pendulum_dqn_config = dict(
+    exp_name='pendulum_dqn_seed0',
+    env=dict(
+        collector_env_num=10,
+        evaluator_env_num=5,
+        # (bool) Scale output action into legal range.
+        act_scale=True,
+        n_evaluator_episode=5,
+        stop_value=-250,
+        continuous=False,
+    ),
+    policy=dict(
+        cuda=False,
+        load_path='pendulum_dqn_seed0/ckpt/ckpt_best.pth.tar',  # necessary for eval
+        model=dict(
+            obs_shape=3,
+            action_shape=11,    # mean the action shape is 11, 11 discrete actions
+            encoder_hidden_size_list=[128, 128, 64],
+            dueling=True,
+        ),
+        nstep=1,
+        discount_factor=0.97,
+        learn=dict(
+            batch_size=64,
+            learning_rate=0.001,
+        ),
+        collect=dict(n_sample=8),
+        eval=dict(evaluator=dict(eval_freq=40, )),
+        other=dict(
+            eps=dict(
+                type='exp',
+                start=0.95,
+                end=0.1,
+                decay=10000,
+            ),
+            replay_buffer=dict(replay_buffer_size=20000, ),
+        ),
+    ),
+)
+pendulum_dqn_config = EasyDict(pendulum_dqn_config)
+main_config = pendulum_dqn_config
+pendulum_dqn_create_config = dict(
+    env=dict(
+        type='pendulum',
+        import_names=['dizoo.classic_control.pendulum.envs.pendulum_env'],
+    ),
+    env_manager=dict(type='base'),
+    policy=dict(type='dqn'),
+    replay_buffer=dict(
+        type='deque',
+        import_names=['ding.data.buffer.deque_buffer_wrapper']
+    ),
+)
+pendulum_dqn_create_config = EasyDict(pendulum_dqn_create_config)
+create_config = pendulum_dqn_create_config
+
+if __name__ == "__main__":
+    # or you can enter `ding -m serial -c pendulum_dqn_config.py -s 0`
+    from ding.entry import serial_pipeline
+    serial_pipeline((main_config, create_config), seed=0)
diff --git a/dizoo/classic_control/pendulum/envs/pendulum_env.py b/dizoo/classic_control/pendulum/envs/pendulum_env.py
@@ -17,10 +17,19 @@ def __init__(self, cfg: dict) -> None:
         self._env = gym.make('Pendulum-v0')
         self._init_flag = False
         self._replay_path = None
+        if 'continuous' in cfg.keys():
+            self._continuous = cfg.continuous
+        else:
+            self._continuous = True
         self._observation_space = gym.spaces.Box(
             low=np.array([-1.0, -1.0, -8.0]), high=np.array([1.0, 1.0, 8.0]), shape=(3, ), dtype=np.float32
         )
-        self._action_space = gym.spaces.Box(low=-2.0, high=2.0, shape=(1, ), dtype=np.float32)
+        if self._continuous:
+            self._action_space = gym.spaces.Box(
+                low=-2.0, high=2.0, shape=(1, ), dtype=np.float32)
+        else:
+            self._discrete_action_num = 11
+            self._action_space = gym.spaces.Discrete(self._discrete_action_num)
         self._reward_space = gym.spaces.Box(
             low=-1 * (3.14 * 3.14 + 0.1 * 8 * 8 + 0.001 * 2 * 2), high=0.0, shape=(1, ), dtype=np.float32
         )
@@ -58,12 +67,18 @@ def seed(self, seed: int, dynamic_seed: bool = True) -> None:
 
     def step(self, action: np.ndarray) -> BaseEnvTimestep:
         assert isinstance(action, np.ndarray), type(action)
+        # if require discrete env, convert actions to [-1 ~ 1] float actions
+        if not self._continuous:
+            action = (action / (self._discrete_action_num-1)) * 2 - 1
+        # scale into [-2, 2]
         if self._act_scale:
-            action = affine_transform(action, min_val=self._env.action_space.low, max_val=self._env.action_space.high)
+            action = affine_transform(
+                action, min_val=self._env.action_space.low, max_val=self._env.action_space.high)
         obs, rew, done, info = self._env.step(action)
         self._final_eval_reward += rew
         obs = to_ndarray(obs).astype(np.float32)
-        rew = to_ndarray([rew]).astype(np.float32)  # wrapped to be transfered to a array with shape (1,)
+        # wrapped to be transfered to a array with shape (1,)
+        rew = to_ndarray([rew]).astype(np.float32)
         if done:
             info['final_eval_reward'] = self._final_eval_reward
         return BaseEnvTimestep(obs, rew, done, info)
@@ -74,7 +89,13 @@ def enable_save_replay(self, replay_path: Optional[str] = None) -> None:
         self._replay_path = replay_path
 
     def random_action(self) -> np.ndarray:
-        return self.action_space.sample().astype(np.float32)
+        # consider discrete
+        if self._continuous:
+            random_action = self.action_space.sample().astype(np.float32)
+        else:
+            random_action = self.action_space.sample()
+            random_action = to_ndarray([random_action], dtype=np.int64)
+        return random_action
 
     @property
     def observation_space(self) -> gym.spaces.Space:

diff --git a/dizoo/classic_control/pendulum/envs/test_pendulum_env.py b/dizoo/classic_control/pendulum/envs/test_pendulum_env.py
@@ -29,3 +29,27 @@ def test_naive(self):
             # assert isinstance(timestep, tuple)
         print(env.observation_space, env.action_space, env.reward_space)
         env.close()
+
+    def test_discrete(self):
+        env = PendulumEnv(EasyDict({'act_scale': True, 'continuous': False}))
+        env.seed(314)
+        assert env._seed == 314
+        obs = env.reset()
+        assert obs.shape == (3, )
+        for i in range(10):
+            # Both ``env.random_action()``, and utilizing ``np.random`` as well as action space,
+            # can generate legal random action.
+            if i < 5:
+                random_action = np.array([env.action_space.sample()])
+            else:
+                random_action = env.random_action()
+            timestep = env.step(random_action)
+            print(env.observation_space, env.action_space, env.reward_space)
+            print(timestep.reward, timestep.obs, timestep.reward)
+            assert timestep.reward.shape == (1, )
+            assert timestep.obs.shape == (3, )
+            assert timestep.reward >= env.reward_space.low
+            assert timestep.reward <= env.reward_space.high
+            # assert isinstance(timestep, tuple)
+        print(env.observation_space, env.action_space, env.reward_space)
+        env.close()