Merge branch 'main' into rjy-ising-model

opendilab · Apr 23, 2024 · 800a8ad · 800a8ad
2 parents 9e7c140 + 1ac9ad5
commit 800a8ad
Show file tree

Hide file tree

Showing 34 changed files with 1,431 additions and 270 deletions.
diff --git a/.github/workflows/unit_test.yml b/.github/workflows/unit_test.yml
@@ -26,6 +26,11 @@ jobs:
           python -m pip install .
           python -m pip install ".[test,k8s]"
           python -m pip install transformers
+          if python --version | grep -q "Python 3.7"; then
+              python -m pip install wandb==0.16.4
+          else
+              echo "Python version is not 3.7, skipping wandb installation"
+          fi
           ./ding/scripts/install-k8s-tools.sh
           make unittest
       - name: Upload coverage to Codecov
@@ -55,5 +60,10 @@ jobs:
           python -m pip install .
           python -m pip install ".[test,k8s]"
           python -m pip install transformers
+          if python --version | grep -q "Python 3.7"; then
+              python -m pip install wandb==0.16.4
+          else
+              echo "Python version is not 3.7, skipping wandb installation"
+          fi
           ./ding/scripts/install-k8s-tools.sh
           make benchmark
diff --git a/README.md b/README.md
diff --git a/ding/example/dqn_frozen_lake.py b/ding/example/dqn_frozen_lake.py
@@ -0,0 +1,45 @@
+from ditk import logging
+from ding.model import DQN
+from ding.policy import DQNPolicy
+from ding.envs import DingEnvWrapper, BaseEnvManagerV2
+from ding.data import DequeBuffer
+from ding.config import compile_config
+from ding.framework import task
+from ding.framework.context import OnlineRLContext
+from ding.framework.middleware import OffPolicyLearner, StepCollector, interaction_evaluator, data_pusher, \
+    eps_greedy_handler, CkptSaver, nstep_reward_enhancer, final_ctx_saver
+from ding.utils import set_pkg_seed
+from dizoo.frozen_lake.config.frozen_lake_dqn_config import main_config, create_config
+from dizoo.frozen_lake.envs import FrozenLakeEnv
+
+
+def main():
+    logging.getLogger().setLevel(logging.INFO)
+    main_config.policy.nstep = 5
+    cfg = compile_config(main_config, create_cfg=create_config, auto=True)
+    with task.start(async_mode=False, ctx=OnlineRLContext()):
+        collector_env = BaseEnvManagerV2(
+            env_fn=[lambda: FrozenLakeEnv(cfg=cfg.env) for _ in range(cfg.env.collector_env_num)], cfg=cfg.env.manager
+        )
+        evaluator_env = BaseEnvManagerV2(
+            env_fn=[lambda: FrozenLakeEnv(cfg=cfg.env) for _ in range(cfg.env.evaluator_env_num)], cfg=cfg.env.manager
+        )
+        set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda)
+
+        model = DQN(**cfg.policy.model)
+        buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size)
+        policy = DQNPolicy(cfg.policy, model=model)
+
+        task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env))
+        task.use(eps_greedy_handler(cfg))
+        task.use(StepCollector(cfg, policy.collect_mode, collector_env))
+        task.use(nstep_reward_enhancer(cfg))
+        task.use(data_pusher(cfg, buffer_))
+        task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_))
+        task.use(CkptSaver(policy, cfg.exp_name, train_freq=100))
+        task.use(final_ctx_saver(cfg.exp_name))
+        task.run()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ding/example/ppo_with_complex_obs.py b/ding/example/ppo_with_complex_obs.py
@@ -30,7 +30,12 @@
         cuda=True,
         action_space='discrete',
         model=dict(
-            obs_shape=None,
+            obs_shape=dict(
+                key_0=dict(k1=(), k2=()),
+                key_1=(5, 10),
+                key_2=(10, 10, 3),
+                key_3=(2, ),
+            ),
             action_shape=2,
             action_space='discrete',
             critic_head_hidden_size=138,

diff --git a/ding/framework/middleware/functional/advantage_estimator.py b/ding/framework/middleware/functional/advantage_estimator.py
@@ -8,6 +8,7 @@
 from ding.rl_utils import gae, gae_data, get_train_sample
 from ding.framework import task
 from ding.utils.data import ttorch_collate
+from ding.utils.dict_helper import convert_easy_dict_to_dict
 from ding.torch_utils import to_device
 
 if TYPE_CHECKING:
@@ -33,10 +34,8 @@ def gae_estimator(cfg: EasyDict, policy: Policy, buffer_: Optional[Buffer] = Non
     # Unify the shape of obs and action
     obs_shape = cfg['policy']['model']['obs_shape']
     obs_shape = torch.Size(torch.tensor(obs_shape)) if isinstance(obs_shape, list) \
+        else ttorch.size.Size(convert_easy_dict_to_dict(obs_shape)) if isinstance(obs_shape, dict) \
         else torch.Size(torch.tensor(obs_shape).unsqueeze(0))
-    action_shape = cfg['policy']['model']['action_shape']
-    action_shape = torch.Size(torch.tensor(action_shape)) if isinstance(action_shape, list) \
-        else torch.Size(torch.tensor(action_shape).unsqueeze(0))
 
     def _gae(ctx: "OnlineRLContext"):
         """

diff --git a/ding/rl_utils/adder.py b/ding/rl_utils/adder.py
@@ -23,7 +23,8 @@ def get_gae(cls, data: List[Dict[str, Any]], last_value: torch.Tensor, gamma: fl
         Overview:
             Get GAE advantage for stacked transitions(T timestep, 1 batch). Call ``gae`` for calculation.
         Arguments:
-            - data (:obj:`list`): Transitions list, each element is a transition dict with at least ['value', 'reward']
+            - data (:obj:`list`): Transitions list, each element is a transition dict with at least \
+                ``['value', 'reward']``.
             - last_value (:obj:`torch.Tensor`): The last value(i.e.: the T+1 timestep)
             - gamma (:obj:`float`): The future discount factor, should be in [0, 1], defaults to 0.99.
             - gae_lambda (:obj:`float`): GAE lambda parameter, should be in [0, 1], defaults to 0.97, \
@@ -63,7 +64,7 @@ def get_gae_with_default_last_value(cls, data: deque, done: bool, gamma: float,
         Overview:
             Like ``get_gae`` above to get GAE advantage for stacked transitions. However, this function is designed in
             case ``last_value`` is not passed. If transition is not done yet, it wouold assign last value in ``data``
-            as ``last_value``, discard the last element in ``data``(i.e. len(data) would decrease by 1), and then call
+            as ``last_value``, discard the last element in ``data`` (i.e. len(data) would decrease by 1), and then call
             ``get_gae``. Otherwise it would make ``last_value`` equal to 0.
         Arguments:
             - data (:obj:`deque`): Transitions list, each element is a transition dict with \
@@ -103,7 +104,7 @@ def get_nstep_return_data(
     ) -> deque:
         """
         Overview:
-            Process raw traj data by updating keys ['next_obs', 'reward', 'done'] in data's dict element.
+            Process raw traj data by updating keys ``['next_obs', 'reward', 'done']`` in data's dict element.
         Arguments:
             - data (:obj:`deque`): Transitions list, each element is a transition dict
             - nstep (:obj:`int`): Number of steps. If equals to 1, return ``data`` directly; \
@@ -163,7 +164,7 @@ def get_train_sample(
     ) -> List[Dict[str, Any]]:
         """
         Overview:
-            Process raw traj data by updating keys ['next_obs', 'reward', 'done'] in data's dict element.
+            Process raw traj data by updating keys ``['next_obs', 'reward', 'done']`` in data's dict element.
             If ``unroll_len`` equals to 1, which means no process is needed, can directly return ``data``.
             Otherwise, ``data`` will be splitted according to ``unroll_len``, process residual part according to
             ``last_fn_type`` and call ``lists_to_dicts`` to form sampled training data.

diff --git a/ding/rl_utils/beta_function.py b/ding/rl_utils/beta_function.py
@@ -14,6 +14,15 @@
 # For CPW, eta = 0.71 most closely match human subjects
 # this function is locally concave for small values of τ and becomes locally convex for larger values of τ
 def cpw(x: Union[torch.Tensor, float], eta: float = 0.71) -> Union[torch.Tensor, float]:
+    """
+    Overview:
+        The implementation of CPW function.
+    Arguments:
+        - x (:obj:`Union[torch.Tensor, float]`): The input value.
+        - eta (:obj:`float`): The hyperparameter of CPW function.
+    Returns:
+        - output (:obj:`Union[torch.Tensor, float]`): The output value.
+    """
     return (x ** eta) / ((x ** eta + (1 - x) ** eta) ** (1 / eta))
 
 
@@ -22,6 +31,15 @@ def cpw(x: Union[torch.Tensor, float], eta: float = 0.71) -> Union[torch.Tensor,
 
 # CVaR is risk-averse
 def CVaR(x: Union[torch.Tensor, float], eta: float = 0.71) -> Union[torch.Tensor, float]:
+    """
+    Overview:
+        The implementation of CVaR function, which is a risk-averse function.
+    Arguments:
+        - x (:obj:`Union[torch.Tensor, float]`): The input value.
+        - eta (:obj:`float`): The hyperparameter of CVaR function.
+    Returns:
+        - output (:obj:`Union[torch.Tensor, float]`): The output value.
+    """
     assert eta <= 1.0
     return x * eta
 
@@ -31,6 +49,15 @@ def CVaR(x: Union[torch.Tensor, float], eta: float = 0.71) -> Union[torch.Tensor
 
 # risk-averse (eta < 0) or risk-seeking (eta > 0)
 def Pow(x: Union[torch.Tensor, float], eta: float = 0.0) -> Union[torch.Tensor, float]:
+    """
+    Overview:
+        The implementation of Pow function, which is risk-averse when eta < 0 and risk-seeking when eta > 0.
+    Arguments:
+        - x (:obj:`Union[torch.Tensor, float]`): The input value.
+        - eta (:obj:`float`): The hyperparameter of Pow function.
+    Returns:
+        - output (:obj:`Union[torch.Tensor, float]`): The output value.
+    """
     if eta >= 0:
         return x ** (1 / (1 + eta))
     else:

diff --git a/ding/rl_utils/exploration.py b/ding/rl_utils/exploration.py
@@ -12,13 +12,13 @@ def get_epsilon_greedy_fn(start: float, end: float, decay: int, type_: str = 'ex
     Overview:
         Generate an epsilon_greedy function with decay, which inputs current timestep and outputs current epsilon.
     Arguments:
-        - start (:obj:`float`): Epsilon start value. For 'linear', it should be 1.0.
+        - start (:obj:`float`): Epsilon start value. For ``linear`` , it should be 1.0.
         - end (:obj:`float`): Epsilon end value.
         - decay (:obj:`int`): Controls the speed that epsilon decreases from ``start`` to ``end``. \
             We recommend epsilon decays according to env step rather than iteration.
-        - type (:obj:`str`): How epsilon decays, now supports ['linear', 'exp'(exponential)]
+        - type (:obj:`str`): How epsilon decays, now supports ``['linear', 'exp'(exponential)]`` .
     Returns:
-        - eps_fn (:obj:`function`): The epsilon greedy function with decay
+        - eps_fn (:obj:`function`): The epsilon greedy function with decay.
     """
     assert type_ in ['linear', 'exp'], type_
     if type_ == 'exp':
@@ -48,27 +48,27 @@ class BaseNoise(ABC):
     def __init__(self) -> None:
         """
         Overview:
-            Initialization method
+            Initialization method.
         """
         super().__init__()
 
     @abstractmethod
     def __call__(self, shape: tuple, device: str) -> torch.Tensor:
         """
         Overview:
-            Generate noise according to action tensor's shape, device
+            Generate noise according to action tensor's shape, device.
         Arguments:
-            - shape (:obj:`tuple`): size of the action tensor, output noise's size should be the same
-            - device (:obj:`str`): device of the action tensor, output noise's device should be the same as it
+            - shape (:obj:`tuple`): size of the action tensor, output noise's size should be the same.
+            - device (:obj:`str`): device of the action tensor, output noise's device should be the same as it.
         Returns:
             - noise (:obj:`torch.Tensor`): generated action noise, \
-                have the same shape and device with the input action tensor
+                have the same shape and device with the input action tensor.
         """
         raise NotImplementedError
 
 
 class GaussianNoise(BaseNoise):
-    r"""
+    """
     Overview:
         Derived class for generating gaussian noise, which satisfies :math:`X \sim N(\mu, \sigma^2)`
     Interface:
@@ -78,10 +78,10 @@ class GaussianNoise(BaseNoise):
     def __init__(self, mu: float = 0.0, sigma: float = 1.0) -> None:
         """
         Overview:
-            Initialize :math:`\mu` and :math:`\sigma` in Gaussian Distribution
+            Initialize :math:`\mu` and :math:`\sigma` in Gaussian Distribution.
         Arguments:
-            - mu (:obj:`float`):  :math:`\mu` , mean value
-            - sigma (:obj:`float`): :math:`\sigma` , standard deviation, should be positive
+            - mu (:obj:`float`):  :math:`\mu` , mean value.
+            - sigma (:obj:`float`): :math:`\sigma` , standard deviation, should be positive.
         """
         super(GaussianNoise, self).__init__()
         self._mu = mu
@@ -125,14 +125,15 @@ def __init__(
         """
         Overview:
             Initialize ``_alpha`` :math:`=\theta * dt\`,
-            ``beta`` :math:`= \sigma * \sqrt{dt}`,  in Ornstein-Uhlenbeck process
+            ``beta`` :math:`= \sigma * \sqrt{dt}`,  in Ornstein-Uhlenbeck process.
         Arguments:
-            - mu (:obj:`float`):  :math:`\mu` , mean value
-            - sigma (:obj:`float`): :math:`\sigma` , standard deviation of the perturbation noise
-            - theta (:obj:`float`): how strongly the noise reacts to perturbations, \
-                greater value means stronger reaction
-            - dt (:obj:`float`): derivative of time t
-            - x0 (:obj:`float` or :obj:`torch.Tensor`): initial action
+            - mu (:obj:`float`): :math:`\mu` , mean value.
+            - sigma (:obj:`float`): :math:`\sigma` , standard deviation of the perturbation noise.
+            - theta (:obj:`float`): How strongly the noise reacts to perturbations, \
+                greater value means stronger reaction.
+            - dt (:obj:`float`): The derivative of time t.
+            - x0 (:obj:`Union[float, torch.Tensor]`): The initial state of the noise, \
+                should be a scalar or tensor with the same shape as the action tensor.
         """
         super().__init__()
         self._mu = mu
@@ -144,21 +145,21 @@ def __init__(
     def reset(self) -> None:
         """
         Overview:
-            Reset ``_x`` to the initial state ``_x0``
+            Reset ``_x`` to the initial state ``_x0``.
         """
         self._x = deepcopy(self._x0)
 
     def __call__(self, shape: tuple, device: str, mu: Optional[float] = None) -> torch.Tensor:
         """
         Overview:
-            Generate gaussian noise according to action tensor's shape, device
+            Generate gaussian noise according to action tensor's shape, device.
         Arguments:
-            - shape (:obj:`tuple`): size of the action tensor, output noise's size should be the same
-            - device (:obj:`str`): device of the action tensor, output noise's device should be the same as it
-            - mu (:obj:`float`): new mean value :math:`\mu`, you can set it to `None` if don't need it
+            - shape (:obj:`tuple`): The size of the action tensor, output noise's size should be the same.
+            - device (:obj:`str`): The device of the action tensor, output noise's device should be the same as it.
+            - mu (:obj:`float`): The new mean value :math:`\mu`, you can set it to `None` if don't need it.
         Returns:
             - noise (:obj:`torch.Tensor`): generated action noise, \
-                have the same shape and device with the input action tensor
+                have the same shape and device with the input action tensor.
         """
         if self._x is None or \
                 (isinstance(self._x, torch.Tensor) and self._x.shape != shape):
@@ -174,15 +175,15 @@ def __call__(self, shape: tuple, device: str, mu: Optional[float] = None) -> tor
     def x0(self) -> Union[float, torch.Tensor]:
         """
         Overview:
-            Get ``self._x0``
+            Get ``self._x0``.
         """
         return self._x0
 
     @x0.setter
     def x0(self, _x0: Union[float, torch.Tensor]) -> None:
         """
         Overview:
-            Set ``self._x0`` and reset ``self.x`` to ``self._x0`` as well
+            Set ``self._x0`` and reset ``self.x`` to ``self._x0`` as well.
         """
         self._x0 = _x0
         self.reset()
@@ -198,10 +199,10 @@ def create_noise_generator(noise_type: str, noise_kwargs: dict) -> BaseNoise:
         or raise an KeyError. In other words, a derived noise generator must first register,
         then call ``create_noise generator`` to get the instance object.
     Arguments:
-        - noise_type (:obj:`str`): the type of noise generator to be created
+        - noise_type (:obj:`str`): the type of noise generator to be created.
     Returns:
         - noise (:obj:`BaseNoise`): the created new noise generator, should be an instance of one of \
-            noise_mapping's values
+            noise_mapping's values.
     """
     if noise_type not in noise_mapping.keys():
         raise KeyError("not support noise type: {}".format(noise_type))

diff --git a/ding/rl_utils/td.py b/ding/rl_utils/td.py
@@ -580,7 +580,7 @@ def v_nstep_td_error(
         nstep: int = 1,
         criterion: torch.nn.modules = nn.MSELoss(reduction='none')  # noqa
 ) -> torch.Tensor:
-    r"""
+    """
     Overview:
         Multistep (n step) td_error for distributed value based algorithm
     Arguments:
@@ -590,14 +590,14 @@ def v_nstep_td_error(
     Returns:
         - loss (:obj:`torch.Tensor`): nstep td error, 0-dim tensor
     Shapes:
-        - data (:obj:`dist_nstep_td_data`): The v_nstep_td_data containing\
+        - data (:obj:`dist_nstep_td_data`): The v_nstep_td_data containing \
             ['v', 'next_n_v', 'reward', 'done', 'weight', 'value_gamma']
         - v (:obj:`torch.FloatTensor`): :math:`(B, )` i.e. [batch_size, ]
         - next_v (:obj:`torch.FloatTensor`): :math:`(B, )`
         - reward (:obj:`torch.FloatTensor`): :math:`(T, B)`, where T is timestep(nstep)
         - done (:obj:`torch.BoolTensor`) :math:`(B, )`, whether done in last timestep
         - weight (:obj:`torch.FloatTensor` or None): :math:`(B, )`, the training sample weight
-        - value_gamma (:obj:`torch.Tensor`): If the remaining data in the buffer is less than n_step\
+        - value_gamma (:obj:`torch.Tensor`): If the remaining data in the buffer is less than n_step \
             we use value_gamma as the gamma discount value for next_v rather than gamma**n_step
     Examples:
         >>> v = torch.randn(5).requires_grad_(True)
@@ -1103,7 +1103,7 @@ def qrdqn_nstep_td_error(
     Overview:
         Multistep (1 step or n step) td_error with in QRDQN
     Arguments:
-        - data (:obj:`iqn_nstep_td_data`): The input data, iqn_nstep_td_data to calculate loss
+        - data (:obj:`qrdqn_nstep_td_data`): The input data, qrdqn_nstep_td_data to calculate loss
         - gamma (:obj:`float`): Discount factor
         - nstep (:obj:`int`): nstep num, default set to 1
     Returns:
@@ -1610,18 +1610,16 @@ def multistep_forward_view(
         lambda_: float,
         done: Optional[torch.Tensor] = None
 ) -> torch.Tensor:
-    r"""
+    """
     Overview:
-        Same as trfl.sequence_ops.multistep_forward_view
-        Implementing (12.18) in Sutton & Barto
+        Same as trfl.sequence_ops.multistep_forward_view, which implements (12.18) in Sutton & Barto.
+        Assuming the first dim of input tensors correspond to the index in batch.
 
-        ```
+    .. note::
         result[T-1] = rewards[T-1] + gammas[T-1] * bootstrap_values[T]
         for t in 0...T-2 :
         result[t] = rewards[t] + gammas[t]*(lambdas[t]*result[t+1] + (1-lambdas[t])*bootstrap_values[t+1])
-        ```
 
-        Assuming the first dim of input tensors correspond to the index in batch
     Arguments:
         - bootstrap_values (:obj:`torch.Tensor`): Estimation of the value at *step 1 to T*, of size [T_traj, batchsize]
         - rewards (:obj:`torch.Tensor`): The returns from 0 to T-1, of size [T_traj, batchsize]