From af4f3123735683cf66f0736418a6cd86f6c7e3e5 Mon Sep 17 00:00:00 2001
From: nighood <nighoodren@gmail.com>
Date: Tue, 16 Jan 2024 20:40:19 +0800
Subject: [PATCH 1/2] polish(rjy): polish pg/iqn/edac policy doc

---
 ding/policy/edac.py |  49 +++++++++++++++--
 ding/policy/iqn.py  |  79 +++++++++++++++++++++++++--
 ding/policy/pg.py   | 130 +++++++++++++++++++++++++++++++++++++-------
 3 files changed, 229 insertions(+), 29 deletions(-)

diff --git a/ding/policy/edac.py b/ding/policy/edac.py
index 5e315ccbf9..80827eeeae 100755
--- a/ding/policy/edac.py
+++ b/ding/policy/edac.py
@@ -20,7 +20,7 @@
 class EDACPolicy(SACPolicy):
     """
        Overview:
-           Policy class of EDAC algorithm. https://arxiv.org/pdf/2110.01548.pdf
+           Policy class of EDAC algorithm. Paper link: https://arxiv.org/pdf/2110.01548.pdf
 
        Config:
            == ====================  ========    =============  ================================= =======================
@@ -139,10 +139,23 @@ class EDACPolicy(SACPolicy):
     )
 
     def _init_learn(self) -> None:
-        r"""
+        """
         Overview:
-            Learn mode init method. Called by ``self.__init__``.
-            Init q, value and policy's optimizers, algorithm config, main and target models.
+            Initialize the learn mode of policy, including related attributes and modules. For EDAC, in addition \
+            to the things that need to be initialized in SAC, it is also necessary to additionally define \
+            eta/with_q_entropy/forward_learn_cnt. \
+            This method will be called in ``__init__`` method if ``learn`` field is in ``enable_field``.
+
+        .. note::
+            For the member variables that need to be saved and loaded, please refer to the ``_state_dict_learn`` \
+            and ``_load_state_dict_learn`` methods.
+
+        .. note::
+            For the member variables that need to be monitored, please refer to the ``_monitor_vars_learn`` method.
+
+        .. note::
+            If you want to set some spacial member variables in ``_init_learn`` method, you'd better name them \
+            with prefix ``_learn_`` to avoid conflict with other modes, such as ``self._learn_attr1``.
         """
         super()._init_learn()
         # EDAC special implementation
@@ -151,6 +164,34 @@ def _init_learn(self) -> None:
         self._forward_learn_cnt = 0
 
     def _forward_learn(self, data: dict) -> Dict[str, Any]:
+        """
+        Overview:
+            Policy forward function of learn mode (training policy and updating parameters). Forward means \
+            that the policy inputs some training batch data from the replay buffer and then returns the output \
+            result, including various training information such as loss, action, priority.
+        Arguments:
+            - data (:obj:`List[Dict[int, Any]]`): The input data used for policy forward, including a batch of \
+                training samples. For each element in list, the key of the dict is the name of data items and the \
+                value is the corresponding data. Usually, the value is torch.Tensor or np.ndarray or there dict/list \
+                combinations. In the ``_forward_learn`` method, data often need to first be stacked in the batch \
+                dimension by some utility functions such as ``default_preprocess_learn``. \
+                For EDAC, each element in list is a dict containing at least the following keys: ``obs``, ``action``, \
+                ``logit``, ``reward``, ``next_obs``, ``done``. Sometimes, it also contains other keys like ``weight``.
+        Returns:
+            - info_dict (:obj:`Dict[str, Any]`): The information dict that indicated training result, which will be \
+                recorded in text log and tensorboard, values must be python scalar or a list of scalars. For the \
+                detailed definition of the dict, refer to the code of ``_monitor_vars_learn`` method.
+
+        .. note::
+            The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \
+            For the data type that not supported, the main reason is that the corresponding model does not support it. \
+            You can implement you own model rather than use the default model. For more information, please raise an \
+            issue in GitHub repo and we will continue to follow up.
+
+        .. note::
+            For more detailed examples, please refer to our unittest for DiscreteEDACPolicy: \
+            ``ding.policy.tests.test_edac``.
+        """
         loss_dict = {}
         data = default_preprocess_learn(
             data,
diff --git a/ding/policy/iqn.py b/ding/policy/iqn.py
index 1bb9b683cc..b2d60c0c82 100644
--- a/ding/policy/iqn.py
+++ b/ding/policy/iqn.py
@@ -13,9 +13,9 @@
 
 @POLICY_REGISTRY.register('iqn')
 class IQNPolicy(DQNPolicy):
-    r"""
+    """
     Overview:
-        Policy class of IQN algorithm.
+        Policy class of IQN algorithm. Paper link: https://arxiv.org/pdf/1806.06923.pdf.
 
     Config:
         == ==================== ======== ============== ======================================== =======================
@@ -98,13 +98,37 @@ class IQNPolicy(DQNPolicy):
     )
 
     def default_model(self) -> Tuple[str, List[str]]:
+        """
+        Overview:
+            Return this algorithm default neural network model setting for demonstration. ``__init__`` method will \
+            automatically call this method to get the default model setting and create model.
+        Returns:
+            - model_info (:obj:`Tuple[str, List[str]]`): The registered model name and model's import_names.
+
+        .. note::
+            The user can define and use customized network model but must obey the same inferface definition indicated \
+            by import_names path. For example about IQN, its registered name is ``iqn`` and the import_names is \
+            ``ding.model.template.q_learning``.
+        """
         return 'iqn', ['ding.model.template.q_learning']
 
     def _init_learn(self) -> None:
-        r"""
+        """
         Overview:
-            Learn mode init method. Called by ``self.__init__``.
-            Init the optimizer, algorithm config, main and target models.
+            Initialize the learn mode of policy, including related attributes and modules. For IQN, it mainly contains \
+            optimizer, algorithm-specific arguments such as nstep, kappa and gamma, main and target model.
+            This method will be called in ``__init__`` method if ``learn`` field is in ``enable_field``.
+
+        .. note::
+            For the member variables that need to be saved and loaded, please refer to the ``_state_dict_learn`` \
+            and ``_load_state_dict_learn`` methods.
+
+        .. note::
+            For the member variables that need to be monitored, please refer to the ``_monitor_vars_learn`` method.
+
+        .. note::
+            If you want to set some spacial member variables in ``_init_learn`` method, you'd better name them \
+            with prefix ``_learn_`` to avoid conflict with other modes, such as ``self._learn_attr1``.
         """
         self._priority = self._cfg.priority
         # Optimizer
@@ -135,6 +159,34 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]:
         Returns:
             - info_dict (:obj:`Dict[str, Any]`): Including current lr and loss.
         """
+        """
+        Overview:
+            Policy forward function of learn mode (training policy and updating parameters). Forward means \
+            that the policy inputs some training batch data from the replay buffer and then returns the output \
+            result, including various training information such as loss, priority.
+        Arguments:
+            - data (:obj:`List[Dict[int, Any]]`): The input data used for policy forward, including a batch of \
+                training samples. For each element in list, the key of the dict is the name of data items and the \
+                value is the corresponding data. Usually, the value is torch.Tensor or np.ndarray or there dict/list \
+                combinations. In the ``_forward_learn`` method, data often need to first be stacked in the batch \
+                dimension by some utility functions such as ``default_preprocess_learn``. \
+                For IQN, each element in list is a dict containing at least the following keys: ``obs``, ``action``, \
+                ``reward``, ``next_obs``, ``done``. Sometimes, it also contains other keys such as ``weight`` \
+                and ``value_gamma``.
+        Returns:
+            - info_dict (:obj:`Dict[str, Any]`): The information dict that indicated training result, which will be \
+                recorded in text log and tensorboard, values must be python scalar or a list of scalars. For the \
+                detailed definition of the dict, refer to the code of ``_monitor_vars_learn`` method.
+
+        .. note::
+            The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \
+            For the data type that not supported, the main reason is that the corresponding model does not support it. \
+            You can implement you own model rather than use the default model. For more information, please raise an \
+            issue in GitHub repo and we will continue to follow up.
+
+        .. note::
+            For more detailed examples, please refer to our unittest for IQNPolicy: ``ding.policy.tests.test_iqn``.
+        """
         data = default_preprocess_learn(
             data, use_priority=self._priority, ignore_done=self._cfg.learn.ignore_done, use_nstep=True
         )
@@ -186,6 +238,12 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]:
         }
 
     def _state_dict_learn(self) -> Dict[str, Any]:
+        """
+        Overview:
+            Return the state_dict of learn mode, usually including model, target_model and optimizer.
+        Returns:
+            - state_dict (:obj:`Dict[str, Any]`): The dict of current policy learn state, for saving and restoring.
+        """
         return {
             'model': self._learn_model.state_dict(),
             'target_model': self._target_model.state_dict(),
@@ -193,6 +251,17 @@ def _state_dict_learn(self) -> Dict[str, Any]:
         }
 
     def _load_state_dict_learn(self, state_dict: Dict[str, Any]) -> None:
+        """
+        Overview:
+            Load the state_dict variable into policy learn mode.
+        Arguments:
+            - state_dict (:obj:`Dict[str, Any]`): The dict of policy learn state saved before.
+
+        .. tip::
+            If you want to only load some parts of model, you can simply set the ``strict`` argument in \
+            load_state_dict to ``False``, or refer to ``ding.torch_utils.checkpoint_helper`` for more \
+            complicated operation.
+        """
         self._learn_model.load_state_dict(state_dict['model'])
         self._target_model.load_state_dict(state_dict['target_model'])
         self._optimizer.load_state_dict(state_dict['optimizer'])
diff --git a/ding/policy/pg.py b/ding/policy/pg.py
index 667439d07b..67f0ce8904 100644
--- a/ding/policy/pg.py
+++ b/ding/policy/pg.py
@@ -13,9 +13,10 @@
 
 @POLICY_REGISTRY.register('pg')
 class PGPolicy(Policy):
-    r"""
+    """
     Overview:
-        Policy class of Policy Gradient (REINFORCE) algorithm.
+        Policy class of Policy Gradient (REINFORCE) algorithm. Paper link: \
+        https://proceedings.neurips.cc/paper_files/paper/1999/file/464d828b85b0bed98e80ade0a5c43b0f-Paper.pdf
     """
     config = dict(
         # (string) RL policy register name (refer to function "register_policy").
@@ -60,13 +61,33 @@ class PGPolicy(Policy):
     )
 
     def default_model(self) -> Tuple[str, List[str]]:
+        """
+        Overview:
+            Return this algorithm default neural network model setting for demonstration. ``__init__`` method will \
+            automatically call this method to get the default model setting and create model.
+        Returns:
+            - model_info (:obj:`Tuple[str, List[str]]`): The registered model name and model's import_names.
+        """
         return 'pg', ['ding.model.template.pg']
 
     def _init_learn(self) -> None:
-        r"""
+        """
         Overview:
-            Learn mode init method. Called by ``self.__init__``.
-            Init the optimizer, algorithm config, main and target models.
+            Initialize the learn mode of policy, including related attributes and modules. For PG, it mainly \
+            contains optimizer, algorithm-specific arguments such as entropy weight and grad norm. This method \
+            also executes some special network initializations.
+            This method will be called in ``__init__`` method if ``learn`` field is in ``enable_field``.
+
+        .. note::
+            For the member variables that need to be saved and loaded, please refer to the ``_state_dict_learn`` \
+            and ``_load_state_dict_learn`` methods.
+
+        .. note::
+            For the member variables that need to be monitored, please refer to the ``_monitor_vars_learn`` method.
+
+        .. note::
+            If you want to set some spacial member variables in ``_init_learn`` method, you'd better name them \
+            with prefix ``_learn_`` to avoid conflict with other modes, such as ``self._learn_attr1``.
         """
         # Optimizer
         self._optimizer = Adam(self._model.parameters(), lr=self._cfg.learn.learning_rate)
@@ -76,13 +97,31 @@ def _init_learn(self) -> None:
         self._learn_model = self._model  # for compatibility
 
     def _forward_learn(self, data: dict) -> Dict[str, Any]:
-        r"""
+        """
         Overview:
-            Forward and backward function of learn mode.
+            Policy forward function of learn mode (training policy and updating parameters). Forward means \
+            that the policy inputs some training batch data from the replay buffer and then returns the output \
+            result, including various training information such as loss, clipfrac, approx_kl.
         Arguments:
-            - data (:obj:`dict`): Dict type data, including at least ['obs', 'action', 'reward', 'next_obs','adv']
+            - data (:obj:`List[Dict[int, Any]]`): The input data used for policy forward, including the latest \
+                collected training samples for on-policy algorithms like PG. For each element in list, the key of the \
+                dict is the name of data items and the value is the corresponding data. Usually, the value is \
+                torch.Tensor or np.ndarray or there dict/list combinations. In the ``_forward_learn`` method, data \
+                often need to first be stacked in the batch dimension by some utility functions such as \
+                ``default_preprocess_learn``. \
+                For PG, each element in list is a dict containing at least the following keys: ``obs``, ``action``, \
+                ``return``.
         Returns:
-            - info_dict (:obj:`Dict[str, Any]`): Including current lr and loss.
+            - return_infos (:obj:`List[Dict[str, Any]]`): The information list that indicated training result, each \
+                training iteration contains append a information dict into the final list. The list will be precessed \
+                and recorded in text log and tensorboard. The value of the dict must be python scalar or a list of \
+                scalars. For the detailed definition of the dict, refer to the code of ``_monitor_vars_learn`` method.
+
+        .. note::
+            The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \
+            For the data type that not supported, the main reason is that the corresponding model does not support it. \
+            You can implement you own model rather than use the default model. For more information, please raise an \
+            issue in GitHub repo and we will continue to follow up.
         """
         data = default_preprocess_learn(data, ignore_done=self._cfg.learn.ignore_done, use_nstep=False)
         if self._cuda:
@@ -124,6 +163,16 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]:
         return return_infos
 
     def _init_collect(self) -> None:
+        """
+        Overview:
+            Initialize the collect mode of policy, including related attributes and modules. For PPG, it contains \
+            algorithm-specific arguments such as unroll_len and gamma.
+            This method will be called in ``__init__`` method if ``collect`` field is in ``enable_field``.
+
+        .. note::
+            If you want to set some spacial member variables in ``_init_collect`` method, you'd better name them \
+            with prefix ``_collect_`` to avoid conflict with other modes, such as ``self._collect_attr1``.
+        """
         self._unroll_len = self._cfg.collect.unroll_len
         self._gamma = self._cfg.collect.discount_factor
 
@@ -142,16 +191,19 @@ def _forward_collect(self, data: dict) -> dict:
         return {i: d for i, d in zip(data_id, output)}
 
     def _process_transition(self, obs: Any, model_output: dict, timestep: namedtuple) -> dict:
-        r"""
+        """
         Overview:
-            Generate dict type transition data from inputs.
+            Process and pack one timestep transition data into a dict, which can be directly used for training and \
+            saved in replay buffer. For PG, it contains obs, action, reward, done.
         Arguments:
-            - obs (:obj:`Any`): Env observation
-            - model_output (:obj:`dict`): Output of collect model, including at least ['action']
-            - timestep (:obj:`namedtuple`): Output after env step, including at least ['obs', 'reward', 'done'] \
-                (here 'obs' indicates obs after env step).
+            - obs (:obj:`torch.Tensor`): The env observation of current timestep, such as stacked 2D image in Atari.
+            - model_output (:obj:`Dict[str, torch.Tensor]`): The output of the policy network with the observation \
+                as input. For PG, it contains the action.
+            - timestep (:obj:`namedtuple`): The execution result namedtuple returned by the environment step method, \
+                except all the elements have been transformed into tensor data. Usually, it contains the next obs, \
+                reward, done, info, etc.
         Returns:
-            - transition (:obj:`dict`): Dict type transition data.
+            - transition (:obj:`Dict[str, torch.Tensor]`): The processed transition data of the current timestep.
         """
         return {
             'obs': obs,
@@ -161,13 +213,21 @@ def _process_transition(self, obs: Any, model_output: dict, timestep: namedtuple
         }
 
     def _get_train_sample(self, data: list) -> Union[None, List[Any]]:
-        r"""
+        """
         Overview:
-            Get the trajectory and the n step return data, then sample from the n_step return data
+            For a given entire episode data (a list of transition), process it into a list of sample that \
+            can be used for training directly. In PG, a train sample is a processed transition with new computed \
+            ``return`` field. This method is usually used in collectors to execute necessary \
+            RL data preprocessing before training, which can help learner amortize revelant time consumption. \
+            In addition, you can also implement this method as an identity function and do the data processing \
+            in ``self._forward_learn`` method.
         Arguments:
-            - data (:obj:`list`): The trajectory's buffer list
+            - data (:obj:`List[Dict[str, Any]`): The episode data (a list of transition), each element is \
+                the same format as the return value of ``self._process_transition`` method. Note that PG needs \
+                a complete epsiode
         Returns:
-            - samples (:obj:`dict`): The training samples generated
+            - samples (:obj:`List[Dict[str, Any]]`): The processed train samples, each element is the similar format \
+                as input transitions, but may contain more data for training, such as discounted episode return.
         """
         assert data[-1]['done'], "PG needs a complete epsiode"
 
@@ -194,6 +254,29 @@ def _init_eval(self) -> None:
         pass
 
     def _forward_eval(self, data: dict) -> dict:
+        """
+        Overview:
+            Policy forward function of eval mode (evaluation policy performance by interacting with envs). Forward \
+            means that the policy gets some necessary data (mainly observation) from the envs and then returns the \
+            action to interact with the envs. ``_forward_eval`` in PG often uses deterministic sample method to get \
+            actions while ``_forward_collect`` usually uses stochastic sample method for balance exploration and \
+            exploitation.
+        Arguments:
+            - data (:obj:`Dict[int, Any]`): The input data used for policy forward, including at least the obs. The \
+                key of the dict is environment id and the value is the corresponding data of the env.
+        Returns:
+            - output (:obj:`Dict[int, Any]`): The output data of policy forward, including at least the action. The \
+                key of the dict is the same as the input data, i.e. environment id.
+
+        .. note::
+            The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \
+            For the data type that not supported, the main reason is that the corresponding model does not support it. \
+            You can implement you own model rather than use the default model. For more information, please raise an \
+            issue in GitHub repo and we will continue to follow up.
+
+        .. note::
+            For more detailed examples, please refer to our unittest for PGPGPolicy: ``ding.policy.tests.test_pg``.
+        """
         data_id = list(data.keys())
         data = default_collate(list(data.values()))
         if self._cuda:
@@ -216,4 +299,11 @@ def _forward_eval(self, data: dict) -> dict:
         return {i: d for i, d in zip(data_id, output)}
 
     def _monitor_vars_learn(self) -> List[str]:
+        """
+        Overview:
+            Return the necessary keys for logging the return dict of ``self._forward_learn``. The logger module, such \
+            as text logger, tensorboard logger, will use these keys to save the corresponding data.
+        Returns:
+            - necessary_keys (:obj:`List[str]`): The list of the necessary keys to be logged.
+        """
         return super()._monitor_vars_learn() + ['policy_loss', 'entropy_loss', 'return_abs_max', 'grad_norm']

From 925663dc8b4bc18ee36e657e7ddd25bcab90622a Mon Sep 17 00:00:00 2001
From: nighood <nighoodren@gmail.com>
Date: Mon, 29 Jan 2024 11:38:44 +0800
Subject: [PATCH 2/2] polish(rjy): polish pg/iqn/edac policy doc

---
 ding/policy/edac.py |  4 ++--
 ding/policy/iqn.py  | 16 ++++++----------
 ding/policy/pg.py   | 33 ++++++++++++++++++++++++++++-----
 3 files changed, 36 insertions(+), 17 deletions(-)

diff --git a/ding/policy/edac.py b/ding/policy/edac.py
index 80827eeeae..b67b1203c8 100755
--- a/ding/policy/edac.py
+++ b/ding/policy/edac.py
@@ -163,7 +163,7 @@ def _init_learn(self) -> None:
         self._with_q_entropy = self._cfg.learn.with_q_entropy
         self._forward_learn_cnt = 0
 
-    def _forward_learn(self, data: dict) -> Dict[str, Any]:
+    def _forward_learn(self, data: List[Dict[int, Any]]) -> Dict[str, Any]:
         """
         Overview:
             Policy forward function of learn mode (training policy and updating parameters). Forward means \
@@ -189,7 +189,7 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]:
             issue in GitHub repo and we will continue to follow up.
 
         .. note::
-            For more detailed examples, please refer to our unittest for DiscreteEDACPolicy: \
+            For more detailed examples, please refer to our unittest for EDACPolicy: \
             ``ding.policy.tests.test_edac``.
         """
         loss_dict = {}
diff --git a/ding/policy/iqn.py b/ding/policy/iqn.py
index b2d60c0c82..7ff69d0528 100644
--- a/ding/policy/iqn.py
+++ b/ding/policy/iqn.py
@@ -15,7 +15,11 @@
 class IQNPolicy(DQNPolicy):
     """
     Overview:
-        Policy class of IQN algorithm. Paper link: https://arxiv.org/pdf/1806.06923.pdf.
+        Policy class of IQN algorithm. Paper link: https://arxiv.org/pdf/1806.06923.pdf. \
+        Distrbutional RL is a new direction of RL, which is more stable than the traditional RL algorithm. \
+        The core idea of distributional RL is to estimate the distribution of action value instead of the \
+        expectation. The difference between IQN and DQN is that IQN uses quantile regression to estimate the \
+        quantile value of the action distribution, while DQN uses the expectation of the action distribution. \
 
     Config:
         == ==================== ======== ============== ======================================== =======================
@@ -150,15 +154,7 @@ def _init_learn(self) -> None:
         self._learn_model.reset()
         self._target_model.reset()
 
-    def _forward_learn(self, data: dict) -> Dict[str, Any]:
-        r"""
-        Overview:
-            Forward and backward function of learn mode.
-        Arguments:
-            - data (:obj:`dict`): Dict type data, including at least ['obs', 'action', 'reward', 'next_obs']
-        Returns:
-            - info_dict (:obj:`Dict[str, Any]`): Including current lr and loss.
-        """
+    def _forward_learn(self, data: List[Dict[int, Any]]) -> Dict[str, Any]:
         """
         Overview:
             Policy forward function of learn mode (training policy and updating parameters). Forward means \
diff --git a/ding/policy/pg.py b/ding/policy/pg.py
index 67f0ce8904..9ae0c87f25 100644
--- a/ding/policy/pg.py
+++ b/ding/policy/pg.py
@@ -96,7 +96,7 @@ def _init_learn(self) -> None:
         self._grad_norm = self._cfg.learn.grad_norm
         self._learn_model = self._model  # for compatibility
 
-    def _forward_learn(self, data: dict) -> Dict[str, Any]:
+    def _forward_learn(self, data: List[Dict[int, Any]]) -> Dict[str, Any]:
         """
         Overview:
             Policy forward function of learn mode (training policy and updating parameters). Forward means \
@@ -176,7 +176,30 @@ def _init_collect(self) -> None:
         self._unroll_len = self._cfg.collect.unroll_len
         self._gamma = self._cfg.collect.discount_factor
 
-    def _forward_collect(self, data: dict) -> dict:
+    def _forward_collect(self, data: Dict[int, Any]) -> dict:
+        """
+        Overview:
+            Policy forward function of collect mode (collecting training data by interacting with envs). Forward means \
+            that the policy gets some necessary data (mainly observation) from the envs and then returns the output \
+            data, such as the action to interact with the envs.
+        Arguments:
+            - data (:obj:`Dict[int, Any]`): The input data used for policy forward, including at least the obs. The \
+                key of the dict is environment id and the value is the corresponding data of the env.
+        Returns:
+            - output (:obj:`Dict[int, Any]`): The output data of policy forward, including at least the action and \
+                other necessary data (action logit) for learn mode defined in ``self._process_transition`` \
+                method. The key of the dict is the same as the input data, i.e. environment id.
+
+        .. tip::
+            If you want to add more tricks on this policy, like temperature factor in multinomial sample, you can pass \
+            related data as extra keyword arguments of this method.
+
+        .. note::
+            The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \
+            For the data type that not supported, the main reason is that the corresponding model does not support it. \
+            You can implement you own model rather than use the default model. For more information, please raise an \
+            issue in GitHub repo and we will continue to follow up.
+        """
         data_id = list(data.keys())
         data = default_collate(list(data.values()))
         if self._cuda:
@@ -190,7 +213,7 @@ def _forward_collect(self, data: dict) -> dict:
         output = default_decollate(output)
         return {i: d for i, d in zip(data_id, output)}
 
-    def _process_transition(self, obs: Any, model_output: dict, timestep: namedtuple) -> dict:
+    def _process_transition(self, obs: Any, model_output: Dict[str, torch.Tensor], timestep: namedtuple) -> dict:
         """
         Overview:
             Process and pack one timestep transition data into a dict, which can be directly used for training and \
@@ -212,7 +235,7 @@ def _process_transition(self, obs: Any, model_output: dict, timestep: namedtuple
             'done': timestep.done,
         }
 
-    def _get_train_sample(self, data: list) -> Union[None, List[Any]]:
+    def _get_train_sample(self, data: List[Dict[str, Any]]) -> Union[None, List[Any]]:
         """
         Overview:
             For a given entire episode data (a list of transition), process it into a list of sample that \
@@ -253,7 +276,7 @@ def _get_train_sample(self, data: list) -> Union[None, List[Any]]:
     def _init_eval(self) -> None:
         pass
 
-    def _forward_eval(self, data: dict) -> dict:
+    def _forward_eval(self, data: Dict[int, Any]) -> dict:
         """
         Overview:
             Policy forward function of eval mode (evaluation policy performance by interacting with envs). Forward \