From 9d76b0f15f3c3f3445e6f1a804e489538aed83b0 Mon Sep 17 00:00:00 2001
From: zjowowen <zjowowen@outlook.com>
Date: Wed, 25 Oct 2023 16:18:22 +0800
Subject: [PATCH 1/5] polish ding.model.template

---
 ding/model/common/encoder.py                | 106 ++++-
 ding/model/common/head.py                   |  80 ++--
 ding/model/common/utils.py                  |   9 +-
 ding/model/template/acer.py                 |  40 +-
 ding/model/template/atoc.py                 | 201 ++++++---
 ding/model/template/bc.py                   |  11 +-
 ding/model/template/bcq.py                  |  82 +++-
 ding/model/template/collaq.py               |  80 +++-
 ding/model/template/coma.py                 |  98 ++++-
 ding/model/template/decision_transformer.py |   8 -
 ding/model/template/ebm.py                  | 437 +++++++++++++++++---
 ding/model/template/maqac.py                |  12 +-
 ding/model/template/ngu.py                  |   6 +-
 ding/model/template/ppg.py                  |  72 +++-
 ding/model/template/q_learning.py           |  18 +-
 ding/model/template/qac_dist.py             |  10 +-
 ding/model/template/vae.py                  |   6 +-
 ding/model/template/wqmix.py                |   4 +-
 18 files changed, 1055 insertions(+), 225 deletions(-)

diff --git a/ding/model/common/encoder.py b/ding/model/common/encoder.py
index e22112601e..bb128ca41b 100644
--- a/ding/model/common/encoder.py
+++ b/ding/model/common/encoder.py
@@ -23,7 +23,7 @@ def prod(iterable):
 class ConvEncoder(nn.Module):
     """
     Overview:
-        The ``Convolution Encoder`` used to encode raw 2-dim image observations (e.g. Atari/Procgen).
+        The Convolution Encoder is used to encode 2-dim image observations.
     Interfaces:
         ``__init__``, ``forward``.
     """
@@ -106,6 +106,18 @@ def _get_flatten_size(self) -> int:
             - outputs (:obj:`torch.Tensor`): Size ``int`` Tensor representing the number of ``in-features``.
         Shapes:
             - outputs: :math:`(1,)`.
+        Examples:
+            >>> conv = ConvEncoder(
+            >>>    obs_shape=(4, 84, 84),
+            >>>    hidden_size_list=[32, 64, 64, 128],
+            >>>    activation=nn.ReLU(),
+            >>>    kernel_size=[8, 4, 3],
+            >>>    stride=[4, 2, 1],
+            >>>    padding=None,
+            >>>    layer_norm=False,
+            >>>    norm_type=None
+            >>> )
+            >>> flatten_size = conv._get_flatten_size()
         """
         test_data = torch.randn(1, *self.obs_shape)
         with torch.no_grad():
@@ -123,6 +135,19 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         Shapes:
             - x : :math:`(B, C, H, W)`, where ``B`` is batch size, ``C`` is channel, ``H`` is height, ``W`` is width.
             - outputs: :math:`(B, N)`, where ``N = hidden_size_list[-1]`` .
+        Examples:
+            >>> conv = ConvEncoder(
+            >>>    obs_shape=(4, 84, 84),
+            >>>    hidden_size_list=[32, 64, 64, 128],
+            >>>    activation=nn.ReLU(),
+            >>>    kernel_size=[8, 4, 3],
+            >>>    stride=[4, 2, 1],
+            >>>    padding=None,
+            >>>    layer_norm=False,
+            >>>    norm_type=None
+            >>> )
+            >>> x = torch.randn(1, 4, 84, 84)
+            >>> output = conv(x)
         """
         x = self.main(x)
         x = self.mid(x)
@@ -132,7 +157,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 class FCEncoder(nn.Module):
     """
     Overview:
-        The ``FCEncoder`` used in models to encode raw 1-dim observations (e.g. MuJoCo).
+        The full connected encoder is used to encode 1-dim input variable.
     Interfaces:
         ``__init__``, ``forward``.
     """
@@ -148,7 +173,7 @@ def __init__(
     ) -> None:
         """
         Overview:
-            Init the FC Encoder according to arguments.
+            Initialize the FC Encoder according to arguments.
         Arguments:
             - obs_shape (:obj:`int`): Observation shape.
             - hidden_size_list (:obj:`SequenceType`): Sequence of ``hidden_size`` of subsequent FC layers.
@@ -194,6 +219,16 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         Shapes:
             - x : :math:`(B, M)`, where ``M = obs_shape``.
             - outputs: :math:`(B, N)`, where ``N = hidden_size_list[-1]``.
+        Examples:
+            >>> fc = FCEncoder(
+            >>>    obs_shape=4,
+            >>>    hidden_size_list=[32, 64, 64, 128],
+            >>>    activation=nn.ReLU(),
+            >>>    norm_type=None,
+            >>>    dropout=None
+            >>> )
+            >>> x = torch.randn(1, 4)
+            >>> output = fc(x)
         """
         x = self.act(self.init(x))
         x = self.main(x)
@@ -211,13 +246,18 @@ def __init__(self, obs_shape: Dict[str, Union[int, List[int]]]) -> None:
 class IMPALACnnResidualBlock(nn.Module):
     """
     Overview:
-        Residual basic block (without batchnorm) in IMPALA CNN encoder, which preserves the channel number and shape.
+        This CNN encoder residual block is residual basic block used in IMPALA algorithm, \
+        which preserves the channel number and shape.
+        IMPALA: Scalable Distributed Deep-RL with Importance Weighted Actor-Learner Architectures
+        https://arxiv.org/pdf/1802.01561.pdf
+    Interfaces:
+        ``__init__``, ``forward``.
     """
 
     def __init__(self, in_channnel: int, scale: float = 1, batch_norm: bool = False):
         """
         Overview:
-            Init the IMPALA CNN residual block according to arguments.
+            Initialize the IMPALA CNN residual block according to arguments.
         Arguments:
             - in_channnel (:obj:`int`): Channel number of input features.
             - scale (:obj:`float`): Scale of module, defaults to 1.
@@ -234,9 +274,16 @@ def __init__(self, in_channnel: int, scale: float = 1, batch_norm: bool = False)
             self.bn1 = nn.BatchNorm2d(self.in_channnel)
 
     def residual(self, x: torch.Tensor) -> torch.Tensor:
-        # inplace should be False for the first relu, so that it does not change the input,
-        # which will be used for skip connection.
-        # getattr is for backwards compatibility with loaded models
+        """
+        Overview:
+            Return output tensor of the residual block, keep the shape and channel number unchanged.
+            The inplace of activation function should be False for the first relu, \
+            so that it does not change the origin input tensor of the residual block.
+        Arguments:
+            - x (:obj:`torch.Tensor`): Input tensor.
+        Returns:
+            - output (:obj:`torch.Tensor`): Output tensor.
+        """
         if self.batch_norm:
             x = self.bn0(x)
         x = F.relu(x, inplace=False)
@@ -255,6 +302,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             - x (:obj:`torch.Tensor`): Input tensor.
         Returns:
             - output (:obj:`torch.Tensor`): Output tensor.
+        Examples:
+            >>> block = IMPALACnnResidualBlock(16)
+            >>> x = torch.randn(1, 16, 84, 84)
+            >>> output = block(x)
         """
         return x + self.residual(x)
 
@@ -262,13 +313,19 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 class IMPALACnnDownStack(nn.Module):
     """
     Overview:
-        Downsampling stack from IMPALA CNN encoder, which reduces the spatial size by 2 with maxpooling.
+        Downsampling stack of CNN encoder used in IMPALA algorithmn.
+        Every IMPALACnnDownStack consists n IMPALACnnResidualBlock,
+        which reduces the spatial size by 2 with maxpooling.
+        IMPALA: Scalable Distributed Deep-RL with Importance Weighted Actor-Learner Architectures
+        https://arxiv.org/pdf/1802.01561.pdf
+    Interfaces:
+        ``__init__``, ``forward``.
     """
 
     def __init__(self, in_channnel, nblock, out_channel, scale=1, pool=True, **kwargs):
         """
         Overview:
-            Init every impala cnn block of the Impala Cnn Encoder.
+            Initialize every impala cnn block of the Impala Cnn Encoder.
         Arguments:
             - in_channnel (:obj:`int`): Channel number of input features.
             - nblock (:obj:`int`): Residual Block number in each block.
@@ -293,6 +350,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             - x (:obj:`torch.Tensor`): Input tensor.
         Returns:
             - output (:obj:`torch.Tensor`): Output tensor.
+        Examples:
+            >>> stack = IMPALACnnDownStack(16, 2, 32)
+            >>> x = torch.randn(1, 16, 84, 84)
+            >>> output = stack(x)
         """
         x = self.firstconv(x)
         if self.pool:
@@ -305,6 +366,17 @@ def output_shape(self, inshape: tuple) -> tuple:
         """
         Overview:
             Calculate the output shape of the downsampling stack according to input shape and related arguments.
+        Arguments:
+            - inshape (:obj:`tuple`): Input shape.
+        Returns:
+            - output_shape (:obj:`tuple`): Output shape.
+        Shapes:
+            - inshape (:obj:`tuple`): :math:`(C, H, W)`, where C is channel number, H is height and W is width.
+            - output_shape (:obj:`tuple`): :math:`(C, H, W)`, where C is channel number, H is height and W is width.
+        Examples:
+            >>> stack = IMPALACnnDownStack(16, 2, 32)
+            >>> inshape = (16, 84, 84)
+            >>> output_shape = stack.output_shape(inshape)
         """
         c, h, w = inshape
         assert c == self.in_channnel
@@ -337,7 +409,7 @@ def __init__(
     ) -> None:
         """
         Overview:
-            Init the IMPALA CNN encoder according to arguments.
+            Initialize the IMPALA CNN encoder according to arguments.
         Arguments:
             - obs_shape (:obj:`SequenceType`): 2D image observation shape.
             - channels (:obj:`SequenceType`): The channel number of a series of  impala cnn blocks. \
@@ -348,6 +420,7 @@ def __init__(
                 observation, such as dividing 255.0 for the raw image observation.
             - nblock (:obj:`int`): The number of Residual Block in each block.
             - final_relu (:obj:`bool`): Whether to use ReLU activation in the final output of encoder.
+            - kwargs (:obj:`Dict[str, Any]`): Other arguments for ``IMPALACnnDownStack``.
         """
         super().__init__()
         self.scale_ob = scale_ob
@@ -375,6 +448,17 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             - x (:obj:`torch.Tensor`): :math:`(B, C, H, W)`, where B is batch size, C is channel number, H is height \
                 and W is width.
             - output (:obj:`torch.Tensor`): :math:`(B, outsize)`, where B is batch size.
+        Examples:
+            >>> encoder = IMPALAConvEncoder(
+            >>>    obs_shape=(4, 84, 84),
+            >>>    channels=(16, 32, 32),
+            >>>    outsize=256,
+            >>>    scale_ob=255.0,
+            >>>    nblock=2,
+            >>>    final_relu=True,
+            >>> )
+            >>> x = torch.randn(1, 4, 84, 84)
+            >>> output = encoder(x)
         """
         x = x / self.scale_ob
         for (i, layer) in enumerate(self.stacks):
diff --git a/ding/model/common/head.py b/ding/model/common/head.py
index c1d27fba89..09d73ac578 100755
--- a/ding/model/common/head.py
+++ b/ding/model/common/head.py
@@ -14,8 +14,8 @@
 class DiscreteHead(nn.Module):
     """
     Overview:
-        The ``DiscreteHead`` used to output discrete actions logit or Q-value logit, which is often used in DQN \
-        and policy head in actor-critic algorithms for discrete action space.
+        The ``DiscreteHead`` is used to generate discrete actions logit or Q-value logit, \
+        which is often used in q-learning algorithmns or actor-critic algorithms for discrete action space.
     Interfaces:
         ``__init__``, ``forward``.
     """
@@ -73,7 +73,6 @@ def forward(self, x: torch.Tensor) -> Dict:
         Shapes:
             - x: :math:`(B, N)`, where ``B = batch_size`` and ``N = hidden_size``.
             - logit: :math:`(B, M)`, where ``M = output_size``.
-
         Examples:
             >>> head = DiscreteHead(64, 64)
             >>> inputs = torch.randn(4, 64)
@@ -87,7 +86,8 @@ def forward(self, x: torch.Tensor) -> Dict:
 class DistributionHead(nn.Module):
     """
     Overview:
-        The ``DistributionHead`` used to output Q-value distribution, which is often used in C51 algorithm.
+        The ``DistributionHead`` is used to generate distribution for Q-value.
+        This module is used in C51 algorithm.
     Interfaces:
         ``__init__``, ``forward``.
     """
@@ -156,7 +156,6 @@ def forward(self, x: torch.Tensor) -> Dict:
             - x: :math:`(B, N)`, where ``B = batch_size`` and ``N = hidden_size``.
             - logit: :math:`(B, M)`, where ``M = output_size``.
             - distribution: :math:`(B, M, n_atom)`.
-
         Examples:
             >>> head = DistributionHead(64, 64)
             >>> inputs = torch.randn(4, 64)
@@ -177,7 +176,8 @@ def forward(self, x: torch.Tensor) -> Dict:
 class BranchingHead(nn.Module):
     """
     Overview:
-        The ``BranchingHead`` used to output different branches Q-value, which is used in Branch DQN.
+        The ``BranchingHead`` is used to generate Q-value with different branches.
+        This module is used in Branch DQN.
     Interfaces:
         ``__init__``, ``forward``.
     """
@@ -267,7 +267,6 @@ def forward(self, x: torch.Tensor) -> Dict:
         Shapes:
             - x: :math:`(B, N)`, where ``B = batch_size`` and ``N = hidden_size``.
             - logit: :math:`(B, M)`, where ``M = output_size``.
-
         Examples:
             >>> head = BranchingHead(64, 5, 2)
             >>> inputs = torch.randn(4, 64)
@@ -290,7 +289,8 @@ def forward(self, x: torch.Tensor) -> Dict:
 class RainbowHead(nn.Module):
     """
     Overview:
-        The ``RainbowHead`` used to output Q-value distribution, which is used in Rainbow DQN.
+        The ``RainbowHead`` is used to generate distribution of Q-value.
+        This module is used in Rainbow DQN.
     Interfaces:
         ``__init__``, ``forward``.
     """
@@ -370,7 +370,6 @@ def forward(self, x: torch.Tensor) -> Dict:
             - x: :math:`(B, N)`, where ``B = batch_size`` and ``N = hidden_size``.
             - logit: :math:`(B, M)`, where ``M = output_size``.
             - distribution: :math:`(B, M, n_atom)`.
-
         Examples:
             >>> head = RainbowHead(64, 64)
             >>> inputs = torch.randn(4, 64)
@@ -394,7 +393,7 @@ def forward(self, x: torch.Tensor) -> Dict:
 class QRDQNHead(nn.Module):
     """
     Overview:
-        The ``QRDQNHead`` (Quantile Regression DQN) used to output action quantiles.
+        The ``QRDQNHead`` (Quantile Regression DQN) is used to output action quantiles.
     Interfaces:
         ``__init__``, ``forward``.
     """
@@ -455,7 +454,6 @@ def forward(self, x: torch.Tensor) -> Dict:
             - logit: :math:`(B, M)`, where ``M = output_size``.
             - q: :math:`(B, M, num_quantiles)`.
             - tau: :math:`(B, M, 1)`.
-
         Examples:
             >>> head = QRDQNHead(64, 64)
             >>> inputs = torch.randn(4, 64)
@@ -478,7 +476,8 @@ def forward(self, x: torch.Tensor) -> Dict:
 class QuantileHead(nn.Module):
     """
     Overview:
-        The ``QuantileHead`` used to output action quantiles, which is used in IQN.
+        The ``QuantileHead`` is used to output action quantiles.
+        This module is used in IQN.
     Interfaces:
         ``__init__``, ``forward``, ``quantile_net``.
     """
@@ -574,7 +573,6 @@ def forward(self, x: torch.Tensor, num_quantiles: Optional[int] = None) -> Dict:
             - logit: :math:`(B, M)`, where ``M = output_size``.
             - q: :math:`(num_quantiles, B, M)`.
             - quantiles: :math:`(quantile_embedding_size, 1)`.
-
         Examples:
             >>> head = QuantileHead(64, 64)
             >>> inputs = torch.randn(4, 64)
@@ -609,7 +607,8 @@ def forward(self, x: torch.Tensor, num_quantiles: Optional[int] = None) -> Dict:
 class FQFHead(nn.Module):
     """
     Overview:
-        The ``FQFHead`` used to output action quantiles, which is used in ``FQF``.
+        The ``FQFHead`` is used to output action quantiles.
+        This module is used in FQF.
     Interfaces:
         ``__init__``, ``forward``, ``quantile_net``.
     """
@@ -779,7 +778,8 @@ def forward(self, x: torch.Tensor, num_quantiles: Optional[int] = None) -> Dict:
 class DuelingHead(nn.Module):
     """
     Overview:
-        The ``DuelingHead`` used to output discrete actions logit, which is used in Dueling DQN.
+        The ``DuelingHead`` is used to output discrete actions logit.
+        This module is used in Dueling DQN.
     Interfaces:
         ``__init__``, ``forward``.
     """
@@ -857,7 +857,6 @@ def forward(self, x: torch.Tensor) -> Dict:
         Shapes:
             - x: :math:`(B, N)`, where ``B = batch_size`` and ``N = hidden_size``.
             - logit: :math:`(B, M)`, where ``M = output_size``.
-
         Examples:
             >>> head = DuelingHead(64, 64)
             >>> inputs = torch.randn(4, 64)
@@ -874,7 +873,7 @@ def forward(self, x: torch.Tensor) -> Dict:
 class StochasticDuelingHead(nn.Module):
     """
     Overview:
-        The ``Stochastic Dueling Network`` proposed in paper ACER (arxiv 1611.01224). \
+        The ``Stochastic Dueling Network`` is proposed in paper ACER (arxiv 1611.01224). \
         That is to say, dueling network architecture in continuous action space.
     Interfaces:
         ``__init__``, ``forward``.
@@ -975,6 +974,16 @@ def forward(
             - sigma: :math:`(B, A)`.
             - q_value: :math:`(B, 1)`.
             - v_value: :math:`(B, 1)`.
+        Examples:
+            >>> head = StochasticDuelingHead(64, 64)
+            >>> inputs = torch.randn(4, 64)
+            >>> a = torch.randn(4, 64)
+            >>> mu = torch.randn(4, 64)
+            >>> sigma = torch.ones(4, 64)
+            >>> outputs = head(inputs, a, mu, sigma)
+            >>> assert isinstance(outputs, dict)
+            >>> assert outputs['q_value'].shape == torch.Size([4, 1])
+            >>> assert outputs['v_value'].shape == torch.Size([4, 1])
         """
 
         batch_size = s.shape[0]  # batch_size or batch_size * T
@@ -1005,8 +1014,9 @@ def forward(
 class RegressionHead(nn.Module):
     """
     Overview:
-        The ``RegressionHead`` used to output continuous actions Q-value (DDPG critic), state value (A2C/PPO), and \
-        directly predict continuous action (DDPG actor).
+        The ``RegressionHead`` is used to regress continuous variables.
+        This module is used for generating Q-value (DDPG critic) of continuous actions, \
+        or state value (A2C/PPO), or directly predicting continuous action (DDPG actor).
     Interfaces:
         ``__init__``, ``forward``.
     """
@@ -1054,7 +1064,6 @@ def forward(self, x: torch.Tensor) -> Dict:
         Shapes:
             - x: :math:`(B, N)`, where ``B = batch_size`` and ``N = hidden_size``.
             - pred: :math:`(B, M)`, where ``M = output_size``.
-
         Examples:
             >>> head = RegressionHead(64, 64)
             >>> inputs = torch.randn(4, 64)
@@ -1074,7 +1083,9 @@ def forward(self, x: torch.Tensor) -> Dict:
 class ReparameterizationHead(nn.Module):
     """
     Overview:
-        The ``ReparameterizationHead`` used to output action ``mu`` and ``sigma``, which is often used in PPO and SAC.
+        The ``ReparameterizationHead`` is used to generate Gaussian distribution of continuous variable, \
+        which is parameterized by ``mu`` and ``sigma``.
+        This module is often used in stochastic policies, such as PPO and SAC.
     Interfaces:
         ``__init__``, ``forward``.
     """
@@ -1146,7 +1157,6 @@ def forward(self, x: torch.Tensor) -> Dict:
             - x: :math:`(B, N)`, where ``B = batch_size`` and ``N = hidden_size``.
             - mu: :math:`(B, M)`, where ``M = output_size``.
             - sigma: :math:`(B, M)`.
-
         Examples:
             >>> head =  ReparameterizationHead(64, 64, sigma_type='fixed')
             >>> inputs = torch.randn(4, 64)
@@ -1173,7 +1183,8 @@ def forward(self, x: torch.Tensor) -> Dict:
 class PopArtVHead(nn.Module):
     """
     Overview:
-        The ``PopArtVHead`` used to output adaptive normalized state value, which is used in PPO/IMPALA.
+        The ``PopArtVHead`` is used to generate adaptive normalized state value.
+        This module is used in PPO or IMPALA.
     Interfaces:
         ``__init__``, ``forward``.
     """
@@ -1261,7 +1272,12 @@ def forward(self, key: torch.Tensor, query: torch.Tensor) -> torch.Tensor:
                 ``K = hidden_size``.
             - query: :math:`(B, K)`.
             - logit: :math:`(B, N)`.
-
+        Examples:
+            >>> head = AttentionPolicyHead()
+            >>> key = torch.randn(4, 5, 64)
+            >>> query = torch.randn(4, 64)
+            >>> logit = head(key, query)
+            >>> assert logit.shape == torch.Size([4, 5])
         .. note::
             In this head, we assume that the ``key`` and ``query`` tensor are both normalized.
         """
@@ -1274,8 +1290,8 @@ def forward(self, key: torch.Tensor, query: torch.Tensor) -> torch.Tensor:
 class MultiHead(nn.Module):
     """
     Overview:
-        The ``MultiHead`` used to output multiple similar results. For example, we can combine ``Distribution``  and \
-        ``MultiHead`` to output multi-discrete action space logit.
+        The ``MultiHead`` is used to generate multiple similar results.
+        For example, we can combine ``Distribution`` and ``MultiHead`` to generate multi-discrete action space logit.
     Interfaces:
         ``__init__``, ``forward``.
     """
@@ -1308,7 +1324,6 @@ def forward(self, x: torch.Tensor) -> Dict:
         Shapes:
             - x: :math:`(B, N)`, where ``B = batch_size`` and ``N = hidden_size``.
             - logit: :math:`(B, Mi)`, where ``Mi = output_size`` corresponding to output ``i``.
-
         Examples:
             >>> head = MultiHead(DuelingHead, 64, [2, 3, 5], v_layer_num=2)
             >>> inputs = torch.randn(4, 64)
@@ -1329,7 +1344,7 @@ def forward(self, x: torch.Tensor) -> Dict:
 class EnsembleHead(nn.Module):
     """
     Overview:
-        The ``EnsembleHead`` used to output action Q-value for Q-ensemble in model-based RL algorithms.
+        The ``EnsembleHead`` is used to generate Q-value for Q-ensemble in model-based RL algorithms.
     Interfaces:
         ``__init__``, ``forward``.
     """
@@ -1400,11 +1415,18 @@ def forward(self, x: torch.Tensor) -> Dict:
 def independent_normal_dist(logits: Union[List, Dict]) -> torch.distributions.Distribution:
     """
     Overview:
-        The compatibility function to convert different types logit to independent normal distribution.
+        Convert different types logit to independent normal distribution.
     Arguments:
         - logits (:obj:`Union[List, Dict]`): The logits to be converted.
     Returns:
         - dist (:obj:`torch.distributions.Distribution`): The converted normal distribution.
+    Examples:
+        >>> logits = [torch.randn(4, 5), torch.ones(4, 5)]
+        >>> dist = independent_normal_dist(logits)
+        >>> assert isinstance(dist, torch.distributions.Independent)
+        >>> assert isinstance(dist.base_dist, torch.distributions.Normal)
+        >>> assert dist.base_dist.loc.shape == torch.Size([4, 5])
+        >>> assert dist.base_dist.scale.shape == torch.Size([4, 5])
     Raises:
         - TypeError: If the type of logits is not ``list`` or ``dict``.
     """
diff --git a/ding/model/common/utils.py b/ding/model/common/utils.py
index 0f508de0b8..0ca8df7fb5 100644
--- a/ding/model/common/utils.py
+++ b/ding/model/common/utils.py
@@ -13,7 +13,14 @@ def create_model(cfg: EasyDict) -> torch.nn.Module:
             used to import modules, and they key ``type`` is used to indicate the model.
     Returns:
         - (:obj:`torch.nn.Module`): The created neural network model.
-
+    Examples:
+        >>> cfg = EasyDict({
+        >>>     'import_names': ['ding.model.template.q_learning'],
+        >>>     'type': 'dqn',
+        >>>     'obs_shape': 4,
+        >>>     'action_shape': 2,
+        >>> })
+        >>> model = create_model(cfg)
     .. tip::
         This method will not modify the ``cfg`` , it will deepcopy the ``cfg`` and then modify it.
     """
diff --git a/ding/model/template/acer.py b/ding/model/template/acer.py
index 2e28ef0b2c..bb46b22bec 100644
--- a/ding/model/template/acer.py
+++ b/ding/model/template/acer.py
@@ -9,9 +9,11 @@
 
 @MODEL_REGISTRY.register('acer')
 class ACER(nn.Module):
-    r"""
+    """
     Overview:
-        The ACER model.
+        The model of algorithmn ACER(Actor Critic with Experience Replay)
+        Sample Efficient Actor-Critic with Experience Replay.
+        https://arxiv.org/abs/1611.01224
     Interfaces:
         ``__init__``, ``forward``, ``compute_actor``, ``compute_critic``
     """
@@ -29,7 +31,7 @@ def __init__(
             activation: Optional[nn.Module] = nn.ReLU(),
             norm_type: Optional[str] = None,
     ) -> None:
-        r"""
+        """
         Overview:
             Init the ACER Model according to arguments.
         Arguments:
@@ -78,10 +80,10 @@ def __init__(
         self.critic = nn.ModuleList(self.critic)
 
     def forward(self, inputs: Union[torch.Tensor, Dict], mode: str) -> Dict:
-        r"""
+        """
         Overview:
-        Use observation to predict output.
-        Parameter updates with ACER's MLPs forward setup.
+            Use observation to predict output.
+            Parameter updates with ACER's MLPs forward setup.
         Arguments:
             Forward with ``'compute_actor'``:
                 - inputs (:obj:`torch.Tensor`):
@@ -101,11 +103,9 @@ def forward(self, inputs: Union[torch.Tensor, Dict], mode: str) -> Dict:
 
                 Forward with ``'compute_critic'``, Necessary Keys:
                     - q_value (:obj:`torch.Tensor`): Q value tensor.
-
         Actor Shapes:
             - obs (:obj:`torch.Tensor`): :math:`(B, N1)`, where B is batch size and N1 is ``obs_shape``
             - logit (:obj:`torch.FloatTensor`): :math:`(B, N2)`, where B is batch size and N2 is ``action_shape``
-
         Critic Shapes:
             - inputs (:obj:`torch.Tensor`): :math:`(B, N1)`, B is batch size and N1 corresponds to ``obs_shape``
             - q_value (:obj:`torch.FloatTensor`): :math:`(B, N2)`, where B is batch size and N2 is ``action_shape``
@@ -115,24 +115,16 @@ def forward(self, inputs: Union[torch.Tensor, Dict], mode: str) -> Dict:
             >>> inputs = torch.randn(4, 64)
             >>> actor_outputs = model(inputs,'compute_actor')
             >>> assert actor_outputs['logit'].shape == torch.Size([4, 64])
-
         Critic Examples:
             >>> inputs = torch.randn(4,N)
             >>> model = ACER(obs_shape=(N, ),action_shape=5)
-            >>> model(inputs, mode='compute_critic')['q_value'] # q value
-            tensor([[-0.0681, -0.0431, -0.0530,  0.1454, -0.1093],
-            [-0.0647, -0.0281, -0.0527,  0.1409, -0.1162],
-            [-0.0596, -0.0321, -0.0676,  0.1386, -0.1113],
-            [-0.0874, -0.0406, -0.0487,  0.1346, -0.1135]],
-            grad_fn=<AddmmBackward>)
-
-
+            >>> model(inputs, mode='compute_critic')['q_value']
         """
         assert mode in self.mode, "not support forward mode: {}/{}".format(mode, self.mode)
         return getattr(self, mode)(inputs)
 
     def compute_actor(self, inputs: torch.Tensor) -> Dict:
-        r"""
+        """
         Overview:
             Use encoded embedding tensor to predict output.
             Execute parameter updates with ``'compute_actor'`` mode
@@ -144,7 +136,6 @@ def compute_actor(self, inputs: torch.Tensor) -> Dict:
             - mode (:obj:`str`): Name of the forward mode.
         Returns:
             - outputs (:obj:`Dict`): Outputs of forward pass encoder and head.
-
         ReturnsKeys (either):
             - logit (:obj:`torch.FloatTensor`): :math:`(B, N1)`, where B is batch size and N1 is ``action_shape``
         Shapes:
@@ -163,7 +154,7 @@ def compute_actor(self, inputs: torch.Tensor) -> Dict:
         return x
 
     def compute_critic(self, inputs: torch.Tensor) -> Dict:
-        r"""
+        """
         Overview:
             Execute parameter updates with ``'compute_critic'`` mode
             Use encoded embedding tensor to predict output.
@@ -172,22 +163,15 @@ def compute_critic(self, inputs: torch.Tensor) -> Dict:
             - mode (:obj:`str`): Name of the forward mode.
         Returns:
             - outputs (:obj:`Dict`): Q-value output.
-
         ReturnKeys:
             - q_value (:obj:`torch.Tensor`): Q value tensor with same size as batch size.
         Shapes:
             - obs (:obj:`torch.Tensor`): :math:`(B, N1)`, where B is batch size and N1 is ``obs_shape``
             - q_value (:obj:`torch.FloatTensor`): :math:`(B, N2)`, where B is batch size and N2 is ``action_shape``.
-
         Examples:
             >>> inputs =torch.randn(4, N)
             >>> model = ACER(obs_shape=(N, ),action_shape=5)
-            >>> model(inputs, mode='compute_critic')['q_value'] # q value
-            tensor([[-0.0681, -0.0431, -0.0530,  0.1454, -0.1093],
-            [-0.0647, -0.0281, -0.0527,  0.1409, -0.1162],
-            [-0.0596, -0.0321, -0.0676,  0.1386, -0.1113],
-            [-0.0874, -0.0406, -0.0487,  0.1346, -0.1135]],
-            grad_fn=<AddmmBackward>)
+            >>> model(inputs, mode='compute_critic')['q_value']
         """
 
         obs = inputs
diff --git a/ding/model/template/atoc.py b/ding/model/template/atoc.py
index f0863481c2..5d05226505 100644
--- a/ding/model/template/atoc.py
+++ b/ding/model/template/atoc.py
@@ -5,28 +5,24 @@
 
 from ding.utils import squeeze, MODEL_REGISTRY, SequenceType
 from ding.torch_utils import MLP
-from ..common import RegressionHead
+from ding.model.common import RegressionHead
 
 
 class ATOCAttentionUnit(nn.Module):
-    r"""
+    """
     Overview:
-        the attention unit of the atoc network. We now implement it as two-layer MLP, same as the original paper
-
+        The attention unit of the ATOC network. We now implement it as two-layer MLP, same as the original paper.
     Interface:
-        __init__, forward
+        ``__init__``, ``forward``
 
     .. note::
-
         "ATOC paper: We use two-layer MLP to implement the attention unit but it is also can be realized by RNN."
-
     """
 
     def __init__(self, thought_size: int, embedding_size: int) -> None:
-        r"""
+        """
         Overview:
-            init the attention unit according to the size of input args
-
+            Initialize the attention unit according to the size of input arguments.
         Arguments:
             - thought_size (:obj:`int`): the size of input thought
             - embedding_size (:obj:`int`): the size of hidden layers
@@ -42,15 +38,19 @@ def __init__(self, thought_size: int, embedding_size: int) -> None:
         self._act2 = nn.Sigmoid()
 
     def forward(self, data: Union[Dict, torch.Tensor]) -> torch.Tensor:
-        r"""
+        """
         Overview:
-            forward method take the thought of agents as input and output the prob of these agent\
-                being initiator
-
+            Take the thought of agents as input and generate the probability of these agent being initiator
         Arguments:
             - x (:obj:`Union[Dict, torch.Tensor`): the input tensor or dict contain the thoughts tensor
-            - ret (:obj:`torch.Tensor`): the output initiator prob
-
+            - ret (:obj:`torch.Tensor`): the output initiator probability
+        Shapes:
+            - data['thought']: :math:`(M, B, N)`, M is the num of thoughts to integrate,\
+                B is batch_size and N is thought size
+        Examples:
+            >>> attention_unit = ATOCAttentionUnit(64, 64)
+            >>> thought = torch.randn(2, 3, 64)
+            >>> attention_unit(thought)
         """
         x = data
         if isinstance(data, Dict):
@@ -61,24 +61,21 @@ def forward(self, data: Union[Dict, torch.Tensor]) -> torch.Tensor:
         x = self._act1(x)
         x = self._fc3(x)
         x = self._act2(x)
-        # return {'initiator': x}
         return x.squeeze(-1)
 
 
 class ATOCCommunicationNet(nn.Module):
-    r"""
+    """
     Overview:
-        atoc commnication net is a bi-direction LSTM, so it can integrate all the thoughts in the group
-
+        This ATOC commnication net is a bi-direction LSTM, so it can integrate all the thoughts in the group.
     Interface:
-        __init__, forward
+        ``__init__``, ``forward``
     """
 
     def __init__(self, thought_size: int) -> None:
-        r"""
+        """
         Overview:
-            init method of the communication network
-
+            Initialize the communication network according to the size of input arguments.
         Arguments:
             - thought_size (:obj:`int`): the size of input thought
 
@@ -93,32 +90,34 @@ def __init__(self, thought_size: int) -> None:
         self._bi_lstm = nn.LSTM(self._thought_size, self._comm_hidden_size, bidirectional=True)
 
     def forward(self, data: Union[Dict, torch.Tensor]):
-        r"""
+        """
         Overview:
-            the forward method that integrate thoughts
+            The forward of ATOCCommunicationNet integrates thoughts in the group.
         Arguments:
             - x (:obj:`Union[Dict, torch.Tensor`): the input tensor or dict contain the thoughts tensor
             - out (:obj:`torch.Tensor`): the integrated thoughts
         Shapes:
             - data['thoughts']: :math:`(M, B, N)`, M is the num of thoughts to integrate,\
                 B is batch_size and N is thought size
+        Examples:
+            >>> comm_net = ATOCCommunicationNet(64)
+            >>> thoughts = torch.randn(2, 3, 64)
+            >>> comm_net(thoughts)
         """
         self._bi_lstm.flatten_parameters()
         x = data
         if isinstance(data, Dict):
             x = data['thoughts']
         out, _ = self._bi_lstm(x)
-        # return {'thoughts': out}
         return out
 
 
 class ATOCActorNet(nn.Module):
-    r"""
+    """
     Overview:
-        the overall ATOC actor network
-
+        The actor network of ATOC.
     Interface:
-        __init__, forward
+        ``__init__``, ``forward``
 
         .. note::
             "ATOC paper: The neural networks use ReLU and batch normalization for some hidden layers."
@@ -139,10 +138,9 @@ def __init__(
         activation: Optional[nn.Module] = nn.ReLU(),
         norm_type: Optional[str] = None,
     ):
-        r"""
+        """
         Overview:
-            the init method of atoc actor network
-
+            Initialize the actor network of ATOC
         Arguments:
             - obs_shape(:obj:`Union[Tuple, int]`): the observation size
             - thought_size (:obj:`int`): the size of thoughts
@@ -194,11 +192,9 @@ def __init__(
             self.comm_net = ATOCCommunicationNet(self._thought_size)
 
     def forward(self, obs: torch.Tensor) -> Dict:
-        r"""
+        """
         Overview:
-            the forward method of actor network, take the input obs, and calculate the corresponding action, group, \
-                initiator_prob, thoughts, etc...
-
+            Take the input obs, and calculate the corresponding action, group, initiator_prob, thoughts, etc...
         Arguments:
             - obs (:obj:`Dict`): the input obs containing the observation
         Returns:
@@ -207,6 +203,18 @@ def forward(self, obs: torch.Tensor) -> Dict:
         ReturnsKeys:
             - necessary: ``action``
             - optional: ``group``, ``initiator_prob``, ``is_initiator``, ``new_thoughts``, ``old_thoughts``
+        Shapes:
+            - obs (:obj:`torch.Tensor`): :math:`(B, A, N)`, where B is batch size, A is agent num, N is obs size
+            - action (:obj:`torch.Tensor`): :math:`(B, A, M)`, where M is action size
+            - group (:obj:`torch.Tensor`): :math:`(B, A, A)`
+            - initiator_prob (:obj:`torch.Tensor`): :math:`(B, A)`
+            - is_initiator (:obj:`torch.Tensor`): :math:`(B, A)`
+            - new_thoughts (:obj:`torch.Tensor`): :math:`(B, A, M)`
+            - old_thoughts (:obj:`torch.Tensor`): :math:`(B, A, M)`
+        Examples:
+            >>> actor_net = ATOCActorNet(64, 64, 64, 3)
+            >>> obs = torch.randn(2, 3, 64)
+            >>> actor_net(obs)
         """
         assert len(obs.shape) == 3
         self._cur_batch_size = obs.shape[0]
@@ -238,6 +246,25 @@ def forward(self, obs: torch.Tensor) -> Dict:
             return {'action': action}
 
     def _get_initiate_group(self, current_thoughts):
+        """
+        Overview:
+            Calculate the initiator probability, group and is_initiator
+        Arguments:
+            - current_thoughts (:obj:`torch.Tensor`): tensor of current thoughts
+        Returns:
+            - init_prob (:obj:`torch.Tensor`): tesnor of initiator probability
+            - is_initiator (:obj:`torch.Tensor`): tensor of is initiator
+            - group (:obj:`torch.Tensor`): tensor of group
+        Shapes:
+            - current_thoughts (:obj:`torch.Tensor`): :math:`(B, A, M)`, where M is thought size
+            - init_prob (:obj:`torch.Tensor`): :math:`(B, A)`
+            - is_initiator (:obj:`torch.Tensor`): :math:`(B, A)`
+            - group (:obj:`torch.Tensor`): :math:`(B, A, A)`
+        Examples:
+            >>> actor_net = ATOCActorNet(64, 64, 64, 3)
+            >>> current_thoughts = torch.randn(2, 3, 64)
+            >>> actor_net._get_initiate_group(current_thoughts)
+        """
         if not self._communication:
             raise NotImplementedError
         init_prob = self.attention(current_thoughts)  # B, A
@@ -267,10 +294,25 @@ def _get_initiate_group(self, current_thoughts):
 
     def _get_new_thoughts(self, current_thoughts, group, is_initiator):
         """
+        Overview:
+            Calculate the new thoughts according to current thoughts, group and is_initiator
+        Arguments:
+            - current_thoughts (:obj:`torch.Tensor`): tensor of current thoughts
+            - group (:obj:`torch.Tensor`): tensor of group
+            - is_initiator (:obj:`torch.Tensor`): tensor of is initiator
+        Returns:
+            - new_thoughts (:obj:`torch.Tensor`): tensor of new thoughts
         Shapes:
             - current_thoughts (:obj:`torch.Tensor`): :math:`(B, A, M)`, where M is thought size
             - group: (:obj:`torch.Tensor`): :math:`(B, A, A)`
             - is_initiator (:obj:`torch.Tensor`): :math:`(B, A)`
+            - new_thoughts (:obj:`torch.Tensor`): :math:`(B, A, M)`
+        Examples:
+            >>> actor_net = ATOCActorNet(64, 64, 64, 3)
+            >>> current_thoughts = torch.randn(2, 3, 64)
+            >>> group = torch.randn(2, 3, 3)
+            >>> is_initiator = torch.randn(2, 3)
+            >>> actor_net._get_new_thoughts(current_thoughts, group, is_initiator)
         """
         if not self._communication:
             raise NotImplementedError
@@ -306,12 +348,13 @@ def _get_new_thoughts(self, current_thoughts, group, is_initiator):
 
 @MODEL_REGISTRY.register('atoc')
 class ATOC(nn.Module):
-    r"""
+    """
     Overview:
         The QAC network of ATOC, a kind of extension of DDPG for MARL.
-
+        Learning Attentional Communication for Multi-Agent Cooperation
+        https://arxiv.org/abs/1805.07733
     Interface:
-        __init__, forward, compute_critic, compute_actor, optimize_actor_attention
+        ``__init__``, ``forward``, ``compute_critic``, ``compute_actor``, ``optimize_actor_attention``
     """
     mode = ['compute_actor', 'compute_critic', 'optimize_actor_attention']
 
@@ -330,10 +373,9 @@ def __init__(
             activation: Optional[nn.Module] = nn.ReLU(),
             norm_type: Optional[str] = None,
     ) -> None:
-        r"""
+        """
         Overview:
-            init the atoc QAC network
-
+            Initialize the ATOC QAC network
         Arguments:
             - obs_shape(:obj:`Union[Tuple, int]`): the observation space shape
             - thought_size (:obj:`int`): the size of thoughts
@@ -367,16 +409,33 @@ def __init__(
         )
 
     def _compute_delta_q(self, obs: torch.Tensor, actor_outputs: Dict) -> torch.Tensor:
-        r"""
+        """
         Overview:
             calculate the delta_q according to obs and actor_outputs
-
         Arguments:
             - obs (:obj:`torch.Tensor`): the observations
             - actor_outputs (:obj:`dict`): the output of actors
             - delta_q (:obj:`Dict`): the calculated delta_q
+        Returns:
+            - delta_q (:obj:`Dict`): the calculated delta_q
         ArgumentsKeys:
             - necessary: ``new_thoughts``, ``old_thoughts``, ``group``, ``is_initiator``
+        Shapes:
+            - obs (:obj:`torch.Tensor`): :math:`(B, A, N)`, where B is batch size, A is agent num, N is obs size
+            - actor_outputs (:obj:`Dict`): the output of actor network, including  ``action``, ``new_thoughts``, ``old_thoughts``, \
+                ``group``, ``initiator_prob``, ``is_initiator``
+            - action (:obj:`torch.Tensor`): :math:`(B, A, M)` where M is action size
+            - new_thoughts (:obj:`torch.Tensor`): :math:`(B, A, M)` where M is thought size
+            - old_thoughts (:obj:`torch.Tensor`): :math:`(B, A, M)` where M is thought size
+            - group (:obj:`torch.Tensor`): :math:`(B, A, A)`
+            - initiator_prob (:obj:`torch.Tensor`): :math:`(B, A)`
+            - is_initiator (:obj:`torch.Tensor`): :math:`(B, A)`
+            - delta_q (:obj:`torch.Tensor`): :math:`(B, A)`
+        Examples:
+            >>> net = ATOC(64, 64, 64, 3)
+            >>> obs = torch.randn(2, 3, 64)
+            >>> actor_outputs = net.compute_actor(obs)
+            >>> net._compute_delta_q(obs, actor_outputs)
         """
         if not self._communication:
             raise NotImplementedError
@@ -413,10 +472,9 @@ def _compute_delta_q(self, obs: torch.Tensor, actor_outputs: Dict) -> torch.Tens
         return curr_delta_q
 
     def compute_actor(self, obs: torch.Tensor, get_delta_q: bool = False) -> Dict[str, torch.Tensor]:
-        r'''
+        '''
         Overview:
             compute the action according to inputs, call the _compute_delta_q function to compute delta_q
-
         Arguments:
             - obs (:obj:`torch.Tensor`): observation
             - get_delta_q (:obj:`bool`) : whether need to get delta_q
@@ -425,7 +483,19 @@ def compute_actor(self, obs: torch.Tensor, get_delta_q: bool = False) -> Dict[st
         ReturnsKeys:
             - necessary: ``action``
             - optional: ``group``, ``initiator_prob``, ``is_initiator``, ``new_thoughts``, ``old_thoughts``, ``delta_q``
-
+        Shapes:
+            - obs (:obj:`torch.Tensor`): :math:`(B, A, N)`, where B is batch size, A is agent num, N is obs size
+            - action (:obj:`torch.Tensor`): :math:`(B, A, M)`, where M is action size
+            - group (:obj:`torch.Tensor`): :math:`(B, A, A)`
+            - initiator_prob (:obj:`torch.Tensor`): :math:`(B, A)`
+            - is_initiator (:obj:`torch.Tensor`): :math:`(B, A)`
+            - new_thoughts (:obj:`torch.Tensor`): :math:`(B, A, M)`
+            - old_thoughts (:obj:`torch.Tensor`): :math:`(B, A, M)`
+            - delta_q (:obj:`torch.Tensor`): :math:`(B, A)`
+        Examples:
+            >>> net = ATOC(64, 64, 64, 3)
+            >>> obs = torch.randn(2, 3, 64)
+            >>> net.compute_actor(obs)
         '''
         outputs = self.actor(obs)
         if get_delta_q and self._communication:
@@ -435,10 +505,25 @@ def compute_actor(self, obs: torch.Tensor, get_delta_q: bool = False) -> Dict[st
 
     def compute_critic(self, inputs: Dict) -> Dict:
         """
+        Overview:
+            compute the q_value according to inputs
+        Arguments:
+            - inputs (:obj:`Dict`): the inputs contain the obs and action
+        Returns:
+            - outputs (:obj:`Dict`): the output of critic network
         ArgumentsKeys:
             - necessary: ``obs``, ``action``
         ReturnsKeys:
             - necessary: ``q_value``
+        Shapes:
+            - obs (:obj:`torch.Tensor`): :math:`(B, A, N)`, where B is batch size, A is agent num, N is obs size
+            - action (:obj:`torch.Tensor`): :math:`(B, A, M)`, where M is action size
+            - q_value (:obj:`torch.Tensor`): :math:`(B, A)`
+        Examples:
+            >>> net = ATOC(64, 64, 64, 3)
+            >>> obs = torch.randn(2, 3, 64)
+            >>> action = torch.randn(2, 3, 64)
+            >>> net.compute_critic({'obs': obs, 'action': action})
         """
         obs, action = inputs['obs'], inputs['action']
         if len(action.shape) == 2:  # (B, A) -> (B, A, 1)
@@ -448,14 +533,28 @@ def compute_critic(self, inputs: Dict) -> Dict:
         return {'q_value': x}
 
     def optimize_actor_attention(self, inputs: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
-        r"""
+        """
         Overview:
             return the actor attention loss
-
         Arguments:
             - inputs (:obj:`Dict`): the inputs contain the delta_q, initiator_prob, and is_initiator
         Returns
             - loss (:obj:`Dict`): the loss of actor attention unit
+        ArgumentsKeys:
+            - necessary: ``delta_q``, ``initiator_prob``, ``is_initiator``
+        ReturnsKeys:
+            - necessary: ``loss``
+        Shapes:
+            - delta_q (:obj:`torch.Tensor`): :math:`(B, A)`
+            - initiator_prob (:obj:`torch.Tensor`): :math:`(B, A)`
+            - is_initiator (:obj:`torch.Tensor`): :math:`(B, A)`
+            - loss (:obj:`torch.Tensor`): :math:`(1)`
+        Examples:
+            >>> net = ATOC(64, 64, 64, 3)
+            >>> delta_q = torch.randn(2, 3)
+            >>> initiator_prob = torch.randn(2, 3)
+            >>> is_initiator = torch.randn(2, 3)
+            >>> net.optimize_actor_attention({'delta_q': delta_q, 'initiator_prob': initiator_prob, 'is_initiator': is_initiator})
         """
         if not self._communication:
             raise NotImplementedError
diff --git a/ding/model/template/bc.py b/ding/model/template/bc.py
index ce58ca8c5f..753868437b 100644
--- a/ding/model/template/bc.py
+++ b/ding/model/template/bc.py
@@ -188,13 +188,20 @@ def forward(self, inputs: Union[torch.Tensor, Dict[str, torch.Tensor]]) -> Dict:
             - inputs (:obj:`torch.Tensor`): Observation data, defaults to tensor.
         Returns:
             - output (:obj:`Dict`): Output dict data, including different key-values among distinct action_space.
-
+        ReturnsKeys:
+            - action (:obj:`torch.Tensor`): Continuous action output of actor network, \
+                with shape :math:`(B, action_shape)`.
+            - logit (:obj:`List[torch.Tensor]`): Continuous action output of actor network, \
+                with shape :math:`(B, action_shape)`.
+        Shapes:
+            - inputs (:obj:`torch.Tensor`): :math:`(B, N)`, where B is batch size and N is ``obs_shape``
+            - action (:obj:`torch.FloatTensor`): :math:`(B, M)`, where B is batch size and M is ``action_shape``
+            - logit (:obj:`List[torch.FloatTensor]`): :math:`(B, M)`, where B is batch size and M is ``action_shape``
         Examples (Regression):
             >>> model = ContinuousBC(32, 6, action_space='regression')
             >>> inputs = torch.randn(4, 32)
             >>> outputs = model(inputs)
             >>> assert isinstance(outputs, dict) and outputs['action'].shape == torch.Size([4, 6])
-
         Examples (Reparameterization):
             >>> model = ContinuousBC(32, 6, action_space='reparameterization')
             >>> inputs = torch.randn(4, 32)
diff --git a/ding/model/template/bcq.py b/ding/model/template/bcq.py
index 7b8d013e9e..bd37afbdc1 100755
--- a/ding/model/template/bcq.py
+++ b/ding/model/template/bcq.py
@@ -11,6 +11,16 @@
 
 @MODEL_REGISTRY.register('bcq')
 class BCQ(nn.Module):
+    """
+    Overview:
+        Model of BCQ (Batch-Constrained deep Q-learning).
+        Off-Policy Deep Reinforcement Learning without Exploration.
+        https://arxiv.org/abs/1812.02900
+    Interface:
+        ``forward``, ``compute_actor``, ``compute_critic``, ``compute_vae``, ``compute_eval``
+    Property:
+        ``mode``
+    """
 
     mode = ['compute_actor', 'compute_critic', 'compute_vae', 'compute_eval']
 
@@ -93,8 +103,14 @@ def forward(self, inputs: Dict[str, torch.Tensor], mode: str) -> Dict[str, torch
                 - inputs (:obj:`Dict`): Input dict data, including obs and action tensor.
             Returns:
                 - output (:obj:`Dict`): Output dict data, including action tensor.
-
-
+        Examples:
+            >>> inputs = {'obs': torch.randn(4, 32), 'action': torch.randn(4, 6)}
+            >>> model = BCQ(32, 6)
+            >>> outputs = model(inputs, mode='compute_actor')
+            >>> outputs = model(inputs, mode='compute_critic')
+            >>> outputs = model(inputs, mode='compute_vae')
+            >>> outputs = model(inputs, mode='compute_eval')
+                
         .. note::
             For specific examples, one can refer to API doc of ``compute_actor`` and ``compute_critic`` respectively.
         """
@@ -102,6 +118,21 @@ def forward(self, inputs: Dict[str, torch.Tensor], mode: str) -> Dict[str, torch
         return getattr(self, mode)(inputs)
 
     def compute_critic(self, inputs: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        """
+        Overview:
+            Use critic network to compute q value.
+        Arguments:
+            - inputs (:obj:`Dict`): Input dict data, including obs and action tensor.
+        Returns:
+            - outputs (:obj:`Dict`): Dict containing keywords ``q_value`` (:obj:`torch.Tensor`).
+        Shapes:
+            - inputs (:obj:`Dict`): :math:`(B, N, D)`, where B is batch size, N is sample number, D is input dimension.
+            - outputs (:obj:`Dict`): :math:`(B, N)`.
+        Examples:
+            >>> inputs = {'obs': torch.randn(4, 32), 'action': torch.randn(4, 6)}
+            >>> model = BCQ(32, 6)
+            >>> outputs = model.compute_critic(inputs)
+        """
         obs, action = inputs['obs'], inputs['action']
         if len(action.shape) == 1:  # (B, ) -> (B, 1)
             action = action.unsqueeze(1)
@@ -110,6 +141,21 @@ def compute_critic(self, inputs: Dict[str, torch.Tensor]) -> Dict[str, torch.Ten
         return {'q_value': x}
 
     def compute_actor(self, inputs: Dict[str, torch.Tensor]) -> Dict[str, Union[torch.Tensor, Dict[str, torch.Tensor]]]:
+        """
+        Overview:
+            Use actor network to compute action.
+        Arguments:
+            - inputs (:obj:`Dict`): Input dict data, including obs and action tensor.
+        Returns:
+            - outputs (:obj:`Dict`): Dict containing keywords ``action`` (:obj:`torch.Tensor`).
+        Shapes:
+            - inputs (:obj:`Dict`): :math:`(B, N, D)`, where B is batch size, N is sample number, D is input dimension.
+            - outputs (:obj:`Dict`): :math:`(B, N)`.
+        Examples:
+            >>> inputs = {'obs': torch.randn(4, 32), 'action': torch.randn(4, 6)}
+            >>> model = BCQ(32, 6)
+            >>> outputs = model.compute_actor(inputs)
+        """
         input = torch.cat([inputs['obs'], inputs['action']], -1)
         x = self.actor(input)
         action = self.phi * 1 * torch.tanh(x)
@@ -117,9 +163,41 @@ def compute_actor(self, inputs: Dict[str, torch.Tensor]) -> Dict[str, Union[torc
         return {'action': action}
 
     def compute_vae(self, inputs: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        """
+        Overview:
+            Use vae network to compute action.
+        Arguments:
+            - inputs (:obj:`Dict`): Input dict data, including obs and action tensor.
+        Returns:
+            - outputs (:obj:`Dict`): Dict containing keywords ``recons_action`` (:obj:`torch.Tensor`), \
+            ``prediction_residual`` (:obj:`torch.Tensor`), ``input`` (:obj:`torch.Tensor`), \
+            ``mu`` (:obj:`torch.Tensor`), ``log_var`` (:obj:`torch.Tensor`) and ``z`` (:obj:`torch.Tensor`).
+        Shapes:
+            - inputs (:obj:`Dict`): :math:`(B, N, D)`, where B is batch size, N is sample number, D is input dimension.
+            - outputs (:obj:`Dict`): :math:`(B, N)`.
+        Examples:
+            >>> inputs = {'obs': torch.randn(4, 32), 'action': torch.randn(4, 6)}
+            >>> model = BCQ(32, 6)
+            >>> outputs = model.compute_vae(inputs)
+        """
         return self.vae.forward(inputs)
 
     def compute_eval(self, inputs: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        """
+        Overview:
+            Use actor network to compute action.
+        Arguments:
+            - inputs (:obj:`Dict`): Input dict data, including obs and action tensor.
+        Returns:
+            - outputs (:obj:`Dict`): Dict containing keywords ``action`` (:obj:`torch.Tensor`).
+        Shapes:
+            - inputs (:obj:`Dict`): :math:`(B, N, D)`, where B is batch size, N is sample number, D is input dimension.
+            - outputs (:obj:`Dict`): :math:`(B, N)`.
+        Examples:
+            >>> inputs = {'obs': torch.randn(4, 32), 'action': torch.randn(4, 6)}
+            >>> model = BCQ(32, 6)
+            >>> outputs = model.compute_eval(inputs)
+        """
         obs = inputs['obs']
         obs_rep = obs.clone().unsqueeze(0).repeat_interleave(100, dim=0)
         z = torch.randn((obs_rep.shape[0], obs_rep.shape[1], self.action_shape * 2)).to(obs.device).clamp(-0.5, 0.5)
diff --git a/ding/model/template/collaq.py b/ding/model/template/collaq.py
index dd8db59986..01136211f5 100644
--- a/ding/model/template/collaq.py
+++ b/ding/model/template/collaq.py
@@ -14,7 +14,7 @@ class CollaQMultiHeadAttention(nn.Module):
     Overview:
         The head of collaq attention module.
     Interface:
-        __init__, forward
+        ``__init__``, ``forward``
     """
 
     def __init__(
@@ -69,8 +69,26 @@ def forward(self, q, k, v, mask=None):
             - q (:obj:`torch.nn.Sequential`): the transformer information q
             - k (:obj:`torch.nn.Sequential`): the transformer information k
             - v (:obj:`torch.nn.Sequential`): the transformer information v
-        Output:
+        Returns:
             - q (:obj:`torch.nn.Sequential`): the transformer output q
+            - residual (:obj:`torch.nn.Sequential`): the transformer output residual
+        Shapes:
+            - q (:obj:`torch.nn.Sequential`): :math:`(B, L, N)` where B is batch_size, L is sequence length, \
+                N is the size of input q
+            - k (:obj:`torch.nn.Sequential`): :math:`(B, L, N)` where B is batch_size, L is sequence length, \
+                N is the size of input k
+            - v (:obj:`torch.nn.Sequential`): :math:`(B, L, N)` where B is batch_size, L is sequence length, \
+                N is the size of input v
+            - q (:obj:`torch.nn.Sequential`): :math:`(B, L, N)` where B is batch_size, L is sequence length, \
+                N is the size of output q
+            - residual (:obj:`torch.nn.Sequential`): :math:`(B, L, N)` where B is batch_size, L is sequence length, \
+                N is the size of output residual
+        Examples:
+            >>> net = CollaQMultiHeadAttention(1, 2, 3, 4, 5, 6)
+            >>> q = torch.randn(1, 2, 2)
+            >>> k = torch.randn(1, 3, 3)
+            >>> v = torch.randn(1, 3, 3)
+            >>> q, residual = net(q, k, v)
         """
         d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
         batch_size, len_q, len_k, len_v = q.size(0), q.size(1), k.size(1), v.size(1)
@@ -104,7 +122,7 @@ class CollaQSMACAttentionModule(nn.Module):
         Collaq attention module. Used to get agent's attention observation. It includes agent's observation\
             and agent's part of the observation information of the agent's concerned allies
     Interface:
-        __init__, _cut_obs, forward
+        ``__init__``, ``_cut_obs``, ``forward``
     """
 
     def __init__(
@@ -140,9 +158,16 @@ def _cut_obs(self, obs: torch.Tensor):
             cut the observed information into self's observation and allay's observation
         Arguments:
             - obs (:obj:`torch.Tensor`): input each agent's observation
-        Return:
+        Returns:
             - self_features (:obj:`torch.Tensor`): output self agent's attention observation
             - ally_features (:obj:`torch.Tensor`): output ally agent's attention observation
+        Shapes:
+            - obs (:obj:`torch.Tensor`): :math:`(T, B, A, N)` where T is timestep, B is batch_size, \
+                A is agent_num, N is obs_shape
+            - self_features (:obj:`torch.Tensor`): :math:`(T, B, A, N)` where T is timestep, B is batch_size, \
+                A is agent_num, N is self_feature_range[1] - self_feature_range[0]
+            - ally_features (:obj:`torch.Tensor`): :math:`(T, B, A, N)` where T is timestep, B is batch_size, \
+                A is agent_num, N is ally_feature_range[1] - ally_feature_range[0]
         """
         # obs shape = (T, B, A, obs_shape)
         self_features = obs[:, :, :, self.self_feature_range[0]:self.self_feature_range[1]]
@@ -155,8 +180,11 @@ def forward(self, inputs: torch.Tensor):
             forward computation to get agent's attention observation information
         Arguments:
             - obs (:obj:`torch.Tensor`): input each agent's observation
-        Return:
+        Returns:
             - obs (:obj:`torch.Tensor`): output agent's attention observation
+        Shapes:
+            - obs (:obj:`torch.Tensor`): :math:`(T, B, A, N)` where T is timestep, B is batch_size, \
+                A is agent_num, N is obs_shape
         """
         # obs shape = (T, B ,A, obs_shape)
         obs = inputs
@@ -182,9 +210,16 @@ def forward(self, inputs: torch.Tensor):
 class CollaQ(nn.Module):
     """
     Overview:
-        CollaQ network
+        The network of CollaQ (Collaborative Q-learning) algorithm.
+        It includes two parts: q_network and q_alone_network. 
+        The q_network is used to get the q_value of the agent's observation and \
+        the agent's part of the observation information of the agent's concerned allies. 
+        The q_alone_network is used to get the q_value of the agent's observation and \
+        the agent's observation information without the agent's concerned allies.
+        Multi-Agent Collaboration via Reward Attribution Decomposition
+        https://arxiv.org/abs/2010.08531
     Interface:
-        __init__, forward, _setup_global_encoder
+        ``__init__``, ``forward``, ``_setup_global_encoder``
     """
 
     def __init__(
@@ -275,7 +310,8 @@ def __init__(
     def forward(self, data: dict, single_step: bool = True) -> dict:
         """
         Overview:
-            forward computation graph of collaQ network
+            The forward method calculates the q_value of each agent and the total q_value of all agents.
+            The q_value of each agent is calculated by the q_network, and the total q_value is calculated by the mixer.
         Arguments:
             - data (:obj:`dict`): input data dict with keys ['obs', 'prev_state', 'action']
                 - agent_state (:obj:`torch.Tensor`): each agent local state(obs)
@@ -302,6 +338,32 @@ def forward(self, data: dict, single_step: bool = True) -> dict:
             - total_q (:obj:`torch.Tensor`): :math:`(T, B)`
             - agent_q (:obj:`torch.Tensor`): :math:`(T, B, A, P)`, where P is action_shape
             - next_state (:obj:`list`): math:`(B, A)`, a list of length B, and each element is a list of length A
+        Examples:
+            >>> collaQ_model = CollaQ(
+            >>>     agent_num=4,
+            >>>     obs_shape=32,
+            >>>     alone_obs_shape=24,
+            >>>     global_obs_shape=32 * 4,
+            >>>     action_shape=9,
+            >>>     hidden_size_list=[128, 64],
+            >>>     self_feature_range=[8, 10],
+            >>>     ally_feature_range=[10, 16],
+            >>>     attention_size=64,
+            >>>     mixer=True,
+            >>>     activation=torch.nn.Tanh()
+            >>> )
+            >>> data={
+            >>>     'obs': {
+            >>>         'agent_state': torch.randn(8, 4, 4, 32),
+            >>>         'agent_alone_state': torch.randn(8, 4, 4, 24),
+            >>>         'agent_alone_padding_state': torch.randn(8, 4, 4, 32),
+            >>>         'global_state': torch.randn(8, 4, 32 * 4),
+            >>>         'action_mask': torch.randint(0, 2, size=(8, 4, 4, 9))
+            >>>     },
+            >>>     'prev_state': [[[None for _ in range(4)] for _ in range(3)] for _ in range(4)],
+            >>>     'action': torch.randint(0, 9, size=(8, 4, 4))
+            >>> }
+            >>> output = collaQ_model(data, single_step=False)
         """
         agent_state, agent_alone_state = data['obs']['agent_state'], data['obs']['agent_alone_state']
         agent_alone_padding_state = data['obs']['agent_alone_padding_state']
@@ -426,7 +488,7 @@ def _setup_global_encoder(self, global_obs_shape: int, embedding_size: int) -> t
         Arguments:
             - global_obs_shape (:obj:`int`): the dimension of global observation state
             - embedding_size (:obj:`int`): the dimension of state emdedding
-        Return:
+        Returns:
             - outputs (:obj:`torch.nn.Module`): Global observation encoding network
         """
         return MLP(global_obs_shape, embedding_size, embedding_size, 2, activation=self._act)
diff --git a/ding/model/template/coma.py b/ding/model/template/coma.py
index c120dfd78d..cccabff9f4 100644
--- a/ding/model/template/coma.py
+++ b/ding/model/template/coma.py
@@ -11,9 +11,9 @@
 class COMAActorNetwork(nn.Module):
     """
     Overview:
-        Decentralized actor network in COMA
+        Decentralized actor network in COMA algorithm.
     Interface:
-        __init__, forward
+         ``__init__``, ``forward``
     """
 
     def __init__(
@@ -24,7 +24,7 @@ def __init__(
     ):
         """
         Overview:
-            initialize COMA actor network
+            Initialize COMA actor network
         Arguments:
             - obs_shape (:obj:`int`): the dimension of each agent's observation state
             - action_shape (:obj:`int`): the dimension of action shape
@@ -35,10 +35,30 @@ def __init__(
 
     def forward(self, inputs: Dict) -> Dict:
         """
+        Overview:
+            The forward computation graph of COMA actor network
+        Arguments:
+            - inputs (:obj:`dict`): input data dict with keys ['obs', 'prev_state']
+            - agent_state (:obj:`torch.Tensor`): each agent local state(obs)
+            - action_mask (:obj:`torch.Tensor`): the masked action
+            - prev_state (:obj:`torch.Tensor`): the previous hidden state
+        Returns:
+            - output (:obj:`dict`): output data dict with keys ['logit', 'next_state', 'action_mask']
         ArgumentsKeys:
             - necessary: ``obs`` { ``agent_state``, ``action_mask`` }, ``prev_state``
         ReturnsKeys:
             - necessary: ``logit``, ``next_state``, ``action_mask``
+        Examples:
+            >>> T, B, A, N = 4, 8, 3, 32
+            >>> embedding_dim = 64
+            >>> action_dim = 6
+            >>> data = torch.randn(T, B, A, N)
+            >>> model = COMAActorNetwork((N, ), action_dim, [128, embedding_dim])
+            >>> prev_state = [[None for _ in range(A)] for _ in range(B)]
+            >>> for t in range(T):
+            >>>     inputs = {'obs': {'agent_state': data[t], 'action_mask': None}, 'prev_state': prev_state}
+            >>>     outputs = model(inputs)
+            >>>     logit, prev_state = outputs['logit'], outputs['next_state']
         """
         agent_state = inputs['obs']['agent_state']
         prev_state = inputs['prev_state']
@@ -62,9 +82,9 @@ def forward(self, inputs: Dict) -> Dict:
 class COMACriticNetwork(nn.Module):
     """
     Overview:
-        Centralized critic network in COMA
+        Centralized critic network in COMA algorithm.
     Interface:
-        __init__, forward
+         ``__init__``, ``forward``
     """
 
     def __init__(
@@ -80,6 +100,14 @@ def __init__(
             - input_size (:obj:`int`): the size of input global observation
             - action_shape (:obj:`int`): the dimension of action shape
             - hidden_size_list (:obj:`list`): the list of hidden size, default to 128
+        Returns:
+            - output (:obj:`dict`): output data dict with keys ['q_value']
+        Shapes:
+            - obs (:obj:`dict`): ``agent_state``: :math:`(T, B, A, N, D)`, ``action_mask``: :math:`(T, B, A, N, A)`
+            - prev_state (:obj:`list`): :math:`[[[h, c] for _ in range(A)] for _ in range(B)]`
+            - logit (:obj:`torch.Tensor`): :math:`(T, B, A, N, A)`
+            - next_state (:obj:`list`): :math:`[[[h, c] for _ in range(A)] for _ in range(B)]`
+            - action_mask (:obj:`torch.Tensor`): :math:`(T, B, A, N, A)`
         """
         super(COMACriticNetwork, self).__init__()
         self.action_shape = action_shape
@@ -101,6 +129,18 @@ def forward(self, data: Dict) -> Dict:
             - necessary: ``obs`` { ``agent_state``, ``global_state`` }, ``action``, ``prev_state``
         ReturnsKeys:
             - necessary: ``q_value``
+        Examples:
+            >>> agent_num, bs, T = 4, 3, 8
+            >>> obs_dim, global_obs_dim, action_dim = 32, 32 * 4, 9
+            >>> coma_model = COMACriticNetwork(obs_dim - action_dim + global_obs_dim + 2 * action_dim * agent_num, action_dim)
+            >>> data = {
+            >>>     'obs': {
+            >>>         'agent_state': torch.randn(T, bs, agent_num, obs_dim),
+            >>>         'global_state': torch.randn(T, bs, global_obs_dim),
+            >>>     },
+            >>>     'action': torch.randint(0, action_dim, size=(T, bs, agent_num)),
+            >>> }
+            >>> output = coma_model(data)
         """
         x = self._preprocess_data(data)
         q = self.mlp(x)
@@ -145,8 +185,13 @@ def _preprocess_data(self, data: Dict) -> torch.Tensor:
 class COMA(nn.Module):
     """
     Overview:
-        COMA network is QAC-type actor-critic.
+        The network of COMA algorithm, which is QAC-type actor-critic.
+    Interface:
+        ``__init__``, ``forward``
+    Properties:
+        - mode (:obj:`list`): The list of forward mode, including ``compute_actor`` and ``compute_critic``
     """
+
     mode = ['compute_actor', 'compute_critic']
 
     def __init__(
@@ -174,12 +219,53 @@ def __init__(
 
     def forward(self, inputs: Dict, mode: str) -> Dict:
         """
+        Overview:
+            forward computation graph of COMA network
+        Arguments:
+            - inputs (:obj:`dict`): input data dict with keys ['obs', 'prev_state', 'action']
+            - agent_state (:obj:`torch.Tensor`): each agent local state(obs)
+            - global_state (:obj:`torch.Tensor`): global state(obs)
+            - action (:obj:`torch.Tensor`): the masked action
         ArgumentsKeys:
             - necessary: ``obs`` { ``agent_state``, ``global_state``, ``action_mask`` }, ``action``, ``prev_state``
         ReturnsKeys:
             - necessary:
                 - compute_critic: ``q_value``
                 - compute_actor: ``logit``, ``next_state``, ``action_mask``
+        Shapes:
+            - obs (:obj:`dict`): ``agent_state``: :math:`(T, B, A, N, D)`, ``action_mask``: :math:`(T, B, A, N, A)`
+            - prev_state (:obj:`list`): :math:`[[[h, c] for _ in range(A)] for _ in range(B)]`
+            - logit (:obj:`torch.Tensor`): :math:`(T, B, A, N, A)`
+            - next_state (:obj:`list`): :math:`[[[h, c] for _ in range(A)] for _ in range(B)]`
+            - action_mask (:obj:`torch.Tensor`): :math:`(T, B, A, N, A)`
+            - q_value (:obj:`torch.Tensor`): :math:`(T, B, A, N, A)`
+        Examples:
+            >>> agent_num, bs, T = 4, 3, 8
+            >>> agent_num, bs, T = 4, 3, 8
+            >>> obs_dim, global_obs_dim, action_dim = 32, 32 * 4, 9
+            >>> coma_model = COMA(
+            >>>     agent_num=agent_num,
+            >>>     obs_shape=dict(agent_state=(obs_dim, ), global_state=(global_obs_dim, )),
+            >>>     action_shape=action_dim,
+            >>>     actor_hidden_size_list=[128, 64],
+            >>> )
+            >>> prev_state = [[None for _ in range(agent_num)] for _ in range(bs)]
+            >>> data = {
+            >>>     'obs': {
+            >>>         'agent_state': torch.randn(T, bs, agent_num, obs_dim),
+            >>>         'action_mask': None,
+            >>>     },
+            >>>     'prev_state': prev_state,
+            >>> }
+            >>> output = coma_model(data, mode='compute_actor')
+            >>> data= {
+            >>>     'obs': {
+            >>>         'agent_state': torch.randn(T, bs, agent_num, obs_dim),
+            >>>         'global_state': torch.randn(T, bs, global_obs_dim),
+            >>>     },
+            >>>     'action': torch.randint(0, action_dim, size=(T, bs, agent_num)),
+            >>> }
+            >>> output = coma_model(data, mode='compute_critic')
         """
         assert mode in self.mode, "not support forward mode: {}/{}".format(mode, self.mode)
         if mode == 'compute_actor':
diff --git a/ding/model/template/decision_transformer.py b/ding/model/template/decision_transformer.py
index 6aca4041da..3d35497383 100644
--- a/ding/model/template/decision_transformer.py
+++ b/ding/model/template/decision_transformer.py
@@ -71,7 +71,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             - x (:obj:`torch.Tensor`): The input tensor.
         Returns:
             - out (:obj:`torch.Tensor`): Output tensor, the shape is the same as the input.
-
         Examples:
             >>> inputs = torch.randn(2, 4, 64)
             >>> model = MaskedCausalAttention(64, 5, 4, 0.1)
@@ -142,7 +141,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             - x (:obj:`torch.Tensor`): The input tensor.
         Returns:
             - output (:obj:`torch.Tensor`): Output tensor, the shape is the same as the input.
-
         Examples:
             >>> inputs = torch.randn(2, 4, 64)
             >>> model = Block(64, 5, 4, 0.1)
@@ -260,7 +258,6 @@ def forward(
         Returns:
             - output (:obj:`Tuple[torch.Tensor, torch.Tensor, torch.Tensor]`): Output contains three tensors, \
             they are correspondingly the predicted states, predicted actions and predicted return-to-go.
-
         Examples:
             >>> B, T = 4, 6
             >>> state_dim = 3
@@ -274,21 +271,16 @@ def forward(
                 n_heads=2,\
                 drop_p=0.1,\
             )
-
             >>> timesteps = torch.randint(0, 100, [B, 3 * T - 1, 1], dtype=torch.long)  # B x T
             >>> states = torch.randn([B, T, state_dim])  # B x T x state_dim
-
             >>> actions = torch.randint(0, act_dim, [B, T, 1])
             >>> action_target = torch.randint(0, act_dim, [B, T, 1])
             >>> returns_to_go_sample = torch.tensor([1, 0.8, 0.6, 0.4, 0.2, 0.]).repeat([B, 1]).unsqueeze(-1).float()
-
             >>> traj_mask = torch.ones([B, T], dtype=torch.long)  # B x T
             >>> actions = actions.squeeze(-1)
-
             >>> state_preds, action_preds, return_preds = DT_model.forward(\
                 timesteps=timesteps, states=states, actions=actions, returns_to_go=returns_to_go\
             )
-
             >>> assert state_preds.shape == torch.Size([B, T, state_dim])
             >>> assert return_preds.shape == torch.Size([B, T, 1])
             >>> assert action_preds.shape == torch.Size([B, T, act_dim])
diff --git a/ding/model/template/ebm.py b/ding/model/template/ebm.py
index fe1c073b17..83f05168e3 100644
--- a/ding/model/template/ebm.py
+++ b/ding/model/template/ebm.py
@@ -15,10 +15,17 @@
 from ding.utils import MODEL_REGISTRY, STOCHASTIC_OPTIMIZER_REGISTRY
 from ding.torch_utils import unsqueeze_repeat
 from ding.model.wrapper import IModelWrapper
-from ..common import RegressionHead
+from ding.model.common import RegressionHead
 
 
 def create_stochastic_optimizer(device: str, stochastic_optimizer_config: dict):
+    """
+    Overview:
+        Create stochastic optimizer.
+    Arguments:
+        - device (:obj:`str`): Device.
+        - stochastic_optimizer_config (:obj:`dict`): Stochastic optimizer config.
+    """
     return STOCHASTIC_OPTIMIZER_REGISTRY.build(
         stochastic_optimizer_config.pop("type"), device=device, **stochastic_optimizer_config
     )
@@ -45,20 +52,34 @@ def wrapper(*args, **kwargs):
 
 
 class StochasticOptimizer(ABC):
+    """
+    Overview:
+        Base class for stochastic optimizers.
+    Interface:
+        ``__init__``, ``_sample``, ``_get_best_action_sample``, ``set_action_bounds``, ``sample``, ``infer``
+    """
 
     def _sample(self, obs: torch.Tensor, num_samples: int) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         Overview:
-            Helper method for drawing action samples from the uniform random distribution \
+            Drawing action samples from the uniform random distribution \
                 and tiling observations to the same shape as action samples.
-
         Arguments:
-            - obs (:obj:`torch.Tensor`): Observation of shape (B, O).
-            - num_samples (:obj:`int`): The number of negative samples (N).
-
+            - obs (:obj:`torch.Tensor`): Observation.
+            - num_samples (:obj:`int`): The number of negative samples.
         Returns:
-            - tiled_obs (:obj:`torch.Tensor`): Observation of shape (B, N, O).
-            - action (:obj:`torch.Tensor`): Action of shape (B, N, A).
+            - tiled_obs (:obj:`torch.Tensor`): Observations tiled.
+            - action (:obj:`torch.Tensor`): Action sampled.
+        Shapes:
+            - obs (:obj:`torch.Tensor`): :math:`(B, O)`.
+            - num_samples (:obj:`int`): :math:`N`.
+            - tiled_obs (:obj:`torch.Tensor`): :math:`(B, N, O)`.
+            - action (:obj:`torch.Tensor`): :math:`(B, N, A)`.
+        Examples:
+            >>> obs = torch.randn(2, 4)
+            >>> opt = StochasticOptimizer()
+            >>> opt.set_action_bounds(np.stack([np.zeros(5), np.ones(5)], axis=0))
+            >>> tiled_obs, action = opt._sample(obs, 8)
         """
         size = (obs.shape[0], num_samples, self.action_bounds.shape[1])
         low, high = self.action_bounds[0, :], self.action_bounds[1, :]
@@ -72,13 +93,22 @@ def _get_best_action_sample(obs: torch.Tensor, action_samples: torch.Tensor, ebm
         """
         Overview:
             Return one action for each batch with highest probability (lowest energy).
-
         Arguments:
-            - obs (:obj:`torch.Tensor`): Observation of shape (B, N, O).
-            - action_samples (:obj:`torch.Tensor`): Action of shape (B, N, A).
-
+            - obs (:obj:`torch.Tensor`): Observation.
+            - action_samples (:obj:`torch.Tensor`): Action from uniform distributions.
         Returns:
-            - best_action_samples (:obj:`torch.Tensor`): Action of shape (B, A).
+            - best_action_samples (:obj:`torch.Tensor`): Best action.
+        Shapes:
+            - obs (:obj:`torch.Tensor`): :math:`(B, O)`.
+            - action_samples (:obj:`torch.Tensor`): :math:`(B, N, A)`.
+            - best_action_samples (:obj:`torch.Tensor`): :math:`(B, A)`.
+        Examples:
+            >>> obs = torch.randn(2, 4)
+            >>> action_samples = torch.randn(2, 8, 5)
+            >>> ebm = EBM(4, 5)
+            >>> opt = StochasticOptimizer()
+            >>> opt.set_action_bounds(np.stack([np.zeros(5), np.ones(5)], axis=0))
+            >>> best_action_samples = opt._get_best_action_sample(obs, action_samples, ebm)
         """
         # (B, N)
         energies = ebm.forward(obs, action_samples)
@@ -91,10 +121,17 @@ def set_action_bounds(self, action_bounds: np.ndarray):
         """
         Overview:
             Set action bounds calculated from the dataset statistics.
-
         Arguments:
             - action_bounds (:obj:`np.ndarray`): Array of shape (2, A), \
                 where action_bounds[0] is lower bound and action_bounds[1] is upper bound.
+        Returns:
+            - action_bounds (:obj:`torch.Tensor`): Action bounds.
+        Shapes:
+            - action_bounds (:obj:`np.ndarray`): :math:`(2, A)`.
+            - action_bounds (:obj:`torch.Tensor`): :math:`(2, A)`.
+        Examples:
+            >>> opt = StochasticOptimizer()
+            >>> opt.set_action_bounds(np.stack([np.zeros(5), np.ones(5)], axis=0))
         """
         self.action_bounds = torch.as_tensor(action_bounds, dtype=torch.float32).to(self.device)
 
@@ -103,14 +140,17 @@ def sample(self, obs: torch.Tensor, ebm: nn.Module) -> Tuple[torch.Tensor, torch
         """
         Overview:
             Create tiled observations and sample counter-negatives for InfoNCE loss.
-
         Arguments:
-            - obs (:obj:`torch.Tensor`): Observation of shape (B, O).
+            - obs (:obj:`torch.Tensor`): Observations.
             - ebm (:obj:`torch.nn.Module`): Energy based model.
-
         Returns:
-            - tiled_obs (:obj:`torch.Tensor`): Observation of shape (B, N, O).
-            - action (:obj:`torch.Tensor`): Action of shape (B, N, A).
+            - tiled_obs (:obj:`torch.Tensor`): Tiled observations.
+            - action (:obj:`torch.Tensor`): Actions.
+        Shapes:
+            - obs (:obj:`torch.Tensor`): :math:`(B, O)`.
+            - ebm (:obj:`torch.nn.Module`): :math:`(B, N, O)`.
+            - tiled_obs (:obj:`torch.Tensor`): :math:`(B, N, O)`.
+            - action (:obj:`torch.Tensor`): :math:`(B, N, A)`.
 
         .. note:: In the case of derivative-free optimization, this function will simply call _sample.
         """
@@ -122,16 +162,27 @@ def infer(self, obs: torch.Tensor, ebm: nn.Module) -> torch.Tensor:
         Overview:
             Optimize for the best action conditioned on the current observation.
         Arguments:
-            - obs (:obj:`torch.Tensor`): Observation of shape (B, O).
+            - obs (:obj:`torch.Tensor`): Observations.
             - ebm (:obj:`torch.nn.Module`): Energy based model.
         Returns:
-            - best_action_samples (:obj:`torch.Tensor`): Action of shape (B, A).
+            - best_action_samples (:obj:`torch.Tensor`): Best actions.
+        Shapes:
+            - obs (:obj:`torch.Tensor`): :math:`(B, O)`.
+            - ebm (:obj:`torch.nn.Module`): :math:`(B, N, O)`.
+            - best_action_samples (:obj:`torch.Tensor`): :math:`(B, A)`.
         """
         raise NotImplementedError
 
 
 @STOCHASTIC_OPTIMIZER_REGISTRY.register('dfo')
 class DFO(StochasticOptimizer):
+    """
+    Overview:
+        Derivative-Free Optimizer in paper Implicit Behavioral Cloning.
+        https://arxiv.org/abs/2109.00137
+    Interface:
+        ``init``, ``sample``, ``infer``
+    """
 
     def __init__(
         self,
@@ -142,6 +193,17 @@ def __init__(
         inference_samples: int = 16384,
         device: str = 'cpu',
     ):
+        """
+        Overview:
+            Initialize the Derivative-Free Optimizer
+        Arguments:
+            - noise_scale (:obj:`float`): Initial noise scale.
+            - noise_shrink (:obj:`float`): Noise scale shrink rate.
+            - iters (:obj:`int`): Number of iterations.
+            - train_samples (:obj:`int`): Number of samples for training.
+            - inference_samples (:obj:`int`): Number of samples for inference.
+            - device (:obj:`str`): Device.
+        """
         self.action_bounds = None
         self.noise_scale = noise_scale
         self.noise_shrink = noise_shrink
@@ -151,10 +213,51 @@ def __init__(
         self.device = device
 
     def sample(self, obs: torch.Tensor, ebm: nn.Module) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Overview:
+            Drawing action samples from the uniform random distribution \
+                and tiling observations to the same shape as action samples.
+        Arguments:
+            - obs (:obj:`torch.Tensor`): Observations.
+            - ebm (:obj:`torch.nn.Module`): Energy based model.
+        Returns:
+            - tiled_obs (:obj:`torch.Tensor`): Tiled observation.
+            - action_samples (:obj:`torch.Tensor`): Action samples.
+        Shapes:
+            - obs (:obj:`torch.Tensor`): :math:`(B, O)`.
+            - ebm (:obj:`torch.nn.Module`): :math:`(B, N, O)`.
+            - tiled_obs (:obj:`torch.Tensor`): :math:`(B, N, O)`.
+            - action_samples (:obj:`torch.Tensor`): :math:`(B, N, A)`.
+        Examples:
+            >>> obs = torch.randn(2, 4)
+            >>> ebm = EBM(4, 5)
+            >>> opt = DFO()
+            >>> opt.set_action_bounds(np.stack([np.zeros(5), np.ones(5)], axis=0))
+            >>> tiled_obs, action_samples = opt.sample(obs, ebm)
+        """
         return self._sample(obs, self.train_samples)
 
     @torch.no_grad()
     def infer(self, obs: torch.Tensor, ebm: nn.Module) -> torch.Tensor:
+        """
+        Overview:
+            Optimize for the best action conditioned on the current observation.
+        Arguments:
+            - obs (:obj:`torch.Tensor`): Observations.
+            - ebm (:obj:`torch.nn.Module`): Energy based model.
+        Returns:
+            - best_action_samples (:obj:`torch.Tensor`): Actions.
+        Shapes:
+            - obs (:obj:`torch.Tensor`): :math:`(B, O)`.
+            - ebm (:obj:`torch.nn.Module`): :math:`(B, N, O)`.
+            - best_action_samples (:obj:`torch.Tensor`): :math:`(B, A)`.
+        Examples:
+            >>> obs = torch.randn(2, 4)
+            >>> ebm = EBM(4, 5)
+            >>> opt = DFO()
+            >>> opt.set_action_bounds(np.stack([np.zeros(5), np.ones(5)], axis=0))
+            >>> best_action_samples = opt.infer(obs, ebm)
+        """
         noise_scale = self.noise_scale
 
         # (B, N, O), (B, N, A)
@@ -181,6 +284,13 @@ def infer(self, obs: torch.Tensor, ebm: nn.Module) -> torch.Tensor:
 
 @STOCHASTIC_OPTIMIZER_REGISTRY.register('ardfo')
 class AutoRegressiveDFO(DFO):
+    """
+    Overview:
+        AutoRegressive Derivative-Free Optimizer in paper Implicit Behavioral Cloning.
+        https://arxiv.org/abs/2109.00137
+    Interface:
+        ``__init__``, ``infer``
+    """
 
     def __init__(
         self,
@@ -191,10 +301,40 @@ def __init__(
         inference_samples: int = 4096,
         device: str = 'cpu',
     ):
+        """
+        Overview:
+            Initialize the AutoRegressive Derivative-Free Optimizer
+        Arguments:
+            - noise_scale (:obj:`float`): Initial noise scale.
+            - noise_shrink (:obj:`float`): Noise scale shrink rate.
+            - iters (:obj:`int`): Number of iterations.
+            - train_samples (:obj:`int`): Number of samples for training.
+            - inference_samples (:obj:`int`): Number of samples for inference.
+            - device (:obj:`str`): Device.
+        """
         super().__init__(noise_scale, noise_shrink, iters, train_samples, inference_samples, device)
 
     @torch.no_grad()
     def infer(self, obs: torch.Tensor, ebm: nn.Module) -> torch.Tensor:
+        """
+        Overview:
+            Optimize for the best action conditioned on the current observation.
+        Arguments:
+            - obs (:obj:`torch.Tensor`): Observations.
+            - ebm (:obj:`torch.nn.Module`): Energy based model.
+        Returns:
+            - best_action_samples (:obj:`torch.Tensor`): Actions.
+        Shapes:
+            - obs (:obj:`torch.Tensor`): :math:`(B, O)`.
+            - ebm (:obj:`torch.nn.Module`): :math:`(B, N, O)`.
+            - best_action_samples (:obj:`torch.Tensor`): :math:`(B, A)`.
+        Examples:
+            >>> obs = torch.randn(2, 4)
+            >>> ebm = EBM(4, 5)
+            >>> opt = AutoRegressiveDFO()
+            >>> opt.set_action_bounds(np.stack([np.zeros(5), np.ones(5)], axis=0))
+            >>> best_action_samples = opt.infer(obs, ebm)
+        """
         noise_scale = self.noise_scale
 
         # (B, N, O), (B, N, A)
@@ -230,38 +370,91 @@ def infer(self, obs: torch.Tensor, ebm: nn.Module) -> torch.Tensor:
 
 @STOCHASTIC_OPTIMIZER_REGISTRY.register('mcmc')
 class MCMC(StochasticOptimizer):
+    """
+    Overview:
+        MCMC method as stochastic optimizers in paper Implicit Behavioral Cloning.
+        https://arxiv.org/abs/2109.00137
+    Interface:
+        ``__init__``, ``sample``, ``infer``, ``grad_penalty``
+    """
 
     class BaseScheduler(ABC):
+        """
+        Overview:
+            Base class for learning rate scheduler.
+        Interface:
+            ``get_rate``
+        """
 
         @abstractmethod
         def get_rate(self, index):
+            """
+            Overview:
+                Abstract method for getting learning rate.
+            """
             raise NotImplementedError
 
     class ExponentialScheduler:
-        """Exponential learning rate schedule for Langevin sampler."""
+        """
+        Overview:
+            Exponential learning rate schedule for Langevin sampler.
+        Interface:
+            ``__init__``, ``get_rate``
+        """
 
         def __init__(self, init, decay):
+            """
+            Overview:
+                Initialize the ExponentialScheduler.
+            Arguments:
+                - init (:obj:`float`): Initial learning rate.
+                - decay (:obj:`float`): Decay rate.
+            """
             self._decay = decay
             self._latest_lr = init
 
         def get_rate(self, index):
-            """Get learning rate. Assumes calling sequentially."""
+            """
+            Overview:
+                Get learning rate. Assumes calling sequentially.
+            Arguments:
+                - index (:obj:`int`): Current iteration.    
+            """
             del index
             lr = self._latest_lr
             self._latest_lr *= self._decay
             return lr
 
     class PolynomialScheduler:
-        """Polynomial learning rate schedule for Langevin sampler."""
+        """
+        Overview:
+            Polynomial learning rate schedule for Langevin sampler.
+        Interface:
+            ``__init__``, ``get_rate``
+        """
 
         def __init__(self, init, final, power, num_steps):
+            """
+            Overview:
+                Initialize the PolynomialScheduler.
+            Arguments:
+                - init (:obj:`float`): Initial learning rate.
+                - final (:obj:`float`): Final learning rate.
+                - power (:obj:`float`): Power of polynomial.
+                - num_steps (:obj:`int`): Number of steps.
+            """
             self._init = init
             self._final = final
             self._power = power
             self._num_steps = num_steps
 
         def get_rate(self, index):
-            """Get learning rate for index."""
+            """
+            Overview:
+                Get learning rate for index.
+            Arguments:
+                - index (:obj:`int`): Current iteration.    
+            """
             if index == -1:
                 return self._init
             return (
@@ -298,6 +491,26 @@ def __init__(
         grad_loss_weight: float = 1.0,
         **kwargs,
     ):
+        """
+        Overview:
+            Initialize the MCMC.
+        Arguments:
+            - iters (:obj:`int`): Number of iterations.
+            - use_langevin_negative_samples (:obj:`bool`): Whether to use Langevin sampler.
+            - train_samples (:obj:`int`): Number of samples for training.
+            - inference_samples (:obj:`int`): Number of samples for inference.
+            - stepsize_scheduler (:obj:`dict`): Step size scheduler for Langevin sampler.
+            - optimize_again (:obj:`bool`): Whether to run a second optimization.
+            - again_stepsize_scheduler (:obj:`dict`): Step size scheduler for the second optimization.
+            - device (:obj:`str`): Device.
+            - noise_scale (:obj:`float`): Initial noise scale.
+            - grad_clip (:obj:`float`): Gradient clip.
+            - delta_action_clip (:obj:`float`): Action clip.
+            - add_grad_penalty (:obj:`bool`): Whether to add gradient penalty.
+            - grad_norm_type (:obj:`str`): Gradient norm type.
+            - grad_margin (:obj:`float`): Gradient margin.
+            - grad_loss_weight (:obj:`float`): Gradient loss weight.
+        """
         self.iters = iters
         self.use_langevin_negative_samples = use_langevin_negative_samples
         self.train_samples = train_samples
@@ -323,9 +536,20 @@ def _gradient_wrt_act(
             create_graph: bool = False,
     ) -> torch.Tensor:
         """
-        Calculate gradient w.r.t action.
-        obs: (B, N, O), action: (B, N, A).
-        return: (B, N, A).
+        Overview:
+            Calculate gradient w.r.t action.
+        Arguments:
+            - obs (:obj:`torch.Tensor`): Observations.
+            - action (:obj:`torch.Tensor`): Actions.
+            - ebm (:obj:`torch.nn.Module`): Energy based model.
+            - create_graph (:obj:`bool`): Whether to create graph.
+        Returns:
+            - grad (:obj:`torch.Tensor`): Gradient w.r.t action.
+        Shapes:
+            - obs (:obj:`torch.Tensor`): :math:`(B, N, O)`.
+            - action (:obj:`torch.Tensor`): :math:`(B, N, A)`.
+            - ebm (:obj:`torch.nn.Module`): :math:`(B, N, O)`.
+            - grad (:obj:`torch.Tensor`): :math:`(B, N, A)`.
         """
         action.requires_grad_(True)
         energy = ebm.forward(obs, action).sum()
@@ -337,9 +561,19 @@ def _gradient_wrt_act(
 
     def grad_penalty(self, obs: torch.Tensor, action: torch.Tensor, ebm: nn.Module) -> torch.Tensor:
         """
-        Calculate gradient penalty.
-        obs: (B, N+1, O), action: (B, N+1, A).
-        return: loss.
+        Overview:
+            Calculate gradient penalty.
+        Arguments:
+            - obs (:obj:`torch.Tensor`): Observations.
+            - action (:obj:`torch.Tensor`): Actions.
+            - ebm (:obj:`torch.nn.Module`): Energy based model.
+        Returns:
+            - loss (:obj:`torch.Tensor`): Gradient penalty.
+        Shapes:
+            - obs (:obj:`torch.Tensor`): :math:`(B, N+1, O)`.
+            - action (:obj:`torch.Tensor`): :math:`(B, N+1, A)`.
+            - ebm (:obj:`torch.nn.Module`): :math:`(B, N+1, O)`.
+            - loss (:obj:`torch.Tensor`): :math:`(B, )`.
         """
         if not self.add_grad_penalty:
             return 0.
@@ -371,9 +605,20 @@ def compute_grad_norm(grad_norm_type, de_dact) -> torch.Tensor:
     @no_ebm_grad()
     def _langevin_step(self, obs: torch.Tensor, action: torch.Tensor, stepsize: float, ebm: nn.Module) -> torch.Tensor:
         """
-        Run one langevin MCMC step.
-        obs: (B, N, O), action: (B, N, A)
-        return: (B, N, A).
+        Overview:
+            Run one langevin MCMC step.
+        Arguments:
+            - obs (:obj:`torch.Tensor`): Observations.
+            - action (:obj:`torch.Tensor`): Actions.
+            - stepsize (:obj:`float`): Step size.
+            - ebm (:obj:`torch.nn.Module`): Energy based model.
+        Returns:
+            - action (:obj:`torch.Tensor`): Actions.
+        Shapes:
+            - obs (:obj:`torch.Tensor`): :math:`(B, N, O)`.
+            - action (:obj:`torch.Tensor`): :math:`(B, N, A)`.
+            - stepsize (:obj:`float`): :math:`(B, )`.
+            - ebm (:obj:`torch.nn.Module`): :math:`(B, N, O)`.
         """
         l_lambda = 1.0
         de_dact = MCMC._gradient_wrt_act(obs, action, ebm)
@@ -402,9 +647,19 @@ def _langevin_action_given_obs(
             scheduler: BaseScheduler = None
     ) -> torch.Tensor:
         """
-        Run langevin MCMC for `self.iters` steps.
-        obs: (B, N, O), action: (B, N, A)
-        return: (B, N, A)
+        Overview:
+            Run langevin MCMC for `self.iters` steps.
+        Arguments:
+            - obs (:obj:`torch.Tensor`): Observations.
+            - action (:obj:`torch.Tensor`): Actions.
+            - ebm (:obj:`torch.nn.Module`): Energy based model.
+            - scheduler (:obj:`BaseScheduler`): Learning rate scheduler.
+        Returns:
+            - action (:obj:`torch.Tensor`): Actions.
+        Shapes:
+            - obs (:obj:`torch.Tensor`): :math:`(B, N, O)`.
+            - action (:obj:`torch.Tensor`): :math:`(B, N, A)`.
+            - ebm (:obj:`torch.nn.Module`): :math:`(B, N, O)`.
         """
         if not scheduler:
             self.stepsize_scheduler['num_steps'] = self.iters
@@ -417,6 +672,27 @@ def _langevin_action_given_obs(
 
     @no_ebm_grad()
     def sample(self, obs: torch.Tensor, ebm: nn.Module) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Overview:
+            Create tiled observations and sample counter-negatives for InfoNCE loss.
+        Arguments:
+            - obs (:obj:`torch.Tensor`): Observations.
+            - ebm (:obj:`torch.nn.Module`): Energy based model.
+        Returns:
+            - tiled_obs (:obj:`torch.Tensor`): Tiled observations.
+            - action_samples (:obj:`torch.Tensor`): Action samples.
+        Shapes:
+            - obs (:obj:`torch.Tensor`): :math:`(B, O)`.
+            - ebm (:obj:`torch.nn.Module`): :math:`(B, N, O)`.
+            - tiled_obs (:obj:`torch.Tensor`): :math:`(B, N, O)`.
+            - action_samples (:obj:`torch.Tensor`): :math:`(B, N, A)`.
+        Examples:
+            >>> obs = torch.randn(2, 4)
+            >>> ebm = EBM(4, 5)
+            >>> opt = MCMC()
+            >>> opt.set_action_bounds(np.stack([np.zeros(5), np.ones(5)], axis=0))
+            >>> tiled_obs, action_samples = opt.sample(obs, ebm)
+        """
         obs, uniform_action_samples = self._sample(obs, self.train_samples)
         if not self.use_langevin_negative_samples:
             return obs, uniform_action_samples
@@ -425,6 +701,25 @@ def sample(self, obs: torch.Tensor, ebm: nn.Module) -> Tuple[torch.Tensor, torch
 
     @no_ebm_grad()
     def infer(self, obs: torch.Tensor, ebm: nn.Module) -> torch.Tensor:
+        """
+        Overview:
+            Optimize for the best action conditioned on the current observation.
+        Arguments:
+            - obs (:obj:`torch.Tensor`): Observations.
+            - ebm (:obj:`torch.nn.Module`): Energy based model.
+        Returns:
+            - best_action_samples (:obj:`torch.Tensor`): Actions.
+        Shapes:
+            - obs (:obj:`torch.Tensor`): :math:`(B, O)`.
+            - ebm (:obj:`torch.nn.Module`): :math:`(B, N, O)`.
+            - best_action_samples (:obj:`torch.Tensor`): :math:`(B, A)`.
+        Examples:
+            >>> obs = torch.randn(2, 4)
+            >>> ebm = EBM(4, 5)
+            >>> opt = MCMC()
+            >>> opt.set_action_bounds(np.stack([np.zeros(5), np.ones(5)], axis=0))
+            >>> best_action_samples = opt.infer(obs, ebm)
+        """
         # (B, N, O), (B, N, A)
         obs, uniform_action_samples = self._sample(obs, self.inference_samples)
         action_samples = self._langevin_action_given_obs(
@@ -449,6 +744,12 @@ def infer(self, obs: torch.Tensor, ebm: nn.Module) -> torch.Tensor:
 
 @MODEL_REGISTRY.register('ebm')
 class EBM(nn.Module):
+    """
+    Overview:
+        Energy based model.
+    Interface:
+        ``__init__``, ``forward``
+    """
 
     def __init__(
         self,
@@ -458,6 +759,15 @@ def __init__(
         hidden_layer_num: int = 4,
         **kwargs,
     ):
+        """
+        Overview:
+            Initialize the EBM.
+        Arguments:
+            - obs_shape (:obj:`int`): Observation shape.
+            - action_shape (:obj:`int`): Action shape.
+            - hidden_size (:obj:`int`): Hidden size.
+            - hidden_layer_num (:obj:`int`): Number of hidden layers.
+        """
         super().__init__()
         input_size = obs_shape + action_shape
         self.net = nn.Sequential(
@@ -471,9 +781,20 @@ def __init__(
         )
 
     def forward(self, obs, action):
-        # obs: (B, N, O)
-        # action: (B, N, A)
-        # return: (B, N)
+        """
+        Overview:
+            Forward computation graph of EBM.
+        Arguments:
+            - obs (:obj:`torch.Tensor`): Observation of shape (B, N, O).
+            - action (:obj:`torch.Tensor`): Action of shape (B, N, A).
+        Returns:
+            - pred (:obj:`torch.Tensor`): Energy of shape (B, N).
+        Examples:
+            >>> obs = torch.randn(2, 3, 4)
+            >>> action = torch.randn(2, 3, 5)
+            >>> ebm = EBM(4, 5)
+            >>> pred = ebm(obs, action)
+        """
         x = torch.cat([obs, action], -1)
         x = self.net(x)
         return x['pred']
@@ -481,6 +802,12 @@ def forward(self, obs, action):
 
 @MODEL_REGISTRY.register('arebm')
 class AutoregressiveEBM(nn.Module):
+    """
+    Overview:
+        Autoregressive energy based model.
+    Interface:
+        ``__init__``, ``forward``
+    """
 
     def __init__(
         self,
@@ -488,21 +815,37 @@ def __init__(
         action_shape: int,
         hidden_size: int = 512,
         hidden_layer_num: int = 4,
-        **kwargs,
     ):
+        """
+        Overview:
+            Initialize the AutoregressiveEBM.
+        Arguments:
+            - obs_shape (:obj:`int`): Observation shape.
+            - action_shape (:obj:`int`): Action shape.
+            - hidden_size (:obj:`int`): Hidden size.
+            - hidden_layer_num (:obj:`int`): Number of hidden layers.
+        """
         super().__init__()
         self.ebm_list = nn.ModuleList()
         for i in range(action_shape):
             self.ebm_list.append(EBM(obs_shape, i + 1, hidden_size, hidden_layer_num))
 
     def forward(self, obs, action):
-        # obs: (B, N, O)
-        # action: (B, N, A)
-        # return: (B, N, A)
-
-        # (B, N)
+        """
+        Overview:
+            Forward computation graph of AutoregressiveEBM.
+        Arguments:
+            - obs (:obj:`torch.Tensor`): Observation of shape (B, N, O).
+            - action (:obj:`torch.Tensor`): Action of shape (B, N, A).
+        Returns:
+            - pred (:obj:`torch.Tensor`): Energy of shape (B, N, A).
+        Examples:
+            >>> obs = torch.randn(2, 3, 4)
+            >>> action = torch.randn(2, 3, 5)
+            >>> arebm = AutoregressiveEBM(4, 5)
+            >>> pred = arebm(obs, action)
+        """
         output_list = []
         for i, ebm in enumerate(self.ebm_list):
             output_list.append(ebm(obs, action[..., :i + 1]))
-        # (B, N, A)
         return torch.stack(output_list, axis=-1)
diff --git a/ding/model/template/maqac.py b/ding/model/template/maqac.py
index 69c6d4cee0..9fab8da685 100644
--- a/ding/model/template/maqac.py
+++ b/ding/model/template/maqac.py
@@ -91,7 +91,7 @@ def __init__(
             )
 
     def forward(self, inputs: Union[torch.Tensor, Dict], mode: str) -> Dict:
-        r"""
+        """
         Overview:
             Use observation and action tensor to predict output.
             Parameter updates with QAC's MLPs forward setup.
@@ -122,7 +122,7 @@ def forward(self, inputs: Union[torch.Tensor, Dict], mode: str) -> Dict:
         return getattr(self, mode)(inputs)
 
     def compute_actor(self, inputs: Dict) -> Dict:
-        r"""
+        """
         Overview:
             Use encoded embedding tensor to predict output.
             Execute parameter updates with ``'compute_actor'`` mode
@@ -163,7 +163,7 @@ def compute_actor(self, inputs: Dict) -> Dict:
         return {'logit': x['logit'], 'action_mask': action_mask}
 
     def compute_critic(self, inputs: Dict) -> Dict:
-        r"""
+        """
         Overview:
             Execute parameter updates with ``'compute_critic'`` mode
             Use encoded embedding tensor to predict output.
@@ -293,7 +293,7 @@ def __init__(
             )
 
     def forward(self, inputs: Union[torch.Tensor, Dict], mode: str) -> Dict:
-        r"""
+        """
         Overview:
             Use observation and action tensor to predict output.
             Parameter updates with QAC's MLPs forward setup.
@@ -347,7 +347,7 @@ def forward(self, inputs: Union[torch.Tensor, Dict], mode: str) -> Dict:
         return getattr(self, mode)(inputs)
 
     def compute_actor(self, inputs: Dict) -> Dict:
-        r"""
+        """
         Overview:
             Use encoded embedding tensor to predict output.
             Execute parameter updates with ``'compute_actor'`` mode
@@ -398,7 +398,7 @@ def compute_actor(self, inputs: Dict) -> Dict:
             return {'logit': [x['mu'], x['sigma']]}
 
     def compute_critic(self, inputs: Dict) -> Dict:
-        r"""
+        """
         Overview:
             Execute parameter updates with ``'compute_critic'`` mode
             Use encoded embedding tensor to predict output.
diff --git a/ding/model/template/ngu.py b/ding/model/template/ngu.py
index eb605982bd..1577115559 100644
--- a/ding/model/template/ngu.py
+++ b/ding/model/template/ngu.py
@@ -10,7 +10,7 @@
 
 
 def parallel_wrapper(forward_fn: Callable) -> Callable:
-    r"""
+    """
     Overview:
         Process timestep T and batch_size B at the same time, in other words, treat different timestep data as
         different trajectories in a batch.
@@ -62,7 +62,7 @@ def __init__(
             activation: Optional[nn.Module] = nn.ReLU(),
             norm_type: Optional[str] = None
     ) -> None:
-        r"""
+        """
         Overview:
             Init the DRQN Model according to arguments.
         Arguments:
@@ -120,7 +120,7 @@ def __init__(
             )
 
     def forward(self, inputs: Dict, inference: bool = False, saved_state_timesteps: Optional[list] = None) -> Dict:
-        r"""
+        """
         Overview:
             Use observation, prev_action prev_reward_extrinsic to predict NGU Q output.
             Parameter updates with NGU's MLPs forward setup.
diff --git a/ding/model/template/ppg.py b/ding/model/template/ppg.py
index f9dd64d21e..8b5669f4bd 100644
--- a/ding/model/template/ppg.py
+++ b/ding/model/template/ppg.py
@@ -8,6 +8,14 @@
 
 @MODEL_REGISTRY.register('ppg')
 class PPG(nn.Module):
+    """
+    Overview:
+        Phasic Policy Gradient (PPG) model from paper `Phasic Policy Gradient`
+        https://arxiv.org/abs/2009.04416
+    Interfaces:
+        ``forward``, ``compute_actor``, ``compute_critic``, ``compute_actor_critic``
+    """
+
     mode = ['compute_actor', 'compute_critic', 'compute_actor_critic']
 
     def __init__(
@@ -25,6 +33,27 @@ def __init__(
         norm_type: Optional[str] = None,
         impala_cnn_encoder: bool = False,
     ) -> None:
+        """
+        Overview:
+            Initailize the PPG Model according to input arguments.
+        Arguments:
+            - obs_shape (:obj:`Union[int, SequenceType]`): Observation's shape, such as 128, (156, ).
+            - action_shape (:obj:`Union[int, SequenceType]`): Action's shape, such as 4, (3, ).
+            - action_space (:obj:`str`): The action space type, such as 'discrete', 'continuous'.
+            - share_encoder (:obj:`bool`): Whether to share encoder.
+            - encoder_hidden_size_list (:obj:`SequenceType`): The hidden size list of encoder.
+            - actor_head_hidden_size (:obj:`int`): The ``hidden_size`` to pass to actor head.
+            - actor_head_layer_num (:obj:`int`): The num of layers used in the network to compute Q value output \
+                for actor head.
+            - critic_head_hidden_size (:obj:`int`): The ``hidden_size`` to pass to critic head.
+            - critic_head_layer_num (:obj:`int`): The num of layers used in the network to compute Q value output \
+                for critic head.
+            - activation (:obj:`Optional[nn.Module]`): The type of activation function to use in ``MLP`` \
+                after each FC layer, if ``None`` then default set to ``nn.ReLU()``.
+            - norm_type (:obj:`Optional[str]`): The type of normalization to after network layer (FC, Conv), \
+                see ``ding.torch_utils.network`` for more details.
+            - impala_cnn_encoder (:obj:`bool`): Whether to use impala cnn encoder.
+        """
         super(PPG, self).__init__()
         self.actor_critic = VAC(
             obs_shape,
@@ -43,20 +72,47 @@ def __init__(
         self.aux_critic = copy.deepcopy(self.actor_critic.critic)
 
     def forward(self, inputs: Union[torch.Tensor, Dict], mode: str) -> Dict:
+        """
+        Overview:
+            Use different forward function according to mode.
+        Arguments:
+            - inputs (:obj:`Union[torch.Tensor, Dict]`): The input data.
+            - mode (:obj:`str`): The mode to forward.
+        Returns:
+            - output (:obj:`Dict`): The output data.
+        """
         assert mode in self.mode, "not support forward mode: {}/{}".format(mode, self.mode)
         return getattr(self, mode)(inputs)
 
     def compute_actor(self, x: torch.Tensor) -> Dict:
         """
+        Overview:
+            Use actor to compute action logits.
+        Arguments:
+            - x (:obj:`torch.Tensor`): The input data.
+        Returns:
+            - output (:obj:`Dict`): The output data.
         ReturnsKeys:
             - necessary: ``logit``
+        Shapes:
+            - x (:obj:`torch.Tensor`): :math:`(B, N)`, where B is batch size and N is the input feature size.
+            - output (:obj:`Dict`): ``logit``: :math:`(B, A)`, where B is batch size and A is the action space size.
         """
         return self.actor_critic(x, mode='compute_actor')
 
     def compute_critic(self, x: torch.Tensor) -> Dict:
         """
+        Overview:
+            Use critic to compute value.
+        Arguments:
+            - x (:obj:`torch.Tensor`): The input data.
+        Returns:
+            - output (:obj:`Dict`): The output data.
         ReturnsKeys:
             - necessary: ``value``
+        Shapes:
+            - x (:obj:`torch.Tensor`): :math:`(B, N)`, where B is batch size and N is the input feature size.
+            - output (:obj:`Dict`): ``value``: :math:`(B, 1)`, where B is batch size.
         """
         x = self.aux_critic[0](x)  # encoder
         x = self.aux_critic[1](x)  # head
@@ -64,10 +120,20 @@ def compute_critic(self, x: torch.Tensor) -> Dict:
 
     def compute_actor_critic(self, x: torch.Tensor) -> Dict:
         """
-        .. note::
-            ``compute_actor_critic`` interface aims to save computation when shares encoder.
-
+        Overview:
+            Use actor and critic to compute action logits and value.
+        Arguments:
+            - x (:obj:`torch.Tensor`): The input data.
+        Returns:
+            - output (:obj:`Dict`): The output data.
         ReturnsKeys:
             - necessary: ``value``, ``logit``
+        Shapes:
+            - x (:obj:`torch.Tensor`): :math:`(B, N)`, where B is batch size and N is the input feature size.
+            - output (:obj:`Dict`): ``value``: :math:`(B, 1)`, where B is batch size.
+            - output (:obj:`Dict`): ``logit``: :math:`(B, A)`, where B is batch size and A is the action space size.
+
+        .. note::
+            ``compute_actor_critic`` interface aims to save computation when shares encoder.
         """
         return self.actor_critic(x, mode='compute_actor_critic')
diff --git a/ding/model/template/q_learning.py b/ding/model/template/q_learning.py
index 43c831c174..e36e5bdf6c 100644
--- a/ding/model/template/q_learning.py
+++ b/ding/model/template/q_learning.py
@@ -199,7 +199,7 @@ def __init__(
         )
 
     def forward(self, x: torch.Tensor) -> Dict:
-        r"""
+        """
         Overview:
             BDQ forward computation graph, input observation tensor to predict q_value.
         Arguments:
@@ -375,7 +375,7 @@ def __init__(
             activation: Optional[nn.Module] = nn.ReLU(),
             norm_type: Optional[str] = None,
     ) -> None:
-        r"""
+        """
         Overview:
             Init the QRDQN Model according to input arguments.
         Arguments:
@@ -429,7 +429,7 @@ def __init__(
             )
 
     def forward(self, x: torch.Tensor) -> Dict:
-        r"""
+        """
         Overview:
             Use observation tensor to predict QRDQN's output.
             Parameter updates with QRDQN's MLPs forward setup.
@@ -479,7 +479,7 @@ def __init__(
             activation: Optional[nn.Module] = nn.ReLU(),
             norm_type: Optional[str] = None
     ) -> None:
-        r"""
+        """
         Overview:
             Init the IQN Model according to input arguments.
         Arguments:
@@ -536,7 +536,7 @@ def __init__(
             )
 
     def forward(self, x: torch.Tensor) -> Dict:
-        r"""
+        """
         Overview:
             Use encoded embedding tensor to predict IQN's output.
             Parameter updates with IQN's MLPs forward setup.
@@ -586,7 +586,7 @@ def __init__(
             activation: Optional[nn.Module] = nn.ReLU(),
             norm_type: Optional[str] = None
     ) -> None:
-        r"""
+        """
         Overview:
             Init the FQF Model according to input arguments.
         Arguments:
@@ -643,7 +643,7 @@ def __init__(
             )
 
     def forward(self, x: torch.Tensor) -> Dict:
-        r"""
+        """
         Overview:
             Use encoded embedding tensor to predict FQF's output.
             Parameter updates with FQF's MLPs forward setup.
@@ -762,7 +762,7 @@ def __init__(
             )
 
     def forward(self, x: torch.Tensor) -> Dict:
-        r"""
+        """
         Overview:
             Use observation tensor to predict Rainbow output.
             Parameter updates with Rainbow's MLPs forward setup.
@@ -796,7 +796,7 @@ def forward(self, x: torch.Tensor) -> Dict:
 
 
 def parallel_wrapper(forward_fn: Callable) -> Callable:
-    r"""
+    """
     Overview:
         Process timestep T and batch_size B at the same time, in other words, treat different timestep data as
         different trajectories in a batch.
diff --git a/ding/model/template/qac_dist.py b/ding/model/template/qac_dist.py
index e2ce65f34c..d9390cb06e 100644
--- a/ding/model/template/qac_dist.py
+++ b/ding/model/template/qac_dist.py
@@ -8,7 +8,7 @@
 
 @MODEL_REGISTRY.register('qac_dist')
 class QACDIST(nn.Module):
-    r"""
+    """
     Overview:
         The QAC model with distributional Q-value.
     Interfaces:
@@ -32,7 +32,7 @@ def __init__(
         v_max: Optional[float] = 10,
         n_atom: Optional[int] = 51,
     ) -> None:
-        r"""
+        """
         Overview:
             Init the QAC Distributional Model according to arguments.
         Arguments:
@@ -102,7 +102,7 @@ def __init__(
             )
 
     def forward(self, inputs: Union[torch.Tensor, Dict], mode: str) -> Dict:
-        r"""
+        """
         Overview:
             Use observation and action tensor to predict output.
             Parameter updates with QACDIST's MLPs forward setup.
@@ -166,7 +166,7 @@ def forward(self, inputs: Union[torch.Tensor, Dict], mode: str) -> Dict:
         return getattr(self, mode)(inputs)
 
     def compute_actor(self, inputs: torch.Tensor) -> Dict:
-        r"""
+        """
         Overview:
             Use encoded embedding tensor to predict output.
             Execute parameter updates with ``'compute_actor'`` mode
@@ -210,7 +210,7 @@ def compute_actor(self, inputs: torch.Tensor) -> Dict:
             return {'logit': [x['mu'], x['sigma']]}
 
     def compute_critic(self, inputs: Dict) -> Dict:
-        r"""
+        """
         Overview:
             Execute parameter updates with ``'compute_critic'`` mode
             Use encoded embedding tensor to predict output.
diff --git a/ding/model/template/vae.py b/ding/model/template/vae.py
index c8d7546ddc..9839f0e905 100644
--- a/ding/model/template/vae.py
+++ b/ding/model/template/vae.py
@@ -83,7 +83,7 @@ def encode(self, input: Dict[str, Tensor]) -> Dict[str, Any]:
         return {'mu': mu, 'log_var': log_var, 'obs_encoding': obs_encoding}
 
     def decode(self, z: Tensor, obs_encoding: Tensor) -> Dict[str, Any]:
-        r"""
+        """
          Overview:
                Maps the given latent action and obs_encoding onto the original action space.
          Arguments:
@@ -108,7 +108,7 @@ def decode(self, z: Tensor, obs_encoding: Tensor) -> Dict[str, Any]:
         return {'reconstruction_action': reconstruction_action, 'predition_residual': predition_residual}
 
     def decode_with_obs(self, z: Tensor, obs: Tensor) -> Dict[str, Any]:
-        r"""
+        """
           Overview:
                 Maps the given latent action and obs onto the original action space.
                 Using the method self.encode_obs_head(obs) to get the obs_encoding.
@@ -136,7 +136,7 @@ def decode_with_obs(self, z: Tensor, obs: Tensor) -> Dict[str, Any]:
         return {'reconstruction_action': reconstruction_action, 'predition_residual': predition_residual}
 
     def reparameterize(self, mu: Tensor, logvar: Tensor) -> Tensor:
-        r"""
+        """
          Overview:
               Reparameterization trick to sample from N(mu, var) from N(0,1).
          Arguments:
diff --git a/ding/model/template/wqmix.py b/ding/model/template/wqmix.py
index c5307d9bbf..5a2bcbb615 100644
--- a/ding/model/template/wqmix.py
+++ b/ding/model/template/wqmix.py
@@ -17,7 +17,7 @@ class MixerStar(nn.Module):
         each agent to a total q_value and is diffrent from the Qmix's mixer network,
         here the mixing network is a feedforward network with 3 hidden layers of 256 dim.
     Interface:
-        __init__, forward
+         ``__init__``, ``forward``
     """
 
     def __init__(self, agent_num: int, state_dim: int, mixing_embed_dim: int) -> None:
@@ -79,7 +79,7 @@ class WQMix(nn.Module):
     Overview:
         WQMIX network, which is same as Qmix network
     Interface:
-        __init__, forward, _setup_global_encoder
+         ``__init__``, ``forward``, _setup_global_encoder
     """
 
     def __init__(

From 9eb72dcb54bbf3fdfe5d7336b46c45910d60cecf Mon Sep 17 00:00:00 2001
From: zjowowen <zjowowen@outlook.com>
Date: Thu, 26 Oct 2023 19:41:44 +0800
Subject: [PATCH 2/5] polish code

---
 ding/model/template/atoc.py   | 9 ++++++---
 ding/model/template/bcq.py    | 2 +-
 ding/model/template/collaq.py | 4 ++--
 ding/model/template/coma.py   | 3 ++-
 ding/model/template/ebm.py    | 4 ++--
 5 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/ding/model/template/atoc.py b/ding/model/template/atoc.py
index 5d05226505..a06f536aef 100644
--- a/ding/model/template/atoc.py
+++ b/ding/model/template/atoc.py
@@ -422,8 +422,8 @@ def _compute_delta_q(self, obs: torch.Tensor, actor_outputs: Dict) -> torch.Tens
             - necessary: ``new_thoughts``, ``old_thoughts``, ``group``, ``is_initiator``
         Shapes:
             - obs (:obj:`torch.Tensor`): :math:`(B, A, N)`, where B is batch size, A is agent num, N is obs size
-            - actor_outputs (:obj:`Dict`): the output of actor network, including  ``action``, ``new_thoughts``, ``old_thoughts``, \
-                ``group``, ``initiator_prob``, ``is_initiator``
+            - actor_outputs (:obj:`Dict`): the output of actor network, including  ``action``, ``new_thoughts``, \
+                ``old_thoughts``, ``group``, ``initiator_prob``, ``is_initiator``
             - action (:obj:`torch.Tensor`): :math:`(B, A, M)` where M is action size
             - new_thoughts (:obj:`torch.Tensor`): :math:`(B, A, M)` where M is thought size
             - old_thoughts (:obj:`torch.Tensor`): :math:`(B, A, M)` where M is thought size
@@ -554,7 +554,10 @@ def optimize_actor_attention(self, inputs: Dict[str, torch.Tensor]) -> Dict[str,
             >>> delta_q = torch.randn(2, 3)
             >>> initiator_prob = torch.randn(2, 3)
             >>> is_initiator = torch.randn(2, 3)
-            >>> net.optimize_actor_attention({'delta_q': delta_q, 'initiator_prob': initiator_prob, 'is_initiator': is_initiator})
+            >>> net.optimize_actor_attention(
+            >>>     {'delta_q': delta_q,
+            >>>      'initiator_prob': initiator_prob,
+            >>>      'is_initiator': is_initiator})
         """
         if not self._communication:
             raise NotImplementedError
diff --git a/ding/model/template/bcq.py b/ding/model/template/bcq.py
index bd37afbdc1..49b266b94d 100755
--- a/ding/model/template/bcq.py
+++ b/ding/model/template/bcq.py
@@ -110,7 +110,7 @@ def forward(self, inputs: Dict[str, torch.Tensor], mode: str) -> Dict[str, torch
             >>> outputs = model(inputs, mode='compute_critic')
             >>> outputs = model(inputs, mode='compute_vae')
             >>> outputs = model(inputs, mode='compute_eval')
-                
+
         .. note::
             For specific examples, one can refer to API doc of ``compute_actor`` and ``compute_critic`` respectively.
         """
diff --git a/ding/model/template/collaq.py b/ding/model/template/collaq.py
index 01136211f5..9872d0684a 100644
--- a/ding/model/template/collaq.py
+++ b/ding/model/template/collaq.py
@@ -211,9 +211,9 @@ class CollaQ(nn.Module):
     """
     Overview:
         The network of CollaQ (Collaborative Q-learning) algorithm.
-        It includes two parts: q_network and q_alone_network. 
+        It includes two parts: q_network and q_alone_network.
         The q_network is used to get the q_value of the agent's observation and \
-        the agent's part of the observation information of the agent's concerned allies. 
+        the agent's part of the observation information of the agent's concerned allies.
         The q_alone_network is used to get the q_value of the agent's observation and \
         the agent's observation information without the agent's concerned allies.
         Multi-Agent Collaboration via Reward Attribution Decomposition
diff --git a/ding/model/template/coma.py b/ding/model/template/coma.py
index cccabff9f4..02eb286e84 100644
--- a/ding/model/template/coma.py
+++ b/ding/model/template/coma.py
@@ -132,7 +132,8 @@ def forward(self, data: Dict) -> Dict:
         Examples:
             >>> agent_num, bs, T = 4, 3, 8
             >>> obs_dim, global_obs_dim, action_dim = 32, 32 * 4, 9
-            >>> coma_model = COMACriticNetwork(obs_dim - action_dim + global_obs_dim + 2 * action_dim * agent_num, action_dim)
+            >>> coma_model = COMACriticNetwork(
+            >>>     obs_dim - action_dim + global_obs_dim + 2 * action_dim * agent_num, action_dim)
             >>> data = {
             >>>     'obs': {
             >>>         'agent_state': torch.randn(T, bs, agent_num, obs_dim),
diff --git a/ding/model/template/ebm.py b/ding/model/template/ebm.py
index 83f05168e3..4b91fd1b6d 100644
--- a/ding/model/template/ebm.py
+++ b/ding/model/template/ebm.py
@@ -418,7 +418,7 @@ def get_rate(self, index):
             Overview:
                 Get learning rate. Assumes calling sequentially.
             Arguments:
-                - index (:obj:`int`): Current iteration.    
+                - index (:obj:`int`): Current iteration.
             """
             del index
             lr = self._latest_lr
@@ -453,7 +453,7 @@ def get_rate(self, index):
             Overview:
                 Get learning rate for index.
             Arguments:
-                - index (:obj:`int`): Current iteration.    
+                - index (:obj:`int`): Current iteration.
             """
             if index == -1:
                 return self._init

From 561330cdc1132a00c5fe8ac40153f60fb2a8acde Mon Sep 17 00:00:00 2001
From: zjowowen <zjowowen@outlook.com>
Date: Fri, 27 Oct 2023 14:58:00 +0800
Subject: [PATCH 3/5] polish code

---
 ding/model/common/encoder.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ding/model/common/encoder.py b/ding/model/common/encoder.py
index bb128ca41b..82dab4808a 100644
--- a/ding/model/common/encoder.py
+++ b/ding/model/common/encoder.py
@@ -246,7 +246,7 @@ def __init__(self, obs_shape: Dict[str, Union[int, List[int]]]) -> None:
 class IMPALACnnResidualBlock(nn.Module):
     """
     Overview:
-        This CNN encoder residual block is residual basic block used in IMPALA algorithm, \
+        This CNN encoder residual block is residual basic block used in IMPALA algorithm,
         which preserves the channel number and shape.
         IMPALA: Scalable Distributed Deep-RL with Importance Weighted Actor-Learner Architectures
         https://arxiv.org/pdf/1802.01561.pdf
@@ -277,7 +277,7 @@ def residual(self, x: torch.Tensor) -> torch.Tensor:
         """
         Overview:
             Return output tensor of the residual block, keep the shape and channel number unchanged.
-            The inplace of activation function should be False for the first relu, \
+            The inplace of activation function should be False for the first relu,
             so that it does not change the origin input tensor of the residual block.
         Arguments:
             - x (:obj:`torch.Tensor`): Input tensor.
@@ -389,7 +389,7 @@ def output_shape(self, inshape: tuple) -> tuple:
 class IMPALAConvEncoder(nn.Module):
     """
     Overview:
-        IMPALA CNN encoder, which is used in IMPALA algorithm. \
+        IMPALA CNN encoder, which is used in IMPALA algorithm.
         IMPALA: Scalable Distributed Deep-RL with Importance Weighted Actor-Learner Architectures, \
         https://arxiv.org/pdf/1802.01561.pdf,
     Interface:

From 749f6a8a1e51b6bddb9a595323fa01f61ff7f871 Mon Sep 17 00:00:00 2001
From: zjowowen <zjowowen@outlook.com>
Date: Mon, 30 Oct 2023 22:50:15 +0800
Subject: [PATCH 4/5] polish code

---
 ding/model/common/head.py         | 19 ++++++++++--
 ding/model/template/bc.py         |  4 +--
 ding/model/template/bcq.py        |  4 +--
 ding/model/template/maqac.py      | 34 ++++++++++-----------
 ding/model/template/ppg.py        | 39 ++++++++++++++++--------
 ding/model/template/q_learning.py | 50 ++++++++++++++++++++-----------
 6 files changed, 95 insertions(+), 55 deletions(-)

diff --git a/ding/model/common/head.py b/ding/model/common/head.py
index 09d73ac578..30f5b58d98 100755
--- a/ding/model/common/head.py
+++ b/ding/model/common/head.py
@@ -15,7 +15,7 @@ class DiscreteHead(nn.Module):
     """
     Overview:
         The ``DiscreteHead`` is used to generate discrete actions logit or Q-value logit, \
-        which is often used in q-learning algorithmns or actor-critic algorithms for discrete action space.
+        which is often used in q-learning algorithms or actor-critic algorithms for discrete action space.
     Interfaces:
         ``__init__``, ``forward``.
     """
@@ -480,6 +480,11 @@ class QuantileHead(nn.Module):
         This module is used in IQN.
     Interfaces:
         ``__init__``, ``forward``, ``quantile_net``.
+
+    .. note::
+        The difference between ``QuantileHead`` and ``QRDQNHead`` is that ``QuantileHead`` models the \
+        state-action quantile function as a mapping from state-actions and samples from some base distribution \
+        while ``QRDQNHead`` approximates random returns by a uniform mixture of Diracs functions.
     """
 
     def __init__(
@@ -611,6 +616,14 @@ class FQFHead(nn.Module):
         This module is used in FQF.
     Interfaces:
         ``__init__``, ``forward``, ``quantile_net``.
+
+    .. note::
+        The implementation of FQFHead is based on the paper https://arxiv.org/abs/1911.02140.
+        The difference between FQFHead and QuantileHead is that, in FQF, \
+        N adjustable quantile values for N adjustable quantile fractions are estimated to approximate \
+        the quantile function. The distribution of the return is approximated by a weighted mixture of N \
+        Diracs functions. While in IQN, the state-action quantile function is modeled as a mapping from \
+        state-actions and samples from some base distribution.
     """
 
     def __init__(
@@ -1183,7 +1196,9 @@ def forward(self, x: torch.Tensor) -> Dict:
 class PopArtVHead(nn.Module):
     """
     Overview:
-        The ``PopArtVHead`` is used to generate adaptive normalized state value.
+        The ``PopArtVHead`` is used to generate adaptive normalized state value. More information can be found in \
+        paper Multi-task Deep Reinforcement Learning with PopArt. \
+        https://arxiv.org/abs/1809.04474 \
         This module is used in PPO or IMPALA.
     Interfaces:
         ``__init__``, ``forward``.
diff --git a/ding/model/template/bc.py b/ding/model/template/bc.py
index 753868437b..5348c750a6 100644
--- a/ding/model/template/bc.py
+++ b/ding/model/template/bc.py
@@ -189,9 +189,9 @@ def forward(self, inputs: Union[torch.Tensor, Dict[str, torch.Tensor]]) -> Dict:
         Returns:
             - output (:obj:`Dict`): Output dict data, including different key-values among distinct action_space.
         ReturnsKeys:
-            - action (:obj:`torch.Tensor`): Continuous action output of actor network, \
+            - action (:obj:`torch.Tensor`): action output of actor network, \
                 with shape :math:`(B, action_shape)`.
-            - logit (:obj:`List[torch.Tensor]`): Continuous action output of actor network, \
+            - logit (:obj:`List[torch.Tensor]`): reparameterized action output of actor network, \
                 with shape :math:`(B, action_shape)`.
         Shapes:
             - inputs (:obj:`torch.Tensor`): :math:`(B, N)`, where B is batch size and N is ``obs_shape``
diff --git a/ding/model/template/bcq.py b/ding/model/template/bcq.py
index 49b266b94d..0e72927a76 100755
--- a/ding/model/template/bcq.py
+++ b/ding/model/template/bcq.py
@@ -170,8 +170,8 @@ def compute_vae(self, inputs: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor
             - inputs (:obj:`Dict`): Input dict data, including obs and action tensor.
         Returns:
             - outputs (:obj:`Dict`): Dict containing keywords ``recons_action`` (:obj:`torch.Tensor`), \
-            ``prediction_residual`` (:obj:`torch.Tensor`), ``input`` (:obj:`torch.Tensor`), \
-            ``mu`` (:obj:`torch.Tensor`), ``log_var`` (:obj:`torch.Tensor`) and ``z`` (:obj:`torch.Tensor`).
+                ``prediction_residual`` (:obj:`torch.Tensor`), ``input`` (:obj:`torch.Tensor`), \
+                ``mu`` (:obj:`torch.Tensor`), ``log_var`` (:obj:`torch.Tensor`) and ``z`` (:obj:`torch.Tensor`).
         Shapes:
             - inputs (:obj:`Dict`): :math:`(B, N, D)`, where B is batch size, N is sample number, D is input dimension.
             - outputs (:obj:`Dict`): :math:`(B, N)`.
diff --git a/ding/model/template/maqac.py b/ding/model/template/maqac.py
index 9fab8da685..ae832f0bd8 100644
--- a/ding/model/template/maqac.py
+++ b/ding/model/template/maqac.py
@@ -13,7 +13,10 @@
 class DiscreteMAQAC(nn.Module):
     """
     Overview:
-        The discrete action Multi-Agent Q-value Actor-CritiC (MAQAC) model.
+        The neural network and computation graph of algorithms related to discrete action Multi-Agent Q-value \
+        Actor-CritiC (MAQAC) model. The model is composed of actor and critic, where actor is a MLP network and \
+        critic is a MLP network. The actor network is used to predict the action probability distribution, and the \
+        critic network is used to predict the Q value of the state-action pair.
     Interfaces:
         ``__init__``, ``forward``, ``compute_actor``, ``compute_critic``
     """
@@ -34,7 +37,7 @@ def __init__(
     ) -> None:
         """
         Overview:
-            Init the DiscreteMAQAC Model according to arguments.
+            Initiate the DiscreteMAQAC Model according to arguments.
         Arguments:
             - agent_obs_shape (:obj:`Union[int, SequenceType]`): Agent's observation's space.
             - global_obs_shape (:obj:`Union[int, SequenceType]`): Global observation's space.
@@ -93,8 +96,7 @@ def __init__(
     def forward(self, inputs: Union[torch.Tensor, Dict], mode: str) -> Dict:
         """
         Overview:
-            Use observation and action tensor to predict output.
-            Parameter updates with QAC's MLPs forward setup.
+            Use observation and action tensor to predict output, with ``'compute_actor'`` or ``'compute_critic'`` mode.
         Arguments:
             Forward with ``'compute_actor'``:
                 - inputs (:obj:`torch.Tensor`):
@@ -124,9 +126,7 @@ def forward(self, inputs: Union[torch.Tensor, Dict], mode: str) -> Dict:
     def compute_actor(self, inputs: Dict) -> Dict:
         """
         Overview:
-            Use encoded embedding tensor to predict output.
-            Execute parameter updates with ``'compute_actor'`` mode
-            Use encoded embedding tensor to predict output.
+            Use encoded embedding tensor to predict output of actor network.
         Arguments:
             - inputs (:obj:`torch.Tensor`):
                 The encoded embedding tensor, determined with given ``hidden_size``, i.e. ``(B, N=hidden_size)``.
@@ -165,8 +165,7 @@ def compute_actor(self, inputs: Dict) -> Dict:
     def compute_critic(self, inputs: Dict) -> Dict:
         """
         Overview:
-            Execute parameter updates with ``'compute_critic'`` mode
-            Use encoded embedding tensor to predict output.
+            use encoded embedding tensor to predict output of critic network.
         Arguments:
             - ``obs``, ``action`` encoded tensors.
             - mode (:obj:`str`): Name of the forward mode.
@@ -191,7 +190,10 @@ def compute_critic(self, inputs: Dict) -> Dict:
 class ContinuousMAQAC(nn.Module):
     """
     Overview:
-        The continuous action Multi-Agent Q-value Actor-CritiC (MAQAC) model.
+        The neural network and computation graph of algorithms related to continuous action Multi-Agent Q-value \
+        Actor-CritiC (MAQAC) model. The model is composed of actor and critic, where actor is a MLP network and \
+        critic is a MLP network. The actor network is used to predict the action probability distribution, and the \
+        critic network is used to predict the Q value of the state-action pair.
     Interfaces:
         ``__init__``, ``forward``, ``compute_actor``, ``compute_critic``
     """
@@ -213,7 +215,7 @@ def __init__(
     ) -> None:
         """
         Overview:
-            Init the QAC Model according to arguments.
+            Initiate the QAC Model according to arguments.
         Arguments:
             - obs_shape (:obj:`Union[int, SequenceType]`): Observation's space.
             - action_shape (:obj:`Union[int, SequenceType, EasyDict]`): Action's space, such as 4, (3, )
@@ -295,8 +297,7 @@ def __init__(
     def forward(self, inputs: Union[torch.Tensor, Dict], mode: str) -> Dict:
         """
         Overview:
-            Use observation and action tensor to predict output.
-            Parameter updates with QAC's MLPs forward setup.
+            Use observation and action tensor to predict output in ``'compute_actor'`` or ``'compute_critic'`` mode.
         Arguments:
             Forward with ``'compute_actor'``:
                 - inputs (:obj:`torch.Tensor`):
@@ -349,9 +350,7 @@ def forward(self, inputs: Union[torch.Tensor, Dict], mode: str) -> Dict:
     def compute_actor(self, inputs: Dict) -> Dict:
         """
         Overview:
-            Use encoded embedding tensor to predict output.
-            Execute parameter updates with ``'compute_actor'`` mode
-            Use encoded embedding tensor to predict output.
+            Use encoded embedding tensor to predict output of actor network.
         Arguments:
             - inputs (:obj:`torch.Tensor`):
                 The encoded embedding tensor, determined with given ``hidden_size``, i.e. ``(B, N=hidden_size)``.
@@ -400,8 +399,7 @@ def compute_actor(self, inputs: Dict) -> Dict:
     def compute_critic(self, inputs: Dict) -> Dict:
         """
         Overview:
-            Execute parameter updates with ``'compute_critic'`` mode
-            Use encoded embedding tensor to predict output.
+            Use encoded embedding tensor to predict output of critic network.
         Arguments:
             - inputs (:obj: `Dict`): ``obs``, ``action`` and ``logit` tensors.
             - mode (:obj:`str`): Name of the forward mode.
diff --git a/ding/model/template/ppg.py b/ding/model/template/ppg.py
index 8b5669f4bd..76df579e71 100644
--- a/ding/model/template/ppg.py
+++ b/ding/model/template/ppg.py
@@ -11,7 +11,8 @@ class PPG(nn.Module):
     """
     Overview:
         Phasic Policy Gradient (PPG) model from paper `Phasic Policy Gradient`
-        https://arxiv.org/abs/2009.04416
+        https://arxiv.org/abs/2009.04416 \
+        This module contains VAC module and an auxiliary critic module.
     Interfaces:
         ``forward``, ``compute_actor``, ``compute_critic``, ``compute_actor_critic``
     """
@@ -74,12 +75,14 @@ def __init__(
     def forward(self, inputs: Union[torch.Tensor, Dict], mode: str) -> Dict:
         """
         Overview:
-            Use different forward function according to mode.
+            Compute action logits or value according to mode being ``compute_actor``, ``compute_critic`` or \
+                ``compute_actor_critic``.
         Arguments:
-            - inputs (:obj:`Union[torch.Tensor, Dict]`): The input data.
-            - mode (:obj:`str`): The mode to forward.
+            - x (:obj:`torch.Tensor`): The input observation tensor data.
+            - mode (:obj:`str`): The forward mode, all the modes are defined in the beginning of this class.
         Returns:
-            - output (:obj:`Dict`): The output data.
+            - outputs (:obj:`Dict`): The output dict of PPG's forward computation graph, whose key-values vary from \
+                different ``mode``.
         """
         assert mode in self.mode, "not support forward mode: {}/{}".format(mode, self.mode)
         return getattr(self, mode)(inputs)
@@ -89,11 +92,15 @@ def compute_actor(self, x: torch.Tensor) -> Dict:
         Overview:
             Use actor to compute action logits.
         Arguments:
-            - x (:obj:`torch.Tensor`): The input data.
+            - x (:obj:`torch.Tensor`): The input observation tensor data.
         Returns:
-            - output (:obj:`Dict`): The output data.
+            - output (:obj:`Dict`): The output data containing action logits.
         ReturnsKeys:
-            - necessary: ``logit``
+            - logit (:obj:`torch.Tensor`): The predicted action logit tensor, for discrete action space, it will be \
+                the same dimension real-value ranged tensor of possible action choices, and for continuous action \
+                space, it will be the mu and sigma of the Gaussian distribution, and the number of mu and sigma is the \
+                same as the number of continuous actions. Hybrid action space is a kind of combination of discrete \
+                and continuous action space, so the logit will be a dict with ``action_type`` and ``action_args``.
         Shapes:
             - x (:obj:`torch.Tensor`): :math:`(B, N)`, where B is batch size and N is the input feature size.
             - output (:obj:`Dict`): ``logit``: :math:`(B, A)`, where B is batch size and A is the action space size.
@@ -105,9 +112,9 @@ def compute_critic(self, x: torch.Tensor) -> Dict:
         Overview:
             Use critic to compute value.
         Arguments:
-            - x (:obj:`torch.Tensor`): The input data.
+            - x (:obj:`torch.Tensor`): The input observation tensor data.
         Returns:
-            - output (:obj:`Dict`): The output data.
+            - output (:obj:`Dict`): The output dict of VAC's forward computation graph for critic, including ``value``.
         ReturnsKeys:
             - necessary: ``value``
         Shapes:
@@ -123,11 +130,17 @@ def compute_actor_critic(self, x: torch.Tensor) -> Dict:
         Overview:
             Use actor and critic to compute action logits and value.
         Arguments:
-            - x (:obj:`torch.Tensor`): The input data.
+            - x (:obj:`torch.Tensor`): The input observation tensor data.
         Returns:
-            - output (:obj:`Dict`): The output data.
+            - outputs (:obj:`Dict`): The output dict of PPG's forward computation graph for both actor and critic, \
+                including ``logit`` and ``value``.
         ReturnsKeys:
-            - necessary: ``value``, ``logit``
+            - logit (:obj:`torch.Tensor`): The predicted action logit tensor, for discrete action space, it will be \
+                the same dimension real-value ranged tensor of possible action choices, and for continuous action \
+                space, it will be the mu and sigma of the Gaussian distribution, and the number of mu and sigma is the \
+                same as the number of continuous actions. Hybrid action space is a kind of combination of discrete \
+                and continuous action space, so the logit will be a dict with ``action_type`` and ``action_args``.
+            - value (:obj:`torch.Tensor`): The predicted state value tensor.
         Shapes:
             - x (:obj:`torch.Tensor`): :math:`(B, N)`, where B is batch size and N is the input feature size.
             - output (:obj:`Dict`): ``value``: :math:`(B, 1)`, where B is batch size.
diff --git a/ding/model/template/q_learning.py b/ding/model/template/q_learning.py
index e36e5bdf6c..8ba8487796 100644
--- a/ding/model/template/q_learning.py
+++ b/ding/model/template/q_learning.py
@@ -316,13 +316,6 @@ def __init__(
 
     def forward(self, x: torch.Tensor) -> Dict:
         """
-        Returns:
-            - outputs (:obj:`Dict`): The output of DQN's forward, including q_value.
-        ReturnsKeys:
-            - logit (:obj:`torch.Tensor`): Discrete Q-value output of each possible action dimension.
-        Shapes:
-            - x (:obj:`torch.Tensor`): :math:`(B, N)`, where B is batch size and N is ``obs_shape``
-            - logit (:obj:`torch.Tensor`): :math:`(B, M)`, where B is batch size and M is ``action_shape``
         Overview:
             C51DQN forward computation graph, input observation tensor to predict q_value and its distribution.
         Arguments:
@@ -337,7 +330,6 @@ def forward(self, x: torch.Tensor) -> Dict:
             - x (:obj:`torch.Tensor`): :math:`(B, N)`, where B is batch size and N is head_hidden_size.
             - logit (:obj:`torch.Tensor`): :math:`(B, M)`, where M is action_shape.
             - distribution(:obj:`torch.Tensor`): :math:`(B, M, P)`, where P is n_atom.
-
         Examples:
             >>> model = C51DQN(128, 64)  # arguments: 'obs_shape' and 'action_shape'
             >>> inputs = torch.randn(4, 128)
@@ -363,6 +355,14 @@ def forward(self, x: torch.Tensor) -> Dict:
 
 @MODEL_REGISTRY.register('qrdqn')
 class QRDQN(nn.Module):
+    """
+    Overview:
+        The neural network structure and computation graph of QRDQN, which combines distributional RL and DQN. \
+        You can refer to Distributional Reinforcement Learning with Quantile Regression \
+        https://arxiv.org/pdf/1710.10044.pdf for more details.
+    Interfaces:
+        ``__init__``, ``forward``
+    """
 
     def __init__(
             self,
@@ -377,7 +377,7 @@ def __init__(
     ) -> None:
         """
         Overview:
-            Init the QRDQN Model according to input arguments.
+            Initiate the QRDQN Model according to input arguments.
         Arguments:
             - obs_shape (:obj:`Union[int, SequenceType]`): Observation's space.
             - action_shape (:obj:`Union[int, SequenceType]`): Action's space.
@@ -439,7 +439,6 @@ def forward(self, x: torch.Tensor) -> Dict:
         Returns:
             - outputs (:obj:`Dict`):
                 Run with encoder and head. Return the result prediction dictionary.
-
         ReturnsKeys:
             - logit (:obj:`torch.Tensor`): Logit tensor with same size as input ``x``.
             - q (:obj:`torch.Tensor`): Q valye tensor tensor of size ``(B, N, num_quantiles)``
@@ -448,7 +447,6 @@ def forward(self, x: torch.Tensor) -> Dict:
             - x (:obj:`torch.Tensor`): :math:`(B, N)`, where B is batch size and N is head_hidden_size.
             - logit (:obj:`torch.FloatTensor`): :math:`(B, M)`, where M is action_shape.
             - tau (:obj:`torch.Tensor`):  :math:`(B, M, 1)`
-
         Examples:
             >>> model = QRDQN(64, 64)
             >>> inputs = torch.randn(4, 64)
@@ -466,6 +464,14 @@ def forward(self, x: torch.Tensor) -> Dict:
 
 @MODEL_REGISTRY.register('iqn')
 class IQN(nn.Module):
+    """
+    Overview:
+        The neural network structure and computation graph of IQN, which combines distributional RL and DQN. \
+        You can refer to paper Implicit Quantile Networks for Distributional Reinforcement Learning \
+        https://arxiv.org/pdf/1806.06923.pdf for more details.
+    Interfaces:
+        ``__init__``, ``forward``
+    """
 
     def __init__(
             self,
@@ -481,7 +487,7 @@ def __init__(
     ) -> None:
         """
         Overview:
-            Init the IQN Model according to input arguments.
+            Initiate the IQN Model according to input arguments.
         Arguments:
             - obs_shape (:obj:`Union[int, SequenceType]`): Observation space shape.
             - action_shape (:obj:`Union[int, SequenceType]`): Action space shape.
@@ -546,7 +552,6 @@ def forward(self, x: torch.Tensor) -> Dict:
         Returns:
             - outputs (:obj:`Dict`):
                 Run with encoder and head. Return the result prediction dictionary.
-
         ReturnsKeys:
             - logit (:obj:`torch.Tensor`): Logit tensor with same size as input ``x``.
             - q (:obj:`torch.Tensor`): Q valye tensor tensor of size ``(num_quantiles, N, B)``
@@ -573,6 +578,14 @@ def forward(self, x: torch.Tensor) -> Dict:
 
 @MODEL_REGISTRY.register('fqf')
 class FQF(nn.Module):
+    """
+    Overview:
+        The neural network structure and computation graph of FQF, which combines distributional RL and DQN. \
+        You can refer to paper Fully Parameterized Quantile Function for Distributional Reinforcement Learning \
+        https://arxiv.org/pdf/1911.02140.pdf for more details.
+    Interface:
+        ``__init__``, ``forward``
+    """
 
     def __init__(
             self,
@@ -588,7 +601,7 @@ def __init__(
     ) -> None:
         """
         Overview:
-            Init the FQF Model according to input arguments.
+            Initiate the FQF Model according to input arguments.
         Arguments:
             - obs_shape (:obj:`Union[int, SequenceType]`): Observation space shape.
             - action_shape (:obj:`Union[int, SequenceType]`): Action space shape.
@@ -685,7 +698,11 @@ def forward(self, x: torch.Tensor) -> Dict:
 class RainbowDQN(nn.Module):
     """
     Overview:
-        RainbowDQN network (C51 + Dueling + Noisy Block)
+        The neural network structure and computation graph of RainbowDQN, which combines distributional RL and DQN. \
+        You can refer to paper Rainbow: Combining Improvements in Deep Reinforcement Learning \
+        https://arxiv.org/pdf/1710.02298.pdf for more details.
+    Interfaces:
+        ``__init__``, ``forward``
 
     .. note::
         RainbowDQN contains dueling architecture by default.
@@ -772,7 +789,6 @@ def forward(self, x: torch.Tensor) -> Dict:
         Returns:
             - outputs (:obj:`Dict`):
                 Run ``MLP`` with ``RainbowHead`` setups and return the result prediction dictionary.
-
         ReturnsKeys:
             - logit (:obj:`torch.Tensor`): Logit tensor with same size as input ``x``.
             - distribution (:obj:`torch.Tensor`): Distribution tensor of size ``(B, N, n_atom)``
@@ -780,7 +796,6 @@ def forward(self, x: torch.Tensor) -> Dict:
             - x (:obj:`torch.Tensor`): :math:`(B, N)`, where B is batch size and N is head_hidden_size.
             - logit (:obj:`torch.FloatTensor`): :math:`(B, M)`, where M is action_shape.
             - distribution(:obj:`torch.FloatTensor`): :math:`(B, M, P)`, where P is n_atom.
-
         Examples:
             >>> model = RainbowDQN(64, 64) # arguments: 'obs_shape' and 'action_shape'
             >>> inputs = torch.randn(4, 64)
@@ -941,7 +956,6 @@ def forward(self, inputs: Dict, inference: bool = False, saved_state_timesteps:
         Shapes:
             - obs (:obj:`torch.Tensor`): :math:`(B, N)`, where B is batch size and N is ``obs_shape``
             - logit (:obj:`torch.Tensor`): :math:`(B, M)`, where B is batch size and M is ``action_shape``
-
         Examples:
             >>> # Init input's Keys:
             >>> prev_state = [[torch.randn(1, 1, 64) for __ in range(2)] for _ in range(4)] # B=4

From 87819c3a3ec0e4109e1058e560bb647f70496243 Mon Sep 17 00:00:00 2001
From: zjowowen <zjowowen@outlook.com>
Date: Tue, 31 Oct 2023 22:05:46 +0800
Subject: [PATCH 5/5] polish code

---
 ding/model/template/maqac.py      | 471 ++++++++++++++++++++----------
 ding/model/template/q_learning.py |   6 +-
 2 files changed, 325 insertions(+), 152 deletions(-)

diff --git a/ding/model/template/maqac.py b/ding/model/template/maqac.py
index ae832f0bd8..ba74b97573 100644
--- a/ding/model/template/maqac.py
+++ b/ding/model/template/maqac.py
@@ -37,7 +37,7 @@ def __init__(
     ) -> None:
         """
         Overview:
-            Initiate the DiscreteMAQAC Model according to arguments.
+            Initialize the DiscreteMAQAC Model according to arguments.
         Arguments:
             - agent_obs_shape (:obj:`Union[int, SequenceType]`): Agent's observation's space.
             - global_obs_shape (:obj:`Union[int, SequenceType]`): Global observation's space.
@@ -96,29 +96,70 @@ def __init__(
     def forward(self, inputs: Union[torch.Tensor, Dict], mode: str) -> Dict:
         """
         Overview:
-            Use observation and action tensor to predict output, with ``'compute_actor'`` or ``'compute_critic'`` mode.
+            Use observation tensor to predict output, with ``'compute_actor'`` or ``'compute_critic'`` mode.
         Arguments:
-            Forward with ``'compute_actor'``:
-                - inputs (:obj:`torch.Tensor`):
-                    The encoded embedding tensor, determined with given ``hidden_size``, i.e. ``(B, N=hidden_size)``.
-                    Whether ``actor_head_hidden_size`` or ``critic_head_hidden_size`` depend on ``mode``.
-            Forward with ``'compute_critic'``, inputs (`Dict`) Necessary Keys:
-                - ``obs``, ``action`` encoded tensors.
-                - mode (:obj:`str`): Name of the forward mode.
+            - inputs (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys:
+                - ``obs`` (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys:
+                    - ``agent_state`` (:obj:`torch.Tensor`): The agent's observation tensor data, \
+                        with shape :math:`(B, A, N0)`, where B is batch size and A is agent num. \
+                        N0 corresponds to ``agent_obs_shape``.
+                    - ``global_state`` (:obj:`torch.Tensor`): The global observation tensor data, \
+                        with shape :math:`(B, A, N1)`, where B is batch size and A is agent num. \
+                        N1 corresponds to ``global_obs_shape``.
+                    - ``action_mask`` (:obj:`torch.Tensor`): The action mask tensor data, \
+                        with shape :math:`(B, A, N2)`, where B is batch size and A is agent num. \
+                        N2 corresponds to ``action_shape``.
+            - mode (:obj:`str`): The forward mode, all the modes are defined in the beginning of this class.
         Returns:
-            - outputs (:obj:`Dict`): Outputs of network forward.
-                Forward with ``'compute_actor'``, Necessary Keys (either):
-                    - action (:obj:`torch.Tensor`): Action tensor with same size as input ``x``.
-                    - logit (:obj:`torch.Tensor`): Action's probabilities.
-                Forward with ``'compute_critic'``, Necessary Keys:
-                    - q_value (:obj:`torch.Tensor`): Q value tensor with same size as batch size.
-        Actor Shapes:
-            - inputs (:obj:`torch.Tensor`): :math:`(B, N0)`, B is batch size and N0 corresponds to ``hidden_size``
-            - action (:obj:`torch.Tensor`): :math:`(B, N0)`
-            - q_value (:obj:`torch.FloatTensor`): :math:`(B, )`, where B is batch size.
-        Critic Shapes:
-            - obs (:obj:`torch.Tensor`): :math:`(B, N1)`, where B is batch size and N1 is ``global_obs_shape``
-            - logit (:obj:`torch.FloatTensor`): :math:`(B, N2)`, where B is batch size and N2 is ``action_shape``
+            - output (:obj:`Dict[str, torch.Tensor]`): The output dict of DiscreteMAQAC forward computation graph, whose \
+                key-values vary in different forward modes.
+            Forward with ``'compute_actor'``, Necessary Keys (either):
+                - logit (:obj:`torch.Tensor`): Action's probabilities.
+                - action_mask (:obj:`torch.Tensor`): Action mask tensor with same size as ``action_shape``.
+            Forward with ``'compute_critic'``, if ``twin_critic`` is ``False``, Necessary Keys:
+                - q_value (:obj:`torch.Tensor`): Q value tensor is the shape of :math:`(B, A, N2)`, where B is batch size \
+                    and A is agent num. N2 corresponds to ``action_shape``.
+            Forward with ``'compute_critic'``, if ``twin_critic`` is ``True``, Necessary Keys:
+                - q_value (:obj:`list`): 2 elements, each is the shape of :math:`(B, A, N2)`, where B is batch size and \
+                    A is agent num. N2 corresponds to ``action_shape``.
+        Shapes:
+            - inputs (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys:
+                - ``obs`` (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys:
+                    - ``agent_state`` (:obj:`torch.Tensor`): :math:`(B, A, N0)`, where B is batch size and A is agent num. \
+                        N0 corresponds to ``agent_obs_shape``.
+                    - ``global_state`` (:obj:`torch.Tensor`): :math:`(B, A, N1)`, where B is batch size and A is agent num. \
+                        N1 corresponds to ``global_obs_shape``.
+                    - ``action_mask`` (:obj:`torch.Tensor`): :math:`(B, A, N2)`, where B is batch size and A is agent num. \
+                        N2 corresponds to ``action_shape``.
+            - output (:obj:`Dict[str, torch.Tensor]`): The output dict of DiscreteMAQAC forward computation graph, whose \
+                key-values vary in different forward modes.
+            Forward with ``'compute_actor'``, Necessary Keys (either):
+                - logit (:obj:`torch.Tensor`): :math:`(B, A, N2)`, where B is batch size and A is agent num. \
+                    N2 corresponds to ``action_shape``.
+                - action_mask (:obj:`torch.Tensor`): :math:`(B, A, N2)`, where B is batch size and A is agent num. \
+                    N2 corresponds to ``action_shape``.
+            Forward with ``'compute_critic'``, if ``twin_critic`` is ``True``, Necessary Keys:
+                - q_value (:obj:`list`): 2 elements, each is the shape of :math:`(B, A, N2)`, where B is batch size and \
+                    A is agent num. N2 corresponds to ``action_shape``.
+            Forward with ``'compute_critic'``, if ``twin_critic`` is ``False``, Necessary Keys:
+                - q_value (:obj:`torch.Tensor`): :math:`(B, A, N2)`, where B is batch size and A is agent num. \
+                    N2 corresponds to ``action_shape``.
+        Examples:
+            >>> B = 32
+            >>> agent_obs_shape = 216
+            >>> global_obs_shape = 264
+            >>> agent_num = 8
+            >>> action_shape = 14
+            >>> data = {
+            >>>     'obs': {
+            >>>         'agent_state': torch.randn(B, agent_num, agent_obs_shape),
+            >>>         'global_state': torch.randn(B, agent_num, global_obs_shape),
+            >>>         'action_mask': torch.randint(0, 2, size=(B, agent_num, action_shape))
+            >>>     }
+            >>> }
+            >>> model = DiscreteMAQAC(agent_obs_shape, global_obs_shape, action_shape, twin_critic=True)
+            >>> logit = model(data, mode='compute_actor')['logit']
+            >>> value = model(data, mode='compute_critic')['q_value']
         """
         assert mode in self.mode, "not support forward mode: {}/{}".format(mode, self.mode)
         return getattr(self, mode)(inputs)
@@ -126,37 +167,54 @@ def forward(self, inputs: Union[torch.Tensor, Dict], mode: str) -> Dict:
     def compute_actor(self, inputs: Dict) -> Dict:
         """
         Overview:
-            Use encoded embedding tensor to predict output of actor network.
+            Use observation tensor to predict action logits.
         Arguments:
-            - inputs (:obj:`torch.Tensor`):
-                The encoded embedding tensor, determined with given ``hidden_size``, i.e. ``(B, N=hidden_size)``.
-                ``hidden_size = actor_head_hidden_size``
-            - mode (:obj:`str`): Name of the forward mode.
+            - inputs (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys:
+                - ``obs`` (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys:
+                    - ``agent_state`` (:obj:`torch.Tensor`): The agent's observation tensor data, \
+                        with shape :math:`(B, A, N0)`, where B is batch size and A is agent num. \
+                        N0 corresponds to ``agent_obs_shape``.
+                    - ``global_state`` (:obj:`torch.Tensor`): The global observation tensor data, \
+                        with shape :math:`(B, A, N1)`, where B is batch size and A is agent num. \
+                        N1 corresponds to ``global_obs_shape``.
+                    - ``action_mask`` (:obj:`torch.Tensor`): The action mask tensor data, \
+                        with shape :math:`(B, A, N2)`, where B is batch size and A is agent num. \
+                        N2 corresponds to ``action_shape``.
         Returns:
-            - outputs (:obj:`Dict`): Outputs of forward pass encoder and head.
-        ReturnsKeys (either):
-            - action (:obj:`torch.Tensor`): Continuous action tensor with same size as ``action_shape``.
-            - logit (:obj:`torch.Tensor`):
-                Logit tensor encoding ``mu`` and ``sigma``, both with same size as input ``x``.
+            - output (:obj:`Dict[str, torch.Tensor]`): The output dict of DiscreteMAQAC forward computation graph, whose \
+                key-values vary in different forward modes.
+                - logit (:obj:`torch.Tensor`): Action's probabilities.
+                - action_mask (:obj:`torch.Tensor`): Action mask tensor with same size as ``action_shape``.
         Shapes:
-            - inputs (:obj:`torch.Tensor`): :math:`(B, N0)`, B is batch size and N0 corresponds to ``hidden_size``
-            - action (:obj:`torch.Tensor`): :math:`(B, N0)`
-            - logit (:obj:`list`): 2 elements, mu and sigma, each is the shape of :math:`(B, N0)`.
-            - q_value (:obj:`torch.FloatTensor`): :math:`(B, )`, B is batch size.
+            - inputs (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys:
+                - ``obs`` (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys:
+                    - ``agent_state`` (:obj:`torch.Tensor`): :math:`(B, A, N0)`, where B is batch size and A is agent num. \
+                        N0 corresponds to ``agent_obs_shape``.
+                    - ``global_state`` (:obj:`torch.Tensor`): :math:`(B, A, N1)`, where B is batch size and A is agent num. \
+                        N1 corresponds to ``global_obs_shape``.
+                    - ``action_mask`` (:obj:`torch.Tensor`): :math:`(B, A, N2)`, where B is batch size and A is agent num. \
+                        N2 corresponds to ``action_shape``.
+            - output (:obj:`Dict[str, torch.Tensor]`): The output dict of DiscreteMAQAC forward computation graph, whose \
+                key-values vary in different forward modes.
+                - logit (:obj:`torch.Tensor`): :math:`(B, A, N2)`, where B is batch size and A is agent num. \
+                    N2 corresponds to ``action_shape``.
+                - action_mask (:obj:`torch.Tensor`): :math:`(B, A, N2)`, where B is batch size and A is agent num. \
+                    N2 corresponds to ``action_shape``.
         Examples:
-            >>> # Regression mode
-            >>> model = DiscreteQAC(64, 64, 'regression')
-            >>> inputs = torch.randn(4, 64)
-            >>> actor_outputs = model(inputs,'compute_actor')
-            >>> assert actor_outputs['action'].shape == torch.Size([4, 64])
-            >>> # Reparameterization Mode
-            >>> model = DiscreteQAC(64, 64, 'reparameterization')
-            >>> inputs = torch.randn(4, 64)
-            >>> actor_outputs = model(inputs,'compute_actor')
-            >>> actor_outputs['logit'][0].shape # mu
-            >>> torch.Size([4, 64])
-            >>> actor_outputs['logit'][1].shape # sigma
-            >>> torch.Size([4, 64])
+            >>> B = 32
+            >>> agent_obs_shape = 216
+            >>> global_obs_shape = 264
+            >>> agent_num = 8
+            >>> action_shape = 14
+            >>> data = {
+            >>>     'obs': {
+            >>>         'agent_state': torch.randn(B, agent_num, agent_obs_shape),
+            >>>         'global_state': torch.randn(B, agent_num, global_obs_shape),
+            >>>         'action_mask': torch.randint(0, 2, size=(B, agent_num, action_shape))
+            >>>     }
+            >>> }
+            >>> model = DiscreteMAQAC(agent_obs_shape, global_obs_shape, action_shape, twin_critic=True)
+            >>> logit = model.compute_actor(data)['logit']
         """
         action_mask = inputs['obs']['action_mask']
         x = self.actor(inputs['obs']['agent_state'])
@@ -165,18 +223,60 @@ def compute_actor(self, inputs: Dict) -> Dict:
     def compute_critic(self, inputs: Dict) -> Dict:
         """
         Overview:
-            use encoded embedding tensor to predict output of critic network.
+            use observation tensor to predict Q value.
         Arguments:
-            - ``obs``, ``action`` encoded tensors.
-            - mode (:obj:`str`): Name of the forward mode.
+            - inputs (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys:
+                - ``obs`` (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys:
+                    - ``agent_state`` (:obj:`torch.Tensor`): The agent's observation tensor data, \
+                        with shape :math:`(B, A, N0)`, where B is batch size and A is agent num. \
+                        N0 corresponds to ``agent_obs_shape``.
+                    - ``global_state`` (:obj:`torch.Tensor`): The global observation tensor data, \
+                        with shape :math:`(B, A, N1)`, where B is batch size and A is agent num. \
+                        N1 corresponds to ``global_obs_shape``.
+                    - ``action_mask`` (:obj:`torch.Tensor`): The action mask tensor data, \
+                        with shape :math:`(B, A, N2)`, where B is batch size and A is agent num. \
+                        N2 corresponds to ``action_shape``.
         Returns:
-            - outputs (:obj:`Dict`): Q-value output.
-        ReturnKeys:
-            - q_value (:obj:`torch.Tensor`): Q value tensor with same size as batch size.
+            - output (:obj:`Dict[str, torch.Tensor]`): The output dict of DiscreteMAQAC forward computation graph, whose \
+                key-values vary in different forward modes.
+            Forward with ``'compute_critic'``, if ``twin_critic`` is ``False``, Necessary Keys:
+                - q_value (:obj:`torch.Tensor`): Q value tensor is the shape of :math:`(B, A, N2)`, where B is batch size \
+                    and A is agent num. N2 corresponds to ``action_shape``.
+            Forward with ``'compute_critic'``, if ``twin_critic`` is ``True``, Necessary Keys:
+                - q_value (:obj:`list`): 2 elements, each is the shape of :math:`(B, A, N2)`, where B is batch size and \
+                    A is agent num. N2 corresponds to ``action_shape``.
         Shapes:
-            - obs (:obj:`torch.Tensor`): :math:`(B, N1)`, where B is batch size and N1 is ``obs_shape``
-            - action (:obj:`torch.Tensor`): :math:`(B, N2)`, where B is batch size and N2 is ``action_shape``
-            - q_value (:obj:`torch.FloatTensor`): :math:`(B, )`, where B is batch size.
+            - inputs (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys:
+                - ``obs`` (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys:
+                    - ``agent_state`` (:obj:`torch.Tensor`): :math:`(B, A, N0)`, where B is batch size and A is agent num. \
+                        N0 corresponds to ``agent_obs_shape``.
+                    - ``global_state`` (:obj:`torch.Tensor`): :math:`(B, A, N1)`, where B is batch size and A is agent num. \
+                        N1 corresponds to ``global_obs_shape``.
+                    - ``action_mask`` (:obj:`torch.Tensor`): :math:`(B, A, N2)`, where B is batch size and A is agent num. \
+                        N2 corresponds to ``action_shape``.
+            - output (:obj:`Dict[str, torch.Tensor]`): The output dict of DiscreteMAQAC forward computation graph, whose \
+                key-values vary in different forward modes.
+            if ``twin_critic`` is ``True``, Necessary Keys:
+                - q_value (:obj:`list`): 2 elements, each is the shape of :math:`(B, A, N2)`, where B is batch size and \
+                    A is agent num. N2 corresponds to ``action_shape``.
+            if ``twin_critic`` is ``False``, Necessary Keys:
+                - q_value (:obj:`torch.Tensor`): :math:`(B, A, N2)`, where B is batch size and A is agent num. \
+                    N2 corresponds to ``action_shape``.
+        Examples:
+            >>> B = 32
+            >>> agent_obs_shape = 216
+            >>> global_obs_shape = 264
+            >>> agent_num = 8
+            >>> action_shape = 14
+            >>> data = {
+            >>>     'obs': {
+            >>>         'agent_state': torch.randn(B, agent_num, agent_obs_shape),
+            >>>         'global_state': torch.randn(B, agent_num, global_obs_shape),
+            >>>         'action_mask': torch.randint(0, 2, size=(B, agent_num, action_shape))
+            >>>     }
+            >>> }
+            >>> model = DiscreteMAQAC(agent_obs_shape, global_obs_shape, action_shape, twin_critic=True)
+            >>> value = model.compute_critic(data)['q_value']
         """
 
         if self.twin_critic:
@@ -215,7 +315,7 @@ def __init__(
     ) -> None:
         """
         Overview:
-            Initiate the QAC Model according to arguments.
+            Initialize the QAC Model according to arguments.
         Arguments:
             - obs_shape (:obj:`Union[int, SequenceType]`): Observation's space.
             - action_shape (:obj:`Union[int, SequenceType, EasyDict]`): Action's space, such as 4, (3, )
@@ -299,50 +399,79 @@ def forward(self, inputs: Union[torch.Tensor, Dict], mode: str) -> Dict:
         Overview:
             Use observation and action tensor to predict output in ``'compute_actor'`` or ``'compute_critic'`` mode.
         Arguments:
-            Forward with ``'compute_actor'``:
-                - inputs (:obj:`torch.Tensor`):
-                    The encoded embedding tensor, determined with given ``hidden_size``, i.e. ``(B, N=hidden_size)``.
-                    Whether ``actor_head_hidden_size`` or ``critic_head_hidden_size`` depend on ``mode``.
-
-            Forward with ``'compute_critic'``, inputs (`Dict`) Necessary Keys:
-                - ``obs``, ``action`` encoded tensors.
-
+            - inputs (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys:
+                - ``obs`` (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys:
+                    - ``agent_state`` (:obj:`torch.Tensor`): The agent's observation tensor data, \
+                        with shape :math:`(B, A, N0)`, where B is batch size and A is agent num. \
+                        N0 corresponds to ``agent_obs_shape``.
+                    - ``global_state`` (:obj:`torch.Tensor`): The global observation tensor data, \
+                        with shape :math:`(B, A, N1)`, where B is batch size and A is agent num. \
+                        N1 corresponds to ``global_obs_shape``.
+                    - ``action_mask`` (:obj:`torch.Tensor`): The action mask tensor data, \
+                        with shape :math:`(B, A, N2)`, where B is batch size and A is agent num. \
+                        N2 corresponds to ``action_shape``.
+                - ``action`` (:obj:`torch.Tensor`): The action tensor data, \
+                    with shape :math:`(B, A, N3)`, where B is batch size and A is agent num. \
+                    N3 corresponds to ``action_shape``.
             - mode (:obj:`str`): Name of the forward mode.
         Returns:
             - outputs (:obj:`Dict`): Outputs of network forward.
-
-                Forward with ``'compute_actor'``, Necessary Keys (either):
-                    - action (:obj:`torch.Tensor`): Action tensor with same size as input ``x``.
-                    - logit (:obj:`torch.Tensor`):
-                        Logit tensor encoding ``mu`` and ``sigma``, both with same size as input ``x``.
-
-                Forward with ``'compute_critic'``, Necessary Keys:
-                    - q_value (:obj:`torch.Tensor`): Q value tensor with same size as batch size.
-        Actor Shapes:
-            - inputs (:obj:`torch.Tensor`): :math:`(B, N0)`, B is batch size and N0 corresponds to ``hidden_size``
-            - action (:obj:`torch.Tensor`): :math:`(B, N0)`
-            - q_value (:obj:`torch.FloatTensor`): :math:`(B, )`, where B is batch size.
-
-        Critic Shapes:
-            - obs (:obj:`torch.Tensor`): :math:`(B, N1)`, where B is batch size and N1 is ``obs_shape``
-            - action (:obj:`torch.Tensor`): :math:`(B, N2)`, where B is batch size and N2 is``action_shape``
-            - logit (:obj:`torch.FloatTensor`): :math:`(B, N2)`, where B is batch size and N3 is ``action_shape``
-
-        Actor Examples:
-            >>> # Regression mode
-            >>> model = ContinuousQAC(64, 64, 'regression')
-            >>> inputs = torch.randn(4, 64)
-            >>> actor_outputs = model(inputs,'compute_actor')
-            >>> assert actor_outputs['action'].shape == torch.Size([4, 64])
-            >>> # Reparameterization Mode
-            >>> model = ContinuousQAC(64, 64, 'reparameterization')
-            >>> inputs = torch.randn(4, 64)
-            >>> actor_outputs = model(inputs,'compute_actor')
-            >>> actor_outputs['logit'][0].shape # mu
-            >>> torch.Size([4, 64])
-            >>> actor_outputs['logit'][1].shape # sigma
-            >>> torch.Size([4, 64])
-
+            Forward with ``'compute_actor'``, if action_space == 'regression', Necessary Keys:
+                - action (:obj:`torch.Tensor`): Action tensor with same size as ``action_shape``.
+            Forward with ``'compute_actor'``, if action_space == 'reparameterization', Necessary Keys:
+                - logit (:obj:`list`): 2 elements, each is the shape of :math:`(B, A, N3)`, where B is batch size and \
+                    A is agent num. N3 corresponds to ``action_shape``.
+            Forward with ``'compute_critic'``, if ``twin_critic`` is ``True``, Necessary Keys:
+                - q_value (:obj:`list`): 2 elements, each is the shape of :math:`(B, A)`, where B is batch size and \
+                    A is agent num.
+            Forward with ``'compute_critic'``, if ``twin_critic`` is ``False``, Necessary Keys:
+                - q_value (:obj:`torch.Tensor`): :math:`(B, A)`, where B is batch size and A is agent num.
+        Shapes:
+            - inputs (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys:
+                - ``obs`` (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys:
+                    - ``agent_state`` (:obj:`torch.Tensor`): :math:`(B, A, N0)`, where B is batch size and A is agent num. \
+                        N0 corresponds to ``agent_obs_shape``.
+                    - ``global_state`` (:obj:`torch.Tensor`): :math:`(B, A, N1)`, where B is batch size and A is agent num. \
+                        N1 corresponds to ``global_obs_shape``.
+                    - ``action_mask`` (:obj:`torch.Tensor`): :math:`(B, A, N2)`, where B is batch size and A is agent num. \
+                        N2 corresponds to ``action_shape``.
+                - ``action`` (:obj:`torch.Tensor`): :math:`(B, A, N3)`, where B is batch size and A is agent num. \
+                    N3 corresponds to ``action_shape``.
+            - outputs (:obj:`Dict`): Outputs of network forward.
+            Forward with ``'compute_actor'``, if action_space == 'regression', Necessary Keys:
+                - action (:obj:`torch.Tensor`): :math:`(B, A, N3)`, where B is batch size and A is agent num. \
+                    N3 corresponds to ``action_shape``.
+            Forward with ``'compute_actor'``, if action_space == 'reparameterization', Necessary Keys:
+                - logit (:obj:`list`): 2 elements, each is the shape of :math:`(B, A, N3)`, where B is batch size and \
+                    A is agent num. N3 corresponds to ``action_shape``.
+            Forward with ``'compute_critic'``, if ``twin_critic`` is ``True``, Necessary Keys:
+                - q_value (:obj:`list`): 2 elements, each is the shape of :math:`(B, A)`, where B is batch size and \
+                    A is agent num.
+            Forward with ``'compute_critic'``, if ``twin_critic`` is ``False``, Necessary Keys:
+                - q_value (:obj:`torch.Tensor`): :math:`(B, A)`, where B is batch size and A is agent num.
+        Examples:
+            >>> B = 32
+            >>> agent_obs_shape = 216
+            >>> global_obs_shape = 264
+            >>> agent_num = 8
+            >>> action_shape = 14
+            >>> action_space = 'regression'
+            >>> # or
+            >>> action_space = 'reparameterization'
+            >>> data = {
+            >>>     'obs': {
+            >>>         'agent_state': torch.randn(B, agent_num, agent_obs_shape),
+            >>>         'global_state': torch.randn(B, agent_num, global_obs_shape),
+            >>>         'action_mask': torch.randint(0, 2, size=(B, agent_num, action_shape))
+            >>>     },
+            >>>     'action': torch.randn(B, agent_num, squeeze(action_shape))
+            >>> }
+            >>> model = ContinuousMAQAC(agent_obs_shape, global_obs_shape, action_shape, action_space, twin_critic=False)
+            >>> if action_space == 'regression':
+            >>>     action = model(data['obs'], mode='compute_actor')['action']
+            >>> elif action_space == 'reparameterization':
+            >>>     (mu, sigma) = model(data['obs'], mode='compute_actor')['logit']
+            >>> value = model(data, mode='compute_critic')['q_value']
         """
         assert mode in self.mode, "not support forward mode: {}/{}".format(mode, self.mode)
         return getattr(self, mode)(inputs)
@@ -350,43 +479,47 @@ def forward(self, inputs: Union[torch.Tensor, Dict], mode: str) -> Dict:
     def compute_actor(self, inputs: Dict) -> Dict:
         """
         Overview:
-            Use encoded embedding tensor to predict output of actor network.
+            Use observation tensor to predict action logits.
         Arguments:
-            - inputs (:obj:`torch.Tensor`):
-                The encoded embedding tensor, determined with given ``hidden_size``, i.e. ``(B, N=hidden_size)``.
-                ``hidden_size = actor_head_hidden_size``
-            - mode (:obj:`str`): Name of the forward mode.
+            - inputs (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys:
+                - ``agent_state`` (:obj:`torch.Tensor`): The agent's observation tensor data, \
+                    with shape :math:`(B, A, N0)`, where B is batch size and A is agent num. \
+                    N0 corresponds to ``agent_obs_shape``.
         Returns:
-            - outputs (:obj:`Dict`): Outputs of forward pass encoder and head.
-
-        ReturnsKeys (either):
-            - action (:obj:`torch.Tensor`): Continuous action tensor with same size as ``action_shape``.
-            - logit (:obj:`torch.Tensor`):
-                Logit tensor encoding ``mu`` and ``sigma``, both with same size as input ``x``.
-            - logit + action_args
+            - outputs (:obj:`Dict`): Outputs of network forward.
+            if action_space == 'regression', Necessary Keys:
+                - action (:obj:`torch.Tensor`): Action tensor with same size as ``action_shape``.
+            if action_space == 'reparameterization', Necessary Keys:
+                - logit (:obj:`list`): 2 elements, each is the shape of :math:`(B, A, N3)`, where B is batch size and \
+                    A is agent num. N3 corresponds to ``action_shape``.
         Shapes:
-            - inputs (:obj:`torch.Tensor`): :math:`(B, N0)`, B is batch size and N0 corresponds to ``hidden_size``
-            - action (:obj:`torch.Tensor`): :math:`(B, N0)`
-            - logit (:obj:`Union[list, torch.Tensor]`):
-              - case1(continuous space, list): 2 elements, mu and sigma, each is the shape of :math:`(B, N0)`.
-              - case2(hybrid space, torch.Tensor): :math:`(B, N1)`, where N1 is action_type_shape
-            - q_value (:obj:`torch.FloatTensor`): :math:`(B, )`, B is batch size.
-            - action_args (:obj:`torch.FloatTensor`): :math:`(B, N2)`, where N2 is action_args_shape
-                (action_args are continuous real value)
+            - inputs (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys:
+                - ``agent_state`` (:obj:`torch.Tensor`): :math:`(B, A, N0)`, where B is batch size and A is agent num. \
+                    N0 corresponds to ``agent_obs_shape``.
+            - outputs (:obj:`Dict`): Outputs of network forward.
+            if action_space == 'regression', Necessary Keys:
+                - action (:obj:`torch.Tensor`): :math:`(B, A, N3)`, where B is batch size and A is agent num. \
+                    N3 corresponds to ``action_shape``.
+            if action_space == 'reparameterization', Necessary Keys:
+                - logit (:obj:`list`): 2 elements, each is the shape of :math:`(B, A, N3)`, where B is batch size and \
+                    A is agent num. N3 corresponds to ``action_shape``.
         Examples:
-            >>> # Regression mode
-            >>> model = ContinuousQAC(64, 64, 'regression')
-            >>> inputs = torch.randn(4, 64)
-            >>> actor_outputs = model(inputs,'compute_actor')
-            >>> assert actor_outputs['action'].shape == torch.Size([4, 64])
-            >>> # Reparameterization Mode
-            >>> model = ContinuousQAC(64, 64, 'reparameterization')
-            >>> inputs = torch.randn(4, 64)
-            >>> actor_outputs = model(inputs,'compute_actor')
-            >>> actor_outputs['logit'][0].shape # mu
-            >>> torch.Size([4, 64])
-            >>> actor_outputs['logit'][1].shape # sigma
-            >>> torch.Size([4, 64])
+            >>> B = 32
+            >>> agent_obs_shape = 216
+            >>> global_obs_shape = 264
+            >>> agent_num = 8
+            >>> action_shape = 14
+            >>> action_space = 'regression'
+            >>> # or
+            >>> action_space = 'reparameterization'
+            >>> data = {
+            >>>     'agent_state': torch.randn(B, agent_num, agent_obs_shape),
+            >>> }
+            >>> model = ContinuousMAQAC(agent_obs_shape, global_obs_shape, action_shape, action_space, twin_critic=False)
+            >>> if action_space == 'regression':
+            >>>     action = model.compute_actor(data)['action']
+            >>> elif action_space == 'reparameterization':
+            >>>     (mu, sigma) = model.compute_actor(data)['logit']
         """
         inputs = inputs['agent_state']
         if self.action_space == 'regression':
@@ -399,25 +532,65 @@ def compute_actor(self, inputs: Dict) -> Dict:
     def compute_critic(self, inputs: Dict) -> Dict:
         """
         Overview:
-            Use encoded embedding tensor to predict output of critic network.
+            Use observation tensor and action tensor to predict Q value.
         Arguments:
-            - inputs (:obj: `Dict`): ``obs``, ``action`` and ``logit` tensors.
-            - mode (:obj:`str`): Name of the forward mode.
+            - inputs (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys:
+                - ``obs`` (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys:
+                    - ``agent_state`` (:obj:`torch.Tensor`): The agent's observation tensor data, \
+                        with shape :math:`(B, A, N0)`, where B is batch size and A is agent num. \
+                        N0 corresponds to ``agent_obs_shape``.
+                    - ``global_state`` (:obj:`torch.Tensor`): The global observation tensor data, \
+                        with shape :math:`(B, A, N1)`, where B is batch size and A is agent num. \
+                        N1 corresponds to ``global_obs_shape``.
+                    - ``action_mask`` (:obj:`torch.Tensor`): The action mask tensor data, \
+                        with shape :math:`(B, A, N2)`, where B is batch size and A is agent num. \
+                        N2 corresponds to ``action_shape``.
+                - ``action`` (:obj:`torch.Tensor`): The action tensor data, \
+                    with shape :math:`(B, A, N3)`, where B is batch size and A is agent num. \
+                    N3 corresponds to ``action_shape``.
         Returns:
-            - outputs (:obj:`Dict`): Q-value output.
-
-        ArgumentsKeys:
-            - necessary:
-              - obs: (:obj:`torch.Tensor`): 2-dim vector observation
-              - action (:obj:`Union[torch.Tensor, Dict]`): action from actor
-            - optional:
-              - logit (:obj:`torch.Tensor`): discrete action logit
-        ReturnKeys:
-            - q_value (:obj:`torch.Tensor`): Q value tensor with same size as batch size.
+            - outputs (:obj:`Dict`): Outputs of network forward.
+            if ``twin_critic`` is ``True``, Necessary Keys:
+                - q_value (:obj:`list`): 2 elements, each is the shape of :math:`(B, A)`, where B is batch size and \
+                    A is agent num.
+            if ``twin_critic`` is ``False``, Necessary Keys:
+                - q_value (:obj:`torch.Tensor`): :math:`(B, A)`, where B is batch size and A is agent num.
         Shapes:
-            - obs (:obj:`torch.Tensor`): :math:`(B, N1)`, where B is batch size and N1 is ``obs_shape``
-            - action (:obj:`torch.Tensor`): :math:`(B, N2)`, where B is batch size and N2 is ``action_shape``
-            - q_value (:obj:`torch.FloatTensor`): :math:`(B, )`, where B is batch size.
+            - inputs (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys:
+                - ``obs`` (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys:
+                    - ``agent_state`` (:obj:`torch.Tensor`): :math:`(B, A, N0)`, where B is batch size and A is agent num. \
+                        N0 corresponds to ``agent_obs_shape``.
+                    - ``global_state`` (:obj:`torch.Tensor`): :math:`(B, A, N1)`, where B is batch size and A is agent num. \
+                        N1 corresponds to ``global_obs_shape``.
+                    - ``action_mask`` (:obj:`torch.Tensor`): :math:`(B, A, N2)`, where B is batch size and A is agent num. \
+                        N2 corresponds to ``action_shape``.
+                - ``action`` (:obj:`torch.Tensor`): :math:`(B, A, N3)`, where B is batch size and A is agent num. \
+                    N3 corresponds to ``action_shape``.
+            - outputs (:obj:`Dict`): Outputs of network forward.
+            if ``twin_critic`` is ``True``, Necessary Keys:
+                - q_value (:obj:`list`): 2 elements, each is the shape of :math:`(B, A)`, where B is batch size and \
+                    A is agent num.
+            if ``twin_critic`` is ``False``, Necessary Keys:
+                - q_value (:obj:`torch.Tensor`): :math:`(B, A)`, where B is batch size and A is agent num.
+        Examples:
+            >>> B = 32
+            >>> agent_obs_shape = 216
+            >>> global_obs_shape = 264
+            >>> agent_num = 8
+            >>> action_shape = 14
+            >>> action_space = 'regression'
+            >>> # or
+            >>> action_space = 'reparameterization'
+            >>> data = {
+            >>>     'obs': {
+            >>>         'agent_state': torch.randn(B, agent_num, agent_obs_shape),
+            >>>         'global_state': torch.randn(B, agent_num, global_obs_shape),
+            >>>         'action_mask': torch.randint(0, 2, size=(B, agent_num, action_shape))
+            >>>     },
+            >>>     'action': torch.randn(B, agent_num, squeeze(action_shape))
+            >>> }
+            >>> model = ContinuousMAQAC(agent_obs_shape, global_obs_shape, action_shape, action_space, twin_critic=False)
+            >>> value = model.compute_critic(data)['q_value']
         """
 
         obs, action = inputs['obs']['global_state'], inputs['action']
diff --git a/ding/model/template/q_learning.py b/ding/model/template/q_learning.py
index 8ba8487796..ece076bd81 100644
--- a/ding/model/template/q_learning.py
+++ b/ding/model/template/q_learning.py
@@ -377,7 +377,7 @@ def __init__(
     ) -> None:
         """
         Overview:
-            Initiate the QRDQN Model according to input arguments.
+            Initialize the QRDQN Model according to input arguments.
         Arguments:
             - obs_shape (:obj:`Union[int, SequenceType]`): Observation's space.
             - action_shape (:obj:`Union[int, SequenceType]`): Action's space.
@@ -487,7 +487,7 @@ def __init__(
     ) -> None:
         """
         Overview:
-            Initiate the IQN Model according to input arguments.
+            Initialize the IQN Model according to input arguments.
         Arguments:
             - obs_shape (:obj:`Union[int, SequenceType]`): Observation space shape.
             - action_shape (:obj:`Union[int, SequenceType]`): Action space shape.
@@ -601,7 +601,7 @@ def __init__(
     ) -> None:
         """
         Overview:
-            Initiate the FQF Model according to input arguments.
+            Initialize the FQF Model according to input arguments.
         Arguments:
             - obs_shape (:obj:`Union[int, SequenceType]`): Observation space shape.
             - action_shape (:obj:`Union[int, SequenceType]`): Action space shape.