diff --git a/ding/entry/tests/test_serial_entry.py b/ding/entry/tests/test_serial_entry.py
index a9123d0fcf..1a44c7b548 100644
--- a/ding/entry/tests/test_serial_entry.py
+++ b/ding/entry/tests/test_serial_entry.py
@@ -284,15 +284,10 @@ def test_sac_log_space():
         assert False, "pipeline fail"
 
 
-auto_alpha = [True, False]
-log_space = [True, False]
-args = [item for item in product(*[auto_alpha, log_space])]
-
-
 @pytest.mark.platformtest
 @pytest.mark.unittest
-@pytest.mark.parametrize('auto_alpha, log_space', args)
-def test_discrete_sac(auto_alpha, log_space):
+def test_discrete_sac():
+    auto_alpha, log_space = True, False
     config = [deepcopy(cartpole_sac_config), deepcopy(cartpole_sac_create_config)]
     config[0].policy.learn.update_per_collect = 1
     config[0].policy.learn.auto_alpha = auto_alpha
diff --git a/ding/framework/tests/test_parallel.py b/ding/framework/tests/test_parallel.py
index 8d2cf648c2..7bdf6ea343 100644
--- a/ding/framework/tests/test_parallel.py
+++ b/ding/framework/tests/test_parallel.py
@@ -39,7 +39,7 @@ def uncaught_exception_main():
         time.sleep(0.2)
 
 
-@pytest.mark.unittest
+@pytest.mark.tmp
 def test_uncaught_exception():
     # Make one process crash, then the parent process will also crash and output the stack of the wrong process.
     with pytest.raises(Exception) as exc_info:
@@ -70,7 +70,7 @@ def disconnected_main():
         assert i == 9
 
 
-@pytest.mark.unittest
+@pytest.mark.tmp
 def test_disconnected():
     # Make one process exit normally and the rest will still run, even if the network request
     # is not received by other processes.
@@ -141,7 +141,7 @@ def main(cls):
             raise Exception("Invalid node id")
 
 
-@pytest.mark.unittest
+@pytest.mark.tmp
 def test_auto_recover():
     # With max_retries=1
     Parallel.runner(
diff --git a/ding/framework/tests/test_supervisor.py b/ding/framework/tests/test_supervisor.py
index b4fdb95dc0..d6f4c646fa 100644
--- a/ding/framework/tests/test_supervisor.py
+++ b/ding/framework/tests/test_supervisor.py
@@ -29,7 +29,7 @@ def sleep1(self):
         sleep(1)
 
 
-@pytest.mark.unittest
+@pytest.mark.tmp
 @pytest.mark.parametrize("type_", [ChildType.PROCESS, ChildType.THREAD])
 def test_supervisor(type_):
     sv = Supervisor(type_=type_)
@@ -74,7 +74,7 @@ def test_supervisor(type_):
     sv.shutdown()
 
 
-@pytest.mark.unittest
+@pytest.mark.tmp
 def test_supervisor_spawn():
     sv = Supervisor(type_=ChildType.PROCESS, mp_ctx=mp.get_context("spawn"))
     for _ in range(3):
@@ -103,7 +103,7 @@ def step(self, _):
         return self._counter
 
 
-# @pytest.mark.unittest
+@pytest.mark.tmp
 @pytest.mark.parametrize("type_", [ChildType.PROCESS, ChildType.THREAD])
 def test_crash_supervisor(type_):
     sv = Supervisor(type_=type_)
@@ -143,7 +143,7 @@ def test_crash_supervisor(type_):
     sv.shutdown()
 
 
-@pytest.mark.unittest
+@pytest.mark.tmp
 @pytest.mark.parametrize("type_", [ChildType.PROCESS, ChildType.THREAD])
 def test_recv_all(type_):
     sv = Supervisor(type_=type_)
diff --git a/ding/framework/tests/test_task.py b/ding/framework/tests/test_task.py
index 8b6f9ee1de..67f3dc34c7 100644
--- a/ding/framework/tests/test_task.py
+++ b/ding/framework/tests/test_task.py
@@ -124,12 +124,12 @@ def _counter(ctx):
         assert sync_count > 0
 
 
-@pytest.mark.unittest
+@pytest.mark.tmp
 def test_parallel_pipeline():
     Parallel.runner(n_parallel_workers=2, startup_interval=0.1)(parallel_main)
 
 
-@pytest.mark.unittest
+@pytest.mark.tmp
 def test_emit():
     with task.start():
         greets = []
@@ -161,12 +161,12 @@ def emit_remote_main():
             assert len(greets) == 0
 
 
-@pytest.mark.unittest
+@pytest.mark.tmp
 def test_emit_remote():
     Parallel.runner(n_parallel_workers=2, startup_interval=0.1)(emit_remote_main)
 
 
-@pytest.mark.unittest
+@pytest.mark.tmp
 def test_wait_for():
     # Wait for will only work in async or parallel mode
     with task.start(async_mode=True, n_async_workers=2):
@@ -198,7 +198,7 @@ def step1(_):
             task.run(max_step=1)
 
 
-@pytest.mark.unittest
+@pytest.mark.tmp
 def test_async_exception():
     with task.start(async_mode=True, n_async_workers=2):
 
@@ -227,12 +227,12 @@ def early_stop_main():
         assert task.ctx.total_step < 7
 
 
-@pytest.mark.unittest
+@pytest.mark.tmp
 def test_early_stop():
     Parallel.runner(n_parallel_workers=2, startup_interval=0.1)(early_stop_main)
 
 
-@pytest.mark.unittest
+@pytest.mark.tmp
 def test_parallel_in_sequencial():
     result = []
 
@@ -250,7 +250,7 @@ def slow(_):
         assert result == ["begin", "fast", "slow"]
 
 
-@pytest.mark.unittest
+@pytest.mark.tmp
 def test_serial_in_parallel():
     result = []
 
diff --git a/ding/model/common/encoder.py b/ding/model/common/encoder.py
index 8e3f8a5c2a..e22112601e 100644
--- a/ding/model/common/encoder.py
+++ b/ding/model/common/encoder.py
@@ -23,7 +23,7 @@ def prod(iterable):
 class ConvEncoder(nn.Module):
     """
     Overview:
-        The ``Convolution Encoder`` used to encode raw 2-dim image observations (e.g. Atari/Procgen). 
+        The ``Convolution Encoder`` used to encode raw 2-dim image observations (e.g. Atari/Procgen).
     Interfaces:
         ``__init__``, ``forward``.
     """
diff --git a/ding/model/common/head.py b/ding/model/common/head.py
index 1f2d324430..c1d27fba89 100755
--- a/ding/model/common/head.py
+++ b/ding/model/common/head.py
@@ -290,7 +290,7 @@ def forward(self, x: torch.Tensor) -> Dict:
 class RainbowHead(nn.Module):
     """
     Overview:
-        The ``RainbowHead`` used to output Q-value distribution, which is used in Rainbow DQN. 
+        The ``RainbowHead`` used to output Q-value distribution, which is used in Rainbow DQN.
     Interfaces:
         ``__init__``, ``forward``.
     """
@@ -394,7 +394,7 @@ def forward(self, x: torch.Tensor) -> Dict:
 class QRDQNHead(nn.Module):
     """
     Overview:
-        The ``QRDQNHead`` (Quantile Regression DQN) used to output action quantiles. 
+        The ``QRDQNHead`` (Quantile Regression DQN) used to output action quantiles.
     Interfaces:
         ``__init__``, ``forward``.
     """
diff --git a/ding/model/common/utils.py b/ding/model/common/utils.py
index b7c2159215..0f508de0b8 100644
--- a/ding/model/common/utils.py
+++ b/ding/model/common/utils.py
@@ -1,3 +1,4 @@
+import copy
 import torch
 from easydict import EasyDict
 from ding.utils import import_module, MODEL_REGISTRY
diff --git a/ding/model/template/__init__.py b/ding/model/template/__init__.py
index a291936d64..df5f337888 100755
--- a/ding/model/template/__init__.py
+++ b/ding/model/template/__init__.py
@@ -1,12 +1,12 @@
 # general
-from .q_learning import DQN, RainbowDQN, QRDQN, IQN, FQF, DRQN, C51DQN, BDQ
-from .qac import QAC, DiscreteQAC
+from .q_learning import DQN, RainbowDQN, QRDQN, IQN, FQF, DRQN, C51DQN, BDQ, GTrXLDQN
+from .qac import DiscreteQAC, ContinuousQAC
 from .pdqn import PDQN
 from .vac import VAC, DREAMERVAC
 from .bc import DiscreteBC, ContinuousBC
-from .pg import PG
 from .language_transformer import LanguageTransformer
 # algorithm-specific
+from .pg import PG
 from .ppg import PPG
 from .qmix import Mixer, QMix
 from .collaq import CollaQ
@@ -19,10 +19,10 @@
 from .mavac import MAVAC
 from .ngu import NGU
 from .qac_dist import QACDIST
-from .maqac import MAQAC, ContinuousMAQAC
+from .maqac import DiscreteMAQAC, ContinuousMAQAC
 from .madqn import MADQN
 from .vae import VanillaVAE
 from .dt import DecisionTransformer
 from .procedure_cloning import ProcedureCloningMCTS, ProcedureCloningBFS
 from .bcq import BCQ
-from .edac import QACEnsemble
+from .edac import EDAC
diff --git a/ding/model/template/bc.py b/ding/model/template/bc.py
index b40ef4f118..4568e3ce1c 100644
--- a/ding/model/template/bc.py
+++ b/ding/model/template/bc.py
@@ -177,10 +177,10 @@ def forward(self, inputs: Union[torch.Tensor, Dict[str, torch.Tensor]]) -> Dict[
         """
         Overview:
             The unique execution (forward) method of ContinuousBC method.
-            Arguments:
-                - inputs (:obj:`torch.Tensor`): Observation data, defaults to tensor.
-            Returns:
-                - output (:obj:`Dict`): Output dict data, including differnet key-values among distinct action_space.
+        Arguments:
+            - inputs (:obj:`torch.Tensor`): Observation data, defaults to tensor.
+        Returns:
+            - output (:obj:`Dict`): Output dict data, including differnet key-values among distinct action_space.
         """
         if self.action_space == 'regression':
             x = self.actor(inputs)
diff --git a/ding/model/template/edac.py b/ding/model/template/edac.py
index 49789f8acc..397ba69763 100755
--- a/ding/model/template/edac.py
+++ b/ding/model/template/edac.py
@@ -10,10 +10,10 @@
 
 
 @MODEL_REGISTRY.register('edac')
-class QACEnsemble(nn.Module):
-    r"""
+class EDAC(nn.Module):
+    """
     Overview:
-        The QAC network with ensemble, which is used in EDAC.
+        The Q-value Actor-Critic network with the ensemble mechanism, which is used in EDAC.
     Interfaces:
         ``__init__``, ``forward``, ``compute_actor``, ``compute_critic``
     """
@@ -51,7 +51,7 @@ def __init__(
             - norm_type (:obj:`Optional[str]`): The type of normalization to after network layer (FC, Conv), \
                 see ``ding.torch_utils.network`` for more details.
         """
-        super(QACEnsemble, self).__init__()
+        super(EDAC, self).__init__()
         obs_shape: int = squeeze(obs_shape)
         action_shape = squeeze(action_shape)
         self.action_shape = action_shape
@@ -94,6 +94,7 @@ def forward(self, inputs: Union[torch.Tensor, Dict[str, torch.Tensor]], mode: st
                 - inputs (:obj:`Dict`): Input dict data, including obs and action tensor.
             Returns:
                 - output (:obj:`Dict`): Output dict data, including q_value tensor.
+
         .. note::
             For specific examples, one can refer to API doc of ``compute_actor`` and ``compute_critic`` respectively.
         """
@@ -125,7 +126,7 @@ def compute_actor(self, obs: torch.Tensor) -> Dict[str, Union[torch.Tensor, Dict
             - action_args (:obj:`torch.Tensor`): :math:`(B, N3)`, B is batch size and N3 corresponds to \
                 ``action_shape.action_args_shape``.
         Examples:
-            >>> model = QACEnsemble(64, 64,)
+            >>> model = EDAC(64, 64,)
             >>> obs = torch.randn(4, 64)
             >>> actor_outputs = model(obs,'compute_actor')
             >>> assert actor_outputs['logit'][0].shape == torch.Size([4, 64])  # mu
diff --git a/ding/model/template/maqac.py b/ding/model/template/maqac.py
index e6ddd996dd..798a3753f5 100644
--- a/ding/model/template/maqac.py
+++ b/ding/model/template/maqac.py
@@ -9,11 +9,11 @@
     FCEncoder, ConvEncoder
 
 
-@MODEL_REGISTRY.register('maqac')
-class MAQAC(nn.Module):
-    r"""
+@MODEL_REGISTRY.register('discrete_maqac')
+class DiscreteMAQAC(nn.Module):
+    """
     Overview:
-        The MAQAC model.
+        The discrete action Multi-Agent Q-value Actor-CritiC (MAQAC) model.
     Interfaces:
         ``__init__``, ``forward``, ``compute_actor``, ``compute_critic``
     """
@@ -32,9 +32,9 @@ def __init__(
             activation: Optional[nn.Module] = nn.ReLU(),
             norm_type: Optional[str] = None,
     ) -> None:
-        r"""
+        """
         Overview:
-            Init the MAQAC Model according to arguments.
+            Init the DiscreteMAQAC Model according to arguments.
         Arguments:
             - agent_obs_shape (:obj:`Union[int, SequenceType]`): Agent's observation's space.
             - global_obs_shape (:obj:`Union[int, SequenceType]`): Global observation's space.
@@ -42,18 +42,17 @@ def __init__(
             - action_shape (:obj:`Union[int, SequenceType]`): Action's space.
             - twin_critic (:obj:`bool`): Whether include twin critic.
             - actor_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to actor-nn's ``Head``.
-            - actor_head_layer_num (:obj:`int`):
-                The num of layers used in the network to compute Q value output for actor's nn.
+            - actor_head_layer_num (:obj:`int`): The num of layers used in the network to compute Q value output \
+                for actor's nn.
             - critic_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to critic-nn's ``Head``.
-            - critic_head_layer_num (:obj:`int`):
-                The num of layers used in the network to compute Q value output for critic's nn.
-            - activation (:obj:`Optional[nn.Module]`):
-                The type of activation function to use in ``MLP`` the after ``layer_fn``,
-                if ``None`` then default set to ``nn.ReLU()``
-            - norm_type (:obj:`Optional[str]`):
-                The type of normalization to use, see ``ding.torch_utils.fc_block`` for more details.
+            - critic_head_layer_num (:obj:`int`): The num of layers used in the network to compute Q value output \
+                for critic's nn.
+            - activation (:obj:`Optional[nn.Module]`): The type of activation function to use in ``MLP`` the after \
+                ``layer_fn``, if ``None`` then default set to ``nn.ReLU()``
+            - norm_type (:obj:`Optional[str]`): The type of normalization to use, see ``ding.torch_utils.fc_block`` \
+                for more details.
         """
-        super(MAQAC, self).__init__()
+        super(DiscreteMAQAC, self).__init__()
         agent_obs_shape: int = squeeze(agent_obs_shape)
         action_shape: int = squeeze(action_shape)
         self.actor = nn.Sequential(
@@ -188,11 +187,11 @@ def compute_critic(self, inputs: Dict) -> Dict:
         return {'q_value': x}
 
 
-@MODEL_REGISTRY.register('maqac_continuous')
+@MODEL_REGISTRY.register('continuous_maqac')
 class ContinuousMAQAC(nn.Module):
-    r"""
+    """
     Overview:
-        The Continuous MAQAC model.
+        The continuous action Multi-Agent Q-value Actor-CritiC (MAQAC) model.
     Interfaces:
         ``__init__``, ``forward``, ``compute_actor``, ``compute_critic``
     """
@@ -212,7 +211,7 @@ def __init__(
             activation: Optional[nn.Module] = nn.ReLU(),
             norm_type: Optional[str] = None,
     ) -> None:
-        r"""
+        """
         Overview:
             Init the QAC Model according to arguments.
         Arguments:
@@ -221,16 +220,15 @@ def __init__(
             - action_space (:obj:`str`): Whether choose ``regression`` or ``reparameterization``.
             - twin_critic (:obj:`bool`): Whether include twin critic.
             - actor_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to actor-nn's ``Head``.
-            - actor_head_layer_num (:obj:`int`):
-                The num of layers used in the network to compute Q value output for actor's nn.
+            - actor_head_layer_num (:obj:`int`): The num of layers used in the network to compute Q value output \
+                for actor's nn.
             - critic_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to critic-nn's ``Head``.
-            - critic_head_layer_num (:obj:`int`):
-                The num of layers used in the network to compute Q value output for critic's nn.
-            - activation (:obj:`Optional[nn.Module]`):
-                The type of activation function to use in ``MLP`` the after ``layer_fn``,
-                if ``None`` then default set to ``nn.ReLU()``
-            - norm_type (:obj:`Optional[str]`):
-                The type of normalization to use, see ``ding.torch_utils.fc_block`` for more details.
+            - critic_head_layer_num (:obj:`int`): The num of layers used in the network to compute Q value output \
+                for critic's nn.
+            - activation (:obj:`Optional[nn.Module]`): The type of activation function to use in ``MLP`` the after \
+                ``layer_fn``, if ``None`` then default set to ``nn.ReLU()``
+            - norm_type (:obj:`Optional[str]`): The type of normalization to use, see ``ding.torch_utils.fc_block`` \
+                for more details.
         """
         super(ContinuousMAQAC, self).__init__()
         obs_shape: int = squeeze(agent_obs_shape)
@@ -238,7 +236,7 @@ def __init__(
         action_shape = squeeze(action_shape)
         self.action_shape = action_shape
         self.action_space = action_space
-        assert self.action_space in ['regression', 'reparameterization']
+        assert self.action_space in ['regression', 'reparameterization'], self.action_space
         if self.action_space == 'regression':  # DDPG, TD3
             self.actor = nn.Sequential(
                 nn.Linear(obs_shape, actor_head_hidden_size), activation,
diff --git a/ding/model/template/q_learning.py b/ding/model/template/q_learning.py
index 013790cd65..e1ddbd6e5f 100644
--- a/ding/model/template/q_learning.py
+++ b/ding/model/template/q_learning.py
@@ -11,6 +11,19 @@
 
 @MODEL_REGISTRY.register('dqn')
 class DQN(nn.Module):
+    """
+    Overview:
+        The neural nework structure and computation graph of Deep Q Network (DQN) algorithm, which is the most classic \
+        value-based RL algorithm for discrete action. The DQN is composed of two parts: ``encoder`` and ``head``. \
+        The ``encoder`` is used to extract the feature from various observation, and the ``head`` is used to compute \
+        the Q value of each action dimension.
+    Interfaces:
+        ``__init__``, ``forward``.
+
+    .. note::
+        Current ``DQN`` supports two types of encoder: ``FCEncoder`` and ``ConvEncoder``, two types of head: \
+        ``DiscreteHead`` and ``DuelingHead``. You can customize your own encoder or head by inheriting this class.
+    """
 
     def __init__(
             self,
@@ -26,21 +39,22 @@ def __init__(
     ) -> None:
         """
         Overview:
-            Init the DQN (encoder + head) Model according to input arguments.
+            initialize the DQN (encoder + head) Model according to corresponding input arguments.
         Arguments:
             - obs_shape (:obj:`Union[int, SequenceType]`): Observation space shape, such as 8 or [4, 84, 84].
             - action_shape (:obj:`Union[int, SequenceType]`): Action space shape, such as 6 or [2, 3, 3].
             - encoder_hidden_size_list (:obj:`SequenceType`): Collection of ``hidden_size`` to pass to ``Encoder``, \
                 the last element must match ``head_hidden_size``.
-            - dueling (:obj:`Optional[bool]`): Whether choose ``DuelingHead`` or ``DiscreteHead(default)``.
-            - head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` of head network.
-            - head_layer_num (:obj:`int`): The number of layers used in the head network to compute Q value output
+            - dueling (:obj:`Optional[bool]`): Whether choose ``DuelingHead`` or ``DiscreteHead (default)``.
+            - head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` of head network, defaults to None, \
+                then it will be set to the last element of ``encoder_hidden_size_list``.
+            - head_layer_num (:obj:`int`): The number of layers used in the head network to compute Q value output.
             - activation (:obj:`Optional[nn.Module]`): The type of activation function in networks \
                 if ``None`` then default set it to ``nn.ReLU()``.
             - norm_type (:obj:`Optional[str]`): The type of normalization in networks, see \
                 ``ding.torch_utils.fc_block`` for more details. you can choose one of ['BN', 'IN', 'SyncBN', 'LN']
             - dropout (:obj:`Optional[float]`): The dropout rate of the dropout layer. \
-                if ``None`` then default no dropout layer.
+                if ``None`` then default disable dropout layer.
         """
         super(DQN, self).__init__()
         # Squeeze data from tuple, list or dict to single object. For example, from (4, ) to 4
@@ -91,19 +105,23 @@ def forward(self, x: torch.Tensor) -> Dict:
         Overview:
             DQN forward computation graph, input observation tensor to predict q_value.
         Arguments:
-            - x (:obj:`torch.Tensor`): Observation inputs
+            - x (:obj:`torch.Tensor`): The input observation tensor data.
         Returns:
-            - outputs (:obj:`Dict`): DQN forward outputs, such as q_value.
+            - outputs (:obj:`Dict`): The output of DQN's forward, including q_value.
         ReturnsKeys:
-            - logit (:obj:`torch.Tensor`): Discrete Q-value output of each action dimension.
+            - logit (:obj:`torch.Tensor`): Discrete Q-value output of each possible action dimension.
         Shapes:
             - x (:obj:`torch.Tensor`): :math:`(B, N)`, where B is batch size and N is ``obs_shape``
-            - logit (:obj:`torch.FloatTensor`): :math:`(B, M)`, where B is batch size and M is ``action_shape``
+            - logit (:obj:`torch.Tensor`): :math:`(B, M)`, where B is batch size and M is ``action_shape``
         Examples:
             >>> model = DQN(32, 6)  # arguments: 'obs_shape' and 'action_shape'
             >>> inputs = torch.randn(4, 32)
             >>> outputs = model(inputs)
             >>> assert isinstance(outputs, dict) and outputs['logit'].shape == torch.Size([4, 6])
+
+        .. note::
+            For consistency and compatibility, we name all the outputs of the network which are related to action \
+            selections as ``logit``.
         """
         x = self.encoder(x)
         x = self.head(x)
@@ -207,6 +225,18 @@ def forward(self, x: torch.Tensor) -> Dict:
 
 @MODEL_REGISTRY.register('c51dqn')
 class C51DQN(nn.Module):
+    """
+    Overview:
+        The neural network structure and computation graph of C51DQN, which combines distributional RL and DQN. \
+        You can refer to https://arxiv.org/pdf/1707.06887.pdf for more details. The C51DQN is composed of \
+        ``encoder`` and ``head``. ``encoder`` is used to extract the feature of observation, and ``head`` is \
+        used to compute the distribution of Q-value.
+    Interfaces:
+        ``__init__``, ``forward``
+
+    .. note::
+        Current C51DQN supports two types of encoder: ``FCEncoder`` and ``ConvEncoder``.
+    """
 
     def __init__(
         self,
@@ -221,21 +251,27 @@ def __init__(
         v_max: Optional[float] = 10,
         n_atom: Optional[int] = 51,
     ) -> None:
-        r"""
+        """
         Overview:
-            Init the C51 Model according to input arguments.
+            initialize the C51 Model according to corresponding input arguments.
         Arguments:
-            - obs_shape (:obj:`Union[int, SequenceType]`): Observation's space.
-            - action_shape (:obj:`Union[int, SequenceType]`): Action's space.
-            - encoder_hidden_size_list (:obj:`SequenceType`): Collection of ``hidden_size`` to pass to ``Encoder``
-            - head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to ``Head``.
-            - head_layer_num (:obj:`int`): The num of layers used in the network to compute Q value output
-            - activation (:obj:`Optional[nn.Module]`):
-                The type of activation function to use in ``MLP`` the after ``layer_fn``,
-                if ``None`` then default set to ``nn.ReLU()``
-            - norm_type (:obj:`Optional[str]`):
-                The type of normalization to use, see ``ding.torch_utils.fc_block`` for more details`
-            - n_atom (:obj:`Optional[int]`): Number of atoms in the prediction distribution.
+            - obs_shape (:obj:`Union[int, SequenceType]`): Observation space shape, such as 8 or [4, 84, 84].
+            - action_shape (:obj:`Union[int, SequenceType]`): Action space shape, such as 6 or [2, 3, 3].
+            - encoder_hidden_size_list (:obj:`SequenceType`): Collection of ``hidden_size`` to pass to ``Encoder``, \
+                the last element must match ``head_hidden_size``.
+            - head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` of head network, defaults to None, \
+                then it will be set to the last element of ``encoder_hidden_size_list``.
+            - head_layer_num (:obj:`int`): The number of layers used in the head network to compute Q value output.
+            - activation (:obj:`Optional[nn.Module]`): The type of activation function in networks \
+                if ``None`` then default set it to ``nn.ReLU()``.
+            - norm_type (:obj:`Optional[str]`): The type of normalization in networks, see \
+                ``ding.torch_utils.fc_block`` for more details. you can choose one of ['BN', 'IN', 'SyncBN', 'LN']
+            - v_min (:obj:`Optional[float]`): The minimum value of the support of the distribution, which is related \
+                to the value (discounted sum of reward) scale of the specific environment. Defaults to -10.
+            - v_max (:obj:`Optional[float]`): The maximum value of the support of the distribution, which is related \
+                to the value (discounted sum of reward) scale of the specific environment. Defaults to 10.
+            - n_atom (:obj:`Optional[int]`): The number of atoms in the prediction distribution, 51 is the default \
+                value in the paper, you can also try other values such as 301.
         """
         super(C51DQN, self).__init__()
         # For compatibility: 1, (1, ), [4, 32, 32]
@@ -279,24 +315,28 @@ def __init__(
             )
 
     def forward(self, x: torch.Tensor) -> Dict:
-        r"""
+        """
+        Returns:
+            - outputs (:obj:`Dict`): The output of DQN's forward, including q_value.
+        ReturnsKeys:
+            - logit (:obj:`torch.Tensor`): Discrete Q-value output of each possible action dimension.
+        Shapes:
+            - x (:obj:`torch.Tensor`): :math:`(B, N)`, where B is batch size and N is ``obs_shape``
+            - logit (:obj:`torch.Tensor`): :math:`(B, M)`, where B is batch size and M is ``action_shape``
         Overview:
-            Use observation tensor to predict C51DQN's output.
-            Parameter updates with C51DQN's MLPs forward setup.
+            C51DQN forward computation graph, input observation tensor to predict q_value and its distribution.
         Arguments:
-            - x (:obj:`torch.Tensor`):
-                The encoded embedding tensor w/ ``(B, N=head_hidden_size)``.
+            - x (:obj:`torch.Tensor`): The input observation tensor data.
         Returns:
-            - outputs (:obj:`Dict`):
-                Run with encoder and head. Return the result prediction dictionary.
-
+            - outputs (:obj:`Dict`): The output of DQN's forward, including q_value, and distribution.
         ReturnsKeys:
-            - logit (:obj:`torch.Tensor`): Logit tensor with same size as input ``x``.
-            - distribution (:obj:`torch.Tensor`): Distribution tensor of size ``(B, N, n_atom)``
+            - logit (:obj:`torch.Tensor`): Discrete Q-value output of each possible action dimension.
+            - distribution (:obj:`torch.Tensor`): Q-Value discretized distribution, i.e., probability of each \
+                uniformly spaced atom Q-value, such as dividing [-10, 10] into 51 uniform spaces.
         Shapes:
             - x (:obj:`torch.Tensor`): :math:`(B, N)`, where B is batch size and N is head_hidden_size.
-            - logit (:obj:`torch.FloatTensor`): :math:`(B, M)`, where M is action_shape.
-            - distribution(:obj:`torch.FloatTensor`): :math:`(B, M, P)`, where P is n_atom.
+            - logit (:obj:`torch.Tensor`): :math:`(B, M)`, where M is action_shape.
+            - distribution(:obj:`torch.Tensor`): :math:`(B, M, P)`, where P is n_atom.
 
         Examples:
             >>> model = C51DQN(128, 64)  # arguments: 'obs_shape' and 'action_shape'
@@ -307,6 +347,14 @@ def forward(self, x: torch.Tensor) -> Dict:
             >>> assert outputs['logit'].shape == torch.Size([4, 64])
             >>> # default n_atom: int = 51
             >>> assert outputs['distribution'].shape == torch.Size([4, 64, 51])
+
+        .. note::
+            For consistency and compatibility, we name all the outputs of the network which are related to action \
+            selections as ``logit``.
+
+        .. note::
+            For convenience, we recommend that the number of atoms should be odd, so that the middle atom is exactly \
+            the value of the Q-value.
         """
         x = self.encoder(x)
         x = self.head(x)
@@ -640,7 +688,7 @@ class RainbowDQN(nn.Module):
         RainbowDQN network (C51 + Dueling + Noisy Block)
 
     .. note::
-        RainbowDQN contains dueling architecture by default
+        RainbowDQN contains dueling architecture by default.
     """
 
     def __init__(
@@ -787,7 +835,18 @@ def reshape(d):
 class DRQN(nn.Module):
     """
     Overview:
-        DQN + RNN = DRQN
+        The neural network structure and computation graph of DRQN (DQN + RNN = DRQN) algorithm, which is the most \
+        common DQN variant for sequential data and paratially observable environment. The DRQN is composed of three \
+        parts: ``encoder``, ``head`` and ``rnn``. The ``encoder`` is used to extract the feature from various \
+        observation, the ``rnn`` is used to process the sequential observation and other data, and the ``head`` is \
+        used to compute the Q value of each action dimension.
+    Interfaces:
+        ``__init__``, ``forward``.
+
+    .. note::
+        Current ``DRQN`` supports two types of encoder: ``FCEncoder`` and ``ConvEncoder``, two types of head: \
+        ``DiscreteHead`` and ``DuelingHead``, three types of rnn: ``normal (LSTM with LayerNorm)``, ``pytorch`` and \
+        ``gru``. You can customize your own encoder, rnn or head by inheriting this class.
     """
 
     def __init__(
@@ -803,21 +862,25 @@ def __init__(
             norm_type: Optional[str] = None,
             res_link: bool = False
     ) -> None:
-        r"""
+        """
         Overview:
-            Init the DRQN Model according to arguments.
+            Initialize the DRQN Model according to the corresponding input arguments.
         Arguments:
-            - obs_shape (:obj:`Union[int, SequenceType]`): Observation's space.
-            - action_shape (:obj:`Union[int, SequenceType]`): Action's space.
-            - encoder_hidden_size_list (:obj:`SequenceType`): Collection of ``hidden_size`` to pass to ``Encoder``
-            - head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to ``Head``.
-            - lstm_type (:obj:`Optional[str]`): Version of rnn cell, now support ['normal', 'pytorch', 'hpc', 'gru']
-            - activation (:obj:`Optional[nn.Module]`):
-                The type of activation function to use in ``MLP`` the after ``layer_fn``,
-                if ``None`` then default set to ``nn.ReLU()``
-            - norm_type (:obj:`Optional[str]`):
-                The type of normalization to use, see ``ding.torch_utils.fc_block`` for more details`
-            - res_link (:obj:`bool`): use the residual link or not, default to False
+            - obs_shape (:obj:`Union[int, SequenceType]`): Observation space shape, such as 8 or [4, 84, 84].
+            - action_shape (:obj:`Union[int, SequenceType]`): Action space shape, such as 6 or [2, 3, 3].
+            - encoder_hidden_size_list (:obj:`SequenceType`): Collection of ``hidden_size`` to pass to ``Encoder``, \
+                the last element must match ``head_hidden_size``.
+            - dueling (:obj:`Optional[bool]`): Whether choose ``DuelingHead`` or ``DiscreteHead (default)``.
+            - head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` of head network, defaults to None, \
+                then it will be set to the last element of ``encoder_hidden_size_list``.
+            - head_layer_num (:obj:`int`): The number of layers used in the head network to compute Q value output.
+            - lstm_type (:obj:`Optional[str]`): The type of RNN module, now support ['normal', 'pytorch', 'gru'].
+            - activation (:obj:`Optional[nn.Module]`): The type of activation function in networks \
+                if ``None`` then default set it to ``nn.ReLU()``.
+            - norm_type (:obj:`Optional[str]`): The type of normalization in networks, see \
+                ``ding.torch_utils.fc_block`` for more details. you can choose one of ['BN', 'IN', 'SyncBN', 'LN']
+            - res_link (:obj:`bool`): Whether to enable the residual link, which is the skip connnection between \
+                single frame data and the sequential data, defaults to False.
         """
         super(DRQN, self).__init__()
         # For compatibility: 1, (1, ), [4, 32, 32]
@@ -858,34 +921,26 @@ def __init__(
             )
 
     def forward(self, inputs: Dict, inference: bool = False, saved_state_timesteps: Optional[list] = None) -> Dict:
-        r"""
+        """
         Overview:
-            Use observation tensor to predict DRQN output.
-            Parameter updates with DRQN's MLPs forward setup.
+            DRQN forward computation graph, input observation tensor to predict q_value.
         Arguments:
-            - inputs (:obj:`Dict`):
-            - inference: (:obj:'bool'): if inference is True, we unroll the one timestep transition,
-                if inference is False, we unroll the sequence transitions.
-            - saved_state_timesteps: (:obj:'Optional[list]'): when inference is False,
-                we unroll the sequence transitions, then we would save rnn hidden states at timesteps
-                that are listed in list saved_state_timesteps.
-
-       ArgumentsKeys:
-            - obs (:obj:`torch.Tensor`): Encoded observation
-            - prev_state (:obj:`list`): Previous state's tensor of size ``(B, N)``
-
+            - inputs (:obj:`torch.Tensor`): The dict of input data, including observation and previous rnn state.
+            - inference: (:obj:'bool'): Whether to enable inference forward mode, if True, we unroll the one timestep \
+                transition, otherwise, we unroll the eentire sequence transitions.
+            - saved_state_timesteps: (:obj:'Optional[list]'): When inference is False, we unroll the sequence \
+                transitions, then we would use this list to indicate how to save and return hidden state.
+        ArgumentsKeys:
+            - obs (:obj:`torch.Tensor`): The raw observation tensor.
+            - prev_state (:obj:`list`): The previous rnn state tensor, whose structure depends on ``lstm_type``.
         Returns:
-            - outputs (:obj:`Dict`):
-                Run ``MLP`` with ``DRQN`` setups and return the result prediction dictionary.
-
+            - outputs (:obj:`Dict`): The output of DRQN's forward, including logit (q_value) and next state.
         ReturnsKeys:
-            - logit (:obj:`torch.Tensor`): Logit tensor with same size as input ``obs``.
-            - next_state (:obj:`list`): Next state's tensor of size ``(B, N)``
+            - logit (:obj:`torch.Tensor`): Discrete Q-value output of each possible action dimension.
+            - next_state (:obj:`list`): The next rnn state tensor, whose structure depends on ``lstm_type``.
         Shapes:
-            - obs (:obj:`torch.Tensor`): :math:`(B, N=obs_space)`, where B is batch size.
-            - prev_state(:obj:`torch.FloatTensor list`): :math:`[(B, N)]`
-            - logit (:obj:`torch.FloatTensor`): :math:`(B, N)`
-            - next_state(:obj:`torch.FloatTensor list`): :math:`[(B, N)]`
+            - obs (:obj:`torch.Tensor`): :math:`(B, N)`, where B is batch size and N is ``obs_shape``
+            - logit (:obj:`torch.Tensor`): :math:`(B, M)`, where B is batch size and M is ``action_shape``
 
         Examples:
             >>> # Init input's Keys:
@@ -958,18 +1013,24 @@ def forward(self, inputs: Dict, inference: bool = False, saved_state_timesteps:
             x['hidden_state'] = torch.cat(hidden_state_list, dim=0)
             if saved_state_timesteps is not None:
                 # the selected saved hidden states, including the hidden state (h) and the cell state (c)
-                # in r2d2, set 'saved_hidden_​​state_timesteps=[self._burnin_step, self._burnin_step + self._nstep]',
+                # in r2d2, set 'saved_hidden_state_timesteps=[self._burnin_step, self._burnin_step + self._nstep]',
                 # then saved_state will record the hidden_state for main_obs and target_obs to
                 # initialize their lstm (h c)
                 x['saved_state'] = saved_state
             return x
 
 
-@MODEL_REGISTRY.register('gtrxl_discrete')
-class GTrXLDiscreteHead(nn.Module):
+@MODEL_REGISTRY.register('gtrxldqn')
+class GTrXLDQN(nn.Module):
     """
     Overview:
-        Add a discrete head on top of the GTrXL module.
+        The neural network structure and computation graph of Gated Transformer-XL DQN algorithm, which is the \
+        enhanced version of DRQN, using Transformer-XL to improve long-term sequential modelling ability. The \
+        GTrXL-DQN is composed of three parts: ``encoder``, ``head`` and ``core``. The ``encoder`` is used to extract \
+        the feature from various observation, the ``core`` is used to process the sequential observation and other \
+        data, and the ``head`` is used to compute the Q value of each action dimension.
+    Interfaces:
+        ``__init__``, ``forward``, ``reset_memory``, ``get_memory`` .
     """
 
     def __init__(
@@ -992,11 +1053,15 @@ def __init__(
         encoder_hidden_size_list: SequenceType = [128, 128, 256],
         encoder_norm_type: Optional[str] = None,
     ) -> None:
-        r"""
+        """
         Overview:
-            Init the model according to arguments.
+            Initialize the GTrXLDQN model accoding to corresponding input arguments.
+
+        .. tip::
+            You can refer to GTrXl class in ``ding.torch_utils.network.gtrxl`` for more details about the input \
+            arguments.
+
         Arguments:
-            Refer to GTrXl class in `ding.torch_utils.network.gtrxl` for more details about the input arguments.
             - obs_shape (:obj:`Union[int, SequenceType]`): Used by Transformer. Observation's space.
             - action_shape (:obj:Union[int, SequenceType]): Used by Head. Action's space.
             - head_layer_num (:obj:`int`): Used by Head. Number of layers.
@@ -1006,20 +1071,20 @@ def __init__(
             - att_mlp_num (:obj:`int`): Used by Transformer.
             - att_layer_num (:obj:`int`): Used by Transformer.
             - memory_len (:obj:`int`): Used by Transformer.
-            - activation (:obj:`Optional[nn.Module]`): Used by Transformer and Head. if ``None`` then default set to
-             ``nn.ReLU()``.
-            - head_norm_type (:obj:`Optional[str]`): Used by Head. The type of normalization to use, see
-             ``ding.torch_utils.fc_block`` for more details`.
+            - activation (:obj:`Optional[nn.Module]`): Used by Transformer and Head. if ``None`` then default set to \
+                ``nn.ReLU()``.
+            - head_norm_type (:obj:`Optional[str]`): Used by Head. The type of normalization to use, see \
+                ``ding.torch_utils.fc_block`` for more details`.
             - dropout (:obj:`bool`): Used by Transformer.
             - gru_gating (:obj:`bool`): Used by Transformer.
             - gru_bias (:obj:`float`): Used by Transformer.
             - dueling (:obj:`bool`): Used by Head. Make the head dueling.
-            - encoder_hidden_size_list(:obj:`SequenceType`): Used by Encoder. The collection of ``hidden_size`` if using
-              a custom convolutional encoder.
-            - encoder_norm_type (:obj:`Optional[str]`): Used by Encoder. The type of normalization to use, see
+            - encoder_hidden_size_list(:obj:`SequenceType`): Used by Encoder. The collection of ``hidden_size`` if \
+                using a custom convolutional encoder.
+            - encoder_norm_type (:obj:`Optional[str]`): Used by Encoder. The type of normalization to use, see \
              ``ding.torch_utils.fc_block`` for more details`.
         """
-        super(GTrXLDiscreteHead, self).__init__()
+        super(GTrXLDQN, self).__init__()
         self.core = GTrXL(
             input_dim=obs_shape,
             head_dim=att_head_dim,
@@ -1035,7 +1100,7 @@ def __init__(
         )
 
         if isinstance(obs_shape, int) or len(obs_shape) == 1:
-            pass
+            raise NotImplementedError("not support obs_shape for pre-defined encoder: {}".format(obs_shape))
         # replace the embedding layer of Transformer with Conv Encoder
         elif len(obs_shape) == 3:
             assert encoder_hidden_size_list[-1] == hidden_size
@@ -1069,19 +1134,17 @@ def __init__(
             )
 
     def forward(self, x: torch.Tensor) -> Dict:
-        r"""
+        """
         Overview:
             Let input tensor go through GTrXl and the Head sequentially.
         Arguments:
             - x (:obj:`torch.Tensor`): input tensor of shape (seq_len, bs, obs_shape).
         Returns:
             - out (:obj:`Dict`): run ``GTrXL`` with ``DiscreteHead`` setups and return the result prediction dictionary.
-            Necessary Keys:
-                - logit (:obj:`torch.Tensor`): discrete Q-value output of each action dimension.
-                 Shape is (bs, action_space)
-                - memory (:obj:`torch.Tensor`):
-                memory tensor of size ``(bs x layer_num+1 x memory_len x embedding_dim)``
-                - transformer_out (:obj:`torch.Tensor`): output tensor of transformer with same size as input ``x``.
+        ReturnKeys:
+            - logit (:obj:`torch.Tensor`): discrete Q-value output of each action dimension, shape is (B, action_space).
+            - memory (:obj:`torch.Tensor`): memory tensor of size ``(bs x layer_num+1 x memory_len x embedding_dim)``.
+            - transformer_out (:obj:`torch.Tensor`): output tensor of transformer with same size as input ``x``.
         Examples:
             >>> # Init input's Keys:
             >>> obs_dim, seq_len, bs, action_dim = 128, 64, 32, 4
@@ -1102,27 +1165,23 @@ def forward(self, x: torch.Tensor) -> Dict:
         out['transformer_out'] = o1['logit']  # output of gtrxl, out['logit'] is final output
         return out
 
-    def reset_memory(self, batch_size: Optional[int] = None, state: Optional[torch.Tensor] = None):
-        r"""
+    def reset_memory(self, batch_size: Optional[int] = None, state: Optional[torch.Tensor] = None) -> None:
+        """
         Overview:
-            Clear or set the memory of GTrXL.
-         Arguments:
-            - batch_size (:obj:`Optional[int]`): batch size
-            - state (:obj:`Optional[torch.Tensor]`): input memory.
-            Shape is (layer_num, memory_len, bs, embedding_dim).
+            Clear or reset the memory of GTrXL.
+        Arguments:
+            - batch_size (:obj:`Optional[int]`): The number of samples in a training batch.
+            - state (:obj:`Optional[torch.Tensor]`): The input memory data, whose shape is \
+                (layer_num, memory_len, bs, embedding_dim).
         """
         self.core.reset_memory(batch_size, state)
 
     def get_memory(self) -> Optional[torch.Tensor]:
-        r"""
+        """
         Overview:
             Return the memory of GTrXL.
         Returns:
-            - memory: (:obj:`Optional[torch.Tensor]`): output memory or None if memory has not been initialized.
-            Shape is (layer_num, memory_len, bs, embedding_dim).
+            - memory: (:obj:`Optional[torch.Tensor]`): output memory or None if memory has not been initialized, \
+                whose shape is (layer_num, memory_len, bs, embedding_dim).
         """
         return self.core.get_memory()
-
-
-class GeneralQNetwork(nn.Module):
-    pass
diff --git a/ding/model/template/qac.py b/ding/model/template/qac.py
index aa0cc42b0e..6034a4d74c 100755
--- a/ding/model/template/qac.py
+++ b/ding/model/template/qac.py
@@ -9,11 +9,16 @@
     FCEncoder, ConvEncoder
 
 
-@MODEL_REGISTRY.register('qac')
-class QAC(nn.Module):
-    r"""
+@MODEL_REGISTRY.register('continuous_qac')
+class ContinuousQAC(nn.Module):
+    """
     Overview:
-        The QAC network, which is used in DDPG/TD3/SAC.
+        The neural network and computation graph of algorithms related to Q-value Actor-Critic (QAC), such as \
+        DDPG/TD3/SAC. This model now supports continuous and hybrid action space. The ContinuousQAC is composed of \
+        four parts: ``actor_encoder``, ``critic_encoder``, ``actor_head`` and ``critic_head``. Encoders are used to \
+        extract the feature from various observation. Heads are used to predict corresponding Q-value or action logit. \
+        In high-dimensional observation space like 2D image, we often use a shared encoder for both ``actor_encoder`` \
+        and ``critic_encoder``. In low-dimensional observation space like 1D vector, we often use different encoders.
     Interfaces:
         ``__init__``, ``forward``, ``compute_actor``, ``compute_critic``
     """
@@ -31,32 +36,33 @@ def __init__(
         critic_head_layer_num: int = 1,
         activation: Optional[nn.Module] = nn.ReLU(),
         norm_type: Optional[str] = None,
-        encoder_hidden_size_list: Optional[SequenceType] = [32, 64, 256],
+        encoder_hidden_size_list: Optional[SequenceType] = None,
         share_encoder: Optional[bool] = False,
     ) -> None:
         """
         Overview:
-            Initailize the QAC Model according to input arguments.
+            Initailize the ContinuousQAC Model according to input arguments.
         Arguments:
             - obs_shape (:obj:`Union[int, SequenceType]`): Observation's shape, such as 128, (156, ).
             - action_shape (:obj:`Union[int, SequenceType, EasyDict]`): Action's shape, such as 4, (3, ), \
                 EasyDict({'action_type_shape': 3, 'action_args_shape': 4}).
-            - action_space (:obj:`str`): The type of action space, \
-                including [``regression``, ``reparameterization``, ``hybrid``].
+            - action_space (:obj:`str`): The type of action space, including [``regression``, ``reparameterization``, \
+                ``hybrid``], ``regression`` is used for DDPG/TD3, ``reparameterization`` is used for SAC and \
+                ``hybrid`` for PADDPG.
             - twin_critic (:obj:`bool`): Whether to use twin critic, one of tricks in TD3.
             - actor_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to actor head.
-            - actor_head_layer_num (:obj:`int`): The num of layers used in the network to compute Q value output \
-                for actor head.
+            - actor_head_layer_num (:obj:`int`): The num of layers used in the actor network to compute action.
             - critic_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to critic head.
-            - critic_head_layer_num (:obj:`int`): The num of layers used in the network to compute Q value output \
-                for critic head.
+            - critic_head_layer_num (:obj:`int`): The num of layers used in the critic network to compute Q-value.
             - activation (:obj:`Optional[nn.Module]`): The type of activation function to use in ``MLP`` \
                 after each FC layer, if ``None`` then default set to ``nn.ReLU()``.
             - norm_type (:obj:`Optional[str]`): The type of normalization to after network layer (FC, Conv), \
                 see ``ding.torch_utils.network`` for more details.
+            - encoder_hidden_size_list (:obj:`SequenceType`): Collection of ``hidden_size`` to pass to ``Encoder``, \
+                the last element must match ``head_hidden_size``, this argument is only used in image observation.
             - share_encoder (:obj:`Optional[bool]`): Whether to share encoder between actor and critic.
         """
-        super(QAC, self).__init__()
+        super(ContinuousQAC, self).__init__()
         obs_shape: int = squeeze(obs_shape)
         action_shape = squeeze(action_shape)
         self.action_shape = action_shape
@@ -67,8 +73,12 @@ def __init__(
         self.share_encoder = share_encoder
         if np.isscalar(obs_shape) or len(obs_shape) == 1:
             assert not self.share_encoder, "Vector observation doesn't need share encoder."
-            self.encoder = None
-            self.input_size = obs_shape
+            assert encoder_hidden_size_list is None, "Vector obs encoder only uses one layer nn.Linear"
+            # Because there is already a layer nn.Linear in the head, so we use nn.Identity here to keep
+            # compatible with the image observation and avoid adding an extra layer nn.Linear.
+            self.actor_encoder = nn.Identity()
+            self.critic_encoder = nn.Identity()
+            encoder_output_size = obs_shape
         elif len(obs_shape) == 3:
 
             def setup_conv_encoder():
@@ -84,20 +94,18 @@ def setup_conv_encoder():
                 )
 
             if self.share_encoder:
-                self.encoder = setup_conv_encoder()
-                self.input_size = self.encoder.output_size
+                encoder = setup_conv_encoder()
+                self.actor_encoder = self.critic_encoder = encoder
             else:
-                self.encoder = nn.ModuleDict({
-                    'actor': setup_conv_encoder(),
-                    'critic': setup_conv_encoder(),
-                })
-                self.input_size = self.encoder['actor'].output_size
+                self.actor_encoder = setup_conv_encoder()
+                self.critic_encoder = setup_conv_encoder()
+            encoder_output_size = self.actor_encoder.output_size
         else:
             raise RuntimeError("not support observation shape: {}".format(obs_shape))
         # head
         if self.action_space == 'regression':  # DDPG, TD3
-            self.actor = nn.Sequential(
-                nn.Linear(self.input_size, actor_head_hidden_size), activation,
+            self.actor_head = nn.Sequential(
+                nn.Linear(encoder_output_size, actor_head_hidden_size), activation,
                 RegressionHead(
                     actor_head_hidden_size,
                     action_shape,
@@ -108,8 +116,8 @@ def setup_conv_encoder():
                 )
             )
         elif self.action_space == 'reparameterization':  # SAC
-            self.actor = nn.Sequential(
-                nn.Linear(self.input_size, actor_head_hidden_size), activation,
+            self.actor_head = nn.Sequential(
+                nn.Linear(encoder_output_size, actor_head_hidden_size), activation,
                 ReparameterizationHead(
                     actor_head_hidden_size,
                     action_shape,
@@ -125,7 +133,7 @@ def setup_conv_encoder():
             action_shape.action_args_shape = squeeze(action_shape.action_args_shape)
             action_shape.action_type_shape = squeeze(action_shape.action_type_shape)
             actor_action_args = nn.Sequential(
-                nn.Linear(self.input_size, actor_head_hidden_size), activation,
+                nn.Linear(encoder_output_size, actor_head_hidden_size), activation,
                 RegressionHead(
                     actor_head_hidden_size,
                     action_shape.action_args_shape,
@@ -136,7 +144,7 @@ def setup_conv_encoder():
                 )
             )
             actor_action_type = nn.Sequential(
-                nn.Linear(self.input_size, actor_head_hidden_size), activation,
+                nn.Linear(encoder_output_size, actor_head_hidden_size), activation,
                 DiscreteHead(
                     actor_head_hidden_size,
                     action_shape.action_type_shape,
@@ -145,17 +153,17 @@ def setup_conv_encoder():
                     norm_type=norm_type,
                 )
             )
-            self.actor = nn.ModuleList([actor_action_type, actor_action_args])
+            self.actor_head = nn.ModuleList([actor_action_type, actor_action_args])
 
         self.twin_critic = twin_critic
         if self.action_space == 'hybrid':
-            critic_input_size = self.input_size + action_shape.action_type_shape + action_shape.action_args_shape
+            critic_input_size = encoder_output_size + action_shape.action_type_shape + action_shape.action_args_shape
         else:
-            critic_input_size = self.input_size + action_shape
+            critic_input_size = encoder_output_size + action_shape
         if self.twin_critic:
-            self.critic = nn.ModuleList()
+            self.critic_head = nn.ModuleList()
             for _ in range(2):
-                self.critic.append(
+                self.critic_head.append(
                     nn.Sequential(
                         nn.Linear(critic_input_size, critic_head_hidden_size), activation,
                         RegressionHead(
@@ -169,7 +177,7 @@ def setup_conv_encoder():
                     )
                 )
         else:
-            self.critic = nn.Sequential(
+            self.critic_head = nn.Sequential(
                 nn.Linear(critic_input_size, critic_head_hidden_size), activation,
                 RegressionHead(
                     critic_head_hidden_size,
@@ -181,24 +189,41 @@ def setup_conv_encoder():
                 )
             )
 
+        # Convenient for calling some apis (e.g. self.critic.parameters()),
+        # but may cause misunderstanding when `print(self)`
+        self.actor = nn.ModuleList([self.actor_encoder, self.actor_head])
+        self.critic = nn.ModuleList([self.critic_encoder, self.critic_head])
+
     def forward(self, inputs: Union[torch.Tensor, Dict[str, torch.Tensor]], mode: str) -> Dict[str, torch.Tensor]:
         """
         Overview:
-            The unique execution (forward) method of QAC method, and one can indicate different modes to implement \
-            different computation graph, including ``compute_actor`` and ``compute_critic`` in QAC.
-        Mode compute_actor:
-            Arguments:
-                - inputs (:obj:`torch.Tensor`): Observation data, defaults to tensor.
-            Returns:
-                - output (:obj:`Dict`): Output dict data, including differnet key-values among distinct action_space.
-        Mode compute_critic:
-            Arguments:
-                - inputs (:obj:`Dict`): Input dict data, including obs and action tensor.
-            Returns:
-                - output (:obj:`Dict`): Output dict data, including q_value tensor.
+            QAC forward computation graph, input observation tensor to predict Q-value or action logit. Different \
+            ``mode`` will forward with different network modules to get different outputs and save computation.
+        Arguments:
+            - inputs (:obj:`Union[torch.Tensor, Dict[str, torch.Tensor]]`): The input data for forward computation \
+                graph, for ``compute_actor``, it is the observation tensor, for ``compute_critic``, it is the \
+                dict data including obs and action tensor.
+            - mode (:obj:`str`): The forward mode, all the modes are defined in the beginning of this class.
+        Returns:
+            - output (:obj:`Dict[str, torch.Tensor]`): The output dict of QAC forward computation graph, whose \
+                key-values vary in different forward modes.
+        Examples (Actor):
+            >>> # Regression mode
+            >>> model = ContinuousQAC(64, 6, 'regression')
+            >>> obs = torch.randn(4, 64)
+            >>> actor_outputs = model(obs,'compute_actor')
+            >>> assert actor_outputs['action'].shape == torch.Size([4, 6])
+            >>> # Reparameterization Mode
+            >>> model = ContinuousQAC(64, 6, 'reparameterization')
+            >>> obs = torch.randn(4, 64)
+            >>> actor_outputs = model(obs,'compute_actor')
+            >>> assert actor_outputs['logit'][0].shape == torch.Size([4, 6])  # mu
+            >>> actor_outputs['logit'][1].shape == torch.Size([4, 6]) # sigma
 
-        .. note::
-            For specific examples, one can refer to API doc of ``compute_actor`` and ``compute_critic`` respectively.
+        Examples (Critic):
+            >>> inputs = {'obs': torch.randn(4, 8), 'action': torch.randn(4, 1)}
+            >>> model = ContinuousQAC(obs_shape=(8, ),action_shape=1, action_space='regression')
+            >>> assert model(inputs, mode='compute_critic')['q_value'].shape == (4, )  # q value
         """
         assert mode in self.mode, "not support forward mode: {}/{}".format(mode, self.mode)
         return getattr(self, mode)(inputs)
@@ -206,26 +231,22 @@ def forward(self, inputs: Union[torch.Tensor, Dict[str, torch.Tensor]], mode: st
     def compute_actor(self, obs: torch.Tensor) -> Dict[str, Union[torch.Tensor, Dict[str, torch.Tensor]]]:
         """
         Overview:
-            The forward computation graph of compute_actor mode, uses observation tensor to produce actor output,
-            such as ``action``, ``logit`` and so on.
+            QAC forward computation graph for actor part, input observation tensor to predict action or action logit.
         Arguments:
-            - obs (:obj:`torch.Tensor`): Observation tensor data, now supports a batch of 1-dim vector data, \
-                i.e. ``(B, obs_shape)``.
+            - x (:obj:`torch.Tensor`): The input observation tensor data.
         Returns:
-            - outputs (:obj:`Dict[str, Union[torch.Tensor, Dict[str, torch.Tensor]]]`): Actor output varying \
+            - outputs (:obj:`Dict[str, Union[torch.Tensor, Dict[str, torch.Tensor]]]`): Actor output dict varying \
                 from action_space: ``regression``, ``reparameterization``, ``hybrid``.
-
-        ReturnsKeys (either):
-            - regression action_space
-                - action (:obj:`torch.Tensor`): Continuous action with same size as ``action_shape``, usually in DDPG.
-            - reparameterization action_space
-                - logit (:obj:`Dict[str, torch.Tensor]`): Reparameterization logit, usually in SAC.
-
-                    - mu (:obj:`torch.Tensor`): Mean of parameterization gaussion distribution.
-                    - sigma (:obj:`torch.Tensor`): Standard variation of parameterization gaussion distribution.
-            - hybrid action_space
-                - logit (:obj:`torch.Tensor`): Discrete action type logit.
-                - action_args (:obj:`torch.Tensor`): Continuous action arguments.
+        ReturnsKeys (regression):
+            - action (:obj:`torch.Tensor`): Continuous action with same size as ``action_shape``, usually in DDPG/TD3.
+        ReturnsKeys (reparameterization):
+            - logit (:obj:`Dict[str, torch.Tensor]`): The predictd reparameterization action logit, usually in SAC. \
+                It is a list containing two tensors: ``mu`` and ``sigma``. The former is the mean of the gaussian \
+                distribution, the latter is the standard deviation of the gaussian distribution.
+        ReturnsKeys (hybrid):
+            - logit (:obj:`torch.Tensor`): The predicted discrete action type logit, it will be the same dimension \
+                as ``action_type_shape``, i.e., all the possible discrete action types.
+            - action_args (:obj:`torch.Tensor`): Continuous action arguments with same size as ``action_args_shape``.
         Shapes:
             - obs (:obj:`torch.Tensor`): :math:`(B, N0)`, B is batch size and N0 corresponds to ``obs_shape``.
             - action (:obj:`torch.Tensor`): :math:`(B, N1)`, B is batch size and N1 corresponds to ``action_shape``.
@@ -237,49 +258,44 @@ def compute_actor(self, obs: torch.Tensor) -> Dict[str, Union[torch.Tensor, Dict
                 ``action_shape.action_args_shape``.
         Examples:
             >>> # Regression mode
-            >>> model = QAC(64, 64, 'regression')
+            >>> model = ContinuousQAC(64, 6, 'regression')
             >>> obs = torch.randn(4, 64)
             >>> actor_outputs = model(obs,'compute_actor')
-            >>> assert actor_outputs['action'].shape == torch.Size([4, 64])
+            >>> assert actor_outputs['action'].shape == torch.Size([4, 6])
             >>> # Reparameterization Mode
-            >>> model = QAC(64, 64, 'reparameterization')
+            >>> model = ContinuousQAC(64, 6, 'reparameterization')
             >>> obs = torch.randn(4, 64)
             >>> actor_outputs = model(obs,'compute_actor')
-            >>> assert actor_outputs['logit'][0].shape == torch.Size([4, 64])  # mu
-            >>> actor_outputs['logit'][1].shape == torch.Size([4, 64]) # sigma
+            >>> assert actor_outputs['logit'][0].shape == torch.Size([4, 6])  # mu
+            >>> actor_outputs['logit'][1].shape == torch.Size([4, 6]) # sigma
         """
-        if self.encoder is not None:
-            if self.share_encoder:
-                obs = self.encoder(obs)
-            else:
-                obs = self.encoder['actor'](obs)
+        obs = self.actor_encoder(obs)
         if self.action_space == 'regression':
-            x = self.actor(obs)
+            x = self.actor_head(obs)
             return {'action': x['pred']}
         elif self.action_space == 'reparameterization':
-            x = self.actor(obs)
+            x = self.actor_head(obs)
             return {'logit': [x['mu'], x['sigma']]}
         elif self.action_space == 'hybrid':
-            logit = self.actor[0](obs)
-            action_args = self.actor[1](obs)
+            logit = self.actor_head[0](obs)
+            action_args = self.actor_head[1](obs)
             return {'logit': logit['logit'], 'action_args': action_args['pred']}
 
     def compute_critic(self, inputs: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
         """
         Overview:
-            The forward computation graph of compute_critic mode, uses observation and action tensor to produce critic
-            output, such as ``q_value``.
+            QAC forward computation graph for critic part, input observation and action tensor to predict Q-value.
         Arguments:
-            - inputs (:obj:`Dict[str, torch.Tensor]`): Dict strcture of input data, including ``obs`` and ``action`` \
-                tensor, also contains ``logit`` tensor in hybrid action_space.
-        Returns:
-            - outputs (:obj:`Dict[str, torch.Tensor]`): Critic output, such as ``q_value``.
-
+            - inputs (:obj:`Dict[str, torch.Tensor]`): The dict of input data, including ``obs`` and ``action`` \
+                tensor, also contains ``logit`` and ``action_args`` tensor in hybrid action_space.
         ArgumentsKeys:
             - obs: (:obj:`torch.Tensor`): Observation tensor data, now supports a batch of 1-dim vector data.
             - action (:obj:`Union[torch.Tensor, Dict]`): Continuous action with same size as ``action_shape``.
             - logit (:obj:`torch.Tensor`): Discrete action logit, only in hybrid action_space.
             - action_args (:obj:`torch.Tensor`): Continuous action arguments, only in hybrid action_space.
+        Returns:
+            - outputs (:obj:`Dict[str, torch.Tensor]`): The output dict of QAC's forward computation graph for critic, \
+                including ``q_value``.
         ReturnKeys:
             - q_value (:obj:`torch.Tensor`): Q value tensor with same size as batch size.
         Shapes:
@@ -293,17 +309,12 @@ def compute_critic(self, inputs: Dict[str, torch.Tensor]) -> Dict[str, torch.Ten
 
         Examples:
             >>> inputs = {'obs': torch.randn(4, 8), 'action': torch.randn(4, 1)}
-            >>> model = QAC(obs_shape=(8, ),action_shape=1, action_space='regression')
-            >>> model(inputs, mode='compute_critic')['q_value']  # q value
-            ... tensor([0.0773, 0.1639, 0.0917, 0.0370], grad_fn=<SqueezeBackward1>)
+            >>> model = ContinuousQAC(obs_shape=(8, ),action_shape=1, action_space='regression')
+            >>> assert model(inputs, mode='compute_critic')['q_value'].shape == (4, )  # q value
         """
 
         obs, action = inputs['obs'], inputs['action']
-        if self.encoder is not None:
-            if self.share_encoder:
-                obs = self.encoder(obs)
-            else:
-                obs = self.encoder['critic'](obs)
+        obs = self.critic_encoder(obs)
         assert len(obs.shape) == 2
         if self.action_space == 'hybrid':
             action_type_logit = inputs['logit']
@@ -317,80 +328,100 @@ def compute_critic(self, inputs: Dict[str, torch.Tensor]) -> Dict[str, torch.Ten
                 action = action.unsqueeze(1)
             x = torch.cat([obs, action], dim=1)
         if self.twin_critic:
-            x = [m(x)['pred'] for m in self.critic]
+            x = [m(x)['pred'] for m in self.critic_head]
         else:
-            x = self.critic(x)['pred']
+            x = self.critic_head(x)['pred']
         return {'q_value': x}
 
 
 @MODEL_REGISTRY.register('discrete_qac')
 class DiscreteQAC(nn.Module):
-    r"""
+    """
     Overview:
-        The Discrete QAC model, used in DiscreteSAC.
+        The neural network and computation graph of algorithms related to discrete action Q-value Actor-Critic (QAC), \
+        such as DiscreteSAC. This model now supports only discrete action space. The DiscreteQAC is composed of \
+        four parts: ``actor_encoder``, ``critic_encoder``, ``actor_head`` and ``critic_head``. Encoders are used to \
+        extract the feature from various observation. Heads are used to predict corresponding Q-value or action logit. \
+        In high-dimensional observation space like 2D image, we often use a shared encoder for both ``actor_encoder`` \
+        and ``critic_encoder``. In low-dimensional observation space like 1D vector, we often use different encoders.
     Interfaces:
         ``__init__``, ``forward``, ``compute_actor``, ``compute_critic``
     """
     mode = ['compute_actor', 'compute_critic']
 
     def __init__(
-            self,
-            agent_obs_shape: Union[int, SequenceType],
-            global_obs_shape: Union[int, SequenceType],
-            action_shape: Union[int, SequenceType],
-            encoder_hidden_size_list: SequenceType = [64],
-            twin_critic: bool = False,
-            actor_head_hidden_size: int = 64,
-            actor_head_layer_num: int = 1,
-            critic_head_hidden_size: int = 64,
-            critic_head_layer_num: int = 1,
-            activation: Optional[nn.Module] = nn.ReLU(),
-            norm_type: Optional[str] = None,
+        self,
+        obs_shape: Union[int, SequenceType],
+        action_shape: Union[int, SequenceType],
+        twin_critic: bool = False,
+        actor_head_hidden_size: int = 64,
+        actor_head_layer_num: int = 1,
+        critic_head_hidden_size: int = 64,
+        critic_head_layer_num: int = 1,
+        activation: Optional[nn.Module] = nn.ReLU(),
+        norm_type: Optional[str] = None,
+        encoder_hidden_size_list: SequenceType = None,
+        share_encoder: Optional[bool] = False,
     ) -> None:
-        r"""
+        """
         Overview:
-            Init the QAC Model according to arguments.
+            Initailize the DiscreteQAC Model according to input arguments.
         Arguments:
-            - obs_shape (:obj:`Union[int, SequenceType]`): Observation's space.
-            - action_shape (:obj:`Union[int, SequenceType]`): Action's space.
-            - twin_critic (:obj:`bool`): Whether include twin critic.
-            - actor_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to actor-nn's ``Head``.
-            - actor_head_layer_num (:obj:`int`):
-                The num of layers used in the network to compute Q value output for actor's nn.
-            - critic_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to critic-nn's ``Head``.
-            - critic_head_layer_num (:obj:`int`):
-                The num of layers used in the network to compute Q value output for critic's nn.
-            - activation (:obj:`Optional[nn.Module]`):
-                The type of activation function to use in ``MLP`` the after ``layer_fn``,
-                if ``None`` then default set to ``nn.ReLU()``
-            - norm_type (:obj:`Optional[str]`):
-                The type of normalization to use, see ``ding.torch_utils.fc_block`` for more details.
+            - obs_shape (:obj:`Union[int, SequenceType]`): Observation's shape, such as 128, (156, ).
+            - action_shape (:obj:`Union[int, SequenceType, EasyDict]`): Action's shape, such as 4, (3, ).
+            - twin_critic (:obj:`bool`): Whether to use twin critic.
+            - actor_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to actor head.
+            - actor_head_layer_num (:obj:`int`): The num of layers used in the actor network to compute action.
+            - critic_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to critic head.
+            - critic_head_layer_num (:obj:`int`): The num of layers used in the critic network to compute Q-value.
+            - activation (:obj:`Optional[nn.Module]`): The type of activation function to use in ``MLP`` \
+                after each FC layer, if ``None`` then default set to ``nn.ReLU()``.
+            - norm_type (:obj:`Optional[str]`): The type of normalization to after network layer (FC, Conv), \
+                see ``ding.torch_utils.network`` for more details.
+            - encoder_hidden_size_list (:obj:`SequenceType`): Collection of ``hidden_size`` to pass to ``Encoder``, \
+                the last element must match ``head_hidden_size``, this argument is only used in image observation.
+            - share_encoder (:obj:`Optional[bool]`): Whether to share encoder between actor and critic.
         """
         super(DiscreteQAC, self).__init__()
-        agent_obs_shape: int = squeeze(agent_obs_shape)
+        obs_shape: int = squeeze(obs_shape)
         action_shape: int = squeeze(action_shape)
+        # encoder
+        self.share_encoder = share_encoder
+        if np.isscalar(obs_shape) or len(obs_shape) == 1:
+            assert not self.share_encoder, "Vector observation doesn't need share encoder."
+            assert encoder_hidden_size_list is None, "Vector obs encoder only uses one layer nn.Linear"
+            # Because there is already a layer nn.Linear in the head, so we use nn.Identity here to keep
+            # compatible with the image observation and avoid adding an extra layer nn.Linear.
+            self.actor_encoder = nn.Identity()
+            self.critic_encoder = nn.Identity()
+            encoder_output_size = obs_shape
+        elif len(obs_shape) == 3:
 
-        if isinstance(agent_obs_shape, int) or len(agent_obs_shape) == 1:
-            encoder_cls = FCEncoder
-        elif len(agent_obs_shape) == 3:
-            encoder_cls = ConvEncoder
-        else:
-            raise RuntimeError(
-                "not support obs_shape for pre-defined encoder: {}, please customize your own DQN".
-                format(agent_obs_shape)
-            )
-        if isinstance(global_obs_shape, int) or len(global_obs_shape) == 1:
-            global_encoder_cls = FCEncoder
-        elif len(global_obs_shape) == 3:
-            global_encoder_cls = ConvEncoder
+            def setup_conv_encoder():
+                kernel_size = [3 for _ in range(len(encoder_hidden_size_list))]
+                stride = [2] + [1 for _ in range(len(encoder_hidden_size_list) - 1)]
+                return ConvEncoder(
+                    obs_shape,
+                    encoder_hidden_size_list,
+                    activation=activation,
+                    norm_type=norm_type,
+                    kernel_size=kernel_size,
+                    stride=stride
+                )
+
+            if self.share_encoder:
+                encoder = setup_conv_encoder()
+                self.actor_encoder = self.critic_encoder = encoder
+            else:
+                self.actor_encoder = setup_conv_encoder()
+                self.critic_encoder = setup_conv_encoder()
+            encoder_output_size = self.actor_encoder.output_size
         else:
-            raise RuntimeError(
-                "not support obs_shape for pre-defined encoder: {}, please customize your own DQN".
-                format(global_obs_shape)
-            )
+            raise RuntimeError("not support observation shape: {}".format(obs_shape))
 
-        self.actor = nn.Sequential(
-            encoder_cls(agent_obs_shape, encoder_hidden_size_list, activation=activation, norm_type=norm_type),
+        # head
+        self.actor_head = nn.Sequential(
+            nn.Linear(encoder_output_size, actor_head_hidden_size), activation,
             DiscreteHead(
                 actor_head_hidden_size, action_shape, actor_head_layer_num, activation=activation, norm_type=norm_type
             )
@@ -398,13 +429,11 @@ def __init__(
 
         self.twin_critic = twin_critic
         if self.twin_critic:
-            self.critic = nn.ModuleList()
+            self.critic_head = nn.ModuleList()
             for _ in range(2):
-                self.critic.append(
+                self.critic_head.append(
                     nn.Sequential(
-                        global_encoder_cls(
-                            agent_obs_shape, encoder_hidden_size_list, activation=activation, norm_type=norm_type
-                        ),
+                        nn.Linear(encoder_output_size, critic_head_hidden_size), activation,
                         DiscreteHead(
                             critic_head_hidden_size,
                             action_shape,
@@ -415,10 +444,8 @@ def __init__(
                     )
                 )
         else:
-            self.critic = nn.Sequential(
-                global_encoder_cls(
-                    agent_obs_shape, encoder_hidden_size_list, activation=activation, norm_type=norm_type
-                ),
+            self.critic_head = nn.Sequential(
+                nn.Linear(encoder_output_size, critic_head_hidden_size), activation,
                 DiscreteHead(
                     critic_head_hidden_size,
                     action_shape,
@@ -427,136 +454,88 @@ def __init__(
                     norm_type=norm_type
                 )
             )
+        # Convenient for calling some apis (e.g. self.critic.parameters()),
+        # but may cause misunderstanding when `print(self)`
+        self.actor = nn.ModuleList([self.actor_encoder, self.actor_head])
+        self.critic = nn.ModuleList([self.critic_encoder, self.critic_head])
 
-    def forward(self, inputs: Union[torch.Tensor, Dict], mode: str) -> Dict:
-        r"""
+    def forward(self, inputs: torch.Tensor, mode: str) -> Dict[str, torch.Tensor]:
+        """
         Overview:
-            Use bbservation and action tensor to predict output.
-            Parameter updates with QAC's MLPs forward setup.
+            QAC forward computation graph, input observation tensor to predict Q-value or action logit. Different \
+            ``mode`` will forward with different network modules to get different outputs and save computation.
         Arguments:
-            Forward with ``'compute_actor'``:
-                - inputs (:obj:`torch.Tensor`):
-                    The encoded embedding tensor, determined with given ``hidden_size``, i.e. ``(B, N=hidden_size)``.
-                    Whether ``actor_head_hidden_size`` or ``critic_head_hidden_size`` depend on ``mode``.
-
-            Forward with ``'compute_critic'``, inputs (`Dict`) Necessary Keys:
-                - ``obs``, ``action`` encoded tensors.
-
-            - mode (:obj:`str`): Name of the forward mode.
+            - inputs (:obj:`torch.Tensor`): The input observation tensor data.
+            - mode (:obj:`str`): The forward mode, all the modes are defined in the beginning of this class.
         Returns:
-            - outputs (:obj:`Dict`): Outputs of network forward.
-
-                Forward with ``'compute_actor'``, Necessary Keys (either):
-                    - action (:obj:`torch.Tensor`): Action tensor with same size as input ``x``.
-                    - logit (:obj:`torch.Tensor`):
-                        Logit tensor encoding ``mu`` and ``sigma``, both with same size as input ``x``.
-
-                Forward with ``'compute_critic'``, Necessary Keys:
-                    - q_value (:obj:`torch.Tensor`): Q value tensor with same size as batch size.
-        Actor Shapes:
-            - inputs (:obj:`torch.Tensor`): :math:`(B, N0)`, B is batch size and N0 corresponds to ``hidden_size``
-            - action (:obj:`torch.Tensor`): :math:`(B, N0)`
-            - q_value (:obj:`torch.FloatTensor`): :math:`(B, )`, where B is batch size.
-
-        Critic Shapes:
-            - obs (:obj:`torch.Tensor`): :math:`(B, N1)`, where B is batch size and N1 is ``obs_shape``
-            - action (:obj:`torch.Tensor`): :math:`(B, N2)`, where B is batch size and N2 is``action_shape``
-            - logit (:obj:`torch.FloatTensor`): :math:`(B, N2)`, where B is batch size and N3 is ``action_shape``
-
-        Actor Examples:
-            >>> # Regression mode
-            >>> model = QAC(64, 64, 'regression')
-            >>> inputs = torch.randn(4, 64)
-            >>> actor_outputs = model(inputs,'compute_actor')
-            >>> assert actor_outputs['action'].shape == torch.Size([4, 64])
-            >>> # Reparameterization Mode
-            >>> model = QAC(64, 64, 'reparameterization')
-            >>> inputs = torch.randn(4, 64)
-            >>> actor_outputs = model(inputs,'compute_actor')
-            >>> actor_outputs['logit'][0].shape # mu
-            >>> torch.Size([4, 64])
-            >>> actor_outputs['logit'][1].shape # sigma
-            >>> torch.Size([4, 64])
-
-        Critic Examples:
-            >>> inputs = {'obs': torch.randn(4,N), 'action': torch.randn(4,1)}
-            >>> model = QAC(obs_shape=(N, ), action_shape=1, action_space='regression')
-            >>> model(inputs, mode='compute_critic')['q_value'] # q value
-            tensor([0.0773, 0.1639, 0.0917, 0.0370], grad_fn=<SqueezeBackward1>)
+            - output (:obj:`Dict[str, torch.Tensor]`): The output dict of QAC forward computation graph, whose \
+                key-values vary in different forward modes.
+        Examples (Actor):
+            >>> model = DiscreteQAC(64, 6)
+            >>> obs = torch.randn(4, 64)
+            >>> actor_outputs = model(obs,'compute_actor')
+            >>> assert actor_outputs['logit'].shape == torch.Size([4, 6])
 
+        Examples(Critic):
+            >>> model = DiscreteQAC(64, 6, twin_critic=False)
+            >>> obs = torch.randn(4, 64)
+            >>> actor_outputs = model(obs,'compute_critic')
+            >>> assert actor_outputs['q_value'].shape == torch.Size([4, 6])
         """
         assert mode in self.mode, "not support forward mode: {}/{}".format(mode, self.mode)
         return getattr(self, mode)(inputs)
 
-    def compute_actor(self, inputs: torch.Tensor) -> Dict:
-        r"""
+    def compute_actor(self, inputs: torch.Tensor) -> Dict[str, torch.Tensor]:
+        """
         Overview:
-            Use encoded embedding tensor to predict output.
-            Execute parameter updates with ``'compute_actor'`` mode
-            Use encoded embedding tensor to predict output.
+            QAC forward computation graph for actor part, input observation tensor to predict action or action logit.
         Arguments:
-            - inputs (:obj:`torch.Tensor`):
-                The encoded embedding tensor, determined with given ``hidden_size``, i.e. ``(B, N=hidden_size)``.
-                ``hidden_size = actor_head_hidden_size``
-            - mode (:obj:`str`): Name of the forward mode.
+            - inputs (:obj:`torch.Tensor`): The input observation tensor data.
         Returns:
-            - outputs (:obj:`Dict`): Outputs of forward pass encoder and head.
-
-        ReturnsKeys (either):
-            - action (:obj:`torch.Tensor`): Continuous action tensor with same size as ``action_shape``.
-            - logit (:obj:`torch.Tensor`):
-                Logit tensor encoding ``mu`` and ``sigma``, both with same size as input ``x``.
+            - outputs (:obj:`Dict[str, torch.Tensor]`): The output dict of QAC forward computation graph for actor, \
+                including discrete action ``logit``.
+        ReturnsKeys:
+            - logit (:obj:`torch.Tensor`): The predicted discrete action type logit, it will be the same dimension \
+                as ``action_shape``, i.e., all the possible discrete action choices.
         Shapes:
-            - inputs (:obj:`torch.Tensor`): :math:`(B, N0)`, B is batch size and N0 corresponds to ``hidden_size``
-            - action (:obj:`torch.Tensor`): :math:`(B, N0)`
-            - logit (:obj:`list`): 2 elements, mu and sigma, each is the shape of :math:`(B, N0)`.
-            - q_value (:obj:`torch.FloatTensor`): :math:`(B, )`, B is batch size.
+            - inputs (:obj:`torch.Tensor`): :math:`(B, N0)`, B is batch size and N0 corresponds to ``obs_shape``.
+            - logit (:obj:`torch.Tensor`): :math:`(B, N2)`, B is batch size and N2 corresponds to \
+                ``action_shape``.
         Examples:
-            >>> # Regression mode
-            >>> model = QAC(64, 64, 'regression')
-            >>> inputs = torch.randn(4, 64)
-            >>> actor_outputs = model(inputs,'compute_actor')
-            >>> assert actor_outputs['action'].shape == torch.Size([4, 64])
-            >>> # Reparameterization Mode
-            >>> model = QAC(64, 64, 'reparameterization')
-            >>> inputs = torch.randn(4, 64)
-            >>> actor_outputs = model(inputs,'compute_actor')
-            >>> actor_outputs['logit'][0].shape # mu
-            >>> torch.Size([4, 64])
-            >>> actor_outputs['logit'][1].shape # sigma
-            >>> torch.Size([4, 64])
+            >>> model = DiscreteQAC(64, 6)
+            >>> obs = torch.randn(4, 64)
+            >>> actor_outputs = model(obs,'compute_actor')
+            >>> assert actor_outputs['logit'].shape == torch.Size([4, 6])
         """
-        x = self.actor(inputs['obs'])
+        x = self.actor_encoder(inputs)
+        x = self.actor_head(x)
         return {'logit': x['logit']}
 
-    def compute_critic(self, inputs: Dict) -> Dict:
-        r"""
+    def compute_critic(self, inputs: torch.Tensor) -> Dict[str, torch.Tensor]:
+        """
         Overview:
-            Execute parameter updates with ``'compute_critic'`` mode
-            Use encoded embedding tensor to predict output.
+            QAC forward computation graph for critic part, input observation to predict Q-value for each possible \
+            discrete action choices.
         Arguments:
-            - ``obs``, ``action`` encoded tensors.
-            - mode (:obj:`str`): Name of the forward mode.
+            - inputs (:obj:`torch.Tensor`): The input observation tensor data.
         Returns:
-            - outputs (:obj:`Dict`): Q-value output.
-
+            - outputs (:obj:`Dict[str, torch.Tensor]`): The output dict of QAC forward computation graph for critic, \
+                including ``q_value`` for each possible discrete action choices.
         ReturnKeys:
-            - q_value (:obj:`torch.Tensor`): Q value tensor with same size as batch size.
+            - q_value (:obj:`torch.Tensor`): The predicted Q-value for each possible discrete action choices, it will \
+                be the same dimension as ``action_shape`` and used to calculate the loss.
         Shapes:
-            - obs (:obj:`torch.Tensor`): :math:`(B, N1)`, where B is batch size and N1 is ``obs_shape``
-            - action (:obj:`torch.Tensor`): :math:`(B, N2)`, where B is batch size and N2 is ``action_shape``
-            - q_value (:obj:`torch.FloatTensor`): :math:`(B, )`, where B is batch size.
-
+            - obs (:obj:`torch.Tensor`): :math:`(B, N1)`, where B is batch size and N1 is ``obs_shape``.
+            - q_value (:obj:`torch.Tensor`): :math:`(B, N2)`, where B is batch size and N2 is ``action_shape``.
         Examples:
-            >>> inputs = {'obs': torch.randn(4, N), 'action': torch.randn(4, 1)}
-            >>> model = QAC(obs_shape=(N, ),action_shape=1, action_space='regression')
-            >>> model(inputs, mode='compute_critic')['q_value'] # q value
-            tensor([0.0773, 0.1639, 0.0917, 0.0370], grad_fn=<SqueezeBackward1>)
-
+            >>> model = DiscreteQAC(64, 6, twin_critic=False)
+            >>> obs = torch.randn(4, 64)
+            >>> actor_outputs = model(obs,'compute_critic')
+            >>> assert actor_outputs['q_value'].shape == torch.Size([4, 6])
         """
-
+        inputs = self.critic_encoder(inputs)
         if self.twin_critic:
-            x = [m(inputs['obs'])['logit'] for m in self.critic]
+            x = [m(inputs)['logit'] for m in self.critic_head]
         else:
-            x = self.critic(inputs['obs'])['logit']
+            x = self.critic_head(inputs)['logit']
         return {'q_value': x}
diff --git a/ding/model/template/tests/test_hybrid_qac.py b/ding/model/template/tests/test_hybrid_qac.py
index 018c3f2d36..3a81d55350 100644
--- a/ding/model/template/tests/test_hybrid_qac.py
+++ b/ding/model/template/tests/test_hybrid_qac.py
@@ -3,7 +3,7 @@
 import pytest
 from itertools import product
 
-from ding.model.template import QAC
+from ding.model.template import ContinuousQAC
 from ding.torch_utils import is_differentiable
 from ding.utils import squeeze
 from easydict import EasyDict
@@ -21,7 +21,7 @@
 
 
 @pytest.mark.unittest
-class TestHybridQAC:
+class TestHybridContinuousQAC:
 
     def test_hybrid_qac(
         self,
@@ -39,7 +39,7 @@ def test_hybrid_qac(
             },
             'logit': torch.randn(B, squeeze(action_shape.action_type_shape))
         }
-        model = QAC(
+        model = ContinuousQAC(
             obs_shape=(N, ),
             action_shape=action_shape,
             action_space=action_space,
@@ -50,8 +50,8 @@ def test_hybrid_qac(
         # compute_q
         q = model(inputs, mode='compute_critic')['q_value']
         if twin:
-            is_differentiable(q[0].sum(), model.critic[0])
-            is_differentiable(q[1].sum(), model.critic[1])
+            is_differentiable(q[0].sum(), model.critic[1][0])
+            is_differentiable(q[1].sum(), model.critic[1][1])
         else:
             is_differentiable(q.sum(), model.critic)
 
diff --git a/ding/model/template/tests/test_maqac.py b/ding/model/template/tests/test_maqac.py
index 4b6f40e69a..fa917e7ebc 100644
--- a/ding/model/template/tests/test_maqac.py
+++ b/ding/model/template/tests/test_maqac.py
@@ -3,7 +3,7 @@
 import pytest
 from itertools import product
 
-from ding.model.template import MAQAC, ContinuousMAQAC
+from ding.model.template import DiscreteMAQAC, ContinuousMAQAC
 from ding.torch_utils import is_differentiable
 from ding.utils.default_helper import squeeze
 
@@ -17,7 +17,7 @@
 
 @pytest.mark.unittest
 @pytest.mark.parametrize('agent_obs_shape, global_obs_shape, twin_critic', args)
-class TestMAQAC:
+class TestDiscreteMAQAC:
 
     def output_check(self, model, outputs, action_shape):
         if isinstance(action_shape, tuple):
@@ -34,7 +34,7 @@ def test_maqac(self, agent_obs_shape, global_obs_shape, twin_critic):
                 'action_mask': torch.randint(0, 2, size=(B, agent_num, action_shape))
             }
         }
-        model = MAQAC(agent_obs_shape, global_obs_shape, action_shape, twin_critic=twin_critic)
+        model = DiscreteMAQAC(agent_obs_shape, global_obs_shape, action_shape, twin_critic=twin_critic)
 
         logit = model(data, mode='compute_actor')['logit']
         value = model(data, mode='compute_critic')['q_value']
diff --git a/ding/model/template/tests/test_qac.py b/ding/model/template/tests/test_qac.py
index 4bcc27cc1a..7ddbf9d511 100644
--- a/ding/model/template/tests/test_qac.py
+++ b/ding/model/template/tests/test_qac.py
@@ -3,7 +3,7 @@
 import pytest
 from itertools import product
 
-from ding.model.template import QAC, MAQAC, DiscreteQAC
+from ding.model.template import ContinuousQAC, DiscreteMAQAC, DiscreteQAC
 from ding.torch_utils import is_differentiable
 from ding.utils import squeeze
 
@@ -18,12 +18,12 @@
 
 @pytest.mark.unittest
 @pytest.mark.parametrize('action_shape, twin, action_space', args)
-class TestQAC:
+class TestContinuousQAC:
 
     def test_fcqac(self, action_shape, twin, action_space):
         N = 32
         inputs = {'obs': torch.randn(B, N), 'action': torch.randn(B, squeeze(action_shape))}
-        model = QAC(
+        model = ContinuousQAC(
             obs_shape=(N, ),
             action_shape=action_shape,
             action_space=action_space,
@@ -34,8 +34,8 @@ def test_fcqac(self, action_shape, twin, action_space):
         # compute_q
         q = model(inputs, mode='compute_critic')['q_value']
         if twin:
-            is_differentiable(q[0].sum(), model.critic[0])
-            is_differentiable(q[1].sum(), model.critic[1])
+            is_differentiable(q[0].sum(), model.critic[1][0])
+            is_differentiable(q[1].sum(), model.critic[1][1])
         else:
             is_differentiable(q.sum(), model.critic)
 
@@ -56,83 +56,38 @@ def test_fcqac(self, action_shape, twin, action_space):
             is_differentiable(mu.sum() + sigma.sum(), model.actor)
 
 
-args = list(product(*[[True, False]]))
+args = list(product(*[[True, False], [(13, ), [4, 84, 84]]]))
 
 
 @pytest.mark.unittest
-@pytest.mark.parametrize('twin', args)
+@pytest.mark.parametrize('twin, obs_shape', args)
 class TestDiscreteQAC:
 
-    def test_discreteqac(self, twin):
-        N = 32
-        A = 6
-        inputs = {'obs': torch.randn(B, N)}
+    def test_discreteqac(self, twin, obs_shape):
+        action_shape = 6
+        inputs = torch.randn(B, *obs_shape)
         model = DiscreteQAC(
-            agent_obs_shape=N,
-            global_obs_shape=N,
-            action_shape=A,
+            obs_shape=obs_shape,
+            action_shape=action_shape,
             twin_critic=twin,
+            encoder_hidden_size_list=[32, 32, 64] if len(obs_shape) > 1 else None,
         )
-        # compute_q
+        # compute_critic
         q = model(inputs, mode='compute_critic')['q_value']
         if twin:
-            is_differentiable(q[0].sum(), model.critic[0])
-            is_differentiable(q[1].sum(), model.critic[1])
+            is_differentiable(q[0].sum(), model.critic[1][0])
+            # is_differentiable(q[1].sum(), model.critic[1][1]) # backward encoder twice
+            assert q[0].shape == (B, action_shape)
+            assert q[1].shape == (B, action_shape)
         else:
-            is_differentiable(q.sum(), model.critic)
+            is_differentiable(q.sum(), model.critic[1])
+            assert q.shape == (B, action_shape)
 
-        # compute_action
+        # compute_actor
         print(model)
         logit = model(inputs, mode='compute_actor')['logit']
-        assert logit.shape[0] == B
-        assert logit.shape[1] == A
-
-
-B = 32
-agent_obs_shape = [216, 265]
-global_obs_shape = [264, 324]
-agent_num = 8
-action_shape = 14
-args = list(product(*[agent_obs_shape, global_obs_shape]))
-
-
-@pytest.mark.unittest
-@pytest.mark.parametrize('agent_obs_shape, global_obs_shape', args)
-class TestMAQAC:
-
-    def output_check(self, model, outputs, action_shape):
-        if isinstance(action_shape, tuple):
-            loss = sum([t.sum() for t in outputs])
-        elif np.isscalar(action_shape):
-            loss = outputs.sum()
-        is_differentiable(loss, model)
-
-    def test_maqac(self, agent_obs_shape, global_obs_shape):
-        data = {
-            'obs': {
-                'agent_state': torch.randn(B, agent_num, agent_obs_shape),
-                'global_state': torch.randn(B, agent_num, global_obs_shape),
-                'action_mask': torch.randint(0, 2, size=(B, agent_num, action_shape))
-            }
-        }
-        model = MAQAC(agent_obs_shape, global_obs_shape, action_shape)
-
-        logit = model(data, mode='compute_actor')['logit']
-        value = model(data, mode='compute_critic')['q_value']
-
-        outputs = value.sum() + logit.sum()
-        self.output_check(model, outputs, action_shape)
-
-        for p in model.parameters():
-            p.grad = None
-        logit = model(data, mode='compute_actor')['logit']
-        self.output_check(model.actor, logit, action_shape)
-
-        for p in model.parameters():
-            p.grad = None
-        value = model(data, mode='compute_critic')['q_value']
-        assert value.shape == (B, agent_num, action_shape)
-        self.output_check(model.critic, value, action_shape)
+        assert logit.shape == (B, action_shape)
+        is_differentiable(logit.sum(), model.actor)
 
 
 B = 4
@@ -143,11 +98,11 @@ def test_maqac(self, agent_obs_shape, global_obs_shape):
 
 @pytest.mark.unittest
 @pytest.mark.parametrize('action_shape, twin, share_encoder', args)
-class TestQACPixel:
+class TestContinuousQACPixel:
 
     def test_qacpixel(self, action_shape, twin, share_encoder):
         inputs = {'obs': torch.randn(B, 3, 84, 84), 'action': torch.randn(B, squeeze(action_shape))}
-        model = QAC(
+        model = ContinuousQAC(
             obs_shape=(3, 84, 84),
             action_shape=action_shape,
             action_space='reparameterization',
@@ -169,4 +124,7 @@ def test_qacpixel(self, action_shape, twin, share_encoder):
         action_shape = squeeze(action_shape)
         assert mu.shape == (B, action_shape)
         assert sigma.shape == (B, action_shape)
-        is_differentiable(mu.sum() + sigma.sum(), model.actor)
+        if share_encoder:  # if share_encoder, actor_encoder's grad is not None
+            is_differentiable(mu.sum() + sigma.sum(), model.actor_head)
+        else:
+            is_differentiable(mu.sum() + sigma.sum(), model.actor)
diff --git a/ding/model/template/vac.py b/ding/model/template/vac.py
index 24fe845b94..2a2a56b881 100644
--- a/ding/model/template/vac.py
+++ b/ding/model/template/vac.py
@@ -11,11 +11,16 @@
 
 @MODEL_REGISTRY.register('vac')
 class VAC(nn.Module):
-    r"""
+    """
     Overview:
-        The VAC model.
+        The neural network and computation graph of algorithms related to (state) Value Actor-Critic (VAC), such as \
+        A2C/PPO/IMPALA. This model now supports discrete, continuous and hybrid action space. The VAC is composed of \
+        four parts: ``actor_encoder``, ``critic_encoder``, ``actor_head`` and ``critic_head``. Encoders are used to \
+        extract the feature from various observation. Heads are used to predict corresponding value or action logit. \
+        In high-dimensional observation space like 2D image, we often use a shared encoder for both ``actor_encoder`` \
+        and ``critic_encoder``. In low-dimensional observation space like 1D vector, we often use different encoders.
     Interfaces:
-        ``__init__``, ``forward``, ``compute_actor``, ``compute_critic``
+        ``__init__``, ``forward``, ``compute_actor``, ``compute_critic``, ``compute_actor_critic``.
     """
     mode = ['compute_actor', 'compute_critic', 'compute_actor_critic']
 
@@ -38,26 +43,37 @@ def __init__(
         encoder: Optional[torch.nn.Module] = None,
         impala_cnn_encoder: bool = False,
     ) -> None:
-        r"""
+        """
         Overview:
-            Init the VAC Model according to arguments.
+            Initialize the VAC model according to corresponding input arguments.
         Arguments:
-            - obs_shape (:obj:`Union[int, SequenceType]`): Observation's space.
-            - action_shape (:obj:`Union[int, SequenceType]`): Action's space.
-            - action_space (:obj:`str`): Choose action head in ['discrete', 'continuous', 'hybrid']
-            - share_encoder (:obj:`bool`): Whether share encoder.
-            - encoder_hidden_size_list (:obj:`SequenceType`): Collection of ``hidden_size`` to pass to ``Encoder``
-            - actor_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to actor-nn's ``Head``.
-            - actor_head_layer_num (:obj:`int`):
-                The num of layers used in the network to compute Q value output for actor's nn.
-            - critic_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to critic-nn's ``Head``.
-            - critic_head_layer_num (:obj:`int`):
-                The num of layers used in the network to compute Q value output for critic's nn.
-            - activation (:obj:`Optional[nn.Module]`):
-                The type of activation function to use in ``MLP`` the after ``layer_fn``,
-                if ``None`` then default set to ``nn.ReLU()``
-            - norm_type (:obj:`Optional[str]`):
-                The type of normalization to use, see ``ding.torch_utils.fc_block`` for more details`
+            - obs_shape (:obj:`Union[int, SequenceType]`): Observation space shape, such as 8 or [4, 84, 84].
+            - action_shape (:obj:`Union[int, SequenceType]`): Action space shape, such as 6 or [2, 3, 3].
+            - action_space (:obj:`str`): The type of different action spaces, including ['discrete', 'continuous', \
+                'hybrid'], then will instantiate corresponding head, including ``DiscreteHead``, \
+                ``ReparameterizationHead``, and hybrid heads.
+            - share_encoder (:obj:`bool`): Whether to share observation encoders between actor and decoder.
+            - encoder_hidden_size_list (:obj:`SequenceType`): Collection of ``hidden_size`` to pass to ``Encoder``, \
+                the last element must match ``head_hidden_size``.
+            - actor_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` of ``actor_head`` network, defaults \
+                to 64, it must match the last element of ``encoder_hidden_size_list``.
+            - actor_head_layer_num (:obj:`int`): The num of layers used in the ``actor_head`` network to compute action.
+            - critic_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` of ``critic_head`` network, defaults \
+                to 64, it must match the last element of ``encoder_hidden_size_list``.
+            - critic_head_layer_num (:obj:`int`): The num of layers used in the ``critic_head`` network.
+            - activation (:obj:`Optional[nn.Module]`): The type of activation function in networks \
+                if ``None`` then default set it to ``nn.ReLU()``.
+            - norm_type (:obj:`Optional[str]`): The type of normalization in networks, see \
+                ``ding.torch_utils.fc_block`` for more details. you can choose one of ['BN', 'IN', 'SyncBN', 'LN']
+            - sigma_type (:obj:`Optional[str]`): The type of sigma in continuous action space, see \
+                ``ding.torch_utils.network.dreamer.ReparameterizationHead`` for more details, in A2C/PPO, it defaults \
+                to ``independent``, which means state-independent sigma parameters.
+            - fixed_sigma_value (:obj:`Optional[int]`): If ``sigma_type`` is ``fixed``, then use this value as sigma.
+            - bound_type (:obj:`Optional[str]`): The type of action bound methods in continuous action space, defaults \
+                to ``None``, which means no bound.
+            - encoder (:obj:`Optional[torch.nn.Module]`): The encoder module, defaults to ``None``, you can define \
+                your own encoder module and pass it into VAC to deal with different observation space.
+            - impala_cnn_encoder (:obj:`bool`): Whether to use IMPALA CNN encoder, defaults to ``False``.
         """
         super(VAC, self).__init__()
         obs_shape: int = squeeze(obs_shape)
@@ -174,7 +190,6 @@ def new_encoder(outsize):
             )
             self.actor_head = nn.ModuleList([actor_action_type, actor_action_args])
 
-        # must use list, not nn.ModuleList
         if self.share_encoder:
             self.actor = [self.encoder, self.actor_head]
             self.critic = [self.encoder, self.critic_head]
@@ -186,78 +201,63 @@ def new_encoder(outsize):
         self.actor = nn.ModuleList(self.actor)
         self.critic = nn.ModuleList(self.critic)
 
-    def forward(self, inputs: Union[torch.Tensor, Dict], mode: str) -> Dict:
-        r"""
+    def forward(self, x: torch.Tensor, mode: str) -> Dict:
+        """
         Overview:
-            Use encoded embedding tensor to predict output.
-            Parameter updates with VAC's MLPs forward setup.
+            VAC forward computation graph, input observation tensor to predict state value or action logit. Different \
+            ``mode`` will forward with different network modules to get different outputs and save computation.
         Arguments:
-            Forward with ``'compute_actor'`` or ``'compute_critic'``:
-                - inputs (:obj:`torch.Tensor`):
-                    The encoded embedding tensor, determined with given ``hidden_size``, i.e. ``(B, N=hidden_size)``.
-                    Whether ``actor_head_hidden_size`` or ``critic_head_hidden_size`` depend on ``mode``.
+            - x (:obj:`torch.Tensor`): The input observation tensor data.
+            - mode (:obj:`str`): The forward mode, all the modes are defined in the beginning of this class.
         Returns:
-            - outputs (:obj:`Dict`):
-                Run with encoder and head.
+            - outputs (:obj:`Dict`): The output dict of VAC's forward computation graph, whose key-values vary from \
+                different ``mode``.
 
-                Forward with ``'compute_actor'``, Necessary Keys:
-                    - logit (:obj:`torch.Tensor`): Logit encoding tensor, with same size as input ``x``.
-
-                Forward with ``'compute_critic'``, Necessary Keys:
-                    - value (:obj:`torch.Tensor`): Q value tensor with same size as batch size.
-        Shapes:
-            - inputs (:obj:`torch.Tensor`): :math:`(B, N)`, where B is batch size and N corresponding ``hidden_size``
-            - logit (:obj:`torch.FloatTensor`): :math:`(B, N)`, where B is batch size and N is ``action_shape``
-            - value (:obj:`torch.FloatTensor`): :math:`(B, )`, where B is batch size.
-
-        Actor Examples:
-            >>> model = VAC(64,128)
+        Examples (Actor):
+            >>> model = VAC(64, 128)
             >>> inputs = torch.randn(4, 64)
             >>> actor_outputs = model(inputs,'compute_actor')
             >>> assert actor_outputs['logit'].shape == torch.Size([4, 128])
 
-        Critic Examples:
-            >>> model = VAC(64,64)
+        Examples (Critic):
+            >>> model = VAC(64, 64)
             >>> inputs = torch.randn(4, 64)
             >>> critic_outputs = model(inputs,'compute_critic')
-            >>> critic_outputs['value']
-            tensor([0.0252, 0.0235, 0.0201, 0.0072], grad_fn=<SqueezeBackward1>)
+            >>> assert actor_outputs['logit'].shape == torch.Size([4, 64])
 
-        Actor-Critic Examples:
-            >>> model = VAC(64,64)
+        Examples (Actor-Critic):
+            >>> model = VAC(64, 64)
             >>> inputs = torch.randn(4, 64)
             >>> outputs = model(inputs,'compute_actor_critic')
-            >>> outputs['value']
-            tensor([0.0252, 0.0235, 0.0201, 0.0072], grad_fn=<SqueezeBackward1>)
+            >>> assert critic_outputs['value'].shape == torch.Size([4])
             >>> assert outputs['logit'].shape == torch.Size([4, 64])
 
         """
         assert mode in self.mode, "not support forward mode: {}/{}".format(mode, self.mode)
-        return getattr(self, mode)(inputs)
+        return getattr(self, mode)(x)
 
     def compute_actor(self, x: torch.Tensor) -> Dict:
-        r"""
+        """
         Overview:
-            Execute parameter updates with ``'compute_actor'`` mode
-            Use encoded embedding tensor to predict output.
+            VAC forward computation graph for actor part, input observation tensor to predict action logit.
         Arguments:
-            - inputs (:obj:`torch.Tensor`):
-                The encoded embedding tensor, determined with given ``hidden_size``, i.e. ``(B, N=hidden_size)``.
-                ``hidden_size = actor_head_hidden_size``
+            - x (:obj:`torch.Tensor`): The input observation tensor data.
         Returns:
-            - outputs (:obj:`Dict`):
-                Run with encoder and head.
-
+            - outputs (:obj:`Dict`): The output dict of VAC's forward computation graph for actor, including ``logit``.
         ReturnsKeys:
-            - logit (:obj:`torch.Tensor`): Logit encoding tensor, with same size as input ``x``.
+            - logit (:obj:`torch.Tensor`): The predicted action logit tensor, for discrete action space, it will be \
+                the same dimension real-value ranged tensor of possible action choices, and for continuous action \
+                space, it will be the mu and sigma of the Gaussian distribution, and the number of mu and sigma is the \
+                same as the number of continuous actions. Hybrid action space is a kind of combination of discrete \
+                and continuous action space, so the logit will be a dict with ``action_type`` and ``action_args``.
         Shapes:
-            - logit (:obj:`torch.FloatTensor`): :math:`(B, N)`, where B is batch size and N is ``action_shape``
+            - logit (:obj:`torch.Tensor`): :math:`(B, N)`, where B is batch size and N is ``action_shape``
 
         Examples:
-            >>> model = VAC(64,64)
+            >>> model = VAC(64, 64)
             >>> inputs = torch.randn(4, 64)
             >>> actor_outputs = model(inputs,'compute_actor')
-            >>> assert actor_outputs['action'].shape == torch.Size([4, 64])
+            >>> assert actor_outputs['logit'].shape == torch.Size([4, 64])
         """
         if self.share_encoder:
             x = self.encoder(x)
@@ -275,29 +275,23 @@ def compute_actor(self, x: torch.Tensor) -> Dict:
             return {'logit': {'action_type': action_type['logit'], 'action_args': action_args}}
 
     def compute_critic(self, x: torch.Tensor) -> Dict:
-        r"""
+        """
         Overview:
-            Execute parameter updates with ``'compute_critic'`` mode
-            Use encoded embedding tensor to predict output.
+            VAC forward computation graph for critic part, input observation tensor to predict state value.
         Arguments:
-            - inputs (:obj:`torch.Tensor`):
-                The encoded embedding tensor, determined with given ``hidden_size``, i.e. ``(B, N=hidden_size)``.
-                ``hidden_size = critic_head_hidden_size``
+            - x (:obj:`torch.Tensor`): The input observation tensor data.
         Returns:
-            - outputs (:obj:`Dict`):
-                Run with encoder and head.
-
-                Necessary Keys:
-                    - value (:obj:`torch.Tensor`): Q value tensor with same size as batch size.
+            - outputs (:obj:`Dict`): The output dict of VAC's forward computation graph for critic, including ``value``.
+        ReturnsKeys:
+            - value (:obj:`torch.Tensor`): The predicted state value tensor.
         Shapes:
-            - value (:obj:`torch.FloatTensor`): :math:`(B, )`, where B is batch size.
+            - value (:obj:`torch.Tensor`): :math:`(B, )`, where B is batch size, (B, 1) is squeezed to (B, ).
 
         Examples:
-            >>> model = VAC(64,64)
+            >>> model = VAC(64, 64)
             >>> inputs = torch.randn(4, 64)
             >>> critic_outputs = model(inputs,'compute_critic')
-            >>> critic_outputs['value']
-            tensor([0.0252, 0.0235, 0.0201, 0.0072], grad_fn=<SqueezeBackward1>)
+            >>> assert critic_outputs['value'].shape == torch.Size([4])
         """
         if self.share_encoder:
             x = self.encoder(x)
@@ -307,37 +301,37 @@ def compute_critic(self, x: torch.Tensor) -> Dict:
         return {'value': x['pred']}
 
     def compute_actor_critic(self, x: torch.Tensor) -> Dict:
-        r"""
+        """
         Overview:
-            Execute parameter updates with ``'compute_actor_critic'`` mode
-            Use encoded embedding tensor to predict output.
+            VAC forward computation graph for both actor and critic part, input observation tensor to predict action \
+            logit and state value.
         Arguments:
-            - inputs (:obj:`torch.Tensor`): The encoded embedding tensor.
-
+            - x (:obj:`torch.Tensor`): The input observation tensor data.
         Returns:
-            - outputs (:obj:`Dict`):
-                Run with encoder and head.
-
+            - outputs (:obj:`Dict`): The output dict of VAC's forward computation graph for both actor and critic, \
+                including ``logit`` and ``value``.
         ReturnsKeys:
-            - logit (:obj:`torch.Tensor`): Logit encoding tensor, with same size as input ``x``.
-            - value (:obj:`torch.Tensor`): Q value tensor with same size as batch size.
+            - logit (:obj:`torch.Tensor`): The predicted action logit tensor, for discrete action space, it will be \
+                the same dimension real-value ranged tensor of possible action choices, and for continuous action \
+                space, it will be the mu and sigma of the Gaussian distribution, and the number of mu and sigma is the \
+                same as the number of continuous actions. Hybrid action space is a kind of combination of discrete \
+                and continuous action space, so the logit will be a dict with ``action_type`` and ``action_args``.
+            - value (:obj:`torch.Tensor`): The predicted state value tensor.
         Shapes:
-            - logit (:obj:`torch.FloatTensor`): :math:`(B, N)`, where B is batch size and N is ``action_shape``
-            - value (:obj:`torch.FloatTensor`): :math:`(B, )`, where B is batch size.
+            - logit (:obj:`torch.Tensor`): :math:`(B, N)`, where B is batch size and N is ``action_shape``
+            - value (:obj:`torch.Tensor`): :math:`(B, )`, where B is batch size, (B, 1) is squeezed to (B, ).
 
         Examples:
-            >>> model = VAC(64,64)
+            >>> model = VAC(64, 64)
             >>> inputs = torch.randn(4, 64)
             >>> outputs = model(inputs,'compute_actor_critic')
-            >>> outputs['value']
-            tensor([0.0252, 0.0235, 0.0201, 0.0072], grad_fn=<SqueezeBackward1>)
+            >>> assert critic_outputs['value'].shape == torch.Size([4])
             >>> assert outputs['logit'].shape == torch.Size([4, 64])
 
 
         .. note::
-            ``compute_actor_critic`` interface aims to save computation when shares encoder.
-            Returning the combination dictionry.
-
+            ``compute_actor_critic`` interface aims to save computation when shares encoder and return the combination \
+            dict output.
         """
         if self.share_encoder:
             actor_embedding = critic_embedding = self.encoder(x)
@@ -361,11 +355,12 @@ def compute_actor_critic(self, x: torch.Tensor) -> Dict:
 
 @MODEL_REGISTRY.register('dreamervac')
 class DREAMERVAC(nn.Module):
-    r"""
+    """
     Overview:
-        The VAC model.
+        The neural network and computation graph of DreamerV3 (state) Value Actor-Critic (VAC).
+        This model now supports discrete, continuous action space.
     Interfaces:
-        ``__init__``, ``forward``, ``compute_actor``, ``compute_critic``
+        ``__init__``, ``forward``.
     """
     mode = ['compute_actor', 'compute_critic', 'compute_actor_critic']
 
@@ -388,26 +383,12 @@ def __init__(
             actor_temp=0.1,
             action_unimix_ratio=0.01,
     ) -> None:
-        r"""
+        """
         Overview:
-            Init the VAC Model according to arguments.
+            Initialize the ``DREAMERVAC`` model according to arguments.
         Arguments:
-            - obs_shape (:obj:`Union[int, SequenceType]`): Observation's space.
-            - action_shape (:obj:`Union[int, SequenceType]`): Action's space.
-            - action_space (:obj:`str`): Choose action head in ['discrete', 'continuous', 'hybrid']
-            - share_encoder (:obj:`bool`): Whether share encoder.
-            - encoder_hidden_size_list (:obj:`SequenceType`): Collection of ``hidden_size`` to pass to ``Encoder``
-            - actor_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to actor-nn's ``Head``.
-            - actor_head_layer_num (:obj:`int`):
-                The num of layers used in the network to compute Q value output for actor's nn.
-            - critic_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to critic-nn's ``Head``.
-            - critic_head_layer_num (:obj:`int`):
-                The num of layers used in the network to compute Q value output for critic's nn.
-            - activation (:obj:`Optional[nn.Module]`):
-                The type of activation function to use in ``MLP`` the after ``layer_fn``,
-                if ``None`` then default set to ``nn.ReLU()``
-            - norm_type (:obj:`Optional[str]`):
-                The type of normalization to use, see ``ding.torch_utils.fc_block`` for more details`
+            - obs_shape (:obj:`Union[int, SequenceType]`): Observation space shape, such as 8 or [4, 84, 84].
+            - action_shape (:obj:`Union[int, SequenceType]`): Action space shape, such as 6 or [2, 3, 3].
         """
         super(DREAMERVAC, self).__init__()
         obs_shape: int = squeeze(obs_shape)
diff --git a/ding/model/wrapper/__init__.py b/ding/model/wrapper/__init__.py
index f74d3f2668..24d621e973 100644
--- a/ding/model/wrapper/__init__.py
+++ b/ding/model/wrapper/__init__.py
@@ -1 +1 @@
-from .model_wrappers import model_wrap, register_wrapper, IModelWrapper, BaseModelWrapper
+from .model_wrappers import model_wrap, register_wrapper, IModelWrapper
diff --git a/ding/model/wrapper/model_wrappers.py b/ding/model/wrapper/model_wrappers.py
index 38c6346f2e..e3d57d1ce1 100644
--- a/ding/model/wrapper/model_wrappers.py
+++ b/ding/model/wrapper/model_wrappers.py
@@ -1,40 +1,52 @@
-from typing import Any, Tuple, Callable, Optional, List, Dict
+from typing import Any, Tuple, Callable, Optional, List, Dict, Union
 from abc import ABC
 import numpy as np
 import torch
+import torch.nn as nn
 import torch.nn.functional as F
 from torch.distributions import Categorical, Independent, Normal
-from ding.torch_utils import get_tensor_data
+from ding.torch_utils import get_tensor_data, zeros_like
 from ding.rl_utils import create_noise_generator
 from ding.utils.data import default_collate
 
 
 class IModelWrapper(ABC):
-    r"""
+    """
     Overview:
-        the base class of Model Wrappers
+        The basic interface class of model wrappers. Model wrapper is a wrapper class of torch.nn.Module model, which \
+        is used to add some extra operations for the wrapped model, such as hidden state maintain for RNN-base model, \
+        argmax action selection for discrete action space, etc.
     Interfaces:
-        register
+        ``__init__``, ``__getattr__``, ``info``, ``reset``, ``forward``.
     """
 
-    def __init__(self, model: Any) -> None:
+    def __init__(self, model: nn.Module) -> None:
+        """
+        Overview:
+            Initialize model and other necessary member variabls in the model wrapper.
+        """
         self._model = model
 
     def __getattr__(self, key: str) -> Any:
-        r"""
+        """
         Overview:
-            Get the attrbute in model.
+            Get original attrbutes of torch.nn.Module model, such as variables and methods defined in model.
         Arguments:
-            - key (:obj:`str`): The key to query.
+            - key (:obj:`str`): The string key to query.
         Returns:
             - ret (:obj:`Any`): The queried attribute.
         """
         return getattr(self._model, key)
 
-    def info(self, attr_name):
-        r"""
+    def info(self, attr_name: str) -> str:
+        """
         Overview:
-            get info of attr_name
+            Get some string information of the indicated ``attr_name``, which is used for debug wrappers.
+            This method will recursively search for the indicated ``attr_name``.
+        Arguments:
+            - attr_name (:obj:`str`): The string key to query information.
+        Returns:
+            - info_string (:obj:`str`): The information string of the indicated ``attr_name``.
         """
         if attr_name in dir(self):
             if isinstance(self._model, IModelWrapper):
@@ -50,36 +62,46 @@ def info(self, attr_name):
             else:
                 return '{}'.format(self._model.__class__.__name__)
 
-
-class BaseModelWrapper(IModelWrapper):
-    r"""
-    Overview:
-        the base class of Model Wrappers
-    Interfaces:
-        register
-    """
-
-    def reset(self, data_id: List[int] = None) -> None:
-        r"""
+    def reset(self, data_id: List[int] = None, **kwargs) -> None:
+        """
         Overview
-            the reset function that the Model Wrappers with states should implement
-            used to reset the stored states
+            Basic interface, reset some stateful varaibles in the model wrapper, such as hidden state of RNN.
+            Here we do nothing and just implement this interface method.
+            Other derived model wrappers can override this method to add some extra operations.
+        Arguments:
+            - data_id (:obj:`List[int]`): The data id list to reset. If None, reset all data. In practice, \
+                model wrappers often needs to maintain some stateful variables for each data trajectory, \
+                so we leave this ``data_id`` argument to reset the stateful variables of the indicated data.
         """
         pass
 
+    def forward(self, *args, **kwargs) -> Any:
+        """
+        Overview:
+            Basic interface, call the wrapped model's forward method. Other derived model wrappers can override this \
+            method to add some extra operations.
+        """
+        return self._model.forward(*args, **kwargs)
 
-def zeros_like(h):
-    if isinstance(h, torch.Tensor):
-        return torch.zeros_like(h)
-    elif isinstance(h, (list, tuple)):
-        return [zeros_like(t) for t in h]
-    elif isinstance(h, dict):
-        return {k: zeros_like(v) for k, v in h.items()}
-    else:
-        raise TypeError("not support type: {}".format(h))
+
+class BaseModelWrapper(IModelWrapper):
+    """
+    Overview:
+        Placeholder class for the model wrapper. This class is used to wrap the model without any extra operations, \
+        including a empty ``reset`` method and a ``forward`` method which directly call the wrapped model's forward.
+        To keep the consistency of the model wrapper interface, we use this class to wrap the model without specific \
+        operations in the implementation of DI-engine's policy.
+    """
+    pass
 
 
 class HiddenStateWrapper(IModelWrapper):
+    """
+    Overview:
+        Maintain the hidden state for RNN-base model. Each sample in a batch has its own state.
+    Interfaces:
+        ``__init__``, ``reset``, ``forward``.
+    """
 
     def __init__(
             self,
@@ -387,12 +409,18 @@ def sample_action(logit=None, prob=None):
 
 
 class ArgmaxSampleWrapper(IModelWrapper):
-    r"""
+    """
     Overview:
-        Used to help the model to sample argmax action
+        Used to help the model to sample argmax action.
+    Interfaces:
+        ``forward``.
     """
 
     def forward(self, *args, **kwargs):
+        """
+        Overview:
+            Employ model forward computation graph, and use the output logit to greedily select max action (argmax).
+        """
         output = self._model.forward(*args, **kwargs)
         assert isinstance(output, dict), "model output must be dict, but find {}".format(type(output))
         logit = output['logit']
@@ -415,6 +443,8 @@ class CombinationArgmaxSampleWrapper(IModelWrapper):
     r"""
     Overview:
         Used to help the model to sample combination argmax action.
+    Interfaces:
+        ``forward``.
     """
 
     def forward(self, shot_number, *args, **kwargs):
@@ -438,6 +468,8 @@ class CombinationMultinomialSampleWrapper(IModelWrapper):
     r"""
     Overview:
         Used to help the model to sample combination multinomial action.
+    Interfaces:
+        ``forward``.
     """
 
     def forward(self, shot_number, *args, **kwargs):
@@ -463,6 +495,8 @@ class HybridArgmaxSampleWrapper(IModelWrapper):
     Overview:
         Used to help the model to sample argmax action in hybrid action space,
         i.e.{'action_type': discrete, 'action_args', continuous}
+    Interfaces:
+        ``forward``.
     """
 
     def forward(self, *args, **kwargs):
@@ -487,11 +521,11 @@ def forward(self, *args, **kwargs):
 
 
 class MultinomialSampleWrapper(IModelWrapper):
-    r"""
+    """
     Overview:
-        Used to help the model get the corresponding action from the output['logits']
+        Used to help the model get the corresponding action from the output['logits']self.
     Interfaces:
-        register
+        ``forward``.
     """
 
     def forward(self, *args, **kwargs):
@@ -529,7 +563,7 @@ class EpsGreedySampleWrapper(IModelWrapper):
         - float (i.e. python native scalar): for almost normal case
         - Dict[str, float]: for algorithm NGU
     Interfaces:
-        register
+        ``forward``.
     """
 
     def forward(self, *args, **kwargs):
@@ -583,7 +617,7 @@ class EpsGreedyMultinomialSampleWrapper(IModelWrapper):
         Epsilon greedy sampler coupled with multinomial sample used in collector_model
         to help balance exploration and exploitation.
     Interfaces:
-        register
+        ``forward``.
     """
 
     def forward(self, *args, **kwargs):
@@ -630,7 +664,7 @@ class HybridEpsGreedySampleWrapper(IModelWrapper):
         Epsilon greedy sampler used in collector_model to help balance exploration and exploitation.
         In hybrid action space, i.e.{'action_type': discrete, 'action_args', continuous}
     Interfaces:
-        register, forward
+        ``forward``.
     """
 
     def forward(self, *args, **kwargs):
@@ -670,7 +704,7 @@ class HybridEpsGreedyMultinomialSampleWrapper(IModelWrapper):
         to help balance exploration and exploitation.
         In hybrid action space, i.e.{'action_type': discrete, 'action_args', continuous}
     Interfaces:
-        register
+        ``forward``.
     """
 
     def forward(self, *args, **kwargs):
@@ -759,7 +793,7 @@ def forward(self, *args, **kwargs):
         return output
 
 
-class DeterministicSample(IModelWrapper):
+class DeterministicSampleWrapper(IModelWrapper):
     """
     Overview:
         Deterministic sampler (just use mu directly) used in eval_model.
@@ -774,7 +808,7 @@ def forward(self, *args, **kwargs):
         return output
 
 
-class ReparamSample(IModelWrapper):
+class ReparamSampleWrapper(IModelWrapper):
     """
     Overview:
         Reparameterization gaussian sampler used in collector_model.
@@ -796,7 +830,7 @@ class ActionNoiseWrapper(IModelWrapper):
     Overview:
         Add noise to collector's action output; Do clips on both generated noise and action after adding noise.
     Interfaces:
-        register, __init__, add_noise, reset
+        ``__init__``, ``forward``.
     Arguments:
         - model (:obj:`Any`): Wrapped model class. Should contain ``forward`` method.
         - noise_type (:obj:`str`): The type of noise that should be generated, support ['gauss', 'ou'].
@@ -854,13 +888,6 @@ def add_noise(self, action: torch.Tensor) -> torch.Tensor:
             action = action.clamp(self.action_range['min'], self.action_range['max'])
         return action
 
-    def reset(self) -> None:
-        r"""
-        Overview:
-            Reset noise generator.
-        """
-        pass
-
 
 class TargetNetworkWrapper(IModelWrapper):
     r"""
@@ -919,17 +946,15 @@ def reset_state(self, target_update_count: int = None) -> None:
 
 
 class TeacherNetworkWrapper(IModelWrapper):
-    r"""
+    """
     Overview:
         Set the teacher Network. Set the model's model.teacher_cfg to the input teacher_cfg
-
-    Interfaces:
-        register
     """
 
     def __init__(self, model, teacher_cfg):
         super().__init__(model)
         self._model._teacher_cfg = teacher_cfg
+        raise NotImplementedError
 
 
 wrapper_name_map = {
@@ -939,8 +964,8 @@ def __init__(self, model, teacher_cfg):
     'hybrid_argmax_sample': HybridArgmaxSampleWrapper,
     'eps_greedy_sample': EpsGreedySampleWrapper,
     'eps_greedy_multinomial_sample': EpsGreedyMultinomialSampleWrapper,
-    'deterministic_sample': DeterministicSample,
-    'reparam_sample': ReparamSample,
+    'deterministic_sample': DeterministicSampleWrapper,
+    'reparam_sample': ReparamSampleWrapper,
     'hybrid_eps_greedy_sample': HybridEpsGreedySampleWrapper,
     'hybrid_eps_greedy_multinomial_sample': HybridEpsGreedyMultinomialSampleWrapper,
     'hybrid_reparam_multinomial_sample': HybridReparamMultinomialSampleWrapper,
@@ -958,8 +983,19 @@ def __init__(self, model, teacher_cfg):
 }
 
 
-def model_wrap(model, wrapper_name: str = None, **kwargs):
+def model_wrap(model: Union[nn.Module, IModelWrapper], wrapper_name: str = None, **kwargs):
+    """
+    Overview:
+        Wrap the model with the specified wrapper and return the wrappered model.
+    Arguments:
+        - model (:obj:`Any`): The model to be wrapped.
+        - wrapper_name (:obj:`str`): The name of the wrapper to be used.
+
+    .. note::
+        The arguments of the wrapper should be passed in as kwargs.
+    """
     if wrapper_name in wrapper_name_map:
+        # TODO test whether to remove this if branch
         if not isinstance(model, IModelWrapper):
             model = wrapper_name_map['base'](model)
         model = wrapper_name_map[wrapper_name](model, **kwargs)
@@ -968,13 +1004,15 @@ def model_wrap(model, wrapper_name: str = None, **kwargs):
     return model
 
 
-def register_wrapper(name: str, wrapper_type: type):
-    r"""
+def register_wrapper(name: str, wrapper_type: type) -> None:
+    """
     Overview:
-        Register new wrapper to wrapper_name_map
+        Register new wrapper to ``wrapper_name_map``. When user implements a new wrapper, they must call this function \
+        to complete the registration. Then the wrapper can be called by ``model_wrap``.
     Arguments:
-        - name (:obj:`str`): the name of the wrapper
-        - wrapper_type (subclass of :obj:`IModelWrapper`): the wrapper class added to the plguin_name_map
+        - name (:obj:`str`): The name of the new wrapper to be registered.
+        - wrapper_type (:obj:`type`): The wrapper class needs to be added in ``wrapper_name_map``. This argument \
+            should be the subclass of ``IModelWrapper``.
     """
     assert isinstance(name, str)
     assert issubclass(wrapper_type, IModelWrapper)
diff --git a/ding/model/wrapper/test_model_wrappers.py b/ding/model/wrapper/test_model_wrappers.py
index 93334bee00..890d1eb1fc 100644
--- a/ding/model/wrapper/test_model_wrappers.py
+++ b/ding/model/wrapper/test_model_wrappers.py
@@ -9,7 +9,8 @@
 
 from ding.torch_utils import get_lstm
 from ding.torch_utils.network.gtrxl import GTrXL
-from ding.model import model_wrap, register_wrapper, IModelWrapper, BaseModelWrapper
+from ding.model import model_wrap, register_wrapper, IModelWrapper
+from ding.model.wrapper.model_wrappers import BaseModelWrapper
 
 
 class TempMLP(torch.nn.Module):
@@ -38,7 +39,7 @@ def __init__(self):
         self.bn1 = nn.BatchNorm1d(4)
         self.fc2 = nn.Linear(4, 6)
         self.act = nn.ReLU()
-        self.out = nn.Softmax()
+        self.out = nn.Softmax(dim=-1)
 
     def forward(self, inputs, tmp=0):
         x = self.fc1(inputs['obs'])
@@ -61,7 +62,7 @@ def __init__(self):
         self.bn1 = nn.BatchNorm1d(4)
         self.fc2 = nn.Linear(4, 6)
         self.act = nn.ReLU()
-        self.out = nn.Softmax()
+        self.out = nn.Softmax(dim=-1)
 
         self.fc2_cont = nn.Linear(4, 6)
         self.act_cont = nn.ReLU()
@@ -93,7 +94,7 @@ def __init__(self):
         self.bn1 = nn.BatchNorm1d(4)
         self.fc2 = nn.Linear(4, 6)
         self.act = nn.ReLU()
-        self.out = nn.Softmax()
+        self.out = nn.Softmax(dim=-1)
 
         self.fc2_cont_mu = nn.Linear(4, 6)
         self.act_cont_mu = nn.ReLU()
@@ -131,7 +132,6 @@ def __init__(self):
         self.bn1 = nn.BatchNorm1d(4)
         self.fc2 = nn.Linear(4, 6)
         self.act = nn.ReLU()
-        self.out = nn.Softmax()
 
         self.fc2_cont_mu = nn.Linear(4, 6)
         self.fc2_cont_sigma = nn.Linear(4, 6)
@@ -553,13 +553,15 @@ def test_transformer_memory_wrapper(self):
     def test_combination_argmax_sample_wrapper(self):
         model = model_wrap(ActorMLP(), wrapper_name='combination_argmax_sample')
         data = {'obs': torch.randn(4, 3)}
-        output = model.forward(shot_number=2, inputs=data)
-        assert output['action'].shape == (4, )
+        shot_number = 2
+        output = model.forward(shot_number=shot_number, inputs=data)
+        assert output['action'].shape == (4, shot_number)
         assert (output['action'] >= 0).all() and (output['action'] < 64).all()
 
     def test_combination_multinomial_sample_wrapper(self):
         model = model_wrap(ActorMLP(), wrapper_name='combination_multinomial_sample')
         data = {'obs': torch.randn(4, 3)}
-        output = model.forward(shot_number=2, inputs=data)
-        assert output['action'].shape == (4, )
+        shot_number = 2
+        output = model.forward(shot_number=shot_number, inputs=data)
+        assert output['action'].shape == (4, shot_number)
         assert (output['action'] >= 0).all() and (output['action'] < 64).all()
diff --git a/ding/policy/cql.py b/ding/policy/cql.py
index 21dd28d21c..1622c184ad 100644
--- a/ding/policy/cql.py
+++ b/ding/policy/cql.py
@@ -221,18 +221,18 @@ def _init_learn(self) -> None:
 
         # Weight Init
         init_w = self._cfg.learn.init_w
-        self._model.actor[2].mu.weight.data.uniform_(-init_w, init_w)
-        self._model.actor[2].mu.bias.data.uniform_(-init_w, init_w)
-        self._model.actor[2].log_sigma_layer.weight.data.uniform_(-init_w, init_w)
-        self._model.actor[2].log_sigma_layer.bias.data.uniform_(-init_w, init_w)
+        self._model.actor_head[-1].mu.weight.data.uniform_(-init_w, init_w)
+        self._model.actor_head[-1].mu.bias.data.uniform_(-init_w, init_w)
+        self._model.actor_head[-1].log_sigma_layer.weight.data.uniform_(-init_w, init_w)
+        self._model.actor_head[-1].log_sigma_layer.bias.data.uniform_(-init_w, init_w)
         if self._twin_critic:
-            self._model.critic[0][2].last.weight.data.uniform_(-init_w, init_w)
-            self._model.critic[0][2].last.bias.data.uniform_(-init_w, init_w)
-            self._model.critic[1][2].last.weight.data.uniform_(-init_w, init_w)
-            self._model.critic[1][2].last.bias.data.uniform_(-init_w, init_w)
+            self._model.critic_head[0][-1].last.weight.data.uniform_(-init_w, init_w)
+            self._model.critic_head[0][-1].last.bias.data.uniform_(-init_w, init_w)
+            self._model.critic_head[1][-1].last.weight.data.uniform_(-init_w, init_w)
+            self._model.critic_head[1][-1].last.bias.data.uniform_(-init_w, init_w)
         else:
-            self._model.critic[2].last.weight.data.uniform_(-init_w, init_w)
-            self._model.critic[2].last.bias.data.uniform_(-init_w, init_w)
+            self._model.critic_head[2].last.weight.data.uniform_(-init_w, init_w)
+            self._model.critic_head[-1].last.bias.data.uniform_(-init_w, init_w)
 
         # Optimizers
         if self._value_network:
diff --git a/ding/policy/ddpg.py b/ding/policy/ddpg.py
index 2887b7480d..8629cca4af 100644
--- a/ding/policy/ddpg.py
+++ b/ding/policy/ddpg.py
@@ -148,9 +148,9 @@ class DDPGPolicy(Policy):
 
     def default_model(self) -> Tuple[str, List[str]]:
         if self._cfg.multi_agent:
-            return 'maqac_continuous', ['ding.model.template.maqac']
+            return 'continuous_maqac', ['ding.model.template.maqac']
         else:
-            return 'qac', ['ding.model.template.qac']
+            return 'continuous_qac', ['ding.model.template.qac']
 
     def _init_learn(self) -> None:
         r"""
diff --git a/ding/policy/r2d2_gtrxl.py b/ding/policy/r2d2_gtrxl.py
index 660f0aaef4..73b89239f3 100644
--- a/ding/policy/r2d2_gtrxl.py
+++ b/ding/policy/r2d2_gtrxl.py
@@ -1,5 +1,5 @@
 import copy
-import sys
+import torch
 from collections import namedtuple
 from typing import List, Dict, Any, Tuple, Union, Optional
 
@@ -10,9 +10,6 @@
 from ding.utils import POLICY_REGISTRY
 from ding.utils.data import timestep_collate, default_collate, default_decollate
 from .base_policy import Policy
-import torch
-
-from ding.model.common.head import *
 
 
 @POLICY_REGISTRY.register('r2d2_gtrxl')
@@ -130,7 +127,7 @@ class R2D2GTrXLPolicy(Policy):
     )
 
     def default_model(self) -> Tuple[str, List[str]]:
-        return 'gtrxl_discrete', ['ding.model.template.q_learning']
+        return 'gtrxldqn', ['ding.model.template.q_learning']
 
     def _init_learn(self) -> None:
         """
diff --git a/ding/policy/sac.py b/ding/policy/sac.py
index ca0263305a..ebf2845e51 100644
--- a/ding/policy/sac.py
+++ b/ding/policy/sac.py
@@ -143,7 +143,7 @@ class SACDiscretePolicy(Policy):
 
     def default_model(self) -> Tuple[str, List[str]]:
         if self._cfg.multi_agent:
-            return 'maqac', ['ding.model.template.maqac']
+            return 'discrete_maqac', ['ding.model.template.maqac']
         else:
             return 'discrete_qac', ['ding.model.template.qac']
 
@@ -227,7 +227,7 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]:
         action = data['action']
 
         # 1. predict q value
-        q_value = self._learn_model.forward({'obs': obs}, mode='compute_critic')['q_value']
+        q_value = self._learn_model.forward(obs, mode='compute_critic')['q_value']
         dist = torch.distributions.categorical.Categorical(logits=logit)
         dist_entropy = dist.entropy()
         entropy = dist_entropy.mean()
@@ -236,12 +236,12 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]:
 
         # target q value. SARSA: first predict next action, then calculate next q value
         with torch.no_grad():
-            policy_output_next = self._learn_model.forward({'obs': next_obs}, mode='compute_actor')
+            policy_output_next = self._learn_model.forward(next_obs, mode='compute_actor')
             if self._cfg.multi_agent:
                 policy_output_next['logit'][policy_output_next['action_mask'] == 0.0] = -1e8
             prob = F.softmax(policy_output_next['logit'], dim=-1)
             log_prob = torch.log(prob + 1e-8)
-            target_q_value = self._target_model.forward({'obs': next_obs}, mode='compute_critic')['q_value']
+            target_q_value = self._target_model.forward(next_obs, mode='compute_critic')['q_value']
             # the value of a policy according to the maximum entropy objective
             if self._twin_critic:
                 # find min one as target q value
@@ -270,7 +270,7 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]:
         self._optimizer_q.step()
 
         # 5. evaluate to get action distribution
-        policy_output = self._learn_model.forward({'obs': data['obs']}, mode='compute_actor')
+        policy_output = self._learn_model.forward(obs, mode='compute_actor')
         # 6. apply discrete action mask in multi_agent setting
         if self._cfg.multi_agent:
             policy_output['logit'][policy_output['action_mask'] == 0.0] = -1e8
@@ -279,7 +279,7 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]:
         log_prob = F.log_softmax(logit, dim=-1)
 
         with torch.no_grad():
-            new_q_value = self._learn_model.forward({'obs': data['obs']}, mode='compute_critic')['q_value']
+            new_q_value = self._learn_model.forward(obs, mode='compute_critic')['q_value']
             if self._twin_critic:
                 new_q_value = torch.min(new_q_value[0], new_q_value[1])
         # 7. compute policy loss
@@ -363,7 +363,7 @@ def _forward_collect(self, data: dict, eps: float) -> dict:
             data = to_device(data, self._device)
         self._collect_model.eval()
         with torch.no_grad():
-            output = self._collect_model.forward({'obs': data}, mode='compute_actor', eps=eps)
+            output = self._collect_model.forward(data, mode='compute_actor', eps=eps)
         if self._cuda:
             output = to_device(output, 'cpu')
         output = default_decollate(output)
@@ -394,7 +394,7 @@ def _forward_eval(self, data: dict) -> dict:
             data = to_device(data, self._device)
         self._eval_model.eval()
         with torch.no_grad():
-            output = self._eval_model.forward({'obs': data}, mode='compute_actor')
+            output = self._eval_model.forward(data, mode='compute_actor')
         if self._cuda:
             output = to_device(output, 'cpu')
         output = default_decollate(output)
@@ -543,9 +543,9 @@ class SACPolicy(Policy):
 
     def default_model(self) -> Tuple[str, List[str]]:
         if self._cfg.multi_agent:
-            return 'maqac_continuous', ['ding.model.template.maqac']
+            return 'continuous_maqac', ['ding.model.template.maqac']
         else:
-            return 'qac', ['ding.model.template.qac']
+            return 'continuous_qac', ['ding.model.template.qac']
 
     def _init_learn(self) -> None:
         self._priority = self._cfg.priority
@@ -554,10 +554,10 @@ def _init_learn(self) -> None:
 
         # Weight Init for the last output layer
         init_w = self._cfg.learn.init_w
-        self._model.actor[-1].mu.weight.data.uniform_(-init_w, init_w)
-        self._model.actor[-1].mu.bias.data.uniform_(-init_w, init_w)
-        self._model.actor[-1].log_sigma_layer.weight.data.uniform_(-init_w, init_w)
-        self._model.actor[-1].log_sigma_layer.bias.data.uniform_(-init_w, init_w)
+        self._model.actor_head[-1].mu.weight.data.uniform_(-init_w, init_w)
+        self._model.actor_head[-1].mu.bias.data.uniform_(-init_w, init_w)
+        self._model.actor_head[-1].log_sigma_layer.weight.data.uniform_(-init_w, init_w)
+        self._model.actor_head[-1].log_sigma_layer.bias.data.uniform_(-init_w, init_w)
 
         self._optimizer_q = Adam(
             self._model.critic.parameters(),
@@ -838,10 +838,10 @@ def _init_learn(self) -> None:
 
         # Weight Init for the last output layer
         init_w = self._cfg.learn.init_w
-        self._model.actor[2].mu.weight.data.uniform_(-init_w, init_w)
-        self._model.actor[2].mu.bias.data.uniform_(-init_w, init_w)
-        self._model.actor[2].log_sigma_layer.weight.data.uniform_(-init_w, init_w)
-        self._model.actor[2].log_sigma_layer.bias.data.uniform_(-init_w, init_w)
+        self._model.actor_head[-1].mu.weight.data.uniform_(-init_w, init_w)
+        self._model.actor_head[-1].mu.bias.data.uniform_(-init_w, init_w)
+        self._model.actor_head[-1].log_sigma_layer.weight.data.uniform_(-init_w, init_w)
+        self._model.actor_head[-1].log_sigma_layer.bias.data.uniform_(-init_w, init_w)
 
         self._optimizer_q = Adam(
             self._model.critic.parameters(),
diff --git a/ding/policy/td3_bc.py b/ding/policy/td3_bc.py
index c3295d70d0..e30b6bfc07 100644
--- a/ding/policy/td3_bc.py
+++ b/ding/policy/td3_bc.py
@@ -174,13 +174,12 @@ class from DDPG class by changing ``_actor_update_freq``, ``_twin_critic`` and n
     )
 
     def default_model(self) -> Tuple[str, List[str]]:
-        return 'qac', ['ding.model.template.qac']
+        return 'continuous_qac', ['ding.model.template.qac']
 
     def _init_learn(self) -> None:
-        r"""
+        """
         Overview:
-            Learn mode init method. Called by ``self.__init__``.
-            Init actor and critic optimizers, algorithm config.
+            Learn mode init method. Called by ``self.__init__``. Init actor and critic optimizers, algorithm config.
         """
         super(TD3BCPolicy, self)._init_learn()
         self._alpha = self._cfg.learn.alpha
diff --git a/ding/policy/td3_vae.py b/ding/policy/td3_vae.py
index 7a192d5a6b..7d029c0a91 100644
--- a/ding/policy/td3_vae.py
+++ b/ding/policy/td3_vae.py
@@ -168,7 +168,7 @@ class from DDPG class by changing ``_actor_update_freq``, ``_twin_critic`` and n
     )
 
     def default_model(self) -> Tuple[str, List[str]]:
-        return 'qac', ['ding.model.template.qac']
+        return 'continuous_qac', ['ding.model.template.qac']
 
     def _init_learn(self) -> None:
         r"""
diff --git a/ding/torch_utils/__init__.py b/ding/torch_utils/__init__.py
index 9c7b677143..c98eb3bab4 100755
--- a/ding/torch_utils/__init__.py
+++ b/ding/torch_utils/__init__.py
@@ -1,6 +1,7 @@
 from .checkpoint_helper import build_checkpoint_helper, CountVar, auto_checkpoint
 from .data_helper import to_device, to_tensor, to_ndarray, to_list, to_dtype, same_shape, tensor_to_list, \
-    build_log_buffer, CudaFetcher, get_tensor_data, unsqueeze, squeeze, get_null_data, get_shape0, to_item
+    build_log_buffer, CudaFetcher, get_tensor_data, unsqueeze, squeeze, get_null_data, get_shape0, to_item, \
+    zeros_like
 from .distribution import CategoricalPd, CategoricalPdPytorch
 from .metric import levenshtein_distance, hamming_distance
 from .network import *
diff --git a/ding/torch_utils/data_helper.py b/ding/torch_utils/data_helper.py
index e34df5308a..a985ef0345 100644
--- a/ding/torch_utils/data_helper.py
+++ b/ding/torch_utils/data_helper.py
@@ -461,3 +461,14 @@ def get_null_data(template: Any, num: int) -> List[Any]:
         data['reward'].zero_()
         ret.append(data)
     return ret
+
+
+def zeros_like(h):
+    if isinstance(h, torch.Tensor):
+        return torch.zeros_like(h)
+    elif isinstance(h, (list, tuple)):
+        return [zeros_like(t) for t in h]
+    elif isinstance(h, dict):
+        return {k: zeros_like(v) for k, v in h.items()}
+    else:
+        raise TypeError("not support type: {}".format(h))
diff --git a/ding/world_model/tests/test_world_model.py b/ding/world_model/tests/test_world_model.py
index ec5f0645ef..f8dd620c59 100644
--- a/ding/world_model/tests/test_world_model.py
+++ b/ding/world_model/tests/test_world_model.py
@@ -52,11 +52,11 @@ def step(self, obs, action):
                 return (torch.zeros(B), torch.rand(B, O), obs.sum(-1) > 0)
 
         from ding.policy import SACPolicy
-        from ding.model import QAC
+        from ding.model import ContinuousQAC
 
         policy_config = SACPolicy.default_config()
         policy_config.model.update(dict(obs_shape=2, action_shape=2))
-        model = QAC(**policy_config.model)
+        model = ContinuousQAC(**policy_config.model)
         policy = SACPolicy(policy_config, model=model).collect_mode
 
         fake_model = FakeModel(fake_config, None, None)
diff --git a/dizoo/classic_control/cartpole/config/cartpole_sac_config.py b/dizoo/classic_control/cartpole/config/cartpole_sac_config.py
index 736c7ee930..36dcb53be6 100644
--- a/dizoo/classic_control/cartpole/config/cartpole_sac_config.py
+++ b/dizoo/classic_control/cartpole/config/cartpole_sac_config.py
@@ -13,8 +13,7 @@
         random_collect_size=0,
         multi_agent=False,
         model=dict(
-            agent_obs_shape=4,
-            global_obs_shape=4,
+            obs_shape=4,
             action_shape=2,
             twin_critic=True,
             actor_head_hidden_size=64,