From b9cee618247999c61bb6504ef1b63a7816896950 Mon Sep 17 00:00:00 2001
From: Cloud-Pku <cloud_chen@pku.edu.cn>
Date: Thu, 15 Dec 2022 16:45:27 +0800
Subject: [PATCH 01/10] add BDQ algrithm

---
 ding/model/common/__init__.py                 |   2 +-
 ding/model/common/head.py                     | 102 +++++
 ding/model/template/q_learning.py             |  88 +++-
 ding/policy/__init__.py                       |   2 +
 ding/policy/bdq.py                            | 387 ++++++++++++++++++
 ding/policy/command_mode_policy_instance.py   |   6 +
 ding/rl_utils/__init__.py                     |   2 +-
 ding/rl_utils/td.py                           |  56 +++
 dizoo/mujoco/config/halfcheetah_bdq_config.py |  71 ++++
 dizoo/mujoco/config/hopper_bdq_config.py      |  73 ++++
 dizoo/mujoco/envs/mujoco_env.py               |  12 +-
 11 files changed, 795 insertions(+), 6 deletions(-)
 create mode 100644 ding/policy/bdq.py
 create mode 100644 dizoo/mujoco/config/halfcheetah_bdq_config.py
 create mode 100644 dizoo/mujoco/config/hopper_bdq_config.py

diff --git a/ding/model/common/__init__.py b/ding/model/common/__init__.py
index fc904de2ac..2acbd3c8b7 100644
--- a/ding/model/common/__init__.py
+++ b/ding/model/common/__init__.py
@@ -1,5 +1,5 @@
 from .head import DiscreteHead, DuelingHead, DistributionHead, RainbowHead, QRDQNHead, \
-    QuantileHead, FQFHead, RegressionHead, ReparameterizationHead, MultiHead, head_cls_map, \
+    QuantileHead, FQFHead, RegressionHead, ReparameterizationHead, MultiHead, BranchingHead, head_cls_map, \
     independent_normal_dist
 from .encoder import ConvEncoder, FCEncoder, IMPALAConvEncoder
 from .utils import create_model
diff --git a/ding/model/common/head.py b/ding/model/common/head.py
index 9a83130d0b..94a6b2ba14 100644
--- a/ding/model/common/head.py
+++ b/ding/model/common/head.py
@@ -174,6 +174,108 @@ def forward(self, x: torch.Tensor) -> Dict:
         return {'logit': q, 'distribution': dist}
 
 
+class BranchingHead(nn.Module):
+
+    def __init__(
+            self,
+            hidden_size: int,
+            num_branches: int = 0,
+            action_per_branch: int = 2,
+            layer_num: int = 1,
+            a_layer_num: Optional[int] = None,
+            v_layer_num: Optional[int] = None,
+            norm_type: Optional[str] = None,
+            activation: Optional[nn.Module] = nn.ReLU(),
+            noise: Optional[bool] = False,
+    ) -> None:
+        """
+        Overview:
+            Init the ``BranchingHead`` layers according to the provided arguments.
+        Arguments:
+            - hidden_size (:obj:`int`): The ``hidden_size`` of the MLP connected to ``BranchingHead``.
+            - num_branches (:obj:`int`): The number of branches, which is equivalent to the action dimension.
+            - action_per_branch (:obj:`int`): The number of actions in each dimension.
+            - layer_num (:obj:`int`): The number of layers used in the network to compute Advantage and Value output.
+            - a_layer_num (:obj:`int`): The number of layers used in the network to compute Advantage output.
+            - v_layer_num (:obj:`int`): The number of layers used in the network to compute Value output.
+            - output_size (:obj:`int`): The number of outputs.
+            - norm_type (:obj:`str`): The type of normalization to use. See ``ding.torch_utils.network.fc_block`` \
+                for more details. Default ``None``.
+            - activation (:obj:`nn.Module`): The type of activation function to use in MLP. \
+                If ``None``, then default set activation to ``nn.ReLU()``. Default ``None``.
+            - noise (:obj:`bool`): Whether use ``NoiseLinearLayer`` as ``layer_fn`` in Q networks' MLP. \
+                Default ``False``.
+        """
+        super(BranchingHead, self).__init__()
+        if a_layer_num is None:
+            a_layer_num = layer_num
+        if v_layer_num is None:
+            v_layer_num = layer_num
+        self.num_branches = num_branches
+        self.action_per_branch = action_per_branch
+
+        layer = NoiseLinearLayer if noise else nn.Linear
+        block = noise_block if noise else fc_block
+        # value network
+
+        self.V = nn.Sequential(
+            MLP(
+                hidden_size,
+                hidden_size,
+                hidden_size,
+                v_layer_num,
+                layer_fn=layer,
+                activation=activation,
+                norm_type=norm_type
+            ), block(hidden_size, 1)
+        )
+        # action branching network
+        action_output_dim = action_per_branch
+        self.branches = nn.ModuleList(
+            [
+                nn.Sequential(
+                    MLP(
+                        hidden_size,
+                        hidden_size,
+                        hidden_size,
+                        a_layer_num,
+                        layer_fn=layer,
+                        activation=activation,
+                        norm_type=norm_type
+                    ), block(hidden_size, action_output_dim)
+                ) for _ in range(self.num_branches)
+            ]
+        )
+
+    def forward(self, x: torch.Tensor) -> Dict:
+        """
+        Overview:
+            Use encoded embedding tensor to run MLP with ``BranchingHead`` and return the prediction dictionary.
+        Arguments:
+            - x (:obj:`torch.Tensor`): Tensor containing input embedding.
+        Returns:
+            - outputs (:obj:`Dict`): Dict containing keyword ``logit`` (:obj:`torch.Tensor`).
+        Shapes:
+            - x: :math:`(B, N)`, where ``B = batch_size`` and ``N = hidden_size``.
+            - logit: :math:`(B, M)`, where ``M = output_size``.
+
+        Examples:
+            >>> head = BranchingHead(64, 5, 2)
+            >>> inputs = torch.randn(4, 64)
+            >>> outputs = head(inputs)
+            >>> assert isinstance(outputs, dict) and outputs['logit'].shape == torch.Size([4, 10])
+        """
+        value_out = self.V(x)
+        value_out = torch.unsqueeze(value_out, 1)
+        action_out = []
+        for b in self.branches:
+            action_out.append(b(x))
+        action_scores = torch.stack(action_out, 1)
+        action_scores = action_scores - torch.mean(action_scores, 2, keepdim=True)
+        logits = value_out + action_scores
+        return {'logit': logits}
+
+
 class RainbowHead(nn.Module):
     """
         Overview:
diff --git a/ding/model/template/q_learning.py b/ding/model/template/q_learning.py
index 54df388446..5177a87a7c 100644
--- a/ding/model/template/q_learning.py
+++ b/ding/model/template/q_learning.py
@@ -5,7 +5,7 @@
 from ding.torch_utils import get_lstm
 from ding.utils import MODEL_REGISTRY, SequenceType, squeeze
 from ..common import FCEncoder, ConvEncoder, DiscreteHead, DuelingHead, MultiHead, RainbowHead, \
-    QuantileHead, FQFHead, QRDQNHead, DistributionHead
+    QuantileHead, FQFHead, QRDQNHead, DistributionHead, BranchingHead
 from ding.torch_utils.network.gtrxl import GTrXL
 
 
@@ -98,6 +98,92 @@ def forward(self, x: torch.Tensor) -> Dict:
         return x
 
 
+@MODEL_REGISTRY.register('bdq')
+class BDQ(nn.Module):
+
+    def __init__(
+            self,
+            obs_shape: Union[int, SequenceType],
+            num_branches: int = 0,
+            action_per_branch: int = 2,
+            layer_num: int = 3,
+            a_layer_num: Optional[int] = None,
+            v_layer_num: Optional[int] = None,
+            encoder_hidden_size_list: SequenceType = [128, 128, 64],
+            head_hidden_size: Optional[int] = None,
+            norm_type: Optional[nn.Module] = None,
+            activation: Optional[nn.Module] = nn.ReLU(),
+    ) -> None:
+        """
+        Overview:
+            Init the BDQ (encoder + head) Model according to input arguments.
+        Arguments:
+            - obs_shape (:obj:`Union[int, SequenceType]`): Observation space shape, such as 8 or [4, 84, 84].
+            - num_branches (:obj:`int`): The number of branches, which is equivalent to the action dimension, such as 6.
+            - action_per_branch (:obj:`int`): The number of actions in each dimension.
+            - layer_num (:obj:`int`): The number of layers used in the network to compute Advantage and Value output.
+            - a_layer_num (:obj:`int`): The number of layers used in the network to compute Advantage output.
+            - v_layer_num (:obj:`int`): The number of layers used in the network to compute Value output.
+            - encoder_hidden_size_list (:obj:`SequenceType`): Collection of ``hidden_size`` to pass to ``Encoder``, \
+                the last element must match ``head_hidden_size``.
+            - head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` of head network.
+            - norm_type (:obj:`Optional[str]`): The type of normalization in networks, see \
+                ``ding.torch_utils.fc_block`` for more details.
+            - activation (:obj:`Optional[nn.Module]`): The type of activation function in networks \
+                if ``None`` then default set it to ``nn.ReLU()``
+        """
+        super(BDQ, self).__init__()
+        # For compatibility: 1, (1, ), [4, 32, 32]
+        obs_shape = squeeze(obs_shape)
+        if head_hidden_size is None:
+            head_hidden_size = encoder_hidden_size_list[-1]
+
+        # backbone
+        # FC Encoder
+        if isinstance(obs_shape, int) or len(obs_shape) == 1:
+            self.encoder = FCEncoder(obs_shape, encoder_hidden_size_list, activation=activation, norm_type=norm_type)
+        # Conv Encoder
+        elif len(obs_shape) == 3:
+            self.encoder = ConvEncoder(obs_shape, encoder_hidden_size_list, activation=activation, norm_type=norm_type)
+        else:
+            raise RuntimeError(
+                "not support obs_shape for pre-defined encoder: {}, please customize your own DQN".format(obs_shape)
+            )
+
+        self.num_branches = num_branches
+        self.action_per_branch = action_per_branch
+
+        # head
+        self.head = BranchingHead(
+                head_hidden_size, num_branches=self.num_branches, action_per_branch=action_per_branch,
+                layer_num=layer_num, a_layer_num=a_layer_num, v_layer_num=v_layer_num, activation=activation,
+                norm_type=norm_type)
+
+    def forward(self, x: torch.Tensor) -> Dict:
+        r"""
+        Overview:
+            BDQ forward computation graph, input observation tensor to predict q_value.
+        Arguments:
+            - x (:obj:`torch.Tensor`): Observation inputs
+        Returns:
+            - outputs (:obj:`Dict`): BDQ forward outputs, such as q_value.
+        ReturnsKeys:
+            - logit (:obj:`torch.Tensor`): Discrete Q-value output of each action dimension.
+        Shapes:
+            - x (:obj:`torch.Tensor`): :math:`(B, N)`, where B is batch size and N is ``obs_shape``
+            - logit (:obj:`torch.FloatTensor`): :math:`(B, M)`, where B is batch size and M is
+                ``num_branches * action_per_branch``
+        Examples:
+            >>> model = BDQ(8, 5, 2)  # arguments: 'obs_shape', 'num_branches' and 'action_per_branch'.
+            >>> inputs = torch.randn(4, 8)
+            >>> outputs = model(inputs)
+            >>> assert isinstance(outputs, dict) and outputs['logit'].shape == torch.Size([4, 10])
+        """
+        x = self.encoder(x)
+        x = self.head(x)
+        return x
+
+
 @MODEL_REGISTRY.register('c51dqn')
 class C51DQN(nn.Module):
 
diff --git a/ding/policy/__init__.py b/ding/policy/__init__.py
index e71bd4dd19..84f571a7a6 100644
--- a/ding/policy/__init__.py
+++ b/ding/policy/__init__.py
@@ -43,3 +43,5 @@
 
 from .bc import BehaviourCloningPolicy
 from .ibc import IBCPolicy
+
+from .bdq import BDQPolicy
diff --git a/ding/policy/bdq.py b/ding/policy/bdq.py
new file mode 100644
index 0000000000..920f8b7c07
--- /dev/null
+++ b/ding/policy/bdq.py
@@ -0,0 +1,387 @@
+from typing import List, Dict, Any, Tuple
+from collections import namedtuple
+import copy
+import torch
+
+from ding.torch_utils import Adam, to_device, ContrastiveLoss
+from ding.rl_utils import q_nstep_td_data, bdq_nstep_td_error, get_nstep_return_data, get_train_sample
+from ding.model import model_wrap
+from ding.utils import POLICY_REGISTRY
+from ding.utils.data import default_collate, default_decollate
+
+from .base_policy import Policy
+from .common_utils import default_preprocess_learn
+
+
+@POLICY_REGISTRY.register('bdq')
+class BDQPolicy(Policy):
+    r"""
+    Overview:
+        Policy class of BDQ algorithm, extended by PER/multi-step TD.
+
+    Config:
+        == ==================== ======== ============== ======================================== =======================
+        ID Symbol               Type     Default Value  Description                              Other(Shape)
+        == ==================== ======== ============== ======================================== =======================
+        1  ``type``             str      bdq            | RL policy register name, refer to      | This arg is optional,
+                                                        | registry ``POLICY_REGISTRY``           | a placeholder
+        2  ``cuda``             bool     False          | Whether to use cuda for network        | This arg can be diff-
+                                                                                                 | erent from modes
+        3  ``on_policy``        bool     False          | Whether the RL algorithm is on-policy
+                                                        | or off-policy
+        4  ``priority``         bool     False          | Whether use priority(PER)              | Priority sample,
+                                                                                                 | update priority
+        5  | ``priority_IS``    bool     False          | Whether use Importance Sampling Weight
+           | ``_weight``                                | to correct biased update. If True,
+                                                        | priority must be True.
+        6  | ``discount_``      float    0.97,          | Reward's future discount factor, aka.  | May be 1 when sparse
+           | ``factor``                  [0.95, 0.999]  | gamma                                  | reward env
+        7  ``nstep``            int      1,             | N-step reward discount sum for target
+                                         [3, 5]         | q_value estimation
+        8  | ``learn.update``   int      3              | How many updates(iterations) to train  | This args can be vary
+           | ``per_collect``                            | after collector's one collection. Only | from envs. Bigger val
+                                                        | valid in serial training               | means more off-policy
+        9  | ``learn.multi``    bool     False          | whether to use multi gpu during
+           | ``_gpu``
+        10 | ``learn.batch_``   int      64             | The number of samples of an iteration
+           | ``size``
+        11 | ``learn.learning`` float    0.001          | Gradient step length of an iteration.
+           | ``_rate``
+        12 | ``learn.target_``  int      100            | Frequence of target network update.    | Hard(assign) update
+           | ``update_freq``
+        13 | ``learn.ignore_``  bool     False          | Whether ignore done for target value   | Enable it for some
+           | ``done``                                   | calculation.                           | fake termination env
+        14 ``collect.n_sample`` int      [8, 128]       | The number of training samples of a    | It varies from
+                                                        | call of collector.                     | different envs
+        15 | ``collect.unroll`` int      1              | unroll length of an iteration          | In RNN, unroll_len>1
+           | ``_len``
+        16 | ``other.eps.type`` str      exp            | exploration rate decay type            | Support ['exp',
+                                                                                                 | 'linear'].
+        17 | ``other.eps.``     float    0.95           | start value of exploration rate        | [0,1]
+           | ``start``
+        18 | ``other.eps.``     float    0.1            | end value of exploration rate          | [0,1]
+           | ``end``
+        19 | ``other.eps.``     int      10000          | decay length of exploration            | greater than 0. set
+           | ``decay``                                                                           | decay=10000 means
+                                                                                                 | the exploration rate
+                                                                                                 | decay from start
+                                                                                                 | value to end value
+                                                                                                 | during decay length.
+        == ==================== ======== ============== ======================================== =======================
+    """
+
+    config = dict(
+        type='bdq',
+        # (bool) Whether use cuda in policy
+        cuda=False,
+        # (bool) Whether learning policy is the same as collecting data policy(on-policy)
+        on_policy=False,
+        # (bool) Whether enable priority experience sample
+        priority=False,
+        # (bool) Whether use Importance Sampling Weight to correct biased update. If True, priority must be True.
+        priority_IS_weight=False,
+        # (float) Discount factor(gamma) for returns
+        discount_factor=0.97,
+        # (int) The number of step for calculating target q_value
+        nstep=1,
+        learn=dict(
+            # (bool) Whether to use multi gpu
+            multi_gpu=False,
+            # How many updates(iterations) to train after collector's one collection.
+            # Bigger "update_per_collect" means bigger off-policy.
+            # collect data -> update policy-> collect data -> ...
+            update_per_collect=3,
+            # (int) How many samples in a training batch
+            batch_size=64,
+            # (float) The step size of gradient descent
+            learning_rate=0.001,
+            # ==============================================================
+            # The following configs are algorithm-specific
+            # ==============================================================
+            # (int) Frequence of target network update.
+            target_update_freq=100,
+            # (bool) Whether ignore done(usually for max step termination env)
+            ignore_done=False,
+        ),
+        # collect_mode config
+        collect=dict(
+            # (int) Only one of [n_sample, n_episode] shoule be set
+            # n_sample=8,
+            # (int) Cut trajectories into pieces with length "unroll_len".
+            unroll_len=1,
+        ),
+        eval=dict(),
+        # other config
+        other=dict(
+            # Epsilon greedy with decay.
+            eps=dict(
+                # (str) Decay type. Support ['exp', 'linear'].
+                type='exp',
+                # (float) Epsilon start value
+                start=0.95,
+                # (float) Epsilon end value
+                end=0.1,
+                # (int) Decay length(env step)
+                decay=10000,
+            ),
+            replay_buffer=dict(replay_buffer_size=10000, ),
+        ),
+    )
+
+    def default_model(self) -> Tuple[str, List[str]]:
+        """
+        Overview:
+            Return this algorithm default model setting for demonstration.
+        Returns:
+            - model_info (:obj:`Tuple[str, List[str]]`): model name and mode import_names
+
+        .. note::
+            The user can define and use customized network model but must obey the same inferface definition indicated \
+            by import_names path. For BDQ, ``ding.model.template.q_learning.BDQ``
+        """
+        return 'bdq', ['ding.model.template.q_learning']
+
+    def _init_learn(self) -> None:
+        """
+        Overview:
+            Learn mode init method. Called by ``self.__init__``, initialize the optimizer, algorithm arguments, main \
+            and target model.
+        """
+        self._priority = self._cfg.priority
+        self._priority_IS_weight = self._cfg.priority_IS_weight
+        # Optimizer
+        self._optimizer = Adam(self._model.parameters(), lr=self._cfg.learn.learning_rate)
+
+        self._gamma = self._cfg.discount_factor
+        self._nstep = self._cfg.nstep
+
+        # use model_wrapper for specialized demands of different modes
+        self._target_model = copy.deepcopy(self._model)
+        self._target_model = model_wrap(
+            self._target_model,
+            wrapper_name='target',
+            update_type='assign',
+            update_kwargs={'freq': self._cfg.learn.target_update_freq}
+        )
+        self._learn_model = model_wrap(self._model, wrapper_name='argmax_sample')
+        self._learn_model.reset()
+        self._target_model.reset()
+
+    def _forward_learn(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Overview:
+            Forward computation graph of learn mode(updating policy).
+        Arguments:
+            - data (:obj:`Dict[str, Any]`): Dict type data, a batch of data for training, values are torch.Tensor or \
+                np.ndarray or dict/list combinations.
+        Returns:
+            - info_dict (:obj:`Dict[str, Any]`): Dict type data, a info dict indicated training result, which will be \
+                recorded in text log and tensorboard, values are python scalar or a list of scalars.
+        ArgumentsKeys:
+            - necessary: ``obs``, ``action``, ``reward``, ``next_obs``, ``done``
+            - optional: ``value_gamma``, ``IS``
+        ReturnsKeys:
+            - necessary: ``cur_lr``, ``total_loss``, ``priority``
+            - optional: ``action_distribution``
+        """
+        data = default_preprocess_learn(
+            data,
+            use_priority=self._priority,
+            use_priority_IS_weight=self._cfg.priority_IS_weight,
+            ignore_done=self._cfg.learn.ignore_done,
+            use_nstep=True
+        )
+
+        if self._cuda:
+            data = to_device(data, self._device)
+        # ====================
+        # Q-learning forward
+        # ====================
+        self._learn_model.train()
+        self._target_model.train()
+        # Current q value (main model)
+        q_value = self._learn_model.forward(data['obs'])['logit']
+        # Target q value
+        with torch.no_grad():
+            target_q_value = self._target_model.forward(data['next_obs'])['logit']
+            # Max q value action (main model)
+            target_q_action = self._learn_model.forward(data['next_obs'])['action']
+        if data['action'].shape != target_q_action.shape:
+            data['action'] = data['action'].unsqueeze(-1)
+
+        data_n = q_nstep_td_data(
+            q_value, target_q_value, data['action'], target_q_action, data['reward'], data['done'], data['weight']
+        )
+        value_gamma = data.get('value_gamma')
+        loss, td_error_per_sample = bdq_nstep_td_error(data_n, self._gamma, nstep=self._nstep, value_gamma=value_gamma)
+
+        # ====================
+        # Q-learning update
+        # ====================
+        self._optimizer.zero_grad()
+        loss.backward()
+        if self._cfg.learn.multi_gpu:
+            self.sync_gradients(self._learn_model)
+        self._optimizer.step()
+
+        # =============
+        # after update
+        # =============
+        self._target_model.update(self._learn_model.state_dict())
+        return {
+            'cur_lr': self._optimizer.defaults['lr'],
+            'total_loss': loss.item(),
+            'q_value': q_value.mean().item(),
+            'target_q_value': target_q_value.mean().item(),
+            'priority': td_error_per_sample.abs().tolist(),
+            # Only discrete action satisfying len(data['action'])==1 can return this and draw histogram on tensorboard.
+            # '[histogram]action_distribution': data['action'],
+        }
+
+    def _monitor_vars_learn(self) -> List[str]:
+        return ['cur_lr', 'total_loss', 'q_value']
+
+    def _state_dict_learn(self) -> Dict[str, Any]:
+        """
+        Overview:
+            Return the state_dict of learn mode, usually including model and optimizer.
+        Returns:
+            - state_dict (:obj:`Dict[str, Any]`): the dict of current policy learn state, for saving and restoring.
+        """
+        return {
+            'model': self._learn_model.state_dict(),
+            'target_model': self._target_model.state_dict(),
+            'optimizer': self._optimizer.state_dict(),
+        }
+
+    def _load_state_dict_learn(self, state_dict: Dict[str, Any]) -> None:
+        """
+        Overview:
+            Load the state_dict variable into policy learn mode.
+        Arguments:
+            - state_dict (:obj:`Dict[str, Any]`): the dict of policy learn state saved before.
+
+        .. tip::
+            If you want to only load some parts of model, you can simply set the ``strict`` argument in \
+            load_state_dict to ``False``, or refer to ``ding.torch_utils.checkpoint_helper`` for more \
+            complicated operation.
+        """
+        self._learn_model.load_state_dict(state_dict['model'])
+        self._target_model.load_state_dict(state_dict['target_model'])
+        self._optimizer.load_state_dict(state_dict['optimizer'])
+
+    def _init_collect(self) -> None:
+        """
+        Overview:
+            Collect mode init method. Called by ``self.__init__``, initialize algorithm arguments and collect_model, \
+            enable the eps_greedy_sample for exploration.
+        """
+        self._unroll_len = self._cfg.collect.unroll_len
+        self._gamma = self._cfg.discount_factor  # necessary for parallel
+        self._nstep = self._cfg.nstep  # necessary for parallel
+        self._collect_model = model_wrap(self._model, wrapper_name='eps_greedy_sample')
+        self._collect_model.reset()
+
+    def _forward_collect(self, data: Dict[int, Any], eps: float) -> Dict[int, Any]:
+        """
+        Overview:
+            Forward computation graph of collect mode(collect training data), with eps_greedy for exploration.
+        Arguments:
+            - data (:obj:`Dict[str, Any]`): Dict type data, stacked env data for predicting policy_output(action), \
+                values are torch.Tensor or np.ndarray or dict/list combinations, keys are env_id indicated by integer.
+            - eps (:obj:`float`): epsilon value for exploration, which is decayed by collected env step.
+        Returns:
+            - output (:obj:`Dict[int, Any]`): The dict of predicting policy_output(action) for the interaction with \
+                env and the constructing of transition.
+        ArgumentsKeys:
+            - necessary: ``obs``
+        ReturnsKeys
+            - necessary: ``logit``, ``action``
+        """
+        data_id = list(data.keys())
+        data = default_collate(list(data.values()))
+        if self._cuda:
+            data = to_device(data, self._device)
+        self._collect_model.eval()
+        with torch.no_grad():
+            output = self._collect_model.forward(data, eps=eps)
+        if self._cuda:
+            output = to_device(output, 'cpu')
+        output = default_decollate(output)
+        return {i: d for i, d in zip(data_id, output)}
+
+    def _get_train_sample(self, data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """
+        Overview:
+            For a given trajectory(transitions, a list of transition) data, process it into a list of sample that \
+            can be used for training directly. A train sample can be a processed transition(BDQ with nstep TD).
+        Arguments:
+            - data (:obj:`List[Dict[str, Any]`): The trajectory data(a list of transition), each element is the same \
+                format as the return value of ``self._process_transition`` method.
+        Returns:
+            - samples (:obj:`dict`): The list of training samples.
+
+        .. note::
+            We will vectorize ``process_transition`` and ``get_train_sample`` method in the following release version. \
+            And the user can customize the this data processing procecure by overriding this two methods and collector \
+            itself.
+        """
+        data = get_nstep_return_data(data, self._nstep, gamma=self._gamma)
+        return get_train_sample(data, self._unroll_len)
+
+    def _process_transition(self, obs: Any, policy_output: Dict[str, Any], timestep: namedtuple) -> Dict[str, Any]:
+        """
+        Overview:
+            Generate a transition(e.g.: <s, a, s', r, d>) for this algorithm training.
+        Arguments:
+            - obs (:obj:`Any`): Env observation.
+            - policy_output (:obj:`Dict[str, Any]`): The output of policy collect mode(``self._forward_collect``),\
+                including at least ``action``.
+            - timestep (:obj:`namedtuple`): The output after env step(execute policy output action), including at \
+                least ``obs``, ``reward``, ``done``, (here obs indicates obs after env step).
+        Returns:
+            - transition (:obj:`dict`): Dict type transition data.
+        """
+        transition = {
+            'obs': obs,
+            'next_obs': timestep.obs,
+            'action': policy_output['action'],
+            'reward': timestep.reward,
+            'done': timestep.done,
+        }
+        return transition
+
+    def _init_eval(self) -> None:
+        r"""
+        Overview:
+            Evaluate mode init method. Called by ``self.__init__``, initialize eval_model.
+        """
+        self._eval_model = model_wrap(self._model, wrapper_name='argmax_sample')
+        self._eval_model.reset()
+
+    def _forward_eval(self, data: Dict[int, Any]) -> Dict[int, Any]:
+        """
+        Overview:
+            Forward computation graph of eval mode(evaluate policy performance), at most cases, it is similar to \
+            ``self._forward_collect``.
+        Arguments:
+            - data (:obj:`Dict[str, Any]`): Dict type data, stacked env data for predicting policy_output(action), \
+                values are torch.Tensor or np.ndarray or dict/list combinations, keys are env_id indicated by integer.
+        Returns:
+            - output (:obj:`Dict[int, Any]`): The dict of predicting action for the interaction with env.
+        ArgumentsKeys:
+            - necessary: ``obs``
+        ReturnsKeys
+            - necessary: ``action``
+        """
+        data_id = list(data.keys())
+        data = default_collate(list(data.values()))
+        if self._cuda:
+            data = to_device(data, self._device)
+        self._eval_model.eval()
+        with torch.no_grad():
+            output = self._eval_model.forward(data)
+        if self._cuda:
+            output = to_device(output, 'cpu')
+        output = default_decollate(output)
+        return {i: d for i, d in zip(data_id, output)}
diff --git a/ding/policy/command_mode_policy_instance.py b/ding/policy/command_mode_policy_instance.py
index 13d7032744..f370b8f197 100644
--- a/ding/policy/command_mode_policy_instance.py
+++ b/ding/policy/command_mode_policy_instance.py
@@ -45,6 +45,7 @@
 from .pdqn import PDQNPolicy
 from .sac import SQILSACPolicy
 from .madqn import MADQNPolicy
+from .bdq import BDQPolicy
 
 
 class EpsCommandModePolicy(CommandModePolicy):
@@ -95,6 +96,11 @@ def _get_setting_eval(self, command_info: dict) -> dict:
         return {}
 
 
+@POLICY_REGISTRY.register('bdq_command')
+class BDQCommandModePolicy(BDQPolicy, EpsCommandModePolicy):
+    pass
+
+
 @POLICY_REGISTRY.register('dqn_command')
 class DQNCommandModePolicy(DQNPolicy, EpsCommandModePolicy):
     pass
diff --git a/ding/rl_utils/__init__.py b/ding/rl_utils/__init__.py
index ee39b8a318..2ced88ccef 100644
--- a/ding/rl_utils/__init__.py
+++ b/ding/rl_utils/__init__.py
@@ -11,7 +11,7 @@
     nstep_return_data, nstep_return, iqn_nstep_td_data, iqn_nstep_td_error, qrdqn_nstep_td_data, qrdqn_nstep_td_error,\
     fqf_nstep_td_data, fqf_nstep_td_error, fqf_calculate_fraction_loss, evaluate_quantile_at_action, \
     q_nstep_sql_td_error, dqfd_nstep_td_error, dqfd_nstep_td_data, q_v_1step_td_error, q_v_1step_td_data,\
-    dqfd_nstep_td_error_with_rescale, discount_cumsum
+    dqfd_nstep_td_error_with_rescale, discount_cumsum, bdq_nstep_td_error
 from .vtrace import vtrace_loss, compute_importance_weights
 from .upgo import upgo_loss
 from .adder import get_gae, get_gae_with_default_last_value, get_nstep_return_data, get_train_sample
diff --git a/ding/rl_utils/td.py b/ding/rl_utils/td.py
index 3b01115dbe..f8dd6521c6 100644
--- a/ding/rl_utils/td.py
+++ b/ding/rl_utils/td.py
@@ -453,6 +453,62 @@ def q_nstep_td_error(
     return (td_error_per_sample * weight).mean(), td_error_per_sample
 
 
+def bdq_nstep_td_error(
+        data: namedtuple,
+        gamma: Union[float, list],
+        nstep: int = 1,
+        cum_reward: bool = False,
+        value_gamma: Optional[torch.Tensor] = None,
+        criterion: torch.nn.modules = nn.MSELoss(reduction='none'),
+) -> torch.Tensor:
+    """
+    Overview:
+        Multistep (1 step or n step) td_error for BDQ algorithm
+    Arguments:
+        - data (:obj:`q_nstep_td_data`): the input data, q_nstep_td_data to calculate loss
+        - gamma (:obj:`float`): discount factor
+        - cum_reward (:obj:`bool`): whether to use cumulative nstep reward, which is figured out when collecting data
+        - value_gamma (:obj:`torch.Tensor`): gamma discount value for target q_value
+        - criterion (:obj:`torch.nn.modules`): loss function criterion
+        - nstep (:obj:`int`): nstep num, default set to 1
+    Returns:
+        - loss (:obj:`torch.Tensor`): nstep td error, 0-dim tensor
+        - td_error_per_sample (:obj:`torch.Tensor`): nstep td error, 1-dim tensor
+    Shapes:
+        - data (:obj:`q_nstep_td_data`): the q_nstep_td_data containing\
+            ['q', 'next_n_q', 'action', 'reward', 'done']
+        - q (:obj:`torch.FloatTensor`): :math:`(B, N)` i.e. [batch_size, action_dim]
+        - next_n_q (:obj:`torch.FloatTensor`): :math:`(B, N)`
+        - action (:obj:`torch.LongTensor`): :math:`(B, )`
+        - next_n_action (:obj:`torch.LongTensor`): :math:`(B, )`
+        - reward (:obj:`torch.FloatTensor`): :math:`(T, B)`, where T is timestep(nstep)
+        - done (:obj:`torch.BoolTensor`) :math:`(B, )`, whether done in last timestep
+        - td_error_per_sample (:obj:`torch.FloatTensor`): :math:`(B, )`
+    """
+    q, next_n_q, action, next_n_action, reward, done, weight = data
+    if weight is None:
+        weight = torch.ones_like(reward)
+    reward = reward.unsqueeze(-1)
+    weight = weight.unsqueeze(-1)
+    done = done.unsqueeze(-1)
+    if value_gamma is not None:
+        value_gamma = value_gamma.unsqueeze(-1)
+
+    q_s_a = q.gather(-1, action.unsqueeze(-1)).squeeze(-1)
+    target_q_s_a = next_n_q.gather(-1, next_n_action.unsqueeze(-1)).squeeze(-1)
+
+    if cum_reward:
+        if value_gamma is None:
+            target_q_s_a = reward + (gamma ** nstep) * target_q_s_a * (1 - done)
+        else:
+            target_q_s_a = reward + value_gamma * target_q_s_a * (1 - done)
+    else:
+        target_q_s_a = nstep_return(nstep_return_data(reward, target_q_s_a, done), gamma, nstep, value_gamma)
+    td_error_per_sample = criterion(q_s_a, target_q_s_a.detach())
+    td_error_per_sample = td_error_per_sample.mean(-1)
+    return (td_error_per_sample * weight).mean(), td_error_per_sample
+
+
 def shape_fn_qntd_rescale(args, kwargs):
     r"""
     Overview:
diff --git a/dizoo/mujoco/config/halfcheetah_bdq_config.py b/dizoo/mujoco/config/halfcheetah_bdq_config.py
new file mode 100644
index 0000000000..3a4cbfd388
--- /dev/null
+++ b/dizoo/mujoco/config/halfcheetah_bdq_config.py
@@ -0,0 +1,71 @@
+from easydict import EasyDict
+
+halfcheetah_bdq_config = dict(
+    exp_name='halfcheetah_bdq_seed0',
+    env=dict(
+        env_id='HalfCheetah-v3',
+        norm_obs=dict(use_norm=False, ),
+        norm_reward=dict(use_norm=False, ),
+        collector_env_num=8,
+        evaluator_env_num=8,
+        n_evaluator_episode=8,
+        stop_value=12000,
+        action_per_branch=2,
+    ),
+    policy=dict(
+        cuda=False,
+        priority=False,
+        discount_factor=0.99,
+        nstep=1,
+        model=dict(
+            obs_shape=17,
+            num_branches=6,
+            action_per_branch=2,  # mean the action shape is 11, 11 discrete actions
+            encoder_hidden_size_list=[256, 256, 128],
+        ),
+        
+        learn=dict(
+            batch_size=512,
+            learning_rate=3e-4,
+            ignore_done=True,
+            target_update_freq=500,
+            target_update_theta=0.001,
+            update_per_collect=20,
+        ),
+        collect=dict(
+            n_sample=256,
+            unroll_len=1,
+        ),
+        eval=dict(evaluator=dict(eval_freq=1000, )),
+        other=dict(
+            # Epsilon greedy with decay.
+            eps=dict(
+                # Decay type. Support ['exp', 'linear'].
+                type='exp',
+                start=1,
+                end=0.05,
+                decay=int(1e5),
+            ),
+            replay_buffer=dict(replay_buffer_size=int(1e6), )
+        ),
+    ),
+)
+halfcheetah_bdq_config = EasyDict(halfcheetah_bdq_config)
+main_config = halfcheetah_bdq_config
+
+halfcheetah_bdq_create_config = dict(
+    env=dict(
+        type='mujoco',
+        import_names=['dizoo.mujoco.envs.mujoco_env'],
+    ),
+    env_manager=dict(type='subprocess'),
+    # env_manager=dict(type='subprocess'),
+    policy=dict(type='bdq', ),
+)
+halfcheetah_bdq_create_config = EasyDict(halfcheetah_bdq_create_config)
+create_config = halfcheetah_bdq_create_config
+
+if __name__ == "__main__":
+    # or you can enter `ding -m serial_onpolicy -c halfcheetah_onbdq_config.py -s 0`
+    from ding.entry import serial_pipeline
+    serial_pipeline((main_config, create_config), seed=0)
\ No newline at end of file
diff --git a/dizoo/mujoco/config/hopper_bdq_config.py b/dizoo/mujoco/config/hopper_bdq_config.py
new file mode 100644
index 0000000000..c5b9b50155
--- /dev/null
+++ b/dizoo/mujoco/config/hopper_bdq_config.py
@@ -0,0 +1,73 @@
+from easydict import EasyDict
+
+hopper_bdq_config = dict(
+    exp_name='hopper_bdq_seed0',
+    env=dict(
+        env_id='Hopper-v3',
+        norm_obs=dict(use_norm=False, ),
+        norm_reward=dict(use_norm=False, ),
+        collector_env_num=8,
+        evaluator_env_num=8,
+        n_evaluator_episode=8,
+        stop_value=int(1e6),
+        action_per_branch=4,
+    ),
+    policy=dict(
+        cuda=False,
+        priority=False,
+        discount_factor=0.99,
+        model=dict(
+            obs_shape=11,
+            num_branches=3,
+            action_per_branch=4,  # mean the action shape is 11, 11 discrete actions
+            encoder_hidden_size_list=[256, 256, 128],
+        ),
+        nstep=1,
+        learn=dict(
+            ignore_done=False,
+            batch_size=512,
+            learning_rate=3e-4,
+            # Frequency of target network update.
+            target_update_freq=500,
+            target_update_theta=0.001,
+            update_per_collect=20,
+        ),
+        collect=dict(
+            # You can use either "n_sample" or "n_episode" in collector.collect.
+            # Get "n_sample" samples per collect.
+            n_sample=256,
+            # Cut trajectories into pieces with length "unroll_len".
+            unroll_len=1,
+        ),
+        eval=dict(evaluator=dict(eval_freq=1000, )),
+        other=dict(
+            # Epsilon greedy with decay.
+            eps=dict(
+                # Decay type. Support ['exp', 'linear'].
+                type='exp',
+                start=1,
+                end=0.05,
+                decay=int(1e5),
+            ),
+            replay_buffer=dict(replay_buffer_size=int(1e6), )
+        ),
+    ),
+)
+hopper_bdq_config = EasyDict(hopper_bdq_config)
+main_config = hopper_bdq_config
+
+hopper_bdq_create_config = dict(
+    env=dict(
+        type='mujoco',
+        import_names=['dizoo.mujoco.envs.mujoco_env'],
+    ),
+    env_manager=dict(type='subprocess'),
+    policy=dict(type='bdq', ),
+)
+hopper_bdq_create_config = EasyDict(hopper_bdq_create_config)
+create_config = hopper_bdq_create_config
+
+if __name__ == "__main__":
+    # or you can enter `ding -m serial_onpolicy -c hopper_bdq_config.py -s 0`
+    from ding.entry import serial_pipeline
+    serial_pipeline([main_config, create_config], seed=0)
diff --git a/dizoo/mujoco/envs/mujoco_env.py b/dizoo/mujoco/envs/mujoco_env.py
index 1c6373f51d..3db13f1468 100644
--- a/dizoo/mujoco/envs/mujoco_env.py
+++ b/dizoo/mujoco/envs/mujoco_env.py
@@ -39,6 +39,10 @@ def __init__(self, cfg: dict) -> None:
         self._replay_path = None
         self._replay_path_gif = cfg.replay_path_gif
         self._save_replay_gif = cfg.save_replay_gif
+        self._action_per_branch = cfg.action_per_branch if 'action_per_branch' in cfg else None
+
+    def map_action(self, action: Union[np.ndarray, list]) -> Union[np.ndarray, list]:
+        return [2 * x / (self._action_per_branch - 1) - 1 for x in action]
 
     def reset(self) -> np.ndarray:
         if not self._init_flag:
@@ -65,7 +69,7 @@ def reset(self) -> np.ndarray:
             self._env.seed(self._seed)
         obs = self._env.reset()
         obs = to_ndarray(obs).astype('float32')
-        self._eval_episode_return = 0.
+        self._final_eval_reward = 0.
 
         return obs
 
@@ -80,13 +84,15 @@ def seed(self, seed: int, dynamic_seed: bool = True) -> None:
         np.random.seed(self._seed)
 
     def step(self, action: Union[np.ndarray, list]) -> BaseEnvTimestep:
+        if self._action_per_branch:
+            action = self.map_action(action)
         action = to_ndarray(action)
         if self._save_replay_gif:
             self._frames.append(self._env.render(mode='rgb_array'))
         if self._action_clip:
             action = np.clip(action, -1, 1)
         obs, rew, done, info = self._env.step(action)
-        self._eval_episode_return += rew
+        self._final_eval_reward += rew
         if done:
             if self._save_replay_gif:
                 path = os.path.join(
@@ -94,7 +100,7 @@ def step(self, action: Union[np.ndarray, list]) -> BaseEnvTimestep:
                 )
                 save_frames_as_gif(self._frames, path)
                 self._save_replay_count += 1
-            info['eval_episode_return'] = self._eval_episode_return
+            info['final_eval_reward'] = self._final_eval_reward
 
         obs = to_ndarray(obs).astype(np.float32)
         rew = to_ndarray([rew]).astype(np.float32)

From ebe403ec786eb899fab7f0584da7e045b9c53fe9 Mon Sep 17 00:00:00 2001
From: Cloud-Pku <cloud_chen@pku.edu.cn>
Date: Thu, 15 Dec 2022 17:22:51 +0800
Subject: [PATCH 02/10] after run reformat

---
 ding/model/template/q_learning.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/ding/model/template/q_learning.py b/ding/model/template/q_learning.py
index 5177a87a7c..bc8a0bf152 100644
--- a/ding/model/template/q_learning.py
+++ b/ding/model/template/q_learning.py
@@ -155,9 +155,15 @@ def __init__(
 
         # head
         self.head = BranchingHead(
-                head_hidden_size, num_branches=self.num_branches, action_per_branch=action_per_branch,
-                layer_num=layer_num, a_layer_num=a_layer_num, v_layer_num=v_layer_num, activation=activation,
-                norm_type=norm_type)
+            head_hidden_size,
+            num_branches=self.num_branches,
+            action_per_branch=action_per_branch,
+            layer_num=layer_num,
+            a_layer_num=a_layer_num,
+            v_layer_num=v_layer_num,
+            activation=activation,
+            norm_type=norm_type
+        )
 
     def forward(self, x: torch.Tensor) -> Dict:
         r"""

From a2f30c2010058cfa867df3ef52f41faa5e5a6280 Mon Sep 17 00:00:00 2001
From: Cloud-Pku <cloud_chen@pku.edu.cn>
Date: Thu, 15 Dec 2022 17:25:58 +0800
Subject: [PATCH 03/10] update mujoco_env

---
 dizoo/mujoco/envs/mujoco_env.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dizoo/mujoco/envs/mujoco_env.py b/dizoo/mujoco/envs/mujoco_env.py
index 3db13f1468..2bb2e7d2ff 100644
--- a/dizoo/mujoco/envs/mujoco_env.py
+++ b/dizoo/mujoco/envs/mujoco_env.py
@@ -69,7 +69,7 @@ def reset(self) -> np.ndarray:
             self._env.seed(self._seed)
         obs = self._env.reset()
         obs = to_ndarray(obs).astype('float32')
-        self._final_eval_reward = 0.
+        self._eval_episode_return = 0.
 
         return obs
 
@@ -92,7 +92,7 @@ def step(self, action: Union[np.ndarray, list]) -> BaseEnvTimestep:
         if self._action_clip:
             action = np.clip(action, -1, 1)
         obs, rew, done, info = self._env.step(action)
-        self._final_eval_reward += rew
+        self._eval_episode_return += rew
         if done:
             if self._save_replay_gif:
                 path = os.path.join(
@@ -100,7 +100,7 @@ def step(self, action: Union[np.ndarray, list]) -> BaseEnvTimestep:
                 )
                 save_frames_as_gif(self._frames, path)
                 self._save_replay_count += 1
-            info['final_eval_reward'] = self._final_eval_reward
+            info['eval_episode_return'] = self._eval_episode_return
 
         obs = to_ndarray(obs).astype(np.float32)
         rew = to_ndarray([rew]).astype(np.float32)

From da07a6a725c27076776478264b737ebb58449b82 Mon Sep 17 00:00:00 2001
From: Cloud-Pku <cloud_chen@pku.edu.cn>
Date: Tue, 27 Dec 2022 18:22:11 +0800
Subject: [PATCH 04/10] add unittest; extend n-step TD; polished;

---
 ding/model/common/head.py                     | 20 +++++++++----
 ding/model/template/__init__.py               |  2 +-
 ding/model/template/q_learning.py             | 25 +++++++++-------
 ding/model/template/tests/test_q_learning.py  | 21 ++++++++++++-
 ding/policy/bdq.py                            | 16 +++++++---
 ding/rl_utils/td.py                           | 18 +++++++----
 ding/rl_utils/tests/test_td.py                | 30 ++++++++++++++++++-
 dizoo/mujoco/config/halfcheetah_bdq_config.py |  9 ++----
 dizoo/mujoco/config/hopper_bdq_config.py      | 10 +++----
 dizoo/mujoco/envs/mujoco_env.py               | 22 ++++++++++++--
 10 files changed, 129 insertions(+), 44 deletions(-)

diff --git a/ding/model/common/head.py b/ding/model/common/head.py
index 94a6b2ba14..e60ed0fd50 100644
--- a/ding/model/common/head.py
+++ b/ding/model/common/head.py
@@ -180,7 +180,7 @@ def __init__(
             self,
             hidden_size: int,
             num_branches: int = 0,
-            action_per_branch: int = 2,
+            action_bins_per_branch: int = 2,
             layer_num: int = 1,
             a_layer_num: Optional[int] = None,
             v_layer_num: Optional[int] = None,
@@ -190,11 +190,15 @@ def __init__(
     ) -> None:
         """
         Overview:
-            Init the ``BranchingHead`` layers according to the provided arguments.
+            Init the ``BranchingHead`` layers according to the provided arguments. \
+                This head achieves a linear increase of the number of network outputs \
+                with the number of degrees of freedom by allowing a level of independence \
+                for each individual action dimension.
+                Therefore, this head is suitable for high dimensional action Spaces.
         Arguments:
             - hidden_size (:obj:`int`): The ``hidden_size`` of the MLP connected to ``BranchingHead``.
             - num_branches (:obj:`int`): The number of branches, which is equivalent to the action dimension.
-            - action_per_branch (:obj:`int`): The number of actions in each dimension.
+            - action_bins_per_branch (:obj:int): The number of action bins in each dimension.
             - layer_num (:obj:`int`): The number of layers used in the network to compute Advantage and Value output.
             - a_layer_num (:obj:`int`): The number of layers used in the network to compute Advantage output.
             - v_layer_num (:obj:`int`): The number of layers used in the network to compute Value output.
@@ -212,7 +216,7 @@ def __init__(
         if v_layer_num is None:
             v_layer_num = layer_num
         self.num_branches = num_branches
-        self.action_per_branch = action_per_branch
+        self.action_bins_per_branch = action_bins_per_branch
 
         layer = NoiseLinearLayer if noise else nn.Linear
         block = noise_block if noise else fc_block
@@ -230,7 +234,7 @@ def __init__(
             ), block(hidden_size, 1)
         )
         # action branching network
-        action_output_dim = action_per_branch
+        action_output_dim = action_bins_per_branch
         self.branches = nn.ModuleList(
             [
                 nn.Sequential(
@@ -263,7 +267,7 @@ def forward(self, x: torch.Tensor) -> Dict:
             >>> head = BranchingHead(64, 5, 2)
             >>> inputs = torch.randn(4, 64)
             >>> outputs = head(inputs)
-            >>> assert isinstance(outputs, dict) and outputs['logit'].shape == torch.Size([4, 10])
+            >>> assert isinstance(outputs, dict) and outputs['logit'].shape == torch.Size([4, 5, 2])
         """
         value_out = self.V(x)
         value_out = torch.unsqueeze(value_out, 1)
@@ -271,6 +275,10 @@ def forward(self, x: torch.Tensor) -> Dict:
         for b in self.branches:
             action_out.append(b(x))
         action_scores = torch.stack(action_out, 1)
+        '''
+            From the paper, this implementation performs better than both the naive alternative (Q = V + A) \
+            and the local maximum reduction method (Q = V + max(A)).
+        '''
         action_scores = action_scores - torch.mean(action_scores, 2, keepdim=True)
         logits = value_out + action_scores
         return {'logit': logits}
diff --git a/ding/model/template/__init__.py b/ding/model/template/__init__.py
index 88ee115373..d4907a510d 100644
--- a/ding/model/template/__init__.py
+++ b/ding/model/template/__init__.py
@@ -1,5 +1,5 @@
 # general
-from .q_learning import DQN, RainbowDQN, QRDQN, IQN, FQF, DRQN, C51DQN
+from .q_learning import DQN, RainbowDQN, QRDQN, IQN, FQF, DRQN, C51DQN, BDQ
 from .qac import QAC, DiscreteQAC
 from .pdqn import PDQN
 from .vac import VAC
diff --git a/ding/model/template/q_learning.py b/ding/model/template/q_learning.py
index bc8a0bf152..587ec4899a 100644
--- a/ding/model/template/q_learning.py
+++ b/ding/model/template/q_learning.py
@@ -105,7 +105,7 @@ def __init__(
             self,
             obs_shape: Union[int, SequenceType],
             num_branches: int = 0,
-            action_per_branch: int = 2,
+            action_bins_per_branch: int = 2,
             layer_num: int = 3,
             a_layer_num: Optional[int] = None,
             v_layer_num: Optional[int] = None,
@@ -116,11 +116,14 @@ def __init__(
     ) -> None:
         """
         Overview:
-            Init the BDQ (encoder + head) Model according to input arguments.
+            Init the BDQ (encoder + head) Model according to input arguments. \
+                referenced paper Action Branching Architectures for Deep Reinforcement Learning \
+                <https://arxiv.org/pdf/1711.08946>
         Arguments:
             - obs_shape (:obj:`Union[int, SequenceType]`): Observation space shape, such as 8 or [4, 84, 84].
-            - num_branches (:obj:`int`): The number of branches, which is equivalent to the action dimension, such as 6.
-            - action_per_branch (:obj:`int`): The number of actions in each dimension.
+            - num_branches (:obj:`int`): The number of branches, which is equivalent to the action dimension, \
+                such as 6 in mujoco's halfcheetah environment.
+            - action_bins_per_branch (:obj:`int`): The number of actions in each dimension.
             - layer_num (:obj:`int`): The number of layers used in the network to compute Advantage and Value output.
             - a_layer_num (:obj:`int`): The number of layers used in the network to compute Advantage output.
             - v_layer_num (:obj:`int`): The number of layers used in the network to compute Value output.
@@ -134,7 +137,7 @@ def __init__(
         """
         super(BDQ, self).__init__()
         # For compatibility: 1, (1, ), [4, 32, 32]
-        obs_shape = squeeze(obs_shape)
+        obs_shape, num_branches = squeeze(obs_shape), squeeze(num_branches)
         if head_hidden_size is None:
             head_hidden_size = encoder_hidden_size_list[-1]
 
@@ -151,13 +154,13 @@ def __init__(
             )
 
         self.num_branches = num_branches
-        self.action_per_branch = action_per_branch
+        self.action_bins_per_branch = action_bins_per_branch
 
         # head
         self.head = BranchingHead(
             head_hidden_size,
             num_branches=self.num_branches,
-            action_per_branch=action_per_branch,
+            action_bins_per_branch=self.action_bins_per_branch,
             layer_num=layer_num,
             a_layer_num=a_layer_num,
             v_layer_num=v_layer_num,
@@ -178,14 +181,14 @@ def forward(self, x: torch.Tensor) -> Dict:
         Shapes:
             - x (:obj:`torch.Tensor`): :math:`(B, N)`, where B is batch size and N is ``obs_shape``
             - logit (:obj:`torch.FloatTensor`): :math:`(B, M)`, where B is batch size and M is
-                ``num_branches * action_per_branch``
+                ``num_branches * action_bins_per_branch``
         Examples:
-            >>> model = BDQ(8, 5, 2)  # arguments: 'obs_shape', 'num_branches' and 'action_per_branch'.
+            >>> model = BDQ(8, 5, 2)  # arguments: 'obs_shape', 'num_branches' and 'action_bins_per_branch'.
             >>> inputs = torch.randn(4, 8)
             >>> outputs = model(inputs)
-            >>> assert isinstance(outputs, dict) and outputs['logit'].shape == torch.Size([4, 10])
+            >>> assert isinstance(outputs, dict) and outputs['logit'].shape == torch.Size([4, 5, 2])
         """
-        x = self.encoder(x)
+        x = self.encoder(x) / (self.num_branches + 1)  # corresponds to the "Gradient Rescaling" in the paper
         x = self.head(x)
         return x
 
diff --git a/ding/model/template/tests/test_q_learning.py b/ding/model/template/tests/test_q_learning.py
index b444becdf6..303481cb1c 100644
--- a/ding/model/template/tests/test_q_learning.py
+++ b/ding/model/template/tests/test_q_learning.py
@@ -1,7 +1,7 @@
 import pytest
 from itertools import product
 import torch
-from ding.model.template import DQN, RainbowDQN, QRDQN, IQN, FQF, DRQN, C51DQN
+from ding.model.template import DQN, RainbowDQN, QRDQN, IQN, FQF, DRQN, C51DQN, BDQ
 from ding.torch_utils import is_differentiable
 
 T, B = 3, 4
@@ -40,6 +40,25 @@ def test_dqn(self, obs_shape, act_shape):
                 assert outputs['logit'][i].shape == (B, s)
         self.output_check(model, outputs['logit'])
 
+    @pytest.mark.parametrize('obs_shape, act_shape', args)
+    def test_bdq(self, obs_shape, act_shape):
+        if isinstance(obs_shape, int):
+            inputs = torch.randn(B, obs_shape)
+        else:
+            inputs = torch.randn(B, *obs_shape)
+        if not isinstance(act_shape, int) and len(act_shape) > 1:
+            return
+        num_branches = act_shape
+        for action_bins_per_branch in range(1, 10):
+            model = BDQ(obs_shape, num_branches, action_bins_per_branch)
+            outputs = model(inputs)
+            assert isinstance(outputs, dict)
+            if isinstance(act_shape, int):
+                assert outputs['logit'].shape == (B, act_shape, action_bins_per_branch)
+            else:
+                assert outputs['logit'].shape == (B, *act_shape, action_bins_per_branch)
+            self.output_check(model, outputs['logit'])
+
     @pytest.mark.parametrize('obs_shape, act_shape', args)
     def test_rainbowdqn(self, obs_shape, act_shape):
         if isinstance(obs_shape, int):
diff --git a/ding/policy/bdq.py b/ding/policy/bdq.py
index 920f8b7c07..e1366d38ce 100644
--- a/ding/policy/bdq.py
+++ b/ding/policy/bdq.py
@@ -17,8 +17,12 @@
 class BDQPolicy(Policy):
     r"""
     Overview:
-        Policy class of BDQ algorithm, extended by PER/multi-step TD.
-
+        Policy class of BDQ algorithm, extended by PER/multi-step TD. \
+            referenced paper Action Branching Architectures for Deep Reinforcement Learning \
+            <https://arxiv.org/pdf/1711.08946>
+        .. note::
+            BDQ algorithm contains a neural architecture featuring a shared decision module \
+                followed by several network branches, one for each action dimension.
     Config:
         == ==================== ======== ============== ======================================== =======================
         ID Symbol               Type     Default Value  Description                              Other(Shape)
@@ -228,7 +232,7 @@ def _forward_learn(self, data: Dict[str, Any]) -> Dict[str, Any]:
         # after update
         # =============
         self._target_model.update(self._learn_model.state_dict())
-        return {
+        update_info = {
             'cur_lr': self._optimizer.defaults['lr'],
             'total_loss': loss.item(),
             'q_value': q_value.mean().item(),
@@ -237,9 +241,13 @@ def _forward_learn(self, data: Dict[str, Any]) -> Dict[str, Any]:
             # Only discrete action satisfying len(data['action'])==1 can return this and draw histogram on tensorboard.
             # '[histogram]action_distribution': data['action'],
         }
+        q_value_per_branch = torch.mean(q_value, 2, keepdim=False)
+        for i in range(self._model.num_branches):
+            update_info['q_value_b_' + str(i)] = q_value_per_branch[:, 0].mean().item()
+        return update_info
 
     def _monitor_vars_learn(self) -> List[str]:
-        return ['cur_lr', 'total_loss', 'q_value']
+        return ['cur_lr', 'total_loss', 'q_value'] + ['q_value_b_' + str(i) for i in range(self._model.num_branches)]
 
     def _state_dict_learn(self) -> Dict[str, Any]:
         """
diff --git a/ding/rl_utils/td.py b/ding/rl_utils/td.py
index f8dd6521c6..11c9263d1a 100644
--- a/ding/rl_utils/td.py
+++ b/ding/rl_utils/td.py
@@ -463,7 +463,15 @@ def bdq_nstep_td_error(
 ) -> torch.Tensor:
     """
     Overview:
-        Multistep (1 step or n step) td_error for BDQ algorithm
+        Multistep (1 step or n step) td_error for BDQ algorithm, \
+            referenced paper Action Branching Architectures for Deep Reinforcement Learning \
+            <https://arxiv.org/pdf/1711.08946>
+        In fact, the original paper only provides the 1-step TD-error calculation method, \
+            and here we extend the calculation method of n-step.
+                TD-error:
+                    y_d = \sigma_{t=0}^{nstep} \gamma^t * r_t + \gamma^{nstep} * Q_d'(s', argmax Q_d(s', a_d))
+                    TD-error = \frac{1}{D} * (y_d - Q_d(s, a_d))^2
+                    Loss = mean(TD-error)
     Arguments:
         - data (:obj:`q_nstep_td_data`): the input data, q_nstep_td_data to calculate loss
         - gamma (:obj:`float`): discount factor
@@ -477,10 +485,10 @@ def bdq_nstep_td_error(
     Shapes:
         - data (:obj:`q_nstep_td_data`): the q_nstep_td_data containing\
             ['q', 'next_n_q', 'action', 'reward', 'done']
-        - q (:obj:`torch.FloatTensor`): :math:`(B, N)` i.e. [batch_size, action_dim]
-        - next_n_q (:obj:`torch.FloatTensor`): :math:`(B, N)`
-        - action (:obj:`torch.LongTensor`): :math:`(B, )`
-        - next_n_action (:obj:`torch.LongTensor`): :math:`(B, )`
+        - q (:obj:`torch.FloatTensor`): :math:`(B, D, N)` i.e. [batch_size, branch_num, action_bins_per_branch]
+        - next_n_q (:obj:`torch.FloatTensor`): :math:`(B, D, N)`
+        - action (:obj:`torch.LongTensor`): :math:`(B, D)`
+        - next_n_action (:obj:`torch.LongTensor`): :math:`(B, D)`
         - reward (:obj:`torch.FloatTensor`): :math:`(T, B)`, where T is timestep(nstep)
         - done (:obj:`torch.BoolTensor`) :math:`(B, )`, whether done in last timestep
         - td_error_per_sample (:obj:`torch.FloatTensor`): :math:`(B, )`
diff --git a/ding/rl_utils/tests/test_td.py b/ding/rl_utils/tests/test_td.py
index 3b792ddc35..e96ca37fab 100644
--- a/ding/rl_utils/tests/test_td.py
+++ b/ding/rl_utils/tests/test_td.py
@@ -4,7 +4,7 @@
     td_lambda_error, q_nstep_td_error_with_rescale, dist_1step_td_data, dist_1step_td_error, dist_nstep_td_data,\
     dqfd_nstep_td_data, dqfd_nstep_td_error, dist_nstep_td_error, v_1step_td_data, v_1step_td_error, v_nstep_td_data,\
     v_nstep_td_error, q_nstep_sql_td_error, iqn_nstep_td_data, iqn_nstep_td_error,\
-    fqf_nstep_td_data, fqf_nstep_td_error, qrdqn_nstep_td_data, qrdqn_nstep_td_error
+    fqf_nstep_td_data, fqf_nstep_td_error, qrdqn_nstep_td_data, qrdqn_nstep_td_error, bdq_nstep_td_error
 from ding.rl_utils.td import shape_fn_dntd, shape_fn_qntd, shape_fn_td_lambda, shape_fn_qntd_rescale
 
 
@@ -35,6 +35,34 @@ def test_q_nstep_td():
         assert isinstance(q.grad, torch.Tensor)
 
 
+@pytest.mark.unittest
+def test_bdq_nstep_td():
+    batch_size = 8
+    branch_num = 6
+    action_per_branch = 3
+    next_q = torch.randn(batch_size, branch_num, action_per_branch)
+    done = torch.randn(batch_size)
+    action = torch.randint(0, action_per_branch, size=(batch_size, branch_num))
+    next_action = torch.randint(0, action_per_branch, size=(batch_size, branch_num))
+    for nstep in range(1, 10):
+        q = torch.randn(batch_size, branch_num, action_per_branch).requires_grad_(True)
+        reward = torch.rand(nstep, batch_size)
+        data = q_nstep_td_data(q, next_q, action, next_action, reward, done, None)
+        loss, td_error_per_sample = bdq_nstep_td_error(data, 0.95, nstep=nstep)
+        assert td_error_per_sample.shape == (batch_size, )
+        assert loss.shape == ()
+        assert q.grad is None
+        loss.backward()
+        assert isinstance(q.grad, torch.Tensor)
+        data = q_nstep_td_data(q, next_q, action, next_action, reward, done, None)
+        loss, td_error_per_sample = q_nstep_td_error(data, 0.95, nstep=nstep, cum_reward=True)
+        value_gamma = torch.tensor(0.9)
+        data = q_nstep_td_data(q, next_q, action, next_action, reward, done, None)
+        loss, td_error_per_sample = q_nstep_td_error(data, 0.95, nstep=nstep, cum_reward=True, value_gamma=value_gamma)
+        loss.backward()
+        assert isinstance(q.grad, torch.Tensor)
+
+
 @pytest.mark.unittest
 def test_q_nstep_td_ngu():
     batch_size = 4
diff --git a/dizoo/mujoco/config/halfcheetah_bdq_config.py b/dizoo/mujoco/config/halfcheetah_bdq_config.py
index 3a4cbfd388..145bf8062e 100644
--- a/dizoo/mujoco/config/halfcheetah_bdq_config.py
+++ b/dizoo/mujoco/config/halfcheetah_bdq_config.py
@@ -4,13 +4,12 @@
     exp_name='halfcheetah_bdq_seed0',
     env=dict(
         env_id='HalfCheetah-v3',
-        norm_obs=dict(use_norm=False, ),
         norm_reward=dict(use_norm=False, ),
         collector_env_num=8,
         evaluator_env_num=8,
         n_evaluator_episode=8,
         stop_value=12000,
-        action_per_branch=2,
+        action_bins_per_branch=2,
     ),
     policy=dict(
         cuda=False,
@@ -20,7 +19,7 @@
         model=dict(
             obs_shape=17,
             num_branches=6,
-            action_per_branch=2,  # mean the action shape is 11, 11 discrete actions
+            action_bins_per_branch=2,  # mean the action shape is 6, 2 discrete actions for each action dimension
             encoder_hidden_size_list=[256, 256, 128],
         ),
         
@@ -29,7 +28,6 @@
             learning_rate=3e-4,
             ignore_done=True,
             target_update_freq=500,
-            target_update_theta=0.001,
             update_per_collect=20,
         ),
         collect=dict(
@@ -59,7 +57,6 @@
         import_names=['dizoo.mujoco.envs.mujoco_env'],
     ),
     env_manager=dict(type='subprocess'),
-    # env_manager=dict(type='subprocess'),
     policy=dict(type='bdq', ),
 )
 halfcheetah_bdq_create_config = EasyDict(halfcheetah_bdq_create_config)
@@ -68,4 +65,4 @@
 if __name__ == "__main__":
     # or you can enter `ding -m serial_onpolicy -c halfcheetah_onbdq_config.py -s 0`
     from ding.entry import serial_pipeline
-    serial_pipeline((main_config, create_config), seed=0)
\ No newline at end of file
+    serial_pipeline((main_config, create_config), seed=0, max_env_step=10000000,)
\ No newline at end of file
diff --git a/dizoo/mujoco/config/hopper_bdq_config.py b/dizoo/mujoco/config/hopper_bdq_config.py
index c5b9b50155..de08da2a7a 100644
--- a/dizoo/mujoco/config/hopper_bdq_config.py
+++ b/dizoo/mujoco/config/hopper_bdq_config.py
@@ -4,32 +4,30 @@
     exp_name='hopper_bdq_seed0',
     env=dict(
         env_id='Hopper-v3',
-        norm_obs=dict(use_norm=False, ),
         norm_reward=dict(use_norm=False, ),
         collector_env_num=8,
         evaluator_env_num=8,
         n_evaluator_episode=8,
         stop_value=int(1e6),
-        action_per_branch=4,
+        action_bins_per_branch=4,
     ),
     policy=dict(
         cuda=False,
         priority=False,
         discount_factor=0.99,
+        nstep=3,
         model=dict(
             obs_shape=11,
             num_branches=3,
-            action_per_branch=4,  # mean the action shape is 11, 11 discrete actions
+            action_bins_per_branch=4,  # mean the action shape is 3, 4 discrete actions for each action dimension
             encoder_hidden_size_list=[256, 256, 128],
         ),
-        nstep=1,
         learn=dict(
             ignore_done=False,
             batch_size=512,
             learning_rate=3e-4,
             # Frequency of target network update.
             target_update_freq=500,
-            target_update_theta=0.001,
             update_per_collect=20,
         ),
         collect=dict(
@@ -70,4 +68,4 @@
 if __name__ == "__main__":
     # or you can enter `ding -m serial_onpolicy -c hopper_bdq_config.py -s 0`
     from ding.entry import serial_pipeline
-    serial_pipeline([main_config, create_config], seed=0)
+    serial_pipeline([main_config, create_config], seed=0, max_env_step=10000000,)
diff --git a/dizoo/mujoco/envs/mujoco_env.py b/dizoo/mujoco/envs/mujoco_env.py
index 2bb2e7d2ff..c150581a5b 100644
--- a/dizoo/mujoco/envs/mujoco_env.py
+++ b/dizoo/mujoco/envs/mujoco_env.py
@@ -29,6 +29,7 @@ def default_config(cls: type) -> EasyDict:
         replay_path=None,
         save_replay_gif=False,
         replay_path_gif=None,
+        action_bins_per_branch=None,
     )
 
     def __init__(self, cfg: dict) -> None:
@@ -39,10 +40,25 @@ def __init__(self, cfg: dict) -> None:
         self._replay_path = None
         self._replay_path_gif = cfg.replay_path_gif
         self._save_replay_gif = cfg.save_replay_gif
-        self._action_per_branch = cfg.action_per_branch if 'action_per_branch' in cfg else None
+        self._action_bins_per_branch = cfg.action_bins_per_branch
 
     def map_action(self, action: Union[np.ndarray, list]) -> Union[np.ndarray, list]:
-        return [2 * x / (self._action_per_branch - 1) - 1 for x in action]
+        """
+        Overview:
+            Map the discretized action index to the action in the original action space.
+        Arguments:
+            - action (:obj:`np.ndarray or list`): The discretized action index. \
+                The value ranges is {0, 1, ..., self._action_bins_per_branch - 1}.
+        Returns:
+            - outputs (:obj:`list`): The action in the original action space. \
+                The value ranges is [-1, 1].
+        Examples:
+            >>> inputs = [2, 0, 4]
+            >>> self._action_bins_per_branch = 5
+            >>> outputs = map_action(inputs)
+            >>> assert isinstance(outputs, list) and outputs == [0.0, -1.0, 1.0]
+        """
+        return [2 * x / (self._action_bins_per_branch - 1) - 1 for x in action]
 
     def reset(self) -> np.ndarray:
         if not self._init_flag:
@@ -84,7 +100,7 @@ def seed(self, seed: int, dynamic_seed: bool = True) -> None:
         np.random.seed(self._seed)
 
     def step(self, action: Union[np.ndarray, list]) -> BaseEnvTimestep:
-        if self._action_per_branch:
+        if self._action_bins_per_branch:
             action = self.map_action(action)
         action = to_ndarray(action)
         if self._save_replay_gif:

From 506ec0a92c6a00d35c5ba6b152b7606207e29a50 Mon Sep 17 00:00:00 2001
From: Cloud-Pku <cloud_chen@pku.edu.cn>
Date: Tue, 27 Dec 2022 19:19:08 +0800
Subject: [PATCH 05/10] fix one error

---
 ding/policy/bdq.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ding/policy/bdq.py b/ding/policy/bdq.py
index e1366d38ce..618078a717 100644
--- a/ding/policy/bdq.py
+++ b/ding/policy/bdq.py
@@ -243,7 +243,7 @@ def _forward_learn(self, data: Dict[str, Any]) -> Dict[str, Any]:
         }
         q_value_per_branch = torch.mean(q_value, 2, keepdim=False)
         for i in range(self._model.num_branches):
-            update_info['q_value_b_' + str(i)] = q_value_per_branch[:, 0].mean().item()
+            update_info['q_value_b_' + str(i)] = q_value_per_branch[:, i].mean().item()
         return update_info
 
     def _monitor_vars_learn(self) -> List[str]:

From e41b361601848cd59869e2220f89d54156ecc256 Mon Sep 17 00:00:00 2001
From: Cloud-Pku <cloud_chen@pku.edu.cn>
Date: Thu, 29 Dec 2022 11:27:08 +0800
Subject: [PATCH 06/10] fixed one error

---
 ding/rl_utils/tests/test_td.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ding/rl_utils/tests/test_td.py b/ding/rl_utils/tests/test_td.py
index e96ca37fab..34a6fb4ae0 100644
--- a/ding/rl_utils/tests/test_td.py
+++ b/ding/rl_utils/tests/test_td.py
@@ -55,10 +55,10 @@ def test_bdq_nstep_td():
         loss.backward()
         assert isinstance(q.grad, torch.Tensor)
         data = q_nstep_td_data(q, next_q, action, next_action, reward, done, None)
-        loss, td_error_per_sample = q_nstep_td_error(data, 0.95, nstep=nstep, cum_reward=True)
+        loss, td_error_per_sample = bdq_nstep_td_error(data, 0.95, nstep=nstep, cum_reward=True)
         value_gamma = torch.tensor(0.9)
         data = q_nstep_td_data(q, next_q, action, next_action, reward, done, None)
-        loss, td_error_per_sample = q_nstep_td_error(data, 0.95, nstep=nstep, cum_reward=True, value_gamma=value_gamma)
+        loss, td_error_per_sample = bdq_nstep_td_error(data, 0.95, nstep=nstep, cum_reward=True, value_gamma=value_gamma)
         loss.backward()
         assert isinstance(q.grad, torch.Tensor)
 

From 3b02f6f43f84ec59f5a624ed3db23c7dcaa9444f Mon Sep 17 00:00:00 2001
From: Cloud-Pku <cloud_chen@pku.edu.cn>
Date: Thu, 29 Dec 2022 12:06:10 +0800
Subject: [PATCH 07/10] fixed one error

---
 ding/rl_utils/td.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ding/rl_utils/td.py b/ding/rl_utils/td.py
index 11c9263d1a..824dc34e45 100644
--- a/ding/rl_utils/td.py
+++ b/ding/rl_utils/td.py
@@ -497,7 +497,6 @@ def bdq_nstep_td_error(
     if weight is None:
         weight = torch.ones_like(reward)
     reward = reward.unsqueeze(-1)
-    weight = weight.unsqueeze(-1)
     done = done.unsqueeze(-1)
     if value_gamma is not None:
         value_gamma = value_gamma.unsqueeze(-1)

From 75e93e96c86a5a53a7bfbbb46caf15898b5e6c8e Mon Sep 17 00:00:00 2001
From: Cloud-Pku <cloud_chen@pku.edu.cn>
Date: Tue, 3 Jan 2023 11:20:27 +0800
Subject: [PATCH 08/10] add test_bdq.py

---
 ding/policy/tests/test_bdq.py  | 61 ++++++++++++++++++++++++++++++++++
 ding/rl_utils/tests/test_td.py |  4 ++-
 2 files changed, 64 insertions(+), 1 deletion(-)
 create mode 100644 ding/policy/tests/test_bdq.py

diff --git a/ding/policy/tests/test_bdq.py b/ding/policy/tests/test_bdq.py
new file mode 100644
index 0000000000..09be75cc8b
--- /dev/null
+++ b/ding/policy/tests/test_bdq.py
@@ -0,0 +1,61 @@
+import pytest
+import torch
+from easydict import EasyDict
+from ding.model.wrapper.model_wrappers import ArgmaxSampleWrapper, EpsGreedySampleWrapper, TargetNetworkWrapper
+from ding.policy.bdq import BDQPolicy
+from dizoo.classic_control.pendulum.envs import PendulumEnv
+
+obs_space = 3
+num_branches = 1
+action_bins_per_branch = 5
+
+cfg1 = EasyDict(BDQPolicy.config)
+cfg1.model = {}
+cfg1.model.obs_shape = obs_space
+cfg1.model.num_branches = num_branches
+cfg1.model.action_bins_per_branch = action_bins_per_branch
+
+
+def get_batch(size=8):
+    data = {}
+    for i in range(size):
+        obs = torch.zeros(obs_space)
+        data[i] = obs
+    return data
+
+
+def get_transition(size=20):
+    data = []
+    for i in range(size):
+        sample = {}
+        sample['obs'] = torch.zeros(obs_space)
+        sample['action'] = torch.randint(0, action_bins_per_branch, (num_branches, ))
+        sample['done'] = False
+        sample['next_obs'] = torch.zeros(obs_space)
+        sample['reward'] = torch.Tensor([1.])
+        data.append(sample)
+    return data
+
+
+@pytest.mark.parametrize('cfg', [cfg1])
+@pytest.mark.unittest
+def test_bdq(cfg):
+    policy = BDQPolicy(cfg, enable_field=['collect', 'eval', 'learn'])
+    assert type(policy._learn_model) == ArgmaxSampleWrapper
+    assert type(policy._target_model) == TargetNetworkWrapper
+    assert type(policy._collect_model) == EpsGreedySampleWrapper
+    batch_obs = get_batch()
+    policy._forward_eval(batch_obs)
+    policy._forward_collect(batch_obs, 0.5)
+
+    sample = get_transition(size=20)
+    policy._forward_learn(sample)
+    policy._get_train_sample(sample)
+
+    env = PendulumEnv(EasyDict({'act_scale': True, 'continuous': False}))
+    env.seed(314)
+    obs = env.reset()
+    b_obs = {0: obs}
+    raw_out = policy._forward_collect(b_obs, 0.5)[0]
+    timestep = env.step(raw_out['action'].numpy())
+    transition = policy._process_transition(obs, raw_out, timestep)
diff --git a/ding/rl_utils/tests/test_td.py b/ding/rl_utils/tests/test_td.py
index 34a6fb4ae0..bcc2291e16 100644
--- a/ding/rl_utils/tests/test_td.py
+++ b/ding/rl_utils/tests/test_td.py
@@ -58,7 +58,9 @@ def test_bdq_nstep_td():
         loss, td_error_per_sample = bdq_nstep_td_error(data, 0.95, nstep=nstep, cum_reward=True)
         value_gamma = torch.tensor(0.9)
         data = q_nstep_td_data(q, next_q, action, next_action, reward, done, None)
-        loss, td_error_per_sample = bdq_nstep_td_error(data, 0.95, nstep=nstep, cum_reward=True, value_gamma=value_gamma)
+        loss, td_error_per_sample = bdq_nstep_td_error(
+            data, 0.95, nstep=nstep, cum_reward=True, value_gamma=value_gamma
+        )
         loss.backward()
         assert isinstance(q.grad, torch.Tensor)
 

From 1598f778abed7ed295d1ab54758e25602572331a Mon Sep 17 00:00:00 2001
From: Cloud-Pku <cloud_chen@pku.edu.cn>
Date: Tue, 3 Jan 2023 11:45:12 +0800
Subject: [PATCH 09/10] add readme

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 9985802416..3601d9e77c 100644
--- a/README.md
+++ b/README.md
@@ -240,6 +240,7 @@ P.S: The `.py` file in `Runnable Demo` can be found in `dizoo`
 |  48  |         [ST-DIM](https://arxiv.org/pdf/1906.08226.pdf)          |   ![other](https://img.shields.io/badge/-other-lightgrey)    | [torch_utils/loss/contrastive_loss](https://github.com/opendilab/DI-engine/blob/main/ding/torch_utils/loss/contrastive_loss.py) |        ding -m serial -c cartpole_dqn_stdim_config.py -s 0       |
 |  49  |         [PLR](https://arxiv.org/pdf/2010.03934.pdf)          |   ![other](https://img.shields.io/badge/-other-lightgrey)    | [PLR doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/plr.html)<br>[data/level_replay/level_sampler](https://github.com/opendilab/DI-engine/blob/main/ding/data/level_replay/level_sampler.py) |        python3 -u bigfish_plr_config.py -s 0       |
 |  50  |         [PCGrad](https://arxiv.org/pdf/2001.06782.pdf)          |   ![other](https://img.shields.io/badge/-other-lightgrey)    | [torch_utils/optimizer_helper/PCGrad](https://github.com/opendilab/DI-engine/blob/main/ding/data/torch_utils/optimizer_helper.py) |        python3 -u multi_mnist_pcgrad_main.py -s 0       |
+|  51  |         [BDQ](https://arxiv.org/pdf/1711.08946.pdf)          |   ![other](https://img.shields.io/badge/-other-lightgrey)    | [policy/bdq](https://github.com/opendilab/DI-engine/blob/main/ding/policy/dqn.py) |        python3 -u hopper_bdq_config.py       |
 </details>
 
 

From 7138bc43170e7d8bb6df7925b34b2c27345f77ef Mon Sep 17 00:00:00 2001
From: Cloud-Pku <cloud_chen@pku.edu.cn>
Date: Tue, 3 Jan 2023 13:36:27 +0800
Subject: [PATCH 10/10] add pendulum_bdq test

---
 ding/entry/tests/test_serial_entry.py         | 15 +++++
 ding/policy/tests/test_bdq.py                 | 61 ------------------
 .../pendulum/config/pendulum_bdq_config.py    | 62 +++++++++++++++++++
 3 files changed, 77 insertions(+), 61 deletions(-)
 delete mode 100644 ding/policy/tests/test_bdq.py
 create mode 100644 dizoo/classic_control/pendulum/config/pendulum_bdq_config.py

diff --git a/ding/entry/tests/test_serial_entry.py b/ding/entry/tests/test_serial_entry.py
index 0fc9c5aae6..5d83f0557f 100644
--- a/ding/entry/tests/test_serial_entry.py
+++ b/ding/entry/tests/test_serial_entry.py
@@ -51,6 +51,7 @@
 from dizoo.gym_hybrid.config.gym_hybrid_ddpg_config import gym_hybrid_ddpg_config, gym_hybrid_ddpg_create_config
 from dizoo.gym_hybrid.config.gym_hybrid_pdqn_config import gym_hybrid_pdqn_config, gym_hybrid_pdqn_create_config
 from dizoo.gym_hybrid.config.gym_hybrid_mpdqn_config import gym_hybrid_mpdqn_config, gym_hybrid_mpdqn_create_config
+from dizoo.classic_control.pendulum.config.pendulum_bdq_config import pendulum_bdq_config, pendulum_bdq_create_config  # noqa
 
 
 @pytest.mark.platformtest
@@ -67,6 +68,20 @@ def test_dqn():
         os.popen('rm -rf cartpole_dqn_unittest')
 
 
+@pytest.mark.platformtest
+@pytest.mark.unittest
+def test_bdq():
+    config = [deepcopy(pendulum_bdq_config), deepcopy(pendulum_bdq_create_config)]
+    config[0].policy.learn.update_per_collect = 1
+    config[0].exp_name = 'pendulum_bdq_unittest'
+    try:
+        serial_pipeline(config, seed=0, max_train_iter=1)
+    except Exception:
+        assert False, "pipeline fail"
+    finally:
+        os.popen('rm -rf pendulum_bdq_unittest')
+
+
 @pytest.mark.platformtest
 @pytest.mark.unittest
 def test_ddpg():
diff --git a/ding/policy/tests/test_bdq.py b/ding/policy/tests/test_bdq.py
deleted file mode 100644
index 09be75cc8b..0000000000
--- a/ding/policy/tests/test_bdq.py
+++ /dev/null
@@ -1,61 +0,0 @@
-import pytest
-import torch
-from easydict import EasyDict
-from ding.model.wrapper.model_wrappers import ArgmaxSampleWrapper, EpsGreedySampleWrapper, TargetNetworkWrapper
-from ding.policy.bdq import BDQPolicy
-from dizoo.classic_control.pendulum.envs import PendulumEnv
-
-obs_space = 3
-num_branches = 1
-action_bins_per_branch = 5
-
-cfg1 = EasyDict(BDQPolicy.config)
-cfg1.model = {}
-cfg1.model.obs_shape = obs_space
-cfg1.model.num_branches = num_branches
-cfg1.model.action_bins_per_branch = action_bins_per_branch
-
-
-def get_batch(size=8):
-    data = {}
-    for i in range(size):
-        obs = torch.zeros(obs_space)
-        data[i] = obs
-    return data
-
-
-def get_transition(size=20):
-    data = []
-    for i in range(size):
-        sample = {}
-        sample['obs'] = torch.zeros(obs_space)
-        sample['action'] = torch.randint(0, action_bins_per_branch, (num_branches, ))
-        sample['done'] = False
-        sample['next_obs'] = torch.zeros(obs_space)
-        sample['reward'] = torch.Tensor([1.])
-        data.append(sample)
-    return data
-
-
-@pytest.mark.parametrize('cfg', [cfg1])
-@pytest.mark.unittest
-def test_bdq(cfg):
-    policy = BDQPolicy(cfg, enable_field=['collect', 'eval', 'learn'])
-    assert type(policy._learn_model) == ArgmaxSampleWrapper
-    assert type(policy._target_model) == TargetNetworkWrapper
-    assert type(policy._collect_model) == EpsGreedySampleWrapper
-    batch_obs = get_batch()
-    policy._forward_eval(batch_obs)
-    policy._forward_collect(batch_obs, 0.5)
-
-    sample = get_transition(size=20)
-    policy._forward_learn(sample)
-    policy._get_train_sample(sample)
-
-    env = PendulumEnv(EasyDict({'act_scale': True, 'continuous': False}))
-    env.seed(314)
-    obs = env.reset()
-    b_obs = {0: obs}
-    raw_out = policy._forward_collect(b_obs, 0.5)[0]
-    timestep = env.step(raw_out['action'].numpy())
-    transition = policy._process_transition(obs, raw_out, timestep)
diff --git a/dizoo/classic_control/pendulum/config/pendulum_bdq_config.py b/dizoo/classic_control/pendulum/config/pendulum_bdq_config.py
new file mode 100644
index 0000000000..59bffb05b9
--- /dev/null
+++ b/dizoo/classic_control/pendulum/config/pendulum_bdq_config.py
@@ -0,0 +1,62 @@
+from easydict import EasyDict
+import sys
+sys.path.insert(0, "/mnt/lustre/chenyun/bdq_implement1/DI-engine")
+pendulum_bdq_config = dict(
+    exp_name='pendulum_bdq_seed0',
+    env=dict(
+        collector_env_num=10,
+        evaluator_env_num=5,
+        # (bool) Scale output action into legal range.
+        act_scale=True,
+        n_evaluator_episode=5,
+        stop_value=-250,
+        continuous=False,
+        # The path to save the game replay
+        # replay_path='./pendulum_bdq_seed0/video',
+    ),
+    policy=dict(
+        cuda=False,
+        load_path='pendulum_bdq_seed0/ckpt/ckpt_best.pth.tar',  # necessary for eval
+        model=dict(
+            obs_shape=3,
+            num_branches=1,
+            action_bins_per_branch=11,
+            encoder_hidden_size_list=[128, 128, 64],
+        ),
+        nstep=1,
+        discount_factor=0.97,
+        learn=dict(
+            batch_size=64,
+            learning_rate=0.001,
+        ),
+        collect=dict(n_sample=8),
+        eval=dict(evaluator=dict(eval_freq=40, )),
+        other=dict(
+            eps=dict(
+                type='exp',
+                start=0.95,
+                end=0.1,
+                decay=10000,
+            ),
+            replay_buffer=dict(replay_buffer_size=20000, ),
+        ),
+    ),
+)
+pendulum_bdq_config = EasyDict(pendulum_bdq_config)
+main_config = pendulum_bdq_config
+pendulum_bdq_create_config = dict(
+    env=dict(
+        type='pendulum',
+        import_names=['dizoo.classic_control.pendulum.envs.pendulum_env'],
+    ),
+    env_manager=dict(type='base'),
+    policy=dict(type='bdq'),
+    replay_buffer=dict(type='deque', import_names=['ding.data.buffer.deque_buffer_wrapper']),
+)
+pendulum_bdq_create_config = EasyDict(pendulum_bdq_create_config)
+create_config = pendulum_bdq_create_config
+
+if __name__ == "__main__":
+    # or you can enter `ding -m serial -c pendulum_bdq_config.py -s 0`
+    from ding.entry import serial_pipeline
+    serial_pipeline((main_config, create_config), seed=0)