From b9cee618247999c61bb6504ef1b63a7816896950 Mon Sep 17 00:00:00 2001 From: Cloud-Pku Date: Thu, 15 Dec 2022 16:45:27 +0800 Subject: [PATCH 01/10] add BDQ algrithm --- ding/model/common/__init__.py | 2 +- ding/model/common/head.py | 102 +++++ ding/model/template/q_learning.py | 88 +++- ding/policy/__init__.py | 2 + ding/policy/bdq.py | 387 ++++++++++++++++++ ding/policy/command_mode_policy_instance.py | 6 + ding/rl_utils/__init__.py | 2 +- ding/rl_utils/td.py | 56 +++ dizoo/mujoco/config/halfcheetah_bdq_config.py | 71 ++++ dizoo/mujoco/config/hopper_bdq_config.py | 73 ++++ dizoo/mujoco/envs/mujoco_env.py | 12 +- 11 files changed, 795 insertions(+), 6 deletions(-) create mode 100644 ding/policy/bdq.py create mode 100644 dizoo/mujoco/config/halfcheetah_bdq_config.py create mode 100644 dizoo/mujoco/config/hopper_bdq_config.py diff --git a/ding/model/common/__init__.py b/ding/model/common/__init__.py index fc904de2ac..2acbd3c8b7 100644 --- a/ding/model/common/__init__.py +++ b/ding/model/common/__init__.py @@ -1,5 +1,5 @@ from .head import DiscreteHead, DuelingHead, DistributionHead, RainbowHead, QRDQNHead, \ - QuantileHead, FQFHead, RegressionHead, ReparameterizationHead, MultiHead, head_cls_map, \ + QuantileHead, FQFHead, RegressionHead, ReparameterizationHead, MultiHead, BranchingHead, head_cls_map, \ independent_normal_dist from .encoder import ConvEncoder, FCEncoder, IMPALAConvEncoder from .utils import create_model diff --git a/ding/model/common/head.py b/ding/model/common/head.py index 9a83130d0b..94a6b2ba14 100644 --- a/ding/model/common/head.py +++ b/ding/model/common/head.py @@ -174,6 +174,108 @@ def forward(self, x: torch.Tensor) -> Dict: return {'logit': q, 'distribution': dist} +class BranchingHead(nn.Module): + + def __init__( + self, + hidden_size: int, + num_branches: int = 0, + action_per_branch: int = 2, + layer_num: int = 1, + a_layer_num: Optional[int] = None, + v_layer_num: Optional[int] = None, + norm_type: Optional[str] = None, + activation: Optional[nn.Module] = nn.ReLU(), + noise: Optional[bool] = False, + ) -> None: + """ + Overview: + Init the ``BranchingHead`` layers according to the provided arguments. + Arguments: + - hidden_size (:obj:`int`): The ``hidden_size`` of the MLP connected to ``BranchingHead``. + - num_branches (:obj:`int`): The number of branches, which is equivalent to the action dimension. + - action_per_branch (:obj:`int`): The number of actions in each dimension. + - layer_num (:obj:`int`): The number of layers used in the network to compute Advantage and Value output. + - a_layer_num (:obj:`int`): The number of layers used in the network to compute Advantage output. + - v_layer_num (:obj:`int`): The number of layers used in the network to compute Value output. + - output_size (:obj:`int`): The number of outputs. + - norm_type (:obj:`str`): The type of normalization to use. See ``ding.torch_utils.network.fc_block`` \ + for more details. Default ``None``. + - activation (:obj:`nn.Module`): The type of activation function to use in MLP. \ + If ``None``, then default set activation to ``nn.ReLU()``. Default ``None``. + - noise (:obj:`bool`): Whether use ``NoiseLinearLayer`` as ``layer_fn`` in Q networks' MLP. \ + Default ``False``. + """ + super(BranchingHead, self).__init__() + if a_layer_num is None: + a_layer_num = layer_num + if v_layer_num is None: + v_layer_num = layer_num + self.num_branches = num_branches + self.action_per_branch = action_per_branch + + layer = NoiseLinearLayer if noise else nn.Linear + block = noise_block if noise else fc_block + # value network + + self.V = nn.Sequential( + MLP( + hidden_size, + hidden_size, + hidden_size, + v_layer_num, + layer_fn=layer, + activation=activation, + norm_type=norm_type + ), block(hidden_size, 1) + ) + # action branching network + action_output_dim = action_per_branch + self.branches = nn.ModuleList( + [ + nn.Sequential( + MLP( + hidden_size, + hidden_size, + hidden_size, + a_layer_num, + layer_fn=layer, + activation=activation, + norm_type=norm_type + ), block(hidden_size, action_output_dim) + ) for _ in range(self.num_branches) + ] + ) + + def forward(self, x: torch.Tensor) -> Dict: + """ + Overview: + Use encoded embedding tensor to run MLP with ``BranchingHead`` and return the prediction dictionary. + Arguments: + - x (:obj:`torch.Tensor`): Tensor containing input embedding. + Returns: + - outputs (:obj:`Dict`): Dict containing keyword ``logit`` (:obj:`torch.Tensor`). + Shapes: + - x: :math:`(B, N)`, where ``B = batch_size`` and ``N = hidden_size``. + - logit: :math:`(B, M)`, where ``M = output_size``. + + Examples: + >>> head = BranchingHead(64, 5, 2) + >>> inputs = torch.randn(4, 64) + >>> outputs = head(inputs) + >>> assert isinstance(outputs, dict) and outputs['logit'].shape == torch.Size([4, 10]) + """ + value_out = self.V(x) + value_out = torch.unsqueeze(value_out, 1) + action_out = [] + for b in self.branches: + action_out.append(b(x)) + action_scores = torch.stack(action_out, 1) + action_scores = action_scores - torch.mean(action_scores, 2, keepdim=True) + logits = value_out + action_scores + return {'logit': logits} + + class RainbowHead(nn.Module): """ Overview: diff --git a/ding/model/template/q_learning.py b/ding/model/template/q_learning.py index 54df388446..5177a87a7c 100644 --- a/ding/model/template/q_learning.py +++ b/ding/model/template/q_learning.py @@ -5,7 +5,7 @@ from ding.torch_utils import get_lstm from ding.utils import MODEL_REGISTRY, SequenceType, squeeze from ..common import FCEncoder, ConvEncoder, DiscreteHead, DuelingHead, MultiHead, RainbowHead, \ - QuantileHead, FQFHead, QRDQNHead, DistributionHead + QuantileHead, FQFHead, QRDQNHead, DistributionHead, BranchingHead from ding.torch_utils.network.gtrxl import GTrXL @@ -98,6 +98,92 @@ def forward(self, x: torch.Tensor) -> Dict: return x +@MODEL_REGISTRY.register('bdq') +class BDQ(nn.Module): + + def __init__( + self, + obs_shape: Union[int, SequenceType], + num_branches: int = 0, + action_per_branch: int = 2, + layer_num: int = 3, + a_layer_num: Optional[int] = None, + v_layer_num: Optional[int] = None, + encoder_hidden_size_list: SequenceType = [128, 128, 64], + head_hidden_size: Optional[int] = None, + norm_type: Optional[nn.Module] = None, + activation: Optional[nn.Module] = nn.ReLU(), + ) -> None: + """ + Overview: + Init the BDQ (encoder + head) Model according to input arguments. + Arguments: + - obs_shape (:obj:`Union[int, SequenceType]`): Observation space shape, such as 8 or [4, 84, 84]. + - num_branches (:obj:`int`): The number of branches, which is equivalent to the action dimension, such as 6. + - action_per_branch (:obj:`int`): The number of actions in each dimension. + - layer_num (:obj:`int`): The number of layers used in the network to compute Advantage and Value output. + - a_layer_num (:obj:`int`): The number of layers used in the network to compute Advantage output. + - v_layer_num (:obj:`int`): The number of layers used in the network to compute Value output. + - encoder_hidden_size_list (:obj:`SequenceType`): Collection of ``hidden_size`` to pass to ``Encoder``, \ + the last element must match ``head_hidden_size``. + - head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` of head network. + - norm_type (:obj:`Optional[str]`): The type of normalization in networks, see \ + ``ding.torch_utils.fc_block`` for more details. + - activation (:obj:`Optional[nn.Module]`): The type of activation function in networks \ + if ``None`` then default set it to ``nn.ReLU()`` + """ + super(BDQ, self).__init__() + # For compatibility: 1, (1, ), [4, 32, 32] + obs_shape = squeeze(obs_shape) + if head_hidden_size is None: + head_hidden_size = encoder_hidden_size_list[-1] + + # backbone + # FC Encoder + if isinstance(obs_shape, int) or len(obs_shape) == 1: + self.encoder = FCEncoder(obs_shape, encoder_hidden_size_list, activation=activation, norm_type=norm_type) + # Conv Encoder + elif len(obs_shape) == 3: + self.encoder = ConvEncoder(obs_shape, encoder_hidden_size_list, activation=activation, norm_type=norm_type) + else: + raise RuntimeError( + "not support obs_shape for pre-defined encoder: {}, please customize your own DQN".format(obs_shape) + ) + + self.num_branches = num_branches + self.action_per_branch = action_per_branch + + # head + self.head = BranchingHead( + head_hidden_size, num_branches=self.num_branches, action_per_branch=action_per_branch, + layer_num=layer_num, a_layer_num=a_layer_num, v_layer_num=v_layer_num, activation=activation, + norm_type=norm_type) + + def forward(self, x: torch.Tensor) -> Dict: + r""" + Overview: + BDQ forward computation graph, input observation tensor to predict q_value. + Arguments: + - x (:obj:`torch.Tensor`): Observation inputs + Returns: + - outputs (:obj:`Dict`): BDQ forward outputs, such as q_value. + ReturnsKeys: + - logit (:obj:`torch.Tensor`): Discrete Q-value output of each action dimension. + Shapes: + - x (:obj:`torch.Tensor`): :math:`(B, N)`, where B is batch size and N is ``obs_shape`` + - logit (:obj:`torch.FloatTensor`): :math:`(B, M)`, where B is batch size and M is + ``num_branches * action_per_branch`` + Examples: + >>> model = BDQ(8, 5, 2) # arguments: 'obs_shape', 'num_branches' and 'action_per_branch'. + >>> inputs = torch.randn(4, 8) + >>> outputs = model(inputs) + >>> assert isinstance(outputs, dict) and outputs['logit'].shape == torch.Size([4, 10]) + """ + x = self.encoder(x) + x = self.head(x) + return x + + @MODEL_REGISTRY.register('c51dqn') class C51DQN(nn.Module): diff --git a/ding/policy/__init__.py b/ding/policy/__init__.py index e71bd4dd19..84f571a7a6 100644 --- a/ding/policy/__init__.py +++ b/ding/policy/__init__.py @@ -43,3 +43,5 @@ from .bc import BehaviourCloningPolicy from .ibc import IBCPolicy + +from .bdq import BDQPolicy diff --git a/ding/policy/bdq.py b/ding/policy/bdq.py new file mode 100644 index 0000000000..920f8b7c07 --- /dev/null +++ b/ding/policy/bdq.py @@ -0,0 +1,387 @@ +from typing import List, Dict, Any, Tuple +from collections import namedtuple +import copy +import torch + +from ding.torch_utils import Adam, to_device, ContrastiveLoss +from ding.rl_utils import q_nstep_td_data, bdq_nstep_td_error, get_nstep_return_data, get_train_sample +from ding.model import model_wrap +from ding.utils import POLICY_REGISTRY +from ding.utils.data import default_collate, default_decollate + +from .base_policy import Policy +from .common_utils import default_preprocess_learn + + +@POLICY_REGISTRY.register('bdq') +class BDQPolicy(Policy): + r""" + Overview: + Policy class of BDQ algorithm, extended by PER/multi-step TD. + + Config: + == ==================== ======== ============== ======================================== ======================= + ID Symbol Type Default Value Description Other(Shape) + == ==================== ======== ============== ======================================== ======================= + 1 ``type`` str bdq | RL policy register name, refer to | This arg is optional, + | registry ``POLICY_REGISTRY`` | a placeholder + 2 ``cuda`` bool False | Whether to use cuda for network | This arg can be diff- + | erent from modes + 3 ``on_policy`` bool False | Whether the RL algorithm is on-policy + | or off-policy + 4 ``priority`` bool False | Whether use priority(PER) | Priority sample, + | update priority + 5 | ``priority_IS`` bool False | Whether use Importance Sampling Weight + | ``_weight`` | to correct biased update. If True, + | priority must be True. + 6 | ``discount_`` float 0.97, | Reward's future discount factor, aka. | May be 1 when sparse + | ``factor`` [0.95, 0.999] | gamma | reward env + 7 ``nstep`` int 1, | N-step reward discount sum for target + [3, 5] | q_value estimation + 8 | ``learn.update`` int 3 | How many updates(iterations) to train | This args can be vary + | ``per_collect`` | after collector's one collection. Only | from envs. Bigger val + | valid in serial training | means more off-policy + 9 | ``learn.multi`` bool False | whether to use multi gpu during + | ``_gpu`` + 10 | ``learn.batch_`` int 64 | The number of samples of an iteration + | ``size`` + 11 | ``learn.learning`` float 0.001 | Gradient step length of an iteration. + | ``_rate`` + 12 | ``learn.target_`` int 100 | Frequence of target network update. | Hard(assign) update + | ``update_freq`` + 13 | ``learn.ignore_`` bool False | Whether ignore done for target value | Enable it for some + | ``done`` | calculation. | fake termination env + 14 ``collect.n_sample`` int [8, 128] | The number of training samples of a | It varies from + | call of collector. | different envs + 15 | ``collect.unroll`` int 1 | unroll length of an iteration | In RNN, unroll_len>1 + | ``_len`` + 16 | ``other.eps.type`` str exp | exploration rate decay type | Support ['exp', + | 'linear']. + 17 | ``other.eps.`` float 0.95 | start value of exploration rate | [0,1] + | ``start`` + 18 | ``other.eps.`` float 0.1 | end value of exploration rate | [0,1] + | ``end`` + 19 | ``other.eps.`` int 10000 | decay length of exploration | greater than 0. set + | ``decay`` | decay=10000 means + | the exploration rate + | decay from start + | value to end value + | during decay length. + == ==================== ======== ============== ======================================== ======================= + """ + + config = dict( + type='bdq', + # (bool) Whether use cuda in policy + cuda=False, + # (bool) Whether learning policy is the same as collecting data policy(on-policy) + on_policy=False, + # (bool) Whether enable priority experience sample + priority=False, + # (bool) Whether use Importance Sampling Weight to correct biased update. If True, priority must be True. + priority_IS_weight=False, + # (float) Discount factor(gamma) for returns + discount_factor=0.97, + # (int) The number of step for calculating target q_value + nstep=1, + learn=dict( + # (bool) Whether to use multi gpu + multi_gpu=False, + # How many updates(iterations) to train after collector's one collection. + # Bigger "update_per_collect" means bigger off-policy. + # collect data -> update policy-> collect data -> ... + update_per_collect=3, + # (int) How many samples in a training batch + batch_size=64, + # (float) The step size of gradient descent + learning_rate=0.001, + # ============================================================== + # The following configs are algorithm-specific + # ============================================================== + # (int) Frequence of target network update. + target_update_freq=100, + # (bool) Whether ignore done(usually for max step termination env) + ignore_done=False, + ), + # collect_mode config + collect=dict( + # (int) Only one of [n_sample, n_episode] shoule be set + # n_sample=8, + # (int) Cut trajectories into pieces with length "unroll_len". + unroll_len=1, + ), + eval=dict(), + # other config + other=dict( + # Epsilon greedy with decay. + eps=dict( + # (str) Decay type. Support ['exp', 'linear']. + type='exp', + # (float) Epsilon start value + start=0.95, + # (float) Epsilon end value + end=0.1, + # (int) Decay length(env step) + decay=10000, + ), + replay_buffer=dict(replay_buffer_size=10000, ), + ), + ) + + def default_model(self) -> Tuple[str, List[str]]: + """ + Overview: + Return this algorithm default model setting for demonstration. + Returns: + - model_info (:obj:`Tuple[str, List[str]]`): model name and mode import_names + + .. note:: + The user can define and use customized network model but must obey the same inferface definition indicated \ + by import_names path. For BDQ, ``ding.model.template.q_learning.BDQ`` + """ + return 'bdq', ['ding.model.template.q_learning'] + + def _init_learn(self) -> None: + """ + Overview: + Learn mode init method. Called by ``self.__init__``, initialize the optimizer, algorithm arguments, main \ + and target model. + """ + self._priority = self._cfg.priority + self._priority_IS_weight = self._cfg.priority_IS_weight + # Optimizer + self._optimizer = Adam(self._model.parameters(), lr=self._cfg.learn.learning_rate) + + self._gamma = self._cfg.discount_factor + self._nstep = self._cfg.nstep + + # use model_wrapper for specialized demands of different modes + self._target_model = copy.deepcopy(self._model) + self._target_model = model_wrap( + self._target_model, + wrapper_name='target', + update_type='assign', + update_kwargs={'freq': self._cfg.learn.target_update_freq} + ) + self._learn_model = model_wrap(self._model, wrapper_name='argmax_sample') + self._learn_model.reset() + self._target_model.reset() + + def _forward_learn(self, data: Dict[str, Any]) -> Dict[str, Any]: + """ + Overview: + Forward computation graph of learn mode(updating policy). + Arguments: + - data (:obj:`Dict[str, Any]`): Dict type data, a batch of data for training, values are torch.Tensor or \ + np.ndarray or dict/list combinations. + Returns: + - info_dict (:obj:`Dict[str, Any]`): Dict type data, a info dict indicated training result, which will be \ + recorded in text log and tensorboard, values are python scalar or a list of scalars. + ArgumentsKeys: + - necessary: ``obs``, ``action``, ``reward``, ``next_obs``, ``done`` + - optional: ``value_gamma``, ``IS`` + ReturnsKeys: + - necessary: ``cur_lr``, ``total_loss``, ``priority`` + - optional: ``action_distribution`` + """ + data = default_preprocess_learn( + data, + use_priority=self._priority, + use_priority_IS_weight=self._cfg.priority_IS_weight, + ignore_done=self._cfg.learn.ignore_done, + use_nstep=True + ) + + if self._cuda: + data = to_device(data, self._device) + # ==================== + # Q-learning forward + # ==================== + self._learn_model.train() + self._target_model.train() + # Current q value (main model) + q_value = self._learn_model.forward(data['obs'])['logit'] + # Target q value + with torch.no_grad(): + target_q_value = self._target_model.forward(data['next_obs'])['logit'] + # Max q value action (main model) + target_q_action = self._learn_model.forward(data['next_obs'])['action'] + if data['action'].shape != target_q_action.shape: + data['action'] = data['action'].unsqueeze(-1) + + data_n = q_nstep_td_data( + q_value, target_q_value, data['action'], target_q_action, data['reward'], data['done'], data['weight'] + ) + value_gamma = data.get('value_gamma') + loss, td_error_per_sample = bdq_nstep_td_error(data_n, self._gamma, nstep=self._nstep, value_gamma=value_gamma) + + # ==================== + # Q-learning update + # ==================== + self._optimizer.zero_grad() + loss.backward() + if self._cfg.learn.multi_gpu: + self.sync_gradients(self._learn_model) + self._optimizer.step() + + # ============= + # after update + # ============= + self._target_model.update(self._learn_model.state_dict()) + return { + 'cur_lr': self._optimizer.defaults['lr'], + 'total_loss': loss.item(), + 'q_value': q_value.mean().item(), + 'target_q_value': target_q_value.mean().item(), + 'priority': td_error_per_sample.abs().tolist(), + # Only discrete action satisfying len(data['action'])==1 can return this and draw histogram on tensorboard. + # '[histogram]action_distribution': data['action'], + } + + def _monitor_vars_learn(self) -> List[str]: + return ['cur_lr', 'total_loss', 'q_value'] + + def _state_dict_learn(self) -> Dict[str, Any]: + """ + Overview: + Return the state_dict of learn mode, usually including model and optimizer. + Returns: + - state_dict (:obj:`Dict[str, Any]`): the dict of current policy learn state, for saving and restoring. + """ + return { + 'model': self._learn_model.state_dict(), + 'target_model': self._target_model.state_dict(), + 'optimizer': self._optimizer.state_dict(), + } + + def _load_state_dict_learn(self, state_dict: Dict[str, Any]) -> None: + """ + Overview: + Load the state_dict variable into policy learn mode. + Arguments: + - state_dict (:obj:`Dict[str, Any]`): the dict of policy learn state saved before. + + .. tip:: + If you want to only load some parts of model, you can simply set the ``strict`` argument in \ + load_state_dict to ``False``, or refer to ``ding.torch_utils.checkpoint_helper`` for more \ + complicated operation. + """ + self._learn_model.load_state_dict(state_dict['model']) + self._target_model.load_state_dict(state_dict['target_model']) + self._optimizer.load_state_dict(state_dict['optimizer']) + + def _init_collect(self) -> None: + """ + Overview: + Collect mode init method. Called by ``self.__init__``, initialize algorithm arguments and collect_model, \ + enable the eps_greedy_sample for exploration. + """ + self._unroll_len = self._cfg.collect.unroll_len + self._gamma = self._cfg.discount_factor # necessary for parallel + self._nstep = self._cfg.nstep # necessary for parallel + self._collect_model = model_wrap(self._model, wrapper_name='eps_greedy_sample') + self._collect_model.reset() + + def _forward_collect(self, data: Dict[int, Any], eps: float) -> Dict[int, Any]: + """ + Overview: + Forward computation graph of collect mode(collect training data), with eps_greedy for exploration. + Arguments: + - data (:obj:`Dict[str, Any]`): Dict type data, stacked env data for predicting policy_output(action), \ + values are torch.Tensor or np.ndarray or dict/list combinations, keys are env_id indicated by integer. + - eps (:obj:`float`): epsilon value for exploration, which is decayed by collected env step. + Returns: + - output (:obj:`Dict[int, Any]`): The dict of predicting policy_output(action) for the interaction with \ + env and the constructing of transition. + ArgumentsKeys: + - necessary: ``obs`` + ReturnsKeys + - necessary: ``logit``, ``action`` + """ + data_id = list(data.keys()) + data = default_collate(list(data.values())) + if self._cuda: + data = to_device(data, self._device) + self._collect_model.eval() + with torch.no_grad(): + output = self._collect_model.forward(data, eps=eps) + if self._cuda: + output = to_device(output, 'cpu') + output = default_decollate(output) + return {i: d for i, d in zip(data_id, output)} + + def _get_train_sample(self, data: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """ + Overview: + For a given trajectory(transitions, a list of transition) data, process it into a list of sample that \ + can be used for training directly. A train sample can be a processed transition(BDQ with nstep TD). + Arguments: + - data (:obj:`List[Dict[str, Any]`): The trajectory data(a list of transition), each element is the same \ + format as the return value of ``self._process_transition`` method. + Returns: + - samples (:obj:`dict`): The list of training samples. + + .. note:: + We will vectorize ``process_transition`` and ``get_train_sample`` method in the following release version. \ + And the user can customize the this data processing procecure by overriding this two methods and collector \ + itself. + """ + data = get_nstep_return_data(data, self._nstep, gamma=self._gamma) + return get_train_sample(data, self._unroll_len) + + def _process_transition(self, obs: Any, policy_output: Dict[str, Any], timestep: namedtuple) -> Dict[str, Any]: + """ + Overview: + Generate a transition(e.g.: ) for this algorithm training. + Arguments: + - obs (:obj:`Any`): Env observation. + - policy_output (:obj:`Dict[str, Any]`): The output of policy collect mode(``self._forward_collect``),\ + including at least ``action``. + - timestep (:obj:`namedtuple`): The output after env step(execute policy output action), including at \ + least ``obs``, ``reward``, ``done``, (here obs indicates obs after env step). + Returns: + - transition (:obj:`dict`): Dict type transition data. + """ + transition = { + 'obs': obs, + 'next_obs': timestep.obs, + 'action': policy_output['action'], + 'reward': timestep.reward, + 'done': timestep.done, + } + return transition + + def _init_eval(self) -> None: + r""" + Overview: + Evaluate mode init method. Called by ``self.__init__``, initialize eval_model. + """ + self._eval_model = model_wrap(self._model, wrapper_name='argmax_sample') + self._eval_model.reset() + + def _forward_eval(self, data: Dict[int, Any]) -> Dict[int, Any]: + """ + Overview: + Forward computation graph of eval mode(evaluate policy performance), at most cases, it is similar to \ + ``self._forward_collect``. + Arguments: + - data (:obj:`Dict[str, Any]`): Dict type data, stacked env data for predicting policy_output(action), \ + values are torch.Tensor or np.ndarray or dict/list combinations, keys are env_id indicated by integer. + Returns: + - output (:obj:`Dict[int, Any]`): The dict of predicting action for the interaction with env. + ArgumentsKeys: + - necessary: ``obs`` + ReturnsKeys + - necessary: ``action`` + """ + data_id = list(data.keys()) + data = default_collate(list(data.values())) + if self._cuda: + data = to_device(data, self._device) + self._eval_model.eval() + with torch.no_grad(): + output = self._eval_model.forward(data) + if self._cuda: + output = to_device(output, 'cpu') + output = default_decollate(output) + return {i: d for i, d in zip(data_id, output)} diff --git a/ding/policy/command_mode_policy_instance.py b/ding/policy/command_mode_policy_instance.py index 13d7032744..f370b8f197 100644 --- a/ding/policy/command_mode_policy_instance.py +++ b/ding/policy/command_mode_policy_instance.py @@ -45,6 +45,7 @@ from .pdqn import PDQNPolicy from .sac import SQILSACPolicy from .madqn import MADQNPolicy +from .bdq import BDQPolicy class EpsCommandModePolicy(CommandModePolicy): @@ -95,6 +96,11 @@ def _get_setting_eval(self, command_info: dict) -> dict: return {} +@POLICY_REGISTRY.register('bdq_command') +class BDQCommandModePolicy(BDQPolicy, EpsCommandModePolicy): + pass + + @POLICY_REGISTRY.register('dqn_command') class DQNCommandModePolicy(DQNPolicy, EpsCommandModePolicy): pass diff --git a/ding/rl_utils/__init__.py b/ding/rl_utils/__init__.py index ee39b8a318..2ced88ccef 100644 --- a/ding/rl_utils/__init__.py +++ b/ding/rl_utils/__init__.py @@ -11,7 +11,7 @@ nstep_return_data, nstep_return, iqn_nstep_td_data, iqn_nstep_td_error, qrdqn_nstep_td_data, qrdqn_nstep_td_error,\ fqf_nstep_td_data, fqf_nstep_td_error, fqf_calculate_fraction_loss, evaluate_quantile_at_action, \ q_nstep_sql_td_error, dqfd_nstep_td_error, dqfd_nstep_td_data, q_v_1step_td_error, q_v_1step_td_data,\ - dqfd_nstep_td_error_with_rescale, discount_cumsum + dqfd_nstep_td_error_with_rescale, discount_cumsum, bdq_nstep_td_error from .vtrace import vtrace_loss, compute_importance_weights from .upgo import upgo_loss from .adder import get_gae, get_gae_with_default_last_value, get_nstep_return_data, get_train_sample diff --git a/ding/rl_utils/td.py b/ding/rl_utils/td.py index 3b01115dbe..f8dd6521c6 100644 --- a/ding/rl_utils/td.py +++ b/ding/rl_utils/td.py @@ -453,6 +453,62 @@ def q_nstep_td_error( return (td_error_per_sample * weight).mean(), td_error_per_sample +def bdq_nstep_td_error( + data: namedtuple, + gamma: Union[float, list], + nstep: int = 1, + cum_reward: bool = False, + value_gamma: Optional[torch.Tensor] = None, + criterion: torch.nn.modules = nn.MSELoss(reduction='none'), +) -> torch.Tensor: + """ + Overview: + Multistep (1 step or n step) td_error for BDQ algorithm + Arguments: + - data (:obj:`q_nstep_td_data`): the input data, q_nstep_td_data to calculate loss + - gamma (:obj:`float`): discount factor + - cum_reward (:obj:`bool`): whether to use cumulative nstep reward, which is figured out when collecting data + - value_gamma (:obj:`torch.Tensor`): gamma discount value for target q_value + - criterion (:obj:`torch.nn.modules`): loss function criterion + - nstep (:obj:`int`): nstep num, default set to 1 + Returns: + - loss (:obj:`torch.Tensor`): nstep td error, 0-dim tensor + - td_error_per_sample (:obj:`torch.Tensor`): nstep td error, 1-dim tensor + Shapes: + - data (:obj:`q_nstep_td_data`): the q_nstep_td_data containing\ + ['q', 'next_n_q', 'action', 'reward', 'done'] + - q (:obj:`torch.FloatTensor`): :math:`(B, N)` i.e. [batch_size, action_dim] + - next_n_q (:obj:`torch.FloatTensor`): :math:`(B, N)` + - action (:obj:`torch.LongTensor`): :math:`(B, )` + - next_n_action (:obj:`torch.LongTensor`): :math:`(B, )` + - reward (:obj:`torch.FloatTensor`): :math:`(T, B)`, where T is timestep(nstep) + - done (:obj:`torch.BoolTensor`) :math:`(B, )`, whether done in last timestep + - td_error_per_sample (:obj:`torch.FloatTensor`): :math:`(B, )` + """ + q, next_n_q, action, next_n_action, reward, done, weight = data + if weight is None: + weight = torch.ones_like(reward) + reward = reward.unsqueeze(-1) + weight = weight.unsqueeze(-1) + done = done.unsqueeze(-1) + if value_gamma is not None: + value_gamma = value_gamma.unsqueeze(-1) + + q_s_a = q.gather(-1, action.unsqueeze(-1)).squeeze(-1) + target_q_s_a = next_n_q.gather(-1, next_n_action.unsqueeze(-1)).squeeze(-1) + + if cum_reward: + if value_gamma is None: + target_q_s_a = reward + (gamma ** nstep) * target_q_s_a * (1 - done) + else: + target_q_s_a = reward + value_gamma * target_q_s_a * (1 - done) + else: + target_q_s_a = nstep_return(nstep_return_data(reward, target_q_s_a, done), gamma, nstep, value_gamma) + td_error_per_sample = criterion(q_s_a, target_q_s_a.detach()) + td_error_per_sample = td_error_per_sample.mean(-1) + return (td_error_per_sample * weight).mean(), td_error_per_sample + + def shape_fn_qntd_rescale(args, kwargs): r""" Overview: diff --git a/dizoo/mujoco/config/halfcheetah_bdq_config.py b/dizoo/mujoco/config/halfcheetah_bdq_config.py new file mode 100644 index 0000000000..3a4cbfd388 --- /dev/null +++ b/dizoo/mujoco/config/halfcheetah_bdq_config.py @@ -0,0 +1,71 @@ +from easydict import EasyDict + +halfcheetah_bdq_config = dict( + exp_name='halfcheetah_bdq_seed0', + env=dict( + env_id='HalfCheetah-v3', + norm_obs=dict(use_norm=False, ), + norm_reward=dict(use_norm=False, ), + collector_env_num=8, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=12000, + action_per_branch=2, + ), + policy=dict( + cuda=False, + priority=False, + discount_factor=0.99, + nstep=1, + model=dict( + obs_shape=17, + num_branches=6, + action_per_branch=2, # mean the action shape is 11, 11 discrete actions + encoder_hidden_size_list=[256, 256, 128], + ), + + learn=dict( + batch_size=512, + learning_rate=3e-4, + ignore_done=True, + target_update_freq=500, + target_update_theta=0.001, + update_per_collect=20, + ), + collect=dict( + n_sample=256, + unroll_len=1, + ), + eval=dict(evaluator=dict(eval_freq=1000, )), + other=dict( + # Epsilon greedy with decay. + eps=dict( + # Decay type. Support ['exp', 'linear']. + type='exp', + start=1, + end=0.05, + decay=int(1e5), + ), + replay_buffer=dict(replay_buffer_size=int(1e6), ) + ), + ), +) +halfcheetah_bdq_config = EasyDict(halfcheetah_bdq_config) +main_config = halfcheetah_bdq_config + +halfcheetah_bdq_create_config = dict( + env=dict( + type='mujoco', + import_names=['dizoo.mujoco.envs.mujoco_env'], + ), + env_manager=dict(type='subprocess'), + # env_manager=dict(type='subprocess'), + policy=dict(type='bdq', ), +) +halfcheetah_bdq_create_config = EasyDict(halfcheetah_bdq_create_config) +create_config = halfcheetah_bdq_create_config + +if __name__ == "__main__": + # or you can enter `ding -m serial_onpolicy -c halfcheetah_onbdq_config.py -s 0` + from ding.entry import serial_pipeline + serial_pipeline((main_config, create_config), seed=0) \ No newline at end of file diff --git a/dizoo/mujoco/config/hopper_bdq_config.py b/dizoo/mujoco/config/hopper_bdq_config.py new file mode 100644 index 0000000000..c5b9b50155 --- /dev/null +++ b/dizoo/mujoco/config/hopper_bdq_config.py @@ -0,0 +1,73 @@ +from easydict import EasyDict + +hopper_bdq_config = dict( + exp_name='hopper_bdq_seed0', + env=dict( + env_id='Hopper-v3', + norm_obs=dict(use_norm=False, ), + norm_reward=dict(use_norm=False, ), + collector_env_num=8, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=int(1e6), + action_per_branch=4, + ), + policy=dict( + cuda=False, + priority=False, + discount_factor=0.99, + model=dict( + obs_shape=11, + num_branches=3, + action_per_branch=4, # mean the action shape is 11, 11 discrete actions + encoder_hidden_size_list=[256, 256, 128], + ), + nstep=1, + learn=dict( + ignore_done=False, + batch_size=512, + learning_rate=3e-4, + # Frequency of target network update. + target_update_freq=500, + target_update_theta=0.001, + update_per_collect=20, + ), + collect=dict( + # You can use either "n_sample" or "n_episode" in collector.collect. + # Get "n_sample" samples per collect. + n_sample=256, + # Cut trajectories into pieces with length "unroll_len". + unroll_len=1, + ), + eval=dict(evaluator=dict(eval_freq=1000, )), + other=dict( + # Epsilon greedy with decay. + eps=dict( + # Decay type. Support ['exp', 'linear']. + type='exp', + start=1, + end=0.05, + decay=int(1e5), + ), + replay_buffer=dict(replay_buffer_size=int(1e6), ) + ), + ), +) +hopper_bdq_config = EasyDict(hopper_bdq_config) +main_config = hopper_bdq_config + +hopper_bdq_create_config = dict( + env=dict( + type='mujoco', + import_names=['dizoo.mujoco.envs.mujoco_env'], + ), + env_manager=dict(type='subprocess'), + policy=dict(type='bdq', ), +) +hopper_bdq_create_config = EasyDict(hopper_bdq_create_config) +create_config = hopper_bdq_create_config + +if __name__ == "__main__": + # or you can enter `ding -m serial_onpolicy -c hopper_bdq_config.py -s 0` + from ding.entry import serial_pipeline + serial_pipeline([main_config, create_config], seed=0) diff --git a/dizoo/mujoco/envs/mujoco_env.py b/dizoo/mujoco/envs/mujoco_env.py index 1c6373f51d..3db13f1468 100644 --- a/dizoo/mujoco/envs/mujoco_env.py +++ b/dizoo/mujoco/envs/mujoco_env.py @@ -39,6 +39,10 @@ def __init__(self, cfg: dict) -> None: self._replay_path = None self._replay_path_gif = cfg.replay_path_gif self._save_replay_gif = cfg.save_replay_gif + self._action_per_branch = cfg.action_per_branch if 'action_per_branch' in cfg else None + + def map_action(self, action: Union[np.ndarray, list]) -> Union[np.ndarray, list]: + return [2 * x / (self._action_per_branch - 1) - 1 for x in action] def reset(self) -> np.ndarray: if not self._init_flag: @@ -65,7 +69,7 @@ def reset(self) -> np.ndarray: self._env.seed(self._seed) obs = self._env.reset() obs = to_ndarray(obs).astype('float32') - self._eval_episode_return = 0. + self._final_eval_reward = 0. return obs @@ -80,13 +84,15 @@ def seed(self, seed: int, dynamic_seed: bool = True) -> None: np.random.seed(self._seed) def step(self, action: Union[np.ndarray, list]) -> BaseEnvTimestep: + if self._action_per_branch: + action = self.map_action(action) action = to_ndarray(action) if self._save_replay_gif: self._frames.append(self._env.render(mode='rgb_array')) if self._action_clip: action = np.clip(action, -1, 1) obs, rew, done, info = self._env.step(action) - self._eval_episode_return += rew + self._final_eval_reward += rew if done: if self._save_replay_gif: path = os.path.join( @@ -94,7 +100,7 @@ def step(self, action: Union[np.ndarray, list]) -> BaseEnvTimestep: ) save_frames_as_gif(self._frames, path) self._save_replay_count += 1 - info['eval_episode_return'] = self._eval_episode_return + info['final_eval_reward'] = self._final_eval_reward obs = to_ndarray(obs).astype(np.float32) rew = to_ndarray([rew]).astype(np.float32) From ebe403ec786eb899fab7f0584da7e045b9c53fe9 Mon Sep 17 00:00:00 2001 From: Cloud-Pku Date: Thu, 15 Dec 2022 17:22:51 +0800 Subject: [PATCH 02/10] after run reformat --- ding/model/template/q_learning.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/ding/model/template/q_learning.py b/ding/model/template/q_learning.py index 5177a87a7c..bc8a0bf152 100644 --- a/ding/model/template/q_learning.py +++ b/ding/model/template/q_learning.py @@ -155,9 +155,15 @@ def __init__( # head self.head = BranchingHead( - head_hidden_size, num_branches=self.num_branches, action_per_branch=action_per_branch, - layer_num=layer_num, a_layer_num=a_layer_num, v_layer_num=v_layer_num, activation=activation, - norm_type=norm_type) + head_hidden_size, + num_branches=self.num_branches, + action_per_branch=action_per_branch, + layer_num=layer_num, + a_layer_num=a_layer_num, + v_layer_num=v_layer_num, + activation=activation, + norm_type=norm_type + ) def forward(self, x: torch.Tensor) -> Dict: r""" From a2f30c2010058cfa867df3ef52f41faa5e5a6280 Mon Sep 17 00:00:00 2001 From: Cloud-Pku Date: Thu, 15 Dec 2022 17:25:58 +0800 Subject: [PATCH 03/10] update mujoco_env --- dizoo/mujoco/envs/mujoco_env.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dizoo/mujoco/envs/mujoco_env.py b/dizoo/mujoco/envs/mujoco_env.py index 3db13f1468..2bb2e7d2ff 100644 --- a/dizoo/mujoco/envs/mujoco_env.py +++ b/dizoo/mujoco/envs/mujoco_env.py @@ -69,7 +69,7 @@ def reset(self) -> np.ndarray: self._env.seed(self._seed) obs = self._env.reset() obs = to_ndarray(obs).astype('float32') - self._final_eval_reward = 0. + self._eval_episode_return = 0. return obs @@ -92,7 +92,7 @@ def step(self, action: Union[np.ndarray, list]) -> BaseEnvTimestep: if self._action_clip: action = np.clip(action, -1, 1) obs, rew, done, info = self._env.step(action) - self._final_eval_reward += rew + self._eval_episode_return += rew if done: if self._save_replay_gif: path = os.path.join( @@ -100,7 +100,7 @@ def step(self, action: Union[np.ndarray, list]) -> BaseEnvTimestep: ) save_frames_as_gif(self._frames, path) self._save_replay_count += 1 - info['final_eval_reward'] = self._final_eval_reward + info['eval_episode_return'] = self._eval_episode_return obs = to_ndarray(obs).astype(np.float32) rew = to_ndarray([rew]).astype(np.float32) From da07a6a725c27076776478264b737ebb58449b82 Mon Sep 17 00:00:00 2001 From: Cloud-Pku Date: Tue, 27 Dec 2022 18:22:11 +0800 Subject: [PATCH 04/10] add unittest; extend n-step TD; polished; --- ding/model/common/head.py | 20 +++++++++---- ding/model/template/__init__.py | 2 +- ding/model/template/q_learning.py | 25 +++++++++------- ding/model/template/tests/test_q_learning.py | 21 ++++++++++++- ding/policy/bdq.py | 16 +++++++--- ding/rl_utils/td.py | 18 +++++++---- ding/rl_utils/tests/test_td.py | 30 ++++++++++++++++++- dizoo/mujoco/config/halfcheetah_bdq_config.py | 9 ++---- dizoo/mujoco/config/hopper_bdq_config.py | 10 +++---- dizoo/mujoco/envs/mujoco_env.py | 22 ++++++++++++-- 10 files changed, 129 insertions(+), 44 deletions(-) diff --git a/ding/model/common/head.py b/ding/model/common/head.py index 94a6b2ba14..e60ed0fd50 100644 --- a/ding/model/common/head.py +++ b/ding/model/common/head.py @@ -180,7 +180,7 @@ def __init__( self, hidden_size: int, num_branches: int = 0, - action_per_branch: int = 2, + action_bins_per_branch: int = 2, layer_num: int = 1, a_layer_num: Optional[int] = None, v_layer_num: Optional[int] = None, @@ -190,11 +190,15 @@ def __init__( ) -> None: """ Overview: - Init the ``BranchingHead`` layers according to the provided arguments. + Init the ``BranchingHead`` layers according to the provided arguments. \ + This head achieves a linear increase of the number of network outputs \ + with the number of degrees of freedom by allowing a level of independence \ + for each individual action dimension. + Therefore, this head is suitable for high dimensional action Spaces. Arguments: - hidden_size (:obj:`int`): The ``hidden_size`` of the MLP connected to ``BranchingHead``. - num_branches (:obj:`int`): The number of branches, which is equivalent to the action dimension. - - action_per_branch (:obj:`int`): The number of actions in each dimension. + - action_bins_per_branch (:obj:int): The number of action bins in each dimension. - layer_num (:obj:`int`): The number of layers used in the network to compute Advantage and Value output. - a_layer_num (:obj:`int`): The number of layers used in the network to compute Advantage output. - v_layer_num (:obj:`int`): The number of layers used in the network to compute Value output. @@ -212,7 +216,7 @@ def __init__( if v_layer_num is None: v_layer_num = layer_num self.num_branches = num_branches - self.action_per_branch = action_per_branch + self.action_bins_per_branch = action_bins_per_branch layer = NoiseLinearLayer if noise else nn.Linear block = noise_block if noise else fc_block @@ -230,7 +234,7 @@ def __init__( ), block(hidden_size, 1) ) # action branching network - action_output_dim = action_per_branch + action_output_dim = action_bins_per_branch self.branches = nn.ModuleList( [ nn.Sequential( @@ -263,7 +267,7 @@ def forward(self, x: torch.Tensor) -> Dict: >>> head = BranchingHead(64, 5, 2) >>> inputs = torch.randn(4, 64) >>> outputs = head(inputs) - >>> assert isinstance(outputs, dict) and outputs['logit'].shape == torch.Size([4, 10]) + >>> assert isinstance(outputs, dict) and outputs['logit'].shape == torch.Size([4, 5, 2]) """ value_out = self.V(x) value_out = torch.unsqueeze(value_out, 1) @@ -271,6 +275,10 @@ def forward(self, x: torch.Tensor) -> Dict: for b in self.branches: action_out.append(b(x)) action_scores = torch.stack(action_out, 1) + ''' + From the paper, this implementation performs better than both the naive alternative (Q = V + A) \ + and the local maximum reduction method (Q = V + max(A)). + ''' action_scores = action_scores - torch.mean(action_scores, 2, keepdim=True) logits = value_out + action_scores return {'logit': logits} diff --git a/ding/model/template/__init__.py b/ding/model/template/__init__.py index 88ee115373..d4907a510d 100644 --- a/ding/model/template/__init__.py +++ b/ding/model/template/__init__.py @@ -1,5 +1,5 @@ # general -from .q_learning import DQN, RainbowDQN, QRDQN, IQN, FQF, DRQN, C51DQN +from .q_learning import DQN, RainbowDQN, QRDQN, IQN, FQF, DRQN, C51DQN, BDQ from .qac import QAC, DiscreteQAC from .pdqn import PDQN from .vac import VAC diff --git a/ding/model/template/q_learning.py b/ding/model/template/q_learning.py index bc8a0bf152..587ec4899a 100644 --- a/ding/model/template/q_learning.py +++ b/ding/model/template/q_learning.py @@ -105,7 +105,7 @@ def __init__( self, obs_shape: Union[int, SequenceType], num_branches: int = 0, - action_per_branch: int = 2, + action_bins_per_branch: int = 2, layer_num: int = 3, a_layer_num: Optional[int] = None, v_layer_num: Optional[int] = None, @@ -116,11 +116,14 @@ def __init__( ) -> None: """ Overview: - Init the BDQ (encoder + head) Model according to input arguments. + Init the BDQ (encoder + head) Model according to input arguments. \ + referenced paper Action Branching Architectures for Deep Reinforcement Learning \ + Arguments: - obs_shape (:obj:`Union[int, SequenceType]`): Observation space shape, such as 8 or [4, 84, 84]. - - num_branches (:obj:`int`): The number of branches, which is equivalent to the action dimension, such as 6. - - action_per_branch (:obj:`int`): The number of actions in each dimension. + - num_branches (:obj:`int`): The number of branches, which is equivalent to the action dimension, \ + such as 6 in mujoco's halfcheetah environment. + - action_bins_per_branch (:obj:`int`): The number of actions in each dimension. - layer_num (:obj:`int`): The number of layers used in the network to compute Advantage and Value output. - a_layer_num (:obj:`int`): The number of layers used in the network to compute Advantage output. - v_layer_num (:obj:`int`): The number of layers used in the network to compute Value output. @@ -134,7 +137,7 @@ def __init__( """ super(BDQ, self).__init__() # For compatibility: 1, (1, ), [4, 32, 32] - obs_shape = squeeze(obs_shape) + obs_shape, num_branches = squeeze(obs_shape), squeeze(num_branches) if head_hidden_size is None: head_hidden_size = encoder_hidden_size_list[-1] @@ -151,13 +154,13 @@ def __init__( ) self.num_branches = num_branches - self.action_per_branch = action_per_branch + self.action_bins_per_branch = action_bins_per_branch # head self.head = BranchingHead( head_hidden_size, num_branches=self.num_branches, - action_per_branch=action_per_branch, + action_bins_per_branch=self.action_bins_per_branch, layer_num=layer_num, a_layer_num=a_layer_num, v_layer_num=v_layer_num, @@ -178,14 +181,14 @@ def forward(self, x: torch.Tensor) -> Dict: Shapes: - x (:obj:`torch.Tensor`): :math:`(B, N)`, where B is batch size and N is ``obs_shape`` - logit (:obj:`torch.FloatTensor`): :math:`(B, M)`, where B is batch size and M is - ``num_branches * action_per_branch`` + ``num_branches * action_bins_per_branch`` Examples: - >>> model = BDQ(8, 5, 2) # arguments: 'obs_shape', 'num_branches' and 'action_per_branch'. + >>> model = BDQ(8, 5, 2) # arguments: 'obs_shape', 'num_branches' and 'action_bins_per_branch'. >>> inputs = torch.randn(4, 8) >>> outputs = model(inputs) - >>> assert isinstance(outputs, dict) and outputs['logit'].shape == torch.Size([4, 10]) + >>> assert isinstance(outputs, dict) and outputs['logit'].shape == torch.Size([4, 5, 2]) """ - x = self.encoder(x) + x = self.encoder(x) / (self.num_branches + 1) # corresponds to the "Gradient Rescaling" in the paper x = self.head(x) return x diff --git a/ding/model/template/tests/test_q_learning.py b/ding/model/template/tests/test_q_learning.py index b444becdf6..303481cb1c 100644 --- a/ding/model/template/tests/test_q_learning.py +++ b/ding/model/template/tests/test_q_learning.py @@ -1,7 +1,7 @@ import pytest from itertools import product import torch -from ding.model.template import DQN, RainbowDQN, QRDQN, IQN, FQF, DRQN, C51DQN +from ding.model.template import DQN, RainbowDQN, QRDQN, IQN, FQF, DRQN, C51DQN, BDQ from ding.torch_utils import is_differentiable T, B = 3, 4 @@ -40,6 +40,25 @@ def test_dqn(self, obs_shape, act_shape): assert outputs['logit'][i].shape == (B, s) self.output_check(model, outputs['logit']) + @pytest.mark.parametrize('obs_shape, act_shape', args) + def test_bdq(self, obs_shape, act_shape): + if isinstance(obs_shape, int): + inputs = torch.randn(B, obs_shape) + else: + inputs = torch.randn(B, *obs_shape) + if not isinstance(act_shape, int) and len(act_shape) > 1: + return + num_branches = act_shape + for action_bins_per_branch in range(1, 10): + model = BDQ(obs_shape, num_branches, action_bins_per_branch) + outputs = model(inputs) + assert isinstance(outputs, dict) + if isinstance(act_shape, int): + assert outputs['logit'].shape == (B, act_shape, action_bins_per_branch) + else: + assert outputs['logit'].shape == (B, *act_shape, action_bins_per_branch) + self.output_check(model, outputs['logit']) + @pytest.mark.parametrize('obs_shape, act_shape', args) def test_rainbowdqn(self, obs_shape, act_shape): if isinstance(obs_shape, int): diff --git a/ding/policy/bdq.py b/ding/policy/bdq.py index 920f8b7c07..e1366d38ce 100644 --- a/ding/policy/bdq.py +++ b/ding/policy/bdq.py @@ -17,8 +17,12 @@ class BDQPolicy(Policy): r""" Overview: - Policy class of BDQ algorithm, extended by PER/multi-step TD. - + Policy class of BDQ algorithm, extended by PER/multi-step TD. \ + referenced paper Action Branching Architectures for Deep Reinforcement Learning \ + + .. note:: + BDQ algorithm contains a neural architecture featuring a shared decision module \ + followed by several network branches, one for each action dimension. Config: == ==================== ======== ============== ======================================== ======================= ID Symbol Type Default Value Description Other(Shape) @@ -228,7 +232,7 @@ def _forward_learn(self, data: Dict[str, Any]) -> Dict[str, Any]: # after update # ============= self._target_model.update(self._learn_model.state_dict()) - return { + update_info = { 'cur_lr': self._optimizer.defaults['lr'], 'total_loss': loss.item(), 'q_value': q_value.mean().item(), @@ -237,9 +241,13 @@ def _forward_learn(self, data: Dict[str, Any]) -> Dict[str, Any]: # Only discrete action satisfying len(data['action'])==1 can return this and draw histogram on tensorboard. # '[histogram]action_distribution': data['action'], } + q_value_per_branch = torch.mean(q_value, 2, keepdim=False) + for i in range(self._model.num_branches): + update_info['q_value_b_' + str(i)] = q_value_per_branch[:, 0].mean().item() + return update_info def _monitor_vars_learn(self) -> List[str]: - return ['cur_lr', 'total_loss', 'q_value'] + return ['cur_lr', 'total_loss', 'q_value'] + ['q_value_b_' + str(i) for i in range(self._model.num_branches)] def _state_dict_learn(self) -> Dict[str, Any]: """ diff --git a/ding/rl_utils/td.py b/ding/rl_utils/td.py index f8dd6521c6..11c9263d1a 100644 --- a/ding/rl_utils/td.py +++ b/ding/rl_utils/td.py @@ -463,7 +463,15 @@ def bdq_nstep_td_error( ) -> torch.Tensor: """ Overview: - Multistep (1 step or n step) td_error for BDQ algorithm + Multistep (1 step or n step) td_error for BDQ algorithm, \ + referenced paper Action Branching Architectures for Deep Reinforcement Learning \ + + In fact, the original paper only provides the 1-step TD-error calculation method, \ + and here we extend the calculation method of n-step. + TD-error: + y_d = \sigma_{t=0}^{nstep} \gamma^t * r_t + \gamma^{nstep} * Q_d'(s', argmax Q_d(s', a_d)) + TD-error = \frac{1}{D} * (y_d - Q_d(s, a_d))^2 + Loss = mean(TD-error) Arguments: - data (:obj:`q_nstep_td_data`): the input data, q_nstep_td_data to calculate loss - gamma (:obj:`float`): discount factor @@ -477,10 +485,10 @@ def bdq_nstep_td_error( Shapes: - data (:obj:`q_nstep_td_data`): the q_nstep_td_data containing\ ['q', 'next_n_q', 'action', 'reward', 'done'] - - q (:obj:`torch.FloatTensor`): :math:`(B, N)` i.e. [batch_size, action_dim] - - next_n_q (:obj:`torch.FloatTensor`): :math:`(B, N)` - - action (:obj:`torch.LongTensor`): :math:`(B, )` - - next_n_action (:obj:`torch.LongTensor`): :math:`(B, )` + - q (:obj:`torch.FloatTensor`): :math:`(B, D, N)` i.e. [batch_size, branch_num, action_bins_per_branch] + - next_n_q (:obj:`torch.FloatTensor`): :math:`(B, D, N)` + - action (:obj:`torch.LongTensor`): :math:`(B, D)` + - next_n_action (:obj:`torch.LongTensor`): :math:`(B, D)` - reward (:obj:`torch.FloatTensor`): :math:`(T, B)`, where T is timestep(nstep) - done (:obj:`torch.BoolTensor`) :math:`(B, )`, whether done in last timestep - td_error_per_sample (:obj:`torch.FloatTensor`): :math:`(B, )` diff --git a/ding/rl_utils/tests/test_td.py b/ding/rl_utils/tests/test_td.py index 3b792ddc35..e96ca37fab 100644 --- a/ding/rl_utils/tests/test_td.py +++ b/ding/rl_utils/tests/test_td.py @@ -4,7 +4,7 @@ td_lambda_error, q_nstep_td_error_with_rescale, dist_1step_td_data, dist_1step_td_error, dist_nstep_td_data,\ dqfd_nstep_td_data, dqfd_nstep_td_error, dist_nstep_td_error, v_1step_td_data, v_1step_td_error, v_nstep_td_data,\ v_nstep_td_error, q_nstep_sql_td_error, iqn_nstep_td_data, iqn_nstep_td_error,\ - fqf_nstep_td_data, fqf_nstep_td_error, qrdqn_nstep_td_data, qrdqn_nstep_td_error + fqf_nstep_td_data, fqf_nstep_td_error, qrdqn_nstep_td_data, qrdqn_nstep_td_error, bdq_nstep_td_error from ding.rl_utils.td import shape_fn_dntd, shape_fn_qntd, shape_fn_td_lambda, shape_fn_qntd_rescale @@ -35,6 +35,34 @@ def test_q_nstep_td(): assert isinstance(q.grad, torch.Tensor) +@pytest.mark.unittest +def test_bdq_nstep_td(): + batch_size = 8 + branch_num = 6 + action_per_branch = 3 + next_q = torch.randn(batch_size, branch_num, action_per_branch) + done = torch.randn(batch_size) + action = torch.randint(0, action_per_branch, size=(batch_size, branch_num)) + next_action = torch.randint(0, action_per_branch, size=(batch_size, branch_num)) + for nstep in range(1, 10): + q = torch.randn(batch_size, branch_num, action_per_branch).requires_grad_(True) + reward = torch.rand(nstep, batch_size) + data = q_nstep_td_data(q, next_q, action, next_action, reward, done, None) + loss, td_error_per_sample = bdq_nstep_td_error(data, 0.95, nstep=nstep) + assert td_error_per_sample.shape == (batch_size, ) + assert loss.shape == () + assert q.grad is None + loss.backward() + assert isinstance(q.grad, torch.Tensor) + data = q_nstep_td_data(q, next_q, action, next_action, reward, done, None) + loss, td_error_per_sample = q_nstep_td_error(data, 0.95, nstep=nstep, cum_reward=True) + value_gamma = torch.tensor(0.9) + data = q_nstep_td_data(q, next_q, action, next_action, reward, done, None) + loss, td_error_per_sample = q_nstep_td_error(data, 0.95, nstep=nstep, cum_reward=True, value_gamma=value_gamma) + loss.backward() + assert isinstance(q.grad, torch.Tensor) + + @pytest.mark.unittest def test_q_nstep_td_ngu(): batch_size = 4 diff --git a/dizoo/mujoco/config/halfcheetah_bdq_config.py b/dizoo/mujoco/config/halfcheetah_bdq_config.py index 3a4cbfd388..145bf8062e 100644 --- a/dizoo/mujoco/config/halfcheetah_bdq_config.py +++ b/dizoo/mujoco/config/halfcheetah_bdq_config.py @@ -4,13 +4,12 @@ exp_name='halfcheetah_bdq_seed0', env=dict( env_id='HalfCheetah-v3', - norm_obs=dict(use_norm=False, ), norm_reward=dict(use_norm=False, ), collector_env_num=8, evaluator_env_num=8, n_evaluator_episode=8, stop_value=12000, - action_per_branch=2, + action_bins_per_branch=2, ), policy=dict( cuda=False, @@ -20,7 +19,7 @@ model=dict( obs_shape=17, num_branches=6, - action_per_branch=2, # mean the action shape is 11, 11 discrete actions + action_bins_per_branch=2, # mean the action shape is 6, 2 discrete actions for each action dimension encoder_hidden_size_list=[256, 256, 128], ), @@ -29,7 +28,6 @@ learning_rate=3e-4, ignore_done=True, target_update_freq=500, - target_update_theta=0.001, update_per_collect=20, ), collect=dict( @@ -59,7 +57,6 @@ import_names=['dizoo.mujoco.envs.mujoco_env'], ), env_manager=dict(type='subprocess'), - # env_manager=dict(type='subprocess'), policy=dict(type='bdq', ), ) halfcheetah_bdq_create_config = EasyDict(halfcheetah_bdq_create_config) @@ -68,4 +65,4 @@ if __name__ == "__main__": # or you can enter `ding -m serial_onpolicy -c halfcheetah_onbdq_config.py -s 0` from ding.entry import serial_pipeline - serial_pipeline((main_config, create_config), seed=0) \ No newline at end of file + serial_pipeline((main_config, create_config), seed=0, max_env_step=10000000,) \ No newline at end of file diff --git a/dizoo/mujoco/config/hopper_bdq_config.py b/dizoo/mujoco/config/hopper_bdq_config.py index c5b9b50155..de08da2a7a 100644 --- a/dizoo/mujoco/config/hopper_bdq_config.py +++ b/dizoo/mujoco/config/hopper_bdq_config.py @@ -4,32 +4,30 @@ exp_name='hopper_bdq_seed0', env=dict( env_id='Hopper-v3', - norm_obs=dict(use_norm=False, ), norm_reward=dict(use_norm=False, ), collector_env_num=8, evaluator_env_num=8, n_evaluator_episode=8, stop_value=int(1e6), - action_per_branch=4, + action_bins_per_branch=4, ), policy=dict( cuda=False, priority=False, discount_factor=0.99, + nstep=3, model=dict( obs_shape=11, num_branches=3, - action_per_branch=4, # mean the action shape is 11, 11 discrete actions + action_bins_per_branch=4, # mean the action shape is 3, 4 discrete actions for each action dimension encoder_hidden_size_list=[256, 256, 128], ), - nstep=1, learn=dict( ignore_done=False, batch_size=512, learning_rate=3e-4, # Frequency of target network update. target_update_freq=500, - target_update_theta=0.001, update_per_collect=20, ), collect=dict( @@ -70,4 +68,4 @@ if __name__ == "__main__": # or you can enter `ding -m serial_onpolicy -c hopper_bdq_config.py -s 0` from ding.entry import serial_pipeline - serial_pipeline([main_config, create_config], seed=0) + serial_pipeline([main_config, create_config], seed=0, max_env_step=10000000,) diff --git a/dizoo/mujoco/envs/mujoco_env.py b/dizoo/mujoco/envs/mujoco_env.py index 2bb2e7d2ff..c150581a5b 100644 --- a/dizoo/mujoco/envs/mujoco_env.py +++ b/dizoo/mujoco/envs/mujoco_env.py @@ -29,6 +29,7 @@ def default_config(cls: type) -> EasyDict: replay_path=None, save_replay_gif=False, replay_path_gif=None, + action_bins_per_branch=None, ) def __init__(self, cfg: dict) -> None: @@ -39,10 +40,25 @@ def __init__(self, cfg: dict) -> None: self._replay_path = None self._replay_path_gif = cfg.replay_path_gif self._save_replay_gif = cfg.save_replay_gif - self._action_per_branch = cfg.action_per_branch if 'action_per_branch' in cfg else None + self._action_bins_per_branch = cfg.action_bins_per_branch def map_action(self, action: Union[np.ndarray, list]) -> Union[np.ndarray, list]: - return [2 * x / (self._action_per_branch - 1) - 1 for x in action] + """ + Overview: + Map the discretized action index to the action in the original action space. + Arguments: + - action (:obj:`np.ndarray or list`): The discretized action index. \ + The value ranges is {0, 1, ..., self._action_bins_per_branch - 1}. + Returns: + - outputs (:obj:`list`): The action in the original action space. \ + The value ranges is [-1, 1]. + Examples: + >>> inputs = [2, 0, 4] + >>> self._action_bins_per_branch = 5 + >>> outputs = map_action(inputs) + >>> assert isinstance(outputs, list) and outputs == [0.0, -1.0, 1.0] + """ + return [2 * x / (self._action_bins_per_branch - 1) - 1 for x in action] def reset(self) -> np.ndarray: if not self._init_flag: @@ -84,7 +100,7 @@ def seed(self, seed: int, dynamic_seed: bool = True) -> None: np.random.seed(self._seed) def step(self, action: Union[np.ndarray, list]) -> BaseEnvTimestep: - if self._action_per_branch: + if self._action_bins_per_branch: action = self.map_action(action) action = to_ndarray(action) if self._save_replay_gif: From 506ec0a92c6a00d35c5ba6b152b7606207e29a50 Mon Sep 17 00:00:00 2001 From: Cloud-Pku Date: Tue, 27 Dec 2022 19:19:08 +0800 Subject: [PATCH 05/10] fix one error --- ding/policy/bdq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ding/policy/bdq.py b/ding/policy/bdq.py index e1366d38ce..618078a717 100644 --- a/ding/policy/bdq.py +++ b/ding/policy/bdq.py @@ -243,7 +243,7 @@ def _forward_learn(self, data: Dict[str, Any]) -> Dict[str, Any]: } q_value_per_branch = torch.mean(q_value, 2, keepdim=False) for i in range(self._model.num_branches): - update_info['q_value_b_' + str(i)] = q_value_per_branch[:, 0].mean().item() + update_info['q_value_b_' + str(i)] = q_value_per_branch[:, i].mean().item() return update_info def _monitor_vars_learn(self) -> List[str]: From e41b361601848cd59869e2220f89d54156ecc256 Mon Sep 17 00:00:00 2001 From: Cloud-Pku Date: Thu, 29 Dec 2022 11:27:08 +0800 Subject: [PATCH 06/10] fixed one error --- ding/rl_utils/tests/test_td.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ding/rl_utils/tests/test_td.py b/ding/rl_utils/tests/test_td.py index e96ca37fab..34a6fb4ae0 100644 --- a/ding/rl_utils/tests/test_td.py +++ b/ding/rl_utils/tests/test_td.py @@ -55,10 +55,10 @@ def test_bdq_nstep_td(): loss.backward() assert isinstance(q.grad, torch.Tensor) data = q_nstep_td_data(q, next_q, action, next_action, reward, done, None) - loss, td_error_per_sample = q_nstep_td_error(data, 0.95, nstep=nstep, cum_reward=True) + loss, td_error_per_sample = bdq_nstep_td_error(data, 0.95, nstep=nstep, cum_reward=True) value_gamma = torch.tensor(0.9) data = q_nstep_td_data(q, next_q, action, next_action, reward, done, None) - loss, td_error_per_sample = q_nstep_td_error(data, 0.95, nstep=nstep, cum_reward=True, value_gamma=value_gamma) + loss, td_error_per_sample = bdq_nstep_td_error(data, 0.95, nstep=nstep, cum_reward=True, value_gamma=value_gamma) loss.backward() assert isinstance(q.grad, torch.Tensor) From 3b02f6f43f84ec59f5a624ed3db23c7dcaa9444f Mon Sep 17 00:00:00 2001 From: Cloud-Pku Date: Thu, 29 Dec 2022 12:06:10 +0800 Subject: [PATCH 07/10] fixed one error --- ding/rl_utils/td.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ding/rl_utils/td.py b/ding/rl_utils/td.py index 11c9263d1a..824dc34e45 100644 --- a/ding/rl_utils/td.py +++ b/ding/rl_utils/td.py @@ -497,7 +497,6 @@ def bdq_nstep_td_error( if weight is None: weight = torch.ones_like(reward) reward = reward.unsqueeze(-1) - weight = weight.unsqueeze(-1) done = done.unsqueeze(-1) if value_gamma is not None: value_gamma = value_gamma.unsqueeze(-1) From 75e93e96c86a5a53a7bfbbb46caf15898b5e6c8e Mon Sep 17 00:00:00 2001 From: Cloud-Pku Date: Tue, 3 Jan 2023 11:20:27 +0800 Subject: [PATCH 08/10] add test_bdq.py --- ding/policy/tests/test_bdq.py | 61 ++++++++++++++++++++++++++++++++++ ding/rl_utils/tests/test_td.py | 4 ++- 2 files changed, 64 insertions(+), 1 deletion(-) create mode 100644 ding/policy/tests/test_bdq.py diff --git a/ding/policy/tests/test_bdq.py b/ding/policy/tests/test_bdq.py new file mode 100644 index 0000000000..09be75cc8b --- /dev/null +++ b/ding/policy/tests/test_bdq.py @@ -0,0 +1,61 @@ +import pytest +import torch +from easydict import EasyDict +from ding.model.wrapper.model_wrappers import ArgmaxSampleWrapper, EpsGreedySampleWrapper, TargetNetworkWrapper +from ding.policy.bdq import BDQPolicy +from dizoo.classic_control.pendulum.envs import PendulumEnv + +obs_space = 3 +num_branches = 1 +action_bins_per_branch = 5 + +cfg1 = EasyDict(BDQPolicy.config) +cfg1.model = {} +cfg1.model.obs_shape = obs_space +cfg1.model.num_branches = num_branches +cfg1.model.action_bins_per_branch = action_bins_per_branch + + +def get_batch(size=8): + data = {} + for i in range(size): + obs = torch.zeros(obs_space) + data[i] = obs + return data + + +def get_transition(size=20): + data = [] + for i in range(size): + sample = {} + sample['obs'] = torch.zeros(obs_space) + sample['action'] = torch.randint(0, action_bins_per_branch, (num_branches, )) + sample['done'] = False + sample['next_obs'] = torch.zeros(obs_space) + sample['reward'] = torch.Tensor([1.]) + data.append(sample) + return data + + +@pytest.mark.parametrize('cfg', [cfg1]) +@pytest.mark.unittest +def test_bdq(cfg): + policy = BDQPolicy(cfg, enable_field=['collect', 'eval', 'learn']) + assert type(policy._learn_model) == ArgmaxSampleWrapper + assert type(policy._target_model) == TargetNetworkWrapper + assert type(policy._collect_model) == EpsGreedySampleWrapper + batch_obs = get_batch() + policy._forward_eval(batch_obs) + policy._forward_collect(batch_obs, 0.5) + + sample = get_transition(size=20) + policy._forward_learn(sample) + policy._get_train_sample(sample) + + env = PendulumEnv(EasyDict({'act_scale': True, 'continuous': False})) + env.seed(314) + obs = env.reset() + b_obs = {0: obs} + raw_out = policy._forward_collect(b_obs, 0.5)[0] + timestep = env.step(raw_out['action'].numpy()) + transition = policy._process_transition(obs, raw_out, timestep) diff --git a/ding/rl_utils/tests/test_td.py b/ding/rl_utils/tests/test_td.py index 34a6fb4ae0..bcc2291e16 100644 --- a/ding/rl_utils/tests/test_td.py +++ b/ding/rl_utils/tests/test_td.py @@ -58,7 +58,9 @@ def test_bdq_nstep_td(): loss, td_error_per_sample = bdq_nstep_td_error(data, 0.95, nstep=nstep, cum_reward=True) value_gamma = torch.tensor(0.9) data = q_nstep_td_data(q, next_q, action, next_action, reward, done, None) - loss, td_error_per_sample = bdq_nstep_td_error(data, 0.95, nstep=nstep, cum_reward=True, value_gamma=value_gamma) + loss, td_error_per_sample = bdq_nstep_td_error( + data, 0.95, nstep=nstep, cum_reward=True, value_gamma=value_gamma + ) loss.backward() assert isinstance(q.grad, torch.Tensor) From 1598f778abed7ed295d1ab54758e25602572331a Mon Sep 17 00:00:00 2001 From: Cloud-Pku Date: Tue, 3 Jan 2023 11:45:12 +0800 Subject: [PATCH 09/10] add readme --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 9985802416..3601d9e77c 100644 --- a/README.md +++ b/README.md @@ -240,6 +240,7 @@ P.S: The `.py` file in `Runnable Demo` can be found in `dizoo` | 48 | [ST-DIM](https://arxiv.org/pdf/1906.08226.pdf) | ![other](https://img.shields.io/badge/-other-lightgrey) | [torch_utils/loss/contrastive_loss](https://github.com/opendilab/DI-engine/blob/main/ding/torch_utils/loss/contrastive_loss.py) | ding -m serial -c cartpole_dqn_stdim_config.py -s 0 | | 49 | [PLR](https://arxiv.org/pdf/2010.03934.pdf) | ![other](https://img.shields.io/badge/-other-lightgrey) | [PLR doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/plr.html)
[data/level_replay/level_sampler](https://github.com/opendilab/DI-engine/blob/main/ding/data/level_replay/level_sampler.py) | python3 -u bigfish_plr_config.py -s 0 | | 50 | [PCGrad](https://arxiv.org/pdf/2001.06782.pdf) | ![other](https://img.shields.io/badge/-other-lightgrey) | [torch_utils/optimizer_helper/PCGrad](https://github.com/opendilab/DI-engine/blob/main/ding/data/torch_utils/optimizer_helper.py) | python3 -u multi_mnist_pcgrad_main.py -s 0 | +| 51 | [BDQ](https://arxiv.org/pdf/1711.08946.pdf) | ![other](https://img.shields.io/badge/-other-lightgrey) | [policy/bdq](https://github.com/opendilab/DI-engine/blob/main/ding/policy/dqn.py) | python3 -u hopper_bdq_config.py | From 7138bc43170e7d8bb6df7925b34b2c27345f77ef Mon Sep 17 00:00:00 2001 From: Cloud-Pku Date: Tue, 3 Jan 2023 13:36:27 +0800 Subject: [PATCH 10/10] add pendulum_bdq test --- ding/entry/tests/test_serial_entry.py | 15 +++++ ding/policy/tests/test_bdq.py | 61 ------------------ .../pendulum/config/pendulum_bdq_config.py | 62 +++++++++++++++++++ 3 files changed, 77 insertions(+), 61 deletions(-) delete mode 100644 ding/policy/tests/test_bdq.py create mode 100644 dizoo/classic_control/pendulum/config/pendulum_bdq_config.py diff --git a/ding/entry/tests/test_serial_entry.py b/ding/entry/tests/test_serial_entry.py index 0fc9c5aae6..5d83f0557f 100644 --- a/ding/entry/tests/test_serial_entry.py +++ b/ding/entry/tests/test_serial_entry.py @@ -51,6 +51,7 @@ from dizoo.gym_hybrid.config.gym_hybrid_ddpg_config import gym_hybrid_ddpg_config, gym_hybrid_ddpg_create_config from dizoo.gym_hybrid.config.gym_hybrid_pdqn_config import gym_hybrid_pdqn_config, gym_hybrid_pdqn_create_config from dizoo.gym_hybrid.config.gym_hybrid_mpdqn_config import gym_hybrid_mpdqn_config, gym_hybrid_mpdqn_create_config +from dizoo.classic_control.pendulum.config.pendulum_bdq_config import pendulum_bdq_config, pendulum_bdq_create_config # noqa @pytest.mark.platformtest @@ -67,6 +68,20 @@ def test_dqn(): os.popen('rm -rf cartpole_dqn_unittest') +@pytest.mark.platformtest +@pytest.mark.unittest +def test_bdq(): + config = [deepcopy(pendulum_bdq_config), deepcopy(pendulum_bdq_create_config)] + config[0].policy.learn.update_per_collect = 1 + config[0].exp_name = 'pendulum_bdq_unittest' + try: + serial_pipeline(config, seed=0, max_train_iter=1) + except Exception: + assert False, "pipeline fail" + finally: + os.popen('rm -rf pendulum_bdq_unittest') + + @pytest.mark.platformtest @pytest.mark.unittest def test_ddpg(): diff --git a/ding/policy/tests/test_bdq.py b/ding/policy/tests/test_bdq.py deleted file mode 100644 index 09be75cc8b..0000000000 --- a/ding/policy/tests/test_bdq.py +++ /dev/null @@ -1,61 +0,0 @@ -import pytest -import torch -from easydict import EasyDict -from ding.model.wrapper.model_wrappers import ArgmaxSampleWrapper, EpsGreedySampleWrapper, TargetNetworkWrapper -from ding.policy.bdq import BDQPolicy -from dizoo.classic_control.pendulum.envs import PendulumEnv - -obs_space = 3 -num_branches = 1 -action_bins_per_branch = 5 - -cfg1 = EasyDict(BDQPolicy.config) -cfg1.model = {} -cfg1.model.obs_shape = obs_space -cfg1.model.num_branches = num_branches -cfg1.model.action_bins_per_branch = action_bins_per_branch - - -def get_batch(size=8): - data = {} - for i in range(size): - obs = torch.zeros(obs_space) - data[i] = obs - return data - - -def get_transition(size=20): - data = [] - for i in range(size): - sample = {} - sample['obs'] = torch.zeros(obs_space) - sample['action'] = torch.randint(0, action_bins_per_branch, (num_branches, )) - sample['done'] = False - sample['next_obs'] = torch.zeros(obs_space) - sample['reward'] = torch.Tensor([1.]) - data.append(sample) - return data - - -@pytest.mark.parametrize('cfg', [cfg1]) -@pytest.mark.unittest -def test_bdq(cfg): - policy = BDQPolicy(cfg, enable_field=['collect', 'eval', 'learn']) - assert type(policy._learn_model) == ArgmaxSampleWrapper - assert type(policy._target_model) == TargetNetworkWrapper - assert type(policy._collect_model) == EpsGreedySampleWrapper - batch_obs = get_batch() - policy._forward_eval(batch_obs) - policy._forward_collect(batch_obs, 0.5) - - sample = get_transition(size=20) - policy._forward_learn(sample) - policy._get_train_sample(sample) - - env = PendulumEnv(EasyDict({'act_scale': True, 'continuous': False})) - env.seed(314) - obs = env.reset() - b_obs = {0: obs} - raw_out = policy._forward_collect(b_obs, 0.5)[0] - timestep = env.step(raw_out['action'].numpy()) - transition = policy._process_transition(obs, raw_out, timestep) diff --git a/dizoo/classic_control/pendulum/config/pendulum_bdq_config.py b/dizoo/classic_control/pendulum/config/pendulum_bdq_config.py new file mode 100644 index 0000000000..59bffb05b9 --- /dev/null +++ b/dizoo/classic_control/pendulum/config/pendulum_bdq_config.py @@ -0,0 +1,62 @@ +from easydict import EasyDict +import sys +sys.path.insert(0, "/mnt/lustre/chenyun/bdq_implement1/DI-engine") +pendulum_bdq_config = dict( + exp_name='pendulum_bdq_seed0', + env=dict( + collector_env_num=10, + evaluator_env_num=5, + # (bool) Scale output action into legal range. + act_scale=True, + n_evaluator_episode=5, + stop_value=-250, + continuous=False, + # The path to save the game replay + # replay_path='./pendulum_bdq_seed0/video', + ), + policy=dict( + cuda=False, + load_path='pendulum_bdq_seed0/ckpt/ckpt_best.pth.tar', # necessary for eval + model=dict( + obs_shape=3, + num_branches=1, + action_bins_per_branch=11, + encoder_hidden_size_list=[128, 128, 64], + ), + nstep=1, + discount_factor=0.97, + learn=dict( + batch_size=64, + learning_rate=0.001, + ), + collect=dict(n_sample=8), + eval=dict(evaluator=dict(eval_freq=40, )), + other=dict( + eps=dict( + type='exp', + start=0.95, + end=0.1, + decay=10000, + ), + replay_buffer=dict(replay_buffer_size=20000, ), + ), + ), +) +pendulum_bdq_config = EasyDict(pendulum_bdq_config) +main_config = pendulum_bdq_config +pendulum_bdq_create_config = dict( + env=dict( + type='pendulum', + import_names=['dizoo.classic_control.pendulum.envs.pendulum_env'], + ), + env_manager=dict(type='base'), + policy=dict(type='bdq'), + replay_buffer=dict(type='deque', import_names=['ding.data.buffer.deque_buffer_wrapper']), +) +pendulum_bdq_create_config = EasyDict(pendulum_bdq_create_config) +create_config = pendulum_bdq_create_config + +if __name__ == "__main__": + # or you can enter `ding -m serial -c pendulum_bdq_config.py -s 0` + from ding.entry import serial_pipeline + serial_pipeline((main_config, create_config), seed=0)