diff --git a/dizoo/common/policy/md_dqn.py b/dizoo/common/policy/md_dqn.py index cad59a7062..e52a47e626 100644 --- a/dizoo/common/policy/md_dqn.py +++ b/dizoo/common/policy/md_dqn.py @@ -3,43 +3,94 @@ from ding.rl_utils import q_nstep_td_data, q_nstep_td_error from ding.policy import DQNPolicy from ding.utils import POLICY_REGISTRY +from ding.policy.common_utils import default_preprocess_learn +from ding.torch_utils import to_device @POLICY_REGISTRY.register('md_dqn') class MultiDiscreteDQNPolicy(DQNPolicy): + r""" + Overview: + Policy class of Multi-discrete action space DQN algorithm. + """ def _forward_learn(self, data: dict) -> Dict[str, Any]: - reward = data['reward'] - if len(reward.shape) == 1: - reward = reward.unsqueeze(1) - assert reward.shape == (self._cfg.learn.batch_size, self._nstep), reward.shape - reward = reward.permute(1, 0).contiguous() - q_value = self._armor.forward(data['obs'])['logit'] - # target_q_value = self._armor.target_forward(data['next_obs'])['logit'] - target = self._armor.forward(data['next_obs']) - target_q_value = target['logit'] - next_act = target['action'] - if isinstance(q_value, torch.Tensor): - td_data = q_nstep_td_data( # 'q', 'next_q', 'act', 'next_act', 'reward', 'done', 'weight' - q_value, target_q_value, data['action'][0], next_act, reward, data['done'], data['weight'] - ) - loss, td_error_per_sample = q_nstep_td_error(td_data, self._gamma, nstep=self._nstep) - else: + """ + Overview: + Forward computation of learn mode(updating policy). It supports both single and multi-discrete action \ + space. It depends on whether the ``q_value`` is a list. + Arguments: + - data (:obj:`Dict[str, Any]`): Dict type data, a batch of data for training, values are torch.Tensor or \ + np.ndarray or dict/list combinations. + Returns: + - info_dict (:obj:`Dict[str, Any]`): Dict type data, a info dict indicated training result, which will be \ + recorded in text log and tensorboard, values are python scalar or a list of scalars. + ArgumentsKeys: + - necessary: ``obs``, ``action``, ``reward``, ``next_obs``, ``done`` + - optional: ``value_gamma``, ``IS`` + ReturnsKeys: + - necessary: ``cur_lr``, ``total_loss``, ``priority`` + - optional: ``action_distribution`` + """ + data = default_preprocess_learn( + data, + use_priority=self._priority, + use_priority_IS_weight=self._cfg.priority_IS_weight, + ignore_done=self._cfg.learn.ignore_done, + use_nstep=True + ) + if self._cuda: + data = to_device(data, self._device) + # ==================== + # Q-learning forward + # ==================== + self._learn_model.train() + self._target_model.train() + # Current q value (main model) + q_value = self._learn_model.forward(data['obs'])['logit'] + # Target q value + with torch.no_grad(): + target_q_value = self._target_model.forward(data['next_obs'])['logit'] + # Max q value action (main model) + target_q_action = self._learn_model.forward(data['next_obs'])['action'] + + value_gamma = data.get('value_gamma') + if isinstance(q_value, list): tl_num = len(q_value) loss, td_error_per_sample = [], [] for i in range(tl_num): td_data = q_nstep_td_data( - q_value[i], target_q_value[i], data['action'][i], next_act[i], reward, data['done'], data['weight'] + q_value[i], target_q_value[i], data['action'][i], target_q_action[i], data['reward'], data['done'], + data['weight'] + ) + loss_, td_error_per_sample_ = q_nstep_td_error( + td_data, self._gamma, nstep=self._nstep, value_gamma=value_gamma ) - loss_, td_error_per_sample_ = q_nstep_td_error(td_data, self._gamma, nstep=self._nstep) loss.append(loss_) td_error_per_sample.append(td_error_per_sample_.abs()) loss = sum(loss) / (len(loss) + 1e-8) td_error_per_sample = sum(td_error_per_sample) / (len(td_error_per_sample) + 1e-8) + else: + data_n = q_nstep_td_data( + q_value, target_q_value, data['action'], target_q_action, data['reward'], data['done'], data['weight'] + ) + loss, td_error_per_sample = q_nstep_td_error( + data_n, self._gamma, nstep=self._nstep, value_gamma=value_gamma + ) + + # ==================== + # Q-learning update + # ==================== self._optimizer.zero_grad() loss.backward() + if self._cfg.learn.multi_gpu: + self.sync_gradients(self._learn_model) self._optimizer.step() - self._armor.target_update(self._armor.state_dict()['model']) + + # ============= + # after update + # ============= + self._target_model.update(self._learn_model.state_dict()) return { 'cur_lr': self._optimizer.defaults['lr'], 'total_loss': loss.item(), diff --git a/dizoo/common/policy/md_ppo.py b/dizoo/common/policy/md_ppo.py deleted file mode 100644 index 8ec3124d6b..0000000000 --- a/dizoo/common/policy/md_ppo.py +++ /dev/null @@ -1,57 +0,0 @@ -from typing import List, Dict, Any, Tuple, Union, Optional -import torch -import torch.nn as nn -import torch.nn.functional as F -from ding.rl_utils import ppo_data, ppo_error -from ding.policy import PPOPolicy -from ding.utils import POLICY_REGISTRY -from ding.model.actor_critic.value_ac import ValueAC - - -class SepValueAC(ValueAC): - - def _setup_encoder(self) -> torch.nn.Module: - return nn.Identity() - - -@POLICY_REGISTRY.register('md_ppo') -class MultiDiscretePPOPolicy(PPOPolicy): - - def _forward_learn(self, data: dict) -> Dict[str, Any]: - output = self._armor.forward(data['obs'], param={'mode': 'compute_action_value'}) - adv = data['adv'] - if self._use_adv_norm: - adv = (adv - adv.mean()) / (adv.std() + 1e-8) - return_ = data['value'] + adv - action_num = len(data['logit']) - loss, info = [], [] - for i in range(action_num): - data_ = ppo_data( - output['logit'][i], data['logit'][i], data['action'][i], output['value'], data['value'], adv, return_, - data['weight'] - ) - ppo_loss, ppo_info = ppo_error(data_, self._clip_ratio) - loss.append(ppo_loss) - info.append(ppo_info) - policy_loss = sum([item.policy_loss for item in loss]) / action_num - value_loss = sum([item.value_loss for item in loss]) / action_num - entropy_loss = sum([item.entropy_loss for item in loss]) / action_num - wv, we = self._value_weight, self._entropy_weight - total_loss = ppo_loss.policy_loss + wv * ppo_loss.value_loss - we * ppo_loss.entropy_loss - - approx_kl = sum([item.approx_kl for item in info]) / action_num - clipfrac = sum([item.clipfrac for item in info]) / action_num - - self._optimizer.zero_grad() - total_loss.backward() - self._optimizer.step() - return { - 'cur_lr': self._optimizer.defaults['lr'], - 'total_loss': total_loss.item(), - 'adv_abs_max': adv.abs().max().item(), - 'policy_loss': policy_loss.item(), - 'value_loss': value_loss.item(), - 'entropy_loss': entropy_loss.item(), - 'approx_kl': approx_kl, - 'clipfrac': clipfrac, - } diff --git a/dizoo/common/policy/md_rainbow_dqn.py b/dizoo/common/policy/md_rainbow_dqn.py index c5d639c44e..0cc8113080 100644 --- a/dizoo/common/policy/md_rainbow_dqn.py +++ b/dizoo/common/policy/md_rainbow_dqn.py @@ -1,49 +1,70 @@ from typing import Dict, Any import torch +from ding.torch_utils import to_device from ding.rl_utils import dist_nstep_td_data, dist_nstep_td_error, dist_1step_td_data, dist_1step_td_error from ding.policy import RainbowDQNPolicy from ding.utils import POLICY_REGISTRY +from ding.policy.common_utils import default_preprocess_learn @POLICY_REGISTRY.register('md_rainbow_dqn') class MultiDiscreteRainbowDQNPolicy(RainbowDQNPolicy): + r""" + Overview: + Multi-discrete action space Rainbow DQN algorithms. + """ def _forward_learn(self, data: dict) -> Dict[str, Any]: """ Overview: - Forward and backward function of learn mode, acquire the data and calculate the loss and\ + Forward and backward function of learn mode, acquire the data and calculate the loss and \ optimize learner model Arguments: - data (:obj:`dict`): Dict type data, including at least ['obs', 'next_obs', 'reward', 'action'] Returns: - - info_dict (:obj:`Dict[str, Any]`): Including cur_lr and total_loss + - info_dict (:obj:`Dict[str, Any]`): Including cur_lr, total_loss and priority - cur_lr (:obj:`float`): current learning rate - total_loss (:obj:`float`): the calculated loss + - priority (:obj:`list`): the priority of samples """ + data = default_preprocess_learn( + data, + use_priority=self._priority, + use_priority_IS_weight=self._cfg.priority_IS_weight, + ignore_done=self._cfg.learn.ignore_done, + use_nstep=True + ) + if self._cuda: + data = to_device(data, self._device) # ==================== # Rainbow forward # ==================== - reward = data['reward'] - if len(reward.shape) == 1: - reward = reward.unsqueeze(1) - assert reward.shape == (self._cfg.learn.batch_size, self._nstep), reward.shape - reward = reward.permute(1, 0).contiguous() - # reset noise of noisenet for both main armor and target armor - self._reset_noise(self._armor.model) - self._reset_noise(self._armor.target_model) - q_dist = self._armor.forward(data['obs'])['distribution'] + self._learn_model.train() + self._target_model.train() + # reset noise of noisenet for both main model and target model + self._reset_noise(self._learn_model) + self._reset_noise(self._target_model) + q_dist = self._learn_model.forward(data['obs'])['distribution'] with torch.no_grad(): - target_q_dist = self._armor.target_forward(data['next_obs'])['distribution'] - self._reset_noise(self._armor.model) - target_q_action = self._armor.forward(data['next_obs'])['action'] + target_q_dist = self._target_model.forward(data['next_obs'])['distribution'] + self._reset_noise(self._learn_model) + target_q_action = self._learn_model.forward(data['next_obs'])['action'] + + value_gamma = data.get('value_gamma', None) if isinstance(q_dist, torch.Tensor): td_data = dist_nstep_td_data( - q_dist, target_q_dist, data['action'], target_q_action, reward, data['done'], data['weight'] + q_dist, target_q_dist, data['action'], target_q_action, data['reward'], data['done'], data['weight'] ) loss, td_error_per_sample = dist_nstep_td_error( - td_data, self._gamma, self._v_min, self._v_max, self._n_atom, nstep=self._nstep + td_data, + self._gamma, + self._v_min, + self._v_max, + self._n_atom, + nstep=self._nstep, + value_gamma=value_gamma ) else: tl_num = len(q_dist) @@ -51,11 +72,17 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: td_error_per_samples = [] for i in range(tl_num): td_data = dist_nstep_td_data( - q_dist[i], target_q_dist[i], data['action'][i], target_q_action[i], reward, data['done'], + q_dist[i], target_q_dist[i], data['action'][i], target_q_action[i], data['reward'], data['done'], data['weight'] ) td_loss, td_error_per_sample = dist_nstep_td_error( - td_data, self._gamma, self._v_min, self._v_max, self._n_atom, nstep=self._nstep + td_data, + self._gamma, + self._v_min, + self._v_max, + self._n_atom, + nstep=self._nstep, + value_gamma=value_gamma ) losses.append(td_loss) td_error_per_samples.append(td_error_per_sample) @@ -70,7 +97,7 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: # ============= # after update # ============= - self._armor.target_update(self._armor.state_dict()['model']) + self._target_model.update(self._learn_model.state_dict()) return { 'cur_lr': self._optimizer.defaults['lr'], 'total_loss': loss.item(),