From 146f615429625ea053f5682b32b48fdbf27ac0d6 Mon Sep 17 00:00:00 2001 From: Super1ce <278042904@qq.com> Date: Wed, 29 Mar 2023 17:19:27 +0800 Subject: [PATCH 01/25] add EDAC and modify config of td3bc --- ding/example/edac.py | 45 +++ ding/model/common/__init__.py | 4 +- ding/model/common/head.py | 50 ++- ding/model/template/__init__.py | 1 + ding/model/template/edac.py | 180 ++++++++++ ding/policy/__init__.py | 1 + ding/policy/command_mode_policy_instance.py | 5 + ding/policy/edac.py | 340 ++++++++++++++++++ ding/torch_utils/__init__.py | 0 .../config/halfcheetah_expert_cql_config.py | 6 +- .../config/halfcheetah_expert_td3bc_config.py | 8 +- .../config/halfcheetah_medium_cql_config.py | 6 +- .../config/halfcheetah_medium_edac_config.py | 60 ++++ .../halfcheetah_medium_expert_cql_config.py | 6 +- .../halfcheetah_medium_expert_edac_config.py | 60 ++++ .../halfcheetah_medium_expert_td3bc_config.py | 8 +- .../halfcheetah_medium_replay_cql_config.py | 6 +- .../halfcheetah_medium_replay_td3bc_config.py | 6 +- .../config/halfcheetah_medium_td3bc_config.py | 8 +- .../config/halfcheetah_random_cql_config.py | 6 +- .../config/halfcheetah_random_td3bc_config.py | 6 +- .../d4rl/config/hopper_medium_edac_config.py | 60 ++++ .../hopper_medium_expert_edac_config.py | 60 ++++ .../d4rl/config/walker2d_expert_cql_config.py | 6 +- .../config/walker2d_expert_td3bc_config.py | 6 +- .../d4rl/config/walker2d_medium_cql_config.py | 6 +- .../walker2d_medium_expert_cql_config.py | 6 +- .../walker2d_medium_expert_td3bc_config.py | 6 +- .../walker2d_medium_replay_cql_config.py | 6 +- .../walker2d_medium_replay_td3bc_config.py | 6 +- .../config/walker2d_medium_td3bc_config.py | 6 +- .../d4rl/config/walker2d_random_cql_config.py | 6 +- .../config/walker2d_random_td3bc_config.py | 6 +- dizoo/d4rl/entry/d4rl_edac_main.py | 21 ++ 34 files changed, 947 insertions(+), 66 deletions(-) create mode 100755 ding/example/edac.py mode change 100644 => 100755 ding/model/common/__init__.py mode change 100644 => 100755 ding/model/common/head.py mode change 100644 => 100755 ding/model/template/__init__.py create mode 100755 ding/model/template/edac.py mode change 100644 => 100755 ding/policy/__init__.py mode change 100644 => 100755 ding/policy/command_mode_policy_instance.py create mode 100755 ding/policy/edac.py mode change 100644 => 100755 ding/torch_utils/__init__.py mode change 100644 => 100755 dizoo/d4rl/config/halfcheetah_expert_cql_config.py mode change 100644 => 100755 dizoo/d4rl/config/halfcheetah_expert_td3bc_config.py mode change 100644 => 100755 dizoo/d4rl/config/halfcheetah_medium_cql_config.py create mode 100755 dizoo/d4rl/config/halfcheetah_medium_edac_config.py mode change 100644 => 100755 dizoo/d4rl/config/halfcheetah_medium_expert_cql_config.py create mode 100755 dizoo/d4rl/config/halfcheetah_medium_expert_edac_config.py mode change 100644 => 100755 dizoo/d4rl/config/halfcheetah_medium_expert_td3bc_config.py mode change 100644 => 100755 dizoo/d4rl/config/halfcheetah_medium_replay_cql_config.py mode change 100644 => 100755 dizoo/d4rl/config/halfcheetah_medium_replay_td3bc_config.py mode change 100644 => 100755 dizoo/d4rl/config/halfcheetah_medium_td3bc_config.py mode change 100644 => 100755 dizoo/d4rl/config/halfcheetah_random_cql_config.py mode change 100644 => 100755 dizoo/d4rl/config/halfcheetah_random_td3bc_config.py create mode 100755 dizoo/d4rl/config/hopper_medium_edac_config.py create mode 100755 dizoo/d4rl/config/hopper_medium_expert_edac_config.py mode change 100644 => 100755 dizoo/d4rl/config/walker2d_expert_cql_config.py mode change 100644 => 100755 dizoo/d4rl/config/walker2d_expert_td3bc_config.py mode change 100644 => 100755 dizoo/d4rl/config/walker2d_medium_cql_config.py mode change 100644 => 100755 dizoo/d4rl/config/walker2d_medium_expert_cql_config.py mode change 100644 => 100755 dizoo/d4rl/config/walker2d_medium_expert_td3bc_config.py mode change 100644 => 100755 dizoo/d4rl/config/walker2d_medium_replay_cql_config.py mode change 100644 => 100755 dizoo/d4rl/config/walker2d_medium_replay_td3bc_config.py mode change 100644 => 100755 dizoo/d4rl/config/walker2d_medium_td3bc_config.py mode change 100644 => 100755 dizoo/d4rl/config/walker2d_random_cql_config.py mode change 100644 => 100755 dizoo/d4rl/config/walker2d_random_td3bc_config.py create mode 100755 dizoo/d4rl/entry/d4rl_edac_main.py diff --git a/ding/example/edac.py b/ding/example/edac.py new file mode 100755 index 0000000000..5ce16ee5a2 --- /dev/null +++ b/ding/example/edac.py @@ -0,0 +1,45 @@ +import gym +from ditk import logging +from ding.model import Q_ensemble +from ding.policy import EDACPolicy +from ding.envs import DingEnvWrapper, BaseEnvManagerV2 +from ding.data import create_dataset +from ding.config import compile_config +from ding.framework import task, ding_init +from ding.framework.context import OfflineRLContext +from ding.framework.middleware import interaction_evaluator, trainer, CkptSaver, offline_data_fetcher, offline_logger +from ding.utils import set_pkg_seed +from dizoo.d4rl.envs import D4RLEnv +from dizoo.d4rl.config.halfcheetah_medium_edac_config import main_config,create_config +# from dizoo.d4rl.config.halfcheetah_medium_expert_edac_config import main_config,create_config +# from dizoo.d4rl.config.hopper_medium_expert_edac_config import main_config,create_config +# from dizoo.d4rl.config.hopper_medium_edac_config import main_config,create_config + + +def main(): + # If you don't have offline data, you need to prepare if first and set the data_path in config + # For demostration, we also can train a RL policy (e.g. SAC) and collect some data + logging.getLogger().setLevel(logging.INFO) + cfg = compile_config(main_config, create_cfg=create_config, auto=True) + ding_init(cfg) + with task.start(async_mode=False, ctx=OfflineRLContext()): + evaluator_env = BaseEnvManagerV2( + env_fn=[lambda: D4RLEnv(cfg.env) for _ in range(cfg.env.evaluator_env_num)], cfg=cfg.env.manager + ) + + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + dataset = create_dataset(cfg) + model = Q_ensemble(**cfg.policy.model) + policy = EDACPolicy(cfg.policy, model=model) + + task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use(offline_data_fetcher(cfg, dataset)) + task.use(trainer(cfg, policy.learn_mode)) + task.use(CkptSaver(policy, cfg.exp_name, train_freq=100)) + task.use(offline_logger()) + task.run() + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/ding/model/common/__init__.py b/ding/model/common/__init__.py old mode 100644 new mode 100755 index edb22523ca..3ecff83748 --- a/ding/model/common/__init__.py +++ b/ding/model/common/__init__.py @@ -1,5 +1,5 @@ from .head import DiscreteHead, DuelingHead, DistributionHead, RainbowHead, QRDQNHead, \ - QuantileHead, FQFHead, RegressionHead, ReparameterizationHead, MultiHead, BranchingHead, head_cls_map, \ - independent_normal_dist, AttentionPolicyHead + QuantileHead, FQFHead, RegressionHead, ReparameterizationHead, MultiHead, BranchingHead, \ + EnsembleHead, head_cls_map, independent_normal_dist, AttentionPolicyHead from .encoder import ConvEncoder, FCEncoder, IMPALAConvEncoder from .utils import create_model diff --git a/ding/model/common/head.py b/ding/model/common/head.py old mode 100644 new mode 100755 index 29a3b77ca1..760ecd264b --- a/ding/model/common/head.py +++ b/ding/model/common/head.py @@ -6,7 +6,7 @@ import torch.nn.functional as F from torch.distributions import Normal, Independent -from ding.torch_utils import fc_block, noise_block, NoiseLinearLayer, MLP +from ding.torch_utils import fc_block, noise_block, NoiseLinearLayer, MLP, conv1d_block from ding.rl_utils import beta_function_map from ding.utils import lists_to_dicts, SequenceType @@ -1243,6 +1243,53 @@ def forward(self, x: torch.Tensor) -> Dict: """ return lists_to_dicts([m(x) for m in self.pred]) +class EnsembleHead(nn.Module): + """ + Overview: + The ``EnsembleHead`` used to output action Q-value for Q-ensemble. \ + Input is a (:obj:`torch.Tensor`) of shape ''(B, N * Ensemble_num, 1)'' and returns a (:obj:`Dict`) containing \ + output ``pred``. + Interfaces: + ``__init__``, ``forward``. + """ + def __init__(self, input_size: int, + output_size: int, + hidden_size: int, + layer_nun: int, + ensemble_num: int, + activation: Optional[nn.Module] = nn.ReLU(), + norm_type: Optional[str] = None) -> None: + super(EnsembleHead,self).__init__() + d = input_size + layers = [] + for _ in range(layer_nun): + layers.append(conv1d_block(d * ensemble_num, hidden_size * ensemble_num, kernel_size=1, + stride=1, groups=ensemble_num, activation=activation, norm_type=norm_type)) + d = hidden_size + layers.append(conv1d_block(hidden_size * ensemble_num, output_size * ensemble_num, kernel_size=1, + stride=1, groups=ensemble_num, activation=None, norm_type=None)) + self.pred = nn.Sequential(*layers) + + def forward(self, x: torch.Tensor) -> Dict: + """ + Overview: + Use encoded embedding tensor to run MLP with ``EnsembleHead`` and return the prediction dictionary. + Arguments: + - x (:obj:`torch.Tensor`): Tensor containing input embedding. + Returns: + - outputs (:obj:`Dict`): Dict containing keyword ``pred`` (:obj:`torch.Tensor`). + Shapes: + - x: :math:`(B, N * Ensemble_num, 1)`, where ``B = batch_size`` and ``N = hidden_size``. + - pred: :math:`(B, M * Ensemble_num, 1)`, where ``M = output_size``. + Examples: + >>> head = EnsembleHead(64 * 10, 64 * 10) + >>> inputs = torch.randn(4, 64 * 10, 1) ` + >>> outputs = head(inputs) + >>> assert isinstance(outputs, dict) + >>> assert outputs['pred'].shape == torch.Size([10, 64 * 10]) + """ + x = self.pred(x).squeeze() + return {'pred': x} def independent_normal_dist(logits: Union[List, Dict]) -> torch.distributions.Distribution: if isinstance(logits, (list, tuple)): @@ -1268,4 +1315,5 @@ def independent_normal_dist(logits: Union[List, Dict]) -> torch.distributions.Di 'reparameterization': ReparameterizationHead, # multi 'multi': MultiHead, + 'ensemble': EnsembleHead, } diff --git a/ding/model/template/__init__.py b/ding/model/template/__init__.py old mode 100644 new mode 100755 index e994286ac3..b19bf46857 --- a/ding/model/template/__init__.py +++ b/ding/model/template/__init__.py @@ -23,3 +23,4 @@ from .vae import VanillaVAE from .decision_transformer import DecisionTransformer from .procedure_cloning import ProcedureCloningMCTS, ProcedureCloningBFS +from .edac import Q_ensemble diff --git a/ding/model/template/edac.py b/ding/model/template/edac.py new file mode 100755 index 0000000000..cb3d4048c3 --- /dev/null +++ b/ding/model/template/edac.py @@ -0,0 +1,180 @@ +from typing import Union, Optional, Dict +from easydict import EasyDict + +import torch +import torch.nn as nn +from ding.model.common import ReparameterizationHead, EnsembleHead +from ding.utils import SequenceType, squeeze + +from ding.utils import MODEL_REGISTRY + +@MODEL_REGISTRY.register('edac') +class Q_ensemble(nn.Module): + r""" + Overview: + The QAC network with ensemble, which is used in EDAC. + Interfaces: + ``__init__``, ``forward``, ``compute_actor``, ``compute_critic`` + """ + mode = ['compute_actor', 'compute_critic'] + + def __init__( + self, + obs_shape: Union[int, SequenceType], + action_shape: Union[int, SequenceType, EasyDict], + ensemble_num: int = 2, + actor_head_hidden_size: int = 64, + actor_head_layer_num: int = 1, + critic_head_hidden_size: int = 64, + critic_head_layer_num: int = 1, + activation: Optional[nn.Module] = nn.ReLU(), + norm_type: Optional[str] = None, + **kwargs + ) -> None: + """ + Overview: + Initailize the EDAC Model according to input arguments. + Arguments: + - obs_shape (:obj:`Union[int, SequenceType]`): Observation's shape, such as 128, (156, ). + - action_shape (:obj:`Union[int, SequenceType, EasyDict]`): Action's shape, such as 4, (3, ), \ + EasyDict({'action_type_shape': 3, 'action_args_shape': 4}). + - ensemble_num (:obj:`bool`): Q-net numble. + - actor_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to actor head. + - actor_head_layer_num (:obj:`int`): The num of layers used in the network to compute Q value output \ + for actor head. + - critic_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to critic head. + - critic_head_layer_num (:obj:`int`): The num of layers used in the network to compute Q value output \ + for critic head. + - activation (:obj:`Optional[nn.Module]`): The type of activation function to use in ``MLP`` \ + after each FC layer, if ``None`` then default set to ``nn.ReLU()``. + - norm_type (:obj:`Optional[str]`): The type of normalization to after network layer (FC, Conv), \ + see ``ding.torch_utils.network`` for more details. + """ + super(Q_ensemble, self).__init__() + obs_shape: int = squeeze(obs_shape) + action_shape = squeeze(action_shape) + self.action_shape = action_shape + self.ensemble_num = ensemble_num + self.actor = nn.Sequential( + nn.Linear(obs_shape, actor_head_hidden_size), activation, + ReparameterizationHead( + actor_head_hidden_size, + action_shape, + actor_head_layer_num, + sigma_type='conditioned', + activation=activation, + norm_type=norm_type + ) + ) + + critic_input_size = obs_shape + action_shape + self.critic = nn.Sequential( + EnsembleHead( + critic_input_size, + 1, + critic_head_hidden_size, + critic_head_layer_num, + self.ensemble_num, + activation=activation, + norm_type=norm_type + ) + ) + + + def forward(self, inputs: Union[torch.Tensor, Dict[str, torch.Tensor]], mode: str) -> Dict[str, torch.Tensor]: + """ + Overview: + The unique execution (forward) method of EDAC method, and one can indicate different modes to implement \ + different computation graph, including ``compute_actor`` and ``compute_critic`` in EDAC. + Mode compute_actor: + Arguments: + - inputs (:obj:`torch.Tensor`): Observation data, defaults to tensor. + Returns: + - output (:obj:`Dict`): Output dict data, including differnet key-values among distinct action_space. + Mode compute_critic: + Arguments: + - inputs (:obj:`Dict`): Input dict data, including obs and action tensor. + Returns: + - output (:obj:`Dict`): Output dict data, including q_value tensor. + .. note:: + For specific examples, one can refer to API doc of ``compute_actor`` and ``compute_critic`` respectively. + """ + assert mode in self.mode, "not support forward mode: {}/{}".format(mode, self.mode) + return getattr(self, mode)(inputs) + + def compute_actor(self, obs: torch.Tensor) -> Dict[str, Union[torch.Tensor, Dict[str, torch.Tensor]]]: + """ + Overview: + The forward computation graph of compute_actor mode, uses observation tensor to produce actor output, + such as ``action``, ``logit`` and so on. + Arguments: + - obs (:obj:`torch.Tensor`): Observation tensor data, now supports a batch of 1-dim vector data, \ + i.e. ``(B, obs_shape)``. + Returns: + - outputs (:obj:`Dict[str, Union[torch.Tensor, Dict[str, torch.Tensor]]]`): Actor output varying \ + from action_space: ``reparameterization``. + ReturnsKeys (either): + - logit (:obj:`Dict[str, torch.Tensor]`): Reparameterization logit, usually in SAC. + - mu (:obj:`torch.Tensor`): Mean of parameterization gaussion distribution. + - sigma (:obj:`torch.Tensor`): Standard variation of parameterization gaussion distribution. + Shapes: + - obs (:obj:`torch.Tensor`): :math:`(B, N0)`, B is batch size and N0 corresponds to ``obs_shape``. + - action (:obj:`torch.Tensor`): :math:`(B, N1)`, B is batch size and N1 corresponds to ``action_shape``. + - logit.mu (:obj:`torch.Tensor`): :math:`(B, N1)`, B is batch size and N1 corresponds to ``action_shape``. + - logit.sigma (:obj:`torch.Tensor`): :math:`(B, N1)`, B is batch size. + - logit (:obj:`torch.Tensor`): :math:`(B, N2)`, B is batch size and N2 corresponds to \ + ``action_shape.action_type_shape``. + - action_args (:obj:`torch.Tensor`): :math:`(B, N3)`, B is batch size and N3 corresponds to \ + ``action_shape.action_args_shape``. + Examples: + >>> model = EDAC(64, 64,) + >>> obs = torch.randn(4, 64) + >>> actor_outputs = model(obs,'compute_actor') + >>> assert actor_outputs['logit'][0].shape == torch.Size([4, 64]) # mu + >>> actor_outputs['logit'][1].shape == torch.Size([4, 64]) # sigma + """ + x = self.actor(obs) + return {'logit': [x['mu'], x['sigma']]} + + def compute_critic(self, inputs: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: + """ + Overview: + The forward computation graph of compute_critic mode, uses observation and action tensor to produce critic + output, such as ``q_value``. + Arguments: + - inputs (:obj:`Dict[str, torch.Tensor]`): Dict strcture of input data, including ``obs`` and ``action`` tensor + Returns: + - outputs (:obj:`Dict[str, torch.Tensor]`): Critic output, such as ``q_value``. + ArgumentsKeys: + - obs: (:obj:`torch.Tensor`): Observation tensor data, now supports a batch of 1-dim vector data. + - action (:obj:`Union[torch.Tensor, Dict]`): Continuous action with same size as ``action_shape``. + ReturnKeys: + - q_value (:obj:`torch.Tensor`): Q value tensor with same size as batch size. + Shapes: + - obs (:obj:`torch.Tensor`): :math:`(B, N1)` or '(Ensemble_num, B, N1)', where B is batch size and N1 is ``obs_shape``. + - action (:obj:`torch.Tensor`): :math:`(B, N2)` or '(Ensemble_num, B, N2)', where B is batch size and N4 is ``action_shape``. + - q_value (:obj:`torch.Tensor`): :math:`(Ensemble_num, B)`, where B is batch size. + Examples: + >>> inputs = {'obs': torch.randn(4, 8), 'action': torch.randn(4, 1)} + >>> model = EDAC(obs_shape=(8, ),action_shape=1) + >>> model(inputs, mode='compute_critic')['q_value'] # q value + ... tensor([0.0773, 0.1639, 0.0917, 0.0370], grad_fn=) + """ + + obs, action = inputs['obs'], inputs['action'] + if len(action.shape) == 1: # (B, ) -> (B, 1) + action = action.unsqueeze(1) + x = torch.cat([obs, action], dim=-1) + if len(obs.shape) < 3: + # [batch_size,dim] -> [batch_size,Ensemble_num * dim,1] + x = x.repeat(1,self.ensemble_num).unsqueeze(-1) + else: + # [Ensemble_num,batch_size,dim] -> [batch_size,Ensemble_num,dim] -> [batch_size,Ensemble_num * dim, 1] + x = x.transpose(0,1) + batch_size = obs.shape[1] + x = x.reshape(batch_size,-1,1) + # [Ensemble_num,batch_size,1] + x = self.critic(x)['pred'] + # [batch_size,1*Ensemble_num] -> [Ensemble_num,batch_size] + x = x.permute(1,0) + return {'q_value': x} \ No newline at end of file diff --git a/ding/policy/__init__.py b/ding/policy/__init__.py old mode 100644 new mode 100755 index 15575c7d30..2293432383 --- a/ding/policy/__init__.py +++ b/ding/policy/__init__.py @@ -18,6 +18,7 @@ from .ppo import PPOPolicy, PPOPGPolicy, PPOOffPolicy from .sac import SACPolicy, SACDiscretePolicy, SQILSACPolicy from .cql import CQLPolicy, CQLDiscretePolicy +from .edac import EDACPolicy from .impala import IMPALAPolicy from .ngu import NGUPolicy from .r2d2 import R2D2Policy diff --git a/ding/policy/command_mode_policy_instance.py b/ding/policy/command_mode_policy_instance.py old mode 100644 new mode 100755 index 36e8ba7185..7e93b1d4a9 --- a/ding/policy/command_mode_policy_instance.py +++ b/ding/policy/command_mode_policy_instance.py @@ -47,6 +47,7 @@ from .sac import SQILSACPolicy from .madqn import MADQNPolicy from .bdq import BDQPolicy +from .edac import EDACPolicy class EpsCommandModePolicy(CommandModePolicy): @@ -380,6 +381,10 @@ class SQILSACCommandModePolicy(SQILSACPolicy, DummyCommandModePolicy): class IBCCommandModePolicy(IBCPolicy, DummyCommandModePolicy): pass +@POLICY_REGISTRY.register('edac_command') +class EDACCommandModelPolicy(EDACPolicy, DummyCommandModePolicy): + pass + @POLICY_REGISTRY.register('bc_command') class BCCommandModePolicy(BehaviourCloningPolicy, DummyCommandModePolicy): diff --git a/ding/policy/edac.py b/ding/policy/edac.py new file mode 100755 index 0000000000..be5a9d23d1 --- /dev/null +++ b/ding/policy/edac.py @@ -0,0 +1,340 @@ +from typing import List, Dict, Any, Tuple, Union +import copy +import numpy as np +import torch +import torch.nn as nn +from torch.distributions import Normal, Independent + +from ding.torch_utils import Adam, to_device +from ding.rl_utils import v_1step_td_data, v_1step_td_error, get_train_sample, \ + qrdqn_nstep_td_data, qrdqn_nstep_td_error, get_nstep_return_data +from ding.model import model_wrap +from ding.utils import POLICY_REGISTRY +from ding.utils.data import default_collate, default_decollate +from .sac import SACPolicy +from .dqn import DQNPolicy +from .common_utils import default_preprocess_learn + +@POLICY_REGISTRY.register('edac') +class EDACPolicy(SACPolicy): + """ + Overview: + Policy class of EDAC algorithm. + + Config: + == ==================== ======== ============= ================================= ======================= + ID Symbol Type Default Value Description Other(Shape) + == ==================== ======== ============= ================================= ======================= + 1 ``type`` str td3 | RL policy register name, refer | this arg is optional, + | to registry ``POLICY_REGISTRY`` | a placeholder + 2 ``cuda`` bool True | Whether to use cuda for network | + 3 | ``random_`` int 10000 | Number of randomly collected | Default to 10000 for + | ``collect_size`` | training samples in replay | SAC, 25000 for DDPG/ + | | buffer when training starts. | TD3. + 4 | ``model.policy_`` int 256 | Linear layer size for policy | + | ``embedding_size`` | network. | + 5 | ``model.soft_q_`` int 256 | Linear layer size for soft q | + | ``embedding_size`` | network. | + 6 | ``model.emsemble_`` int 10 | Number of Q-ensemble network | + | ``num`` | | + | | | is False. + 7 | ``learn.learning`` float 3e-4 | Learning rate for soft q | Defalut to 1e-3, when + | ``_rate_q`` | network. | model.value_network + | | | is True. + 8 | ``learn.learning`` float 3e-4 | Learning rate for policy | Defalut to 1e-3, when + | ``_rate_policy`` | network. | model.value_network + | | | is True. + 9 | ``learn.learning`` float 3e-4 | Learning rate for policy | Defalut to None when + | ``_rate_value`` | network. | model.value_network + | | | is False. + 10 | ``learn.alpha`` float 1.0 | Entropy regularization | alpha is initiali- + | | coefficient. | zation for auto + | | | `alpha`, when + | | | auto_alpha is True + 11 | ``learn.eta`` bool True | Parameter of EDAC algorithm |Defalut to 1.0 + | | | + 12 | ``learn.`` bool True | Determine whether to use | Temperature parameter + | ``auto_alpha`` | auto temperature parameter | determines the + | | `alpha`. | relative importance + | | | of the entropy term + | | | against the reward. + 13 | ``learn.-`` bool False | Determine whether to ignore | Use ignore_done only + | ``ignore_done`` | done flag. | in halfcheetah env. + 14 | ``learn.-`` float 0.005 | Used for soft update of the | aka. Interpolation + | ``target_theta`` | target network. | factor in polyak aver + | | | aging for target + | | | networks. + == ==================== ======== ============= ================================= ======================= + """ + config = dict( + type='sac', + cuda=False, + on_policy=False, + multi_agent=False, + priority=False, + priority_IS_weight=False, + random_collect_size=10000, + model=dict( + # (bool type) ensemble_num:num of Q-network. + ensemble_num=10, + # (bool type) value_network: Determine whether to use value network as the + # original SAC paper (arXiv 1801.01290). + # using value_network needs to set learning_rate_value, learning_rate_q, + # and learning_rate_policy in `cfg.policy.learn`. + # Default to False. + # value_network=False, + + # (int) Hidden size for actor network head. + actor_head_hidden_size=256, + + # (int) Hidden size for critic network head. + critic_head_hidden_size=256, + ), + learn=dict( + multi_gpu=False, + update_per_collect=1, + batch_size=256, + + learning_rate_q=3e-4, + learning_rate_policy=3e-4, + learning_rate_value=3e-4, + + learning_rate_alpha=3e-4, + target_theta=0.005, + discount_factor=0.99, + + alpha=1, + + auto_alpha=True, + # (bool type) log_space: Determine whether to use auto `\alpha` in log space. + log_space=True, + # (bool) Whether ignore done(usually for max step termination env. e.g. pendulum) + # Note: Gym wraps the MuJoCo envs by default with TimeLimit environment wrappers. + # These limit HalfCheetah, and several other MuJoCo envs, to max length of 1000. + # However, interaction with HalfCheetah always gets done with done is False, + # Since we inplace done==True with done==False to keep + # TD-error accurate computation(``gamma * (1 - done) * next_v + reward``), + # when the episode step is greater than max episode step. + ignore_done=False, + # (float) Weight uniform initialization range in the last output layer + init_w=3e-3, + # (float) Loss weight for conservative item. + min_q_weight=1.0, + # (bool) Whether to use entropy in target q. + with_q_entropy=False, + eta=0.1, + ), + collect=dict( + # (int) Cut trajectories into pieces with length "unroll_len". + unroll_len=1, + ), + eval=dict(), + other=dict( + replay_buffer=dict( + # (int type) replay_buffer_size: Max size of replay buffer. + replay_buffer_size=1000000, + # (int type) max_use: Max use times of one data in the buffer. + # Data will be removed once used for too many times. + # Default to infinite. + # max_use=256, + ), + ), + ) + + def _init_learn(self) -> None: + r""" + Overview: + Learn mode init method. Called by ``self.__init__``. + Init q, value and policy's optimizers, algorithm config, main and target models. + """ + # Init + self._priority = self._cfg.priority + self._priority_IS_weight = self._cfg.priority_IS_weight + self._eta = self._cfg.learn.eta + self._with_q_entropy = self._cfg.learn.with_q_entropy + self._value_network = False + + self._optimizer_q = Adam( + self._model.critic.parameters(), + lr=self._cfg.learn.learning_rate_q, + ) + self._optimizer_policy = Adam( + self._model.actor.parameters(), + lr=self._cfg.learn.learning_rate_policy, + ) + + self._gamma = self._cfg.learn.discount_factor + + init_w = self._cfg.learn.init_w + self._model.actor[2].mu.weight.data.uniform_(-init_w, init_w) + self._model.actor[2].mu.bias.data.uniform_(-init_w, init_w) + self._model.actor[2].log_sigma_layer.weight.data.uniform_(-init_w, init_w) + self._model.actor[2].log_sigma_layer.bias.data.uniform_(-init_w, init_w) + + if self._cfg.learn.auto_alpha: + self._target_entropy = -np.prod(self._cfg.model.action_shape) + if self._cfg.learn.log_space: + self._log_alpha = torch.log(torch.FloatTensor([self._cfg.learn.alpha])) + self._log_alpha = self._log_alpha.to(self._device).requires_grad_() + self._alpha_optim = torch.optim.Adam([self._log_alpha], lr=self._cfg.learn.learning_rate_alpha) + assert self._log_alpha.shape == torch.Size([1]) and self._log_alpha.requires_grad + self._alpha = self._log_alpha.detach().exp() + self._auto_alpha = True + self._log_space = True + else: + self._alpha = torch.FloatTensor([self._cfg.learn.alpha]).to(self._device).requires_grad_() + self._alpha_optim = torch.optim.Adam([self._alpha], lr=self._cfg.learn.learning_rate_alpha) + self._auto_alpha = True + self._log_space = False + else: + self._alpha = torch.tensor( + [self._cfg.learn.alpha], requires_grad=False, device=self._device, dtype=torch.float32 + ) + self._auto_alpha = False + + self._target_model = copy.deepcopy(self._model) + self._target_model = model_wrap( + self._target_model, + wrapper_name='target', + update_type='momentum', + update_kwargs={'theta': self._cfg.learn.target_theta} + ) + self._learn_model = model_wrap(self._model, wrapper_name='base') + self._learn_model.reset() + self._target_model.reset() + + self._forward_learn_cnt = 0 + + def _forward_learn(self, data: dict) -> Dict[str, Any]: + loss_dict = {} + data = default_preprocess_learn( + data, + use_priority=self._priority, + use_priority_IS_weight=self._cfg.priority_IS_weight, + ignore_done=self._cfg.learn.ignore_done, + use_nstep=False + ) + if len(data.get('action').shape) == 1: + data['action'] = data['action'].reshape(-1, 1) + + if self._cuda: + data = to_device(data, self._device) + + self._learn_model.train() + self._target_model.train() + obs = data['obs'] + next_obs = data['next_obs'] + reward = data['reward'] + done = data['done'] + acs = data['action'] + + # 1. predict q value + q_value = self._learn_model.forward(data, mode='compute_critic')['q_value'] + with torch.no_grad(): + (mu, sigma) = self._learn_model.forward(next_obs, mode='compute_actor')['logit'] + + dist = Independent(Normal(mu, sigma), 1) + pred = dist.rsample() + next_action = torch.tanh(pred) + y = 1 - next_action.pow(2) + 1e-6 + next_log_prob = dist.log_prob(pred).unsqueeze(-1) + next_log_prob = next_log_prob - torch.log(y).sum(-1, keepdim=True) + + next_data = {'obs': next_obs, 'action': next_action} + target_q_value = self._target_model.forward(next_data, mode='compute_critic')['q_value'] + # the value of a policy according to the maximum entropy objective + + target_q_value,_ = torch.min(target_q_value,dim=0) + if self._with_q_entropy: + target_q_value -= self._alpha * next_log_prob.squeeze(-1) + target_q_value = self._gamma * (1 - done) * target_q_value + reward + + weight = data['weight'] + if weight is None: + weight = torch.ones_like(q_value) + td_error_per_sample = nn.MSELoss(reduction='none')(q_value,target_q_value).mean(dim=1).sum() + loss_dict['critic_loss'] = (td_error_per_sample * weight).mean() + + + if self._eta > 0: + # [batch_size,dim] -> [Ensemble_num,batch_size,dim] + pre_obs = obs.unsqueeze(0).repeat_interleave(self._cfg.model.ensemble_num,dim=0) + pre_acs = acs.unsqueeze(0).repeat_interleave(self._cfg.model.ensemble_num,dim=0).requires_grad_(True) + + # [Ensemble_num,batch_size] + q_pred_tile = self._learn_model.forward({'obs':pre_obs,'action':pre_acs}, mode='compute_critic')['q_value'].requires_grad_(True) + + q_pred_grads = torch.autograd.grad(q_pred_tile.sum(),pre_acs,retain_graph=True,create_graph=True)[0] + q_pred_grads = q_pred_grads / (torch.norm(q_pred_grads,p=2,dim=2).unsqueeze(-1) + 1e-10) + # [Ensemble_num,batch_size,act_dim] -> [batch_size,Ensemble_num,act_dim] + q_pred_grads = q_pred_grads.transpose(0,1) + + q_pred_grads = q_pred_grads @ q_pred_grads.permute(0,2,1) + masks = torch.eye(self._cfg.model.ensemble_num,device=obs.device).unsqueeze(dim=0).repeat(q_pred_grads.size(0),1,1) + q_pred_grads = (1 - masks) * q_pred_grads + grad_loss = torch.mean(torch.sum(q_pred_grads,dim=(1,2))) / (self._cfg.model.ensemble_num - 1) + loss_dict['critic_loss'] += grad_loss + + + self._optimizer_q.zero_grad() + loss_dict['critic_loss'].backward() + self._optimizer_q.step() + + (mu, sigma) = self._learn_model.forward(data['obs'], mode='compute_actor')['logit'] + dist = Independent(Normal(mu, sigma), 1) + pred = dist.rsample() + action = torch.tanh(pred) + y = 1 - action.pow(2) + 1e-6 + log_prob = dist.log_prob(pred).unsqueeze(-1) + log_prob = log_prob - torch.log(y).sum(-1, keepdim=True) + + eval_data = {'obs': obs, 'action': action} + new_q_value = self._learn_model.forward(eval_data, mode='compute_critic')['q_value'] + new_q_value,_ = torch.min(new_q_value,dim=0) + + # 8. compute policy loss + policy_loss = (self._alpha * log_prob - new_q_value.unsqueeze(-1)).mean() + + loss_dict['policy_loss'] = policy_loss + + # 9. update policy network + self._optimizer_policy.zero_grad() + loss_dict['policy_loss'].backward() + self._optimizer_policy.step() + + # 10. compute alpha loss + if self._auto_alpha: + if self._log_space: + log_prob = log_prob + self._target_entropy + loss_dict['alpha_loss'] = -(self._log_alpha * log_prob.detach()).mean() + + self._alpha_optim.zero_grad() + loss_dict['alpha_loss'].backward() + self._alpha_optim.step() + self._alpha = self._log_alpha.detach().exp() + else: + log_prob = log_prob + self._target_entropy + loss_dict['alpha_loss'] = -(self._alpha * log_prob.detach()).mean() + + self._alpha_optim.zero_grad() + loss_dict['alpha_loss'].backward() + self._alpha_optim.step() + self._alpha = max(0, self._alpha) + + loss_dict['total_loss'] = sum(loss_dict.values()) + + # ============= + # after update + # ============= + self._forward_learn_cnt += 1 + # target update + self._target_model.update(self._learn_model.state_dict()) + return { + 'cur_lr_q': self._optimizer_q.defaults['lr'], + 'cur_lr_p': self._optimizer_policy.defaults['lr'], + 'priority': td_error_per_sample.abs().tolist(), + 'td_error': td_error_per_sample.detach().mean().item(), + 'alpha': self._alpha.item(), + 'target_q_value': target_q_value.detach().mean().item(), + **loss_dict + } \ No newline at end of file diff --git a/ding/torch_utils/__init__.py b/ding/torch_utils/__init__.py old mode 100644 new mode 100755 diff --git a/dizoo/d4rl/config/halfcheetah_expert_cql_config.py b/dizoo/d4rl/config/halfcheetah_expert_cql_config.py old mode 100644 new mode 100755 index bb4b321d80..58ead98dc5 --- a/dizoo/d4rl/config/halfcheetah_expert_cql_config.py +++ b/dizoo/d4rl/config/halfcheetah_expert_cql_config.py @@ -5,7 +5,7 @@ main_config = dict( exp_name="halfcheetah_expert_cql_seed0", env=dict( - env_id='halfcheetah-expert-v0', + env_id='halfcheetah-expert-v2', collector_env_num=1, evaluator_env_num=8, use_act_scale=True, @@ -15,8 +15,8 @@ policy=dict( cuda=True, model=dict( - obs_shape=11, - action_shape=3, + obs_shape=17, + action_shape=6, ), learn=dict( data_path=None, diff --git a/dizoo/d4rl/config/halfcheetah_expert_td3bc_config.py b/dizoo/d4rl/config/halfcheetah_expert_td3bc_config.py old mode 100644 new mode 100755 index e798cf66e3..55e2ddab19 --- a/dizoo/d4rl/config/halfcheetah_expert_td3bc_config.py +++ b/dizoo/d4rl/config/halfcheetah_expert_td3bc_config.py @@ -3,9 +3,9 @@ from easydict import EasyDict main_config = dict( - exp_name='walker2d_expert_td3-bc_seed0', + exp_name='halfcheetah_expert_td3-bc_seed0', env=dict( - env_id='walker2d-expert-v0', + env_id='halfcheetah-expert-v2', norm_obs=dict( use_norm=True, offline_stats=dict(use_offline_stats=True, ), @@ -18,8 +18,8 @@ ), policy=dict( model=dict( - obs_shape=11, - action_shape=3, + obs_shape=17, + action_shape=6, ), learn=dict( train_epoch=30000, diff --git a/dizoo/d4rl/config/halfcheetah_medium_cql_config.py b/dizoo/d4rl/config/halfcheetah_medium_cql_config.py old mode 100644 new mode 100755 index a879599fcf..84a504cadb --- a/dizoo/d4rl/config/halfcheetah_medium_cql_config.py +++ b/dizoo/d4rl/config/halfcheetah_medium_cql_config.py @@ -5,7 +5,7 @@ main_config = dict( exp_name="halfcheetah_medium_cql_seed0", env=dict( - env_id='halfcheetah-medium-v0', + env_id='halfcheetah-medium-v2', collector_env_num=1, evaluator_env_num=8, use_act_scale=True, @@ -15,8 +15,8 @@ policy=dict( cuda=True, model=dict( - obs_shape=11, - action_shape=3, + obs_shape=17, + action_shape=6, ), learn=dict( data_path=None, diff --git a/dizoo/d4rl/config/halfcheetah_medium_edac_config.py b/dizoo/d4rl/config/halfcheetah_medium_edac_config.py new file mode 100755 index 0000000000..abce76c71e --- /dev/null +++ b/dizoo/d4rl/config/halfcheetah_medium_edac_config.py @@ -0,0 +1,60 @@ +from easydict import EasyDict + +main_config = dict( + exp_name="halfcheetah_medium_edac_seed0", + env=dict( + env_id='halfcheetah-medium-v2', + collector_env_num=1, + evaluator_env_num=8, + use_act_scale=True, + n_evaluator_episode=8, + stop_value=7600, + ), + policy=dict( + cuda=True, + model=dict( + obs_shape=17, + action_shape=6, + ensemble_num=10, + actor_head_hidden_size=256, + actor_head_layer_num=3, + critic_head_hidden_size=256, + critic_head_layer_num=3, + ), + learn=dict( + data_path=None, + train_epoch=30000, + batch_size=256, + learning_rate_q=3e-4, + learning_rate_policy=3e-4, + learning_rate_alpha=3e-4, + alpha=1, + auto_alpha=True, + eta=1.0, + with_q_entropy=False, + learner=dict(hook=dict(save_ckpt_after_iter=100000, )), + ), + collect=dict(data_type='d4rl', ), + eval=dict(evaluator=dict(eval_freq=500, )), + other=dict(replay_buffer=dict(replay_buffer_size=2000000, ), ), + ), + seed = 0, +) + +main_config = EasyDict(main_config) +main_config = main_config + +create_config = dict( + env=dict( + type='d4rl', + import_names=['dizoo.d4rl.envs.d4rl_env'], + ), + env_manager=dict(type='base'), + policy=dict( + type='edac', + import_names=['ding.policy.edac'], + ), + replay_buffer=dict(type='naive', ), +) +create_config = EasyDict(create_config) +create_config = create_config \ No newline at end of file diff --git a/dizoo/d4rl/config/halfcheetah_medium_expert_cql_config.py b/dizoo/d4rl/config/halfcheetah_medium_expert_cql_config.py old mode 100644 new mode 100755 index 2e74c43ac2..05aa2d1752 --- a/dizoo/d4rl/config/halfcheetah_medium_expert_cql_config.py +++ b/dizoo/d4rl/config/halfcheetah_medium_expert_cql_config.py @@ -5,7 +5,7 @@ main_config = dict( exp_name="halfcheetah_medium_expert_cql_seed0", env=dict( - env_id='halfcheetah-medium-expert-v0', + env_id='halfcheetah-medium-expert-v2', collector_env_num=1, evaluator_env_num=8, use_act_scale=True, @@ -15,8 +15,8 @@ policy=dict( cuda=True, model=dict( - obs_shape=11, - action_shape=3, + obs_shape=17, + action_shape=6, ), learn=dict( data_path=None, diff --git a/dizoo/d4rl/config/halfcheetah_medium_expert_edac_config.py b/dizoo/d4rl/config/halfcheetah_medium_expert_edac_config.py new file mode 100755 index 0000000000..be066afae2 --- /dev/null +++ b/dizoo/d4rl/config/halfcheetah_medium_expert_edac_config.py @@ -0,0 +1,60 @@ +from easydict import EasyDict + +main_config = dict( + exp_name="halfcheetah_medium_expert_edac_seed123", + env=dict( + env_id='halfcheetah-medium-expert-v2', + collector_env_num=1, + evaluator_env_num=8, + use_act_scale=True, + n_evaluator_episode=8, + stop_value=13000, + ), + policy=dict( + cuda=True, + model=dict( + obs_shape=17, + action_shape=6, + ensemble_num=10, + actor_head_hidden_size=256, + actor_head_layer_num=3, + critic_head_hidden_size=256, + critic_head_layer_num=3, + ), + learn=dict( + data_path=None, + train_epoch=30000, + batch_size=256, + learning_rate_q=3e-4, + learning_rate_policy=3e-4, + learning_rate_alpha=3e-4, + alpha=1, + auto_alpha=True, + eta=5.0, + with_q_entropy=False, + learner=dict(hook=dict(save_ckpt_after_iter=100000, )), + ), + collect=dict(data_type='d4rl', ), + eval=dict(evaluator=dict(eval_freq=500, )), + other=dict(replay_buffer=dict(replay_buffer_size=2000000, ), ), + ), + seed = 123, +) + +main_config = EasyDict(main_config) +main_config = main_config + +create_config = dict( + env=dict( + type='d4rl', + import_names=['dizoo.d4rl.envs.d4rl_env'], + ), + env_manager=dict(type='base'), + policy=dict( + type='edac', + import_names=['ding.policy.edac'], + ), + replay_buffer=dict(type='naive', ), +) +create_config = EasyDict(create_config) +create_config = create_config \ No newline at end of file diff --git a/dizoo/d4rl/config/halfcheetah_medium_expert_td3bc_config.py b/dizoo/d4rl/config/halfcheetah_medium_expert_td3bc_config.py old mode 100644 new mode 100755 index 8d25289131..045e4decfe --- a/dizoo/d4rl/config/halfcheetah_medium_expert_td3bc_config.py +++ b/dizoo/d4rl/config/halfcheetah_medium_expert_td3bc_config.py @@ -5,7 +5,7 @@ main_config = dict( exp_name='halfcheetah_medium_expert_td3-bc_seed0', env=dict( - env_id='halfcheetah-medium-expert-v0', + env_id='halfcheetah-medium-expert-v2', norm_obs=dict( use_norm=True, offline_stats=dict(use_offline_stats=True, ), @@ -14,12 +14,12 @@ evaluator_env_num=8, use_act_scale=True, n_evaluator_episode=8, - stop_value=6000, + stop_value=13000, ), policy=dict( model=dict( - obs_shape=11, - action_shape=3, + obs_shape=17, + action_shape=6, ), learn=dict( train_epoch=30000, diff --git a/dizoo/d4rl/config/halfcheetah_medium_replay_cql_config.py b/dizoo/d4rl/config/halfcheetah_medium_replay_cql_config.py old mode 100644 new mode 100755 index 010c7963d2..823e08d370 --- a/dizoo/d4rl/config/halfcheetah_medium_replay_cql_config.py +++ b/dizoo/d4rl/config/halfcheetah_medium_replay_cql_config.py @@ -5,7 +5,7 @@ main_config = dict( exp_name="halfcheetah_medium_replay_cql_seed0", env=dict( - env_id='halfcheetah-medium-replay-v0', + env_id='halfcheetah-medium-replay-v2', collector_env_num=1, evaluator_env_num=8, use_act_scale=True, @@ -15,8 +15,8 @@ policy=dict( cuda=True, model=dict( - obs_shape=11, - action_shape=3, + obs_shape=17, + action_shape=6, ), learn=dict( data_path=None, diff --git a/dizoo/d4rl/config/halfcheetah_medium_replay_td3bc_config.py b/dizoo/d4rl/config/halfcheetah_medium_replay_td3bc_config.py old mode 100644 new mode 100755 index 3561f320fb..9f41ba8a5c --- a/dizoo/d4rl/config/halfcheetah_medium_replay_td3bc_config.py +++ b/dizoo/d4rl/config/halfcheetah_medium_replay_td3bc_config.py @@ -5,7 +5,7 @@ main_config = dict( exp_name='halfcheetah_medium_replay_td3-bc_seed0', env=dict( - env_id='halfcheetah-medium-replay-v0', + env_id='halfcheetah-medium-replay-v2', norm_obs=dict( use_norm=True, offline_stats=dict(use_offline_stats=True, ), @@ -18,8 +18,8 @@ ), policy=dict( model=dict( - obs_shape=11, - action_shape=3, + obs_shape=17, + action_shape=6, ), learn=dict( train_epoch=30000, diff --git a/dizoo/d4rl/config/halfcheetah_medium_td3bc_config.py b/dizoo/d4rl/config/halfcheetah_medium_td3bc_config.py old mode 100644 new mode 100755 index ef6e2d3f40..99cf4bc1ea --- a/dizoo/d4rl/config/halfcheetah_medium_td3bc_config.py +++ b/dizoo/d4rl/config/halfcheetah_medium_td3bc_config.py @@ -5,7 +5,7 @@ main_config = dict( exp_name='halfcheetah_medium_td3-bc_seed0', env=dict( - env_id='halfcheetah-medium-v0', + env_id='halfcheetah-medium-v2', norm_obs=dict( use_norm=True, offline_stats=dict(use_offline_stats=True, ), @@ -14,12 +14,12 @@ evaluator_env_num=8, use_act_scale=True, n_evaluator_episode=8, - stop_value=6000, + stop_value=7600, ), policy=dict( model=dict( - obs_shape=11, - action_shape=3, + obs_shape=17, + action_shape=6, ), learn=dict( train_epoch=30000, diff --git a/dizoo/d4rl/config/halfcheetah_random_cql_config.py b/dizoo/d4rl/config/halfcheetah_random_cql_config.py old mode 100644 new mode 100755 index bb4b321d80..58ead98dc5 --- a/dizoo/d4rl/config/halfcheetah_random_cql_config.py +++ b/dizoo/d4rl/config/halfcheetah_random_cql_config.py @@ -5,7 +5,7 @@ main_config = dict( exp_name="halfcheetah_expert_cql_seed0", env=dict( - env_id='halfcheetah-expert-v0', + env_id='halfcheetah-expert-v2', collector_env_num=1, evaluator_env_num=8, use_act_scale=True, @@ -15,8 +15,8 @@ policy=dict( cuda=True, model=dict( - obs_shape=11, - action_shape=3, + obs_shape=17, + action_shape=6, ), learn=dict( data_path=None, diff --git a/dizoo/d4rl/config/halfcheetah_random_td3bc_config.py b/dizoo/d4rl/config/halfcheetah_random_td3bc_config.py old mode 100644 new mode 100755 index dbe94d1a24..a823d4f4a5 --- a/dizoo/d4rl/config/halfcheetah_random_td3bc_config.py +++ b/dizoo/d4rl/config/halfcheetah_random_td3bc_config.py @@ -5,7 +5,7 @@ main_config = dict( exp_name='halfcheetah_random_td3-bc_seed0', env=dict( - env_id='halfcheetah-random-v0', + env_id='halfcheetah-random-v2', norm_obs=dict( use_norm=True, offline_stats=dict(use_offline_stats=True, ), @@ -18,8 +18,8 @@ ), policy=dict( model=dict( - obs_shape=11, - action_shape=3, + obs_shape=17, + action_shape=6, ), learn=dict( train_epoch=30000, diff --git a/dizoo/d4rl/config/hopper_medium_edac_config.py b/dizoo/d4rl/config/hopper_medium_edac_config.py new file mode 100755 index 0000000000..21b5c10eb6 --- /dev/null +++ b/dizoo/d4rl/config/hopper_medium_edac_config.py @@ -0,0 +1,60 @@ +from easydict import EasyDict + +main_config = dict( + exp_name="hopper_medium_edac_seed0", + env=dict( + env_id='hopper-medium-v2', + collector_env_num=1, + evaluator_env_num=8, + use_act_scale=True, + n_evaluator_episode=8, + stop_value=3700, + ), + policy=dict( + cuda=True, + model=dict( + obs_shape=11, + action_shape=3, + ensemble_num=50, + actor_head_hidden_size=256, + actor_head_layer_num=3, + critic_head_hidden_size=256, + critic_head_layer_num=3, + ), + learn=dict( + data_path=None, + train_epoch=30000, + batch_size=256, + learning_rate_q=3e-4, + learning_rate_policy=3e-4, + learning_rate_alpha=3e-4, + alpha=1, + auto_alpha=True, + eta=1.0, + with_q_entropy=False, + learner=dict(hook=dict(save_ckpt_after_iter=100000, )), + ), + collect=dict(data_type='d4rl', ), + eval=dict(evaluator=dict(eval_freq=500, )), + other=dict(replay_buffer=dict(replay_buffer_size=2000000, ), ), + ), + seed = 0, +) + +main_config = EasyDict(main_config) +main_config = main_config + +create_config = dict( + env=dict( + type='d4rl', + import_names=['dizoo.d4rl.envs.d4rl_env'], + ), + env_manager=dict(type='base'), + policy=dict( + type='edac', + import_names=['ding.policy.edac'], + ), + replay_buffer=dict(type='naive', ), +) +create_config = EasyDict(create_config) +create_config = create_config \ No newline at end of file diff --git a/dizoo/d4rl/config/hopper_medium_expert_edac_config.py b/dizoo/d4rl/config/hopper_medium_expert_edac_config.py new file mode 100755 index 0000000000..22131544f6 --- /dev/null +++ b/dizoo/d4rl/config/hopper_medium_expert_edac_config.py @@ -0,0 +1,60 @@ +from easydict import EasyDict + +main_config = dict( + exp_name="hopper_medium_expert_edac_seed0", + env=dict( + env_id='hopper-medium-expert-v2', + collector_env_num=1, + evaluator_env_num=8, + use_act_scale=True, + n_evaluator_episode=8, + stop_value=5000, + ), + policy=dict( + cuda=True, + model=dict( + obs_shape=11, + action_shape=3, + ensemble_num=50, + actor_head_hidden_size=256, + actor_head_layer_num=3, + critic_head_hidden_size=256, + critic_head_layer_num=3, + ), + learn=dict( + data_path=None, + train_epoch=30000, + batch_size=256, + learning_rate_q=3e-4, + learning_rate_policy=3e-4, + learning_rate_alpha=3e-4, + alpha=1, + auto_alpha=False, + eta=1.0, + with_q_entropy=False, + learner=dict(hook=dict(save_ckpt_after_iter=100000, )), + ), + collect=dict(data_type='d4rl', ), + eval=dict(evaluator=dict(eval_freq=500, )), + other=dict(replay_buffer=dict(replay_buffer_size=2000000, ), ), + ), + seed = 0, +) + +main_config = EasyDict(main_config) +main_config = main_config + +create_config = dict( + env=dict( + type='d4rl', + import_names=['dizoo.d4rl.envs.d4rl_env'], + ), + env_manager=dict(type='base'), + policy=dict( + type='edac', + import_names=['ding.policy.edac'], + ), + replay_buffer=dict(type='naive', ), +) +create_config = EasyDict(create_config) +create_config = create_config \ No newline at end of file diff --git a/dizoo/d4rl/config/walker2d_expert_cql_config.py b/dizoo/d4rl/config/walker2d_expert_cql_config.py old mode 100644 new mode 100755 index 271dbca013..346dd1a1f2 --- a/dizoo/d4rl/config/walker2d_expert_cql_config.py +++ b/dizoo/d4rl/config/walker2d_expert_cql_config.py @@ -5,7 +5,7 @@ main_config = dict( exp_name="walker2d_expert_cql_seed0", env=dict( - env_id='walker2d-expert-v0', + env_id='walker2d-expert-v2', collector_env_num=1, evaluator_env_num=8, use_act_scale=True, @@ -15,8 +15,8 @@ policy=dict( cuda=True, model=dict( - obs_shape=11, - action_shape=3, + obs_shape=17, + action_shape=6, ), learn=dict( data_path=None, diff --git a/dizoo/d4rl/config/walker2d_expert_td3bc_config.py b/dizoo/d4rl/config/walker2d_expert_td3bc_config.py old mode 100644 new mode 100755 index c12d58b230..55e2ddab19 --- a/dizoo/d4rl/config/walker2d_expert_td3bc_config.py +++ b/dizoo/d4rl/config/walker2d_expert_td3bc_config.py @@ -5,7 +5,7 @@ main_config = dict( exp_name='halfcheetah_expert_td3-bc_seed0', env=dict( - env_id='halfcheetah-expert-v0', + env_id='halfcheetah-expert-v2', norm_obs=dict( use_norm=True, offline_stats=dict(use_offline_stats=True, ), @@ -18,8 +18,8 @@ ), policy=dict( model=dict( - obs_shape=11, - action_shape=3, + obs_shape=17, + action_shape=6, ), learn=dict( train_epoch=30000, diff --git a/dizoo/d4rl/config/walker2d_medium_cql_config.py b/dizoo/d4rl/config/walker2d_medium_cql_config.py old mode 100644 new mode 100755 index 0ea01a8689..afacebae0b --- a/dizoo/d4rl/config/walker2d_medium_cql_config.py +++ b/dizoo/d4rl/config/walker2d_medium_cql_config.py @@ -5,7 +5,7 @@ main_config = dict( exp_name="walker2d_medium_cql_seed0", env=dict( - env_id='walker2d-medium-v0', + env_id='walker2d-medium-v2', collector_env_num=1, evaluator_env_num=8, use_act_scale=True, @@ -15,8 +15,8 @@ policy=dict( cuda=True, model=dict( - obs_shape=11, - action_shape=3, + obs_shape=17, + action_shape=6, ), learn=dict( data_path=None, diff --git a/dizoo/d4rl/config/walker2d_medium_expert_cql_config.py b/dizoo/d4rl/config/walker2d_medium_expert_cql_config.py old mode 100644 new mode 100755 index b15461ed5b..f05d15c346 --- a/dizoo/d4rl/config/walker2d_medium_expert_cql_config.py +++ b/dizoo/d4rl/config/walker2d_medium_expert_cql_config.py @@ -5,7 +5,7 @@ main_config = dict( exp_name="walker2d_medium_expert_cql_seed0", env=dict( - env_id='walker2d-medium-expert-v0', + env_id='walker2d-medium-expert-v2', collector_env_num=1, evaluator_env_num=8, use_act_scale=True, @@ -15,8 +15,8 @@ policy=dict( cuda=True, model=dict( - obs_shape=11, - action_shape=3, + obs_shape=17, + action_shape=6, ), learn=dict( data_path=None, diff --git a/dizoo/d4rl/config/walker2d_medium_expert_td3bc_config.py b/dizoo/d4rl/config/walker2d_medium_expert_td3bc_config.py old mode 100644 new mode 100755 index 2aed878dd8..91389fdb31 --- a/dizoo/d4rl/config/walker2d_medium_expert_td3bc_config.py +++ b/dizoo/d4rl/config/walker2d_medium_expert_td3bc_config.py @@ -5,7 +5,7 @@ main_config = dict( exp_name='walker2d_medium_expert_td3-bc_seed0', env=dict( - env_id='walker2d-medium-expert-v0', + env_id='walker2d-medium-expert-v2', norm_obs=dict( use_norm=True, offline_stats=dict(use_offline_stats=True, ), @@ -18,8 +18,8 @@ ), policy=dict( model=dict( - obs_shape=11, - action_shape=3, + obs_shape=17, + action_shape=6, ), learn=dict( train_epoch=30000, diff --git a/dizoo/d4rl/config/walker2d_medium_replay_cql_config.py b/dizoo/d4rl/config/walker2d_medium_replay_cql_config.py old mode 100644 new mode 100755 index 5dc454d158..23437423b6 --- a/dizoo/d4rl/config/walker2d_medium_replay_cql_config.py +++ b/dizoo/d4rl/config/walker2d_medium_replay_cql_config.py @@ -5,7 +5,7 @@ main_config = dict( exp_name="walker2d_medium_replay_cql_seed0", env=dict( - env_id='walker2d-medium-replay-v0', + env_id='walker2d-medium-replay-v2', collector_env_num=1, evaluator_env_num=8, use_act_scale=True, @@ -15,8 +15,8 @@ policy=dict( cuda=True, model=dict( - obs_shape=11, - action_shape=3, + obs_shape=17, + action_shape=6, ), learn=dict( data_path=None, diff --git a/dizoo/d4rl/config/walker2d_medium_replay_td3bc_config.py b/dizoo/d4rl/config/walker2d_medium_replay_td3bc_config.py old mode 100644 new mode 100755 index 67cc95a1c2..e532676db2 --- a/dizoo/d4rl/config/walker2d_medium_replay_td3bc_config.py +++ b/dizoo/d4rl/config/walker2d_medium_replay_td3bc_config.py @@ -5,7 +5,7 @@ main_config = dict( exp_name='walker2d_medium_replay_td3-bc_seed0', env=dict( - env_id='walker2d-medium-replay-v0', + env_id='walker2d-medium-replay-v2', norm_obs=dict( use_norm=True, offline_stats=dict(use_offline_stats=True, ), @@ -18,8 +18,8 @@ ), policy=dict( model=dict( - obs_shape=11, - action_shape=3, + obs_shape=17, + action_shape=6, ), learn=dict( train_epoch=30000, diff --git a/dizoo/d4rl/config/walker2d_medium_td3bc_config.py b/dizoo/d4rl/config/walker2d_medium_td3bc_config.py old mode 100644 new mode 100755 index dc76b5c012..3f9adc89d7 --- a/dizoo/d4rl/config/walker2d_medium_td3bc_config.py +++ b/dizoo/d4rl/config/walker2d_medium_td3bc_config.py @@ -5,7 +5,7 @@ main_config = dict( exp_name='walker2d_medium_td3-bc_seed0', env=dict( - env_id='walker2d-medium-v0', + env_id='walker2d-medium-v2', norm_obs=dict( use_norm=True, offline_stats=dict(use_offline_stats=True, ), @@ -18,8 +18,8 @@ ), policy=dict( model=dict( - obs_shape=11, - action_shape=3, + obs_shape=17, + action_shape=6, ), learn=dict( train_epoch=30000, diff --git a/dizoo/d4rl/config/walker2d_random_cql_config.py b/dizoo/d4rl/config/walker2d_random_cql_config.py old mode 100644 new mode 100755 index 271dbca013..346dd1a1f2 --- a/dizoo/d4rl/config/walker2d_random_cql_config.py +++ b/dizoo/d4rl/config/walker2d_random_cql_config.py @@ -5,7 +5,7 @@ main_config = dict( exp_name="walker2d_expert_cql_seed0", env=dict( - env_id='walker2d-expert-v0', + env_id='walker2d-expert-v2', collector_env_num=1, evaluator_env_num=8, use_act_scale=True, @@ -15,8 +15,8 @@ policy=dict( cuda=True, model=dict( - obs_shape=11, - action_shape=3, + obs_shape=17, + action_shape=6, ), learn=dict( data_path=None, diff --git a/dizoo/d4rl/config/walker2d_random_td3bc_config.py b/dizoo/d4rl/config/walker2d_random_td3bc_config.py old mode 100644 new mode 100755 index f252c14dbd..091744acda --- a/dizoo/d4rl/config/walker2d_random_td3bc_config.py +++ b/dizoo/d4rl/config/walker2d_random_td3bc_config.py @@ -5,7 +5,7 @@ main_config = dict( exp_name='walker2d_random_td3-bc_seed0', env=dict( - env_id='walker2d-random-v0', + env_id='walker2d-random-v2', norm_obs=dict( use_norm=True, offline_stats=dict(use_offline_stats=True, ), @@ -18,8 +18,8 @@ ), policy=dict( model=dict( - obs_shape=11, - action_shape=3, + obs_shape=17, + action_shape=6, ), learn=dict( train_epoch=30000, diff --git a/dizoo/d4rl/entry/d4rl_edac_main.py b/dizoo/d4rl/entry/d4rl_edac_main.py new file mode 100755 index 0000000000..9ee28815ab --- /dev/null +++ b/dizoo/d4rl/entry/d4rl_edac_main.py @@ -0,0 +1,21 @@ +from ding.entry import serial_pipeline_offline +from ding.config import read_config +from pathlib import Path + + +def train(args): + # launch from anywhere + config = Path(__file__).absolute().parent.parent / 'config' / args.config + config = read_config(str(config)) + config[0].exp_name = config[0].exp_name.replace('0', str(args.seed)) + serial_pipeline_offline(config, seed=args.seed) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument('--seed', '-s', type=int, default=0) + parser.add_argument('--config', '-c', type=str, default='halfcheetah_medium_edac_config.py') + args = parser.parse_args() + train(args) \ No newline at end of file From 63c357b2d453f282dbebf865dbca409036dc367c Mon Sep 17 00:00:00 2001 From: Super1ce <278042904@qq.com> Date: Thu, 6 Apr 2023 20:01:10 +0800 Subject: [PATCH 02/25] modify edac --- ding/example/edac.py | 3 --- ding/model/common/head.py | 48 +++++++++++++++++++++++++++---------- ding/model/template/edac.py | 12 +++++----- ding/policy/edac.py | 40 +++++++++++++++---------------- 4 files changed, 62 insertions(+), 41 deletions(-) diff --git a/ding/example/edac.py b/ding/example/edac.py index 5ce16ee5a2..c1da13947e 100755 --- a/ding/example/edac.py +++ b/ding/example/edac.py @@ -11,9 +11,6 @@ from ding.utils import set_pkg_seed from dizoo.d4rl.envs import D4RLEnv from dizoo.d4rl.config.halfcheetah_medium_edac_config import main_config,create_config -# from dizoo.d4rl.config.halfcheetah_medium_expert_edac_config import main_config,create_config -# from dizoo.d4rl.config.hopper_medium_expert_edac_config import main_config,create_config -# from dizoo.d4rl.config.hopper_medium_edac_config import main_config,create_config def main(): diff --git a/ding/model/common/head.py b/ding/model/common/head.py index 760ecd264b..3538340b5a 100755 --- a/ding/model/common/head.py +++ b/ding/model/common/head.py @@ -1243,6 +1243,7 @@ def forward(self, x: torch.Tensor) -> Dict: """ return lists_to_dicts([m(x) for m in self.pred]) + class EnsembleHead(nn.Module): """ Overview: @@ -1252,22 +1253,44 @@ class EnsembleHead(nn.Module): Interfaces: ``__init__``, ``forward``. """ - def __init__(self, input_size: int, - output_size: int, - hidden_size: int, - layer_nun: int, - ensemble_num: int, - activation: Optional[nn.Module] = nn.ReLU(), - norm_type: Optional[str] = None) -> None: - super(EnsembleHead,self).__init__() + + def __init__( + self, + input_size: int, + output_size: int, + hidden_size: int, + layer_nun: int, + ensemble_num: int, + activation: Optional[nn.Module] = nn.ReLU(), + norm_type: Optional[str] = None + ) -> None: + super(EnsembleHead, self).__init__() d = input_size layers = [] for _ in range(layer_nun): - layers.append(conv1d_block(d * ensemble_num, hidden_size * ensemble_num, kernel_size=1, - stride=1, groups=ensemble_num, activation=activation, norm_type=norm_type)) + layers.append( + conv1d_block( + d * ensemble_num, + hidden_size * ensemble_num, + kernel_size=1, + stride=1, + groups=ensemble_num, + activation=activation, + norm_type=norm_type + ) + ) d = hidden_size - layers.append(conv1d_block(hidden_size * ensemble_num, output_size * ensemble_num, kernel_size=1, - stride=1, groups=ensemble_num, activation=None, norm_type=None)) + layers.append( + conv1d_block( + hidden_size * ensemble_num, + output_size * ensemble_num, + kernel_size=1, + stride=1, + groups=ensemble_num, + activation=None, + norm_type=None + ) + ) self.pred = nn.Sequential(*layers) def forward(self, x: torch.Tensor) -> Dict: @@ -1291,6 +1314,7 @@ def forward(self, x: torch.Tensor) -> Dict: x = self.pred(x).squeeze() return {'pred': x} + def independent_normal_dist(logits: Union[List, Dict]) -> torch.distributions.Distribution: if isinstance(logits, (list, tuple)): return Independent(Normal(*logits), 1) diff --git a/ding/model/template/edac.py b/ding/model/template/edac.py index cb3d4048c3..fa49b2404e 100755 --- a/ding/model/template/edac.py +++ b/ding/model/template/edac.py @@ -8,6 +8,7 @@ from ding.utils import MODEL_REGISTRY + @MODEL_REGISTRY.register('edac') class Q_ensemble(nn.Module): r""" @@ -80,7 +81,6 @@ def __init__( ) ) - def forward(self, inputs: Union[torch.Tensor, Dict[str, torch.Tensor]], mode: str) -> Dict[str, torch.Tensor]: """ Overview: @@ -167,14 +167,14 @@ def compute_critic(self, inputs: Dict[str, torch.Tensor]) -> Dict[str, torch.Ten x = torch.cat([obs, action], dim=-1) if len(obs.shape) < 3: # [batch_size,dim] -> [batch_size,Ensemble_num * dim,1] - x = x.repeat(1,self.ensemble_num).unsqueeze(-1) + x = x.repeat(1, self.ensemble_num).unsqueeze(-1) else: # [Ensemble_num,batch_size,dim] -> [batch_size,Ensemble_num,dim] -> [batch_size,Ensemble_num * dim, 1] - x = x.transpose(0,1) + x = x.transpose(0, 1) batch_size = obs.shape[1] - x = x.reshape(batch_size,-1,1) + x = x.reshape(batch_size, -1, 1) # [Ensemble_num,batch_size,1] x = self.critic(x)['pred'] # [batch_size,1*Ensemble_num] -> [Ensemble_num,batch_size] - x = x.permute(1,0) - return {'q_value': x} \ No newline at end of file + x = x.permute(1, 0) + return {'q_value': x} diff --git a/ding/policy/edac.py b/ding/policy/edac.py index be5a9d23d1..7b46b1dd17 100755 --- a/ding/policy/edac.py +++ b/ding/policy/edac.py @@ -15,6 +15,7 @@ from .dqn import DQNPolicy from .common_utils import default_preprocess_learn + @POLICY_REGISTRY.register('edac') class EDACPolicy(SACPolicy): """ @@ -94,17 +95,13 @@ class EDACPolicy(SACPolicy): multi_gpu=False, update_per_collect=1, batch_size=256, - learning_rate_q=3e-4, learning_rate_policy=3e-4, learning_rate_value=3e-4, - learning_rate_alpha=3e-4, target_theta=0.005, discount_factor=0.99, - alpha=1, - auto_alpha=True, # (bool type) log_space: Determine whether to use auto `\alpha` in log space. log_space=True, @@ -244,7 +241,7 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: target_q_value = self._target_model.forward(next_data, mode='compute_critic')['q_value'] # the value of a policy according to the maximum entropy objective - target_q_value,_ = torch.min(target_q_value,dim=0) + target_q_value, _ = torch.min(target_q_value, dim=0) if self._with_q_entropy: target_q_value -= self._alpha * next_log_prob.squeeze(-1) target_q_value = self._gamma * (1 - done) * target_q_value + reward @@ -252,29 +249,32 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: weight = data['weight'] if weight is None: weight = torch.ones_like(q_value) - td_error_per_sample = nn.MSELoss(reduction='none')(q_value,target_q_value).mean(dim=1).sum() + td_error_per_sample = nn.MSELoss(reduction='none')(q_value, target_q_value).mean(dim=1).sum() loss_dict['critic_loss'] = (td_error_per_sample * weight).mean() - if self._eta > 0: # [batch_size,dim] -> [Ensemble_num,batch_size,dim] - pre_obs = obs.unsqueeze(0).repeat_interleave(self._cfg.model.ensemble_num,dim=0) - pre_acs = acs.unsqueeze(0).repeat_interleave(self._cfg.model.ensemble_num,dim=0).requires_grad_(True) + pre_obs = obs.unsqueeze(0).repeat_interleave(self._cfg.model.ensemble_num, dim=0) + pre_acs = acs.unsqueeze(0).repeat_interleave(self._cfg.model.ensemble_num, dim=0).requires_grad_(True) # [Ensemble_num,batch_size] - q_pred_tile = self._learn_model.forward({'obs':pre_obs,'action':pre_acs}, mode='compute_critic')['q_value'].requires_grad_(True) + q_pred_tile = self._learn_model.forward({ + 'obs': pre_obs, + 'action': pre_acs + }, mode='compute_critic')['q_value'].requires_grad_(True) - q_pred_grads = torch.autograd.grad(q_pred_tile.sum(),pre_acs,retain_graph=True,create_graph=True)[0] - q_pred_grads = q_pred_grads / (torch.norm(q_pred_grads,p=2,dim=2).unsqueeze(-1) + 1e-10) + q_pred_grads = torch.autograd.grad(q_pred_tile.sum(), pre_acs, retain_graph=True, create_graph=True)[0] + q_pred_grads = q_pred_grads / (torch.norm(q_pred_grads, p=2, dim=2).unsqueeze(-1) + 1e-10) # [Ensemble_num,batch_size,act_dim] -> [batch_size,Ensemble_num,act_dim] - q_pred_grads = q_pred_grads.transpose(0,1) + q_pred_grads = q_pred_grads.transpose(0, 1) - q_pred_grads = q_pred_grads @ q_pred_grads.permute(0,2,1) - masks = torch.eye(self._cfg.model.ensemble_num,device=obs.device).unsqueeze(dim=0).repeat(q_pred_grads.size(0),1,1) + q_pred_grads = q_pred_grads @ q_pred_grads.permute(0, 2, 1) + masks = torch.eye( + self._cfg.model.ensemble_num, device=obs.device + ).unsqueeze(dim=0).repeat(q_pred_grads.size(0), 1, 1) q_pred_grads = (1 - masks) * q_pred_grads - grad_loss = torch.mean(torch.sum(q_pred_grads,dim=(1,2))) / (self._cfg.model.ensemble_num - 1) - loss_dict['critic_loss'] += grad_loss - + grad_loss = torch.mean(torch.sum(q_pred_grads, dim=(1, 2))) / (self._cfg.model.ensemble_num - 1) + loss_dict['critic_loss'] += grad_loss * self._eta self._optimizer_q.zero_grad() loss_dict['critic_loss'].backward() @@ -290,7 +290,7 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: eval_data = {'obs': obs, 'action': action} new_q_value = self._learn_model.forward(eval_data, mode='compute_critic')['q_value'] - new_q_value,_ = torch.min(new_q_value,dim=0) + new_q_value, _ = torch.min(new_q_value, dim=0) # 8. compute policy loss policy_loss = (self._alpha * log_prob - new_q_value.unsqueeze(-1)).mean() @@ -337,4 +337,4 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: 'alpha': self._alpha.item(), 'target_q_value': target_q_value.detach().mean().item(), **loss_dict - } \ No newline at end of file + } From 7c525d80013979e2a14e523b7999f32dcfbd550b Mon Sep 17 00:00:00 2001 From: Super1ce <278042904@qq.com> Date: Fri, 7 Apr 2023 10:13:09 +0800 Subject: [PATCH 03/25] add conv1d --- ding/torch_utils/network/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ding/torch_utils/network/__init__.py b/ding/torch_utils/network/__init__.py index 7f520f7d19..bca314d952 100644 --- a/ding/torch_utils/network/__init__.py +++ b/ding/torch_utils/network/__init__.py @@ -1,7 +1,7 @@ from .activation import build_activation, Swish from .res_block import ResBlock, ResFCBlock from .nn_module import fc_block, conv2d_block, one_hot, deconv2d_block, BilinearUpsample, NearestUpsample, \ - binary_encode, NoiseLinearLayer, noise_block, MLP, Flatten, normed_linear, normed_conv2d + binary_encode, NoiseLinearLayer, noise_block, MLP, Flatten, normed_linear, normed_conv2d, conv1d_block from .normalization import build_normalization from .rnn import get_lstm, sequence_mask from .soft_argmax import SoftArgmax From adf62378eeae07378d976fca70f7283d9517b163 Mon Sep 17 00:00:00 2001 From: Super1ce <278042904@qq.com> Date: Fri, 7 Apr 2023 11:14:17 +0800 Subject: [PATCH 04/25] add test_ensemble --- ding/model/common/tests/test_head.py | 9 +- ding/model/template/qac.py | 173 ++++++++++++++++++++++++++- 2 files changed, 180 insertions(+), 2 deletions(-) diff --git a/ding/model/common/tests/test_head.py b/ding/model/common/tests/test_head.py index 3c36640047..0fa78a343c 100644 --- a/ding/model/common/tests/test_head.py +++ b/ding/model/common/tests/test_head.py @@ -2,7 +2,7 @@ import numpy as np import pytest -from ding.model.common.head import DuelingHead, ReparameterizationHead, MultiHead, StochasticDuelingHead +from ding.model.common.head import DuelingHead, ReparameterizationHead, MultiHead, StochasticDuelingHead, EnsembleHead from ding.torch_utils import is_differentiable B = 4 @@ -84,3 +84,10 @@ def test_stochastic_dueling(self): assert isinstance(sigma.grad, torch.Tensor) assert outputs['q_value'].shape == (B, 1) assert outputs['v_value'].shape == (B, 1) + + def test_ensemble(self): + inputs = torch.randn(B, embedding_dim * 3, 1) + model = EnsembleHead(embedding_dim, action_shape, 3, 3,3) + outputs = model(inputs)['pred'] + self.output_check(model, outputs) + assert outputs.shape == (B, action_shape * 3, 1) diff --git a/ding/model/template/qac.py b/ding/model/template/qac.py index aa0cc42b0e..4107961d2b 100644 --- a/ding/model/template/qac.py +++ b/ding/model/template/qac.py @@ -6,7 +6,7 @@ from ding.utils import SequenceType, squeeze, MODEL_REGISTRY from ..common import RegressionHead, ReparameterizationHead, DiscreteHead, MultiHead, \ - FCEncoder, ConvEncoder + FCEncoder, ConvEncoder, EnsembleHead @MODEL_REGISTRY.register('qac') @@ -560,3 +560,174 @@ def compute_critic(self, inputs: Dict) -> Dict: else: x = self.critic(inputs['obs'])['logit'] return {'q_value': x} + + +@MODEL_REGISTRY.register('edac') +class Q_ensemble(nn.Module): + r""" + Overview: + The QAC network with ensemble, which is used in EDAC. + Interfaces: + ``__init__``, ``forward``, ``compute_actor``, ``compute_critic`` + """ + mode = ['compute_actor', 'compute_critic'] + + def __init__( + self, + obs_shape: Union[int, SequenceType], + action_shape: Union[int, SequenceType, EasyDict], + ensemble_num: int = 2, + actor_head_hidden_size: int = 64, + actor_head_layer_num: int = 1, + critic_head_hidden_size: int = 64, + critic_head_layer_num: int = 1, + activation: Optional[nn.Module] = nn.ReLU(), + norm_type: Optional[str] = None, + **kwargs + ) -> None: + """ + Overview: + Initailize the EDAC Model according to input arguments. + Arguments: + - obs_shape (:obj:`Union[int, SequenceType]`): Observation's shape, such as 128, (156, ). + - action_shape (:obj:`Union[int, SequenceType, EasyDict]`): Action's shape, such as 4, (3, ), \ + EasyDict({'action_type_shape': 3, 'action_args_shape': 4}). + - ensemble_num (:obj:`bool`): Q-net numble. + - actor_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to actor head. + - actor_head_layer_num (:obj:`int`): The num of layers used in the network to compute Q value output \ + for actor head. + - critic_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to critic head. + - critic_head_layer_num (:obj:`int`): The num of layers used in the network to compute Q value output \ + for critic head. + - activation (:obj:`Optional[nn.Module]`): The type of activation function to use in ``MLP`` \ + after each FC layer, if ``None`` then default set to ``nn.ReLU()``. + - norm_type (:obj:`Optional[str]`): The type of normalization to after network layer (FC, Conv), \ + see ``ding.torch_utils.network`` for more details. + """ + super(Q_ensemble, self).__init__() + obs_shape: int = squeeze(obs_shape) + action_shape = squeeze(action_shape) + self.action_shape = action_shape + self.ensemble_num = ensemble_num + self.actor = nn.Sequential( + nn.Linear(obs_shape, actor_head_hidden_size), activation, + ReparameterizationHead( + actor_head_hidden_size, + action_shape, + actor_head_layer_num, + sigma_type='conditioned', + activation=activation, + norm_type=norm_type + ) + ) + + critic_input_size = obs_shape + action_shape + self.critic = nn.Sequential( + EnsembleHead( + critic_input_size, + 1, + critic_head_hidden_size, + critic_head_layer_num, + self.ensemble_num, + activation=activation, + norm_type=norm_type + ) + ) + + def forward(self, inputs: Union[torch.Tensor, Dict[str, torch.Tensor]], mode: str) -> Dict[str, torch.Tensor]: + """ + Overview: + The unique execution (forward) method of EDAC method, and one can indicate different modes to implement \ + different computation graph, including ``compute_actor`` and ``compute_critic`` in EDAC. + Mode compute_actor: + Arguments: + - inputs (:obj:`torch.Tensor`): Observation data, defaults to tensor. + Returns: + - output (:obj:`Dict`): Output dict data, including differnet key-values among distinct action_space. + Mode compute_critic: + Arguments: + - inputs (:obj:`Dict`): Input dict data, including obs and action tensor. + Returns: + - output (:obj:`Dict`): Output dict data, including q_value tensor. + .. note:: + For specific examples, one can refer to API doc of ``compute_actor`` and ``compute_critic`` respectively. + """ + assert mode in self.mode, "not support forward mode: {}/{}".format(mode, self.mode) + return getattr(self, mode)(inputs) + + def compute_actor(self, obs: torch.Tensor) -> Dict[str, Union[torch.Tensor, Dict[str, torch.Tensor]]]: + """ + Overview: + The forward computation graph of compute_actor mode, uses observation tensor to produce actor output, + such as ``action``, ``logit`` and so on. + Arguments: + - obs (:obj:`torch.Tensor`): Observation tensor data, now supports a batch of 1-dim vector data, \ + i.e. ``(B, obs_shape)``. + Returns: + - outputs (:obj:`Dict[str, Union[torch.Tensor, Dict[str, torch.Tensor]]]`): Actor output varying \ + from action_space: ``reparameterization``. + ReturnsKeys (either): + - logit (:obj:`Dict[str, torch.Tensor]`): Reparameterization logit, usually in SAC. + - mu (:obj:`torch.Tensor`): Mean of parameterization gaussion distribution. + - sigma (:obj:`torch.Tensor`): Standard variation of parameterization gaussion distribution. + Shapes: + - obs (:obj:`torch.Tensor`): :math:`(B, N0)`, B is batch size and N0 corresponds to ``obs_shape``. + - action (:obj:`torch.Tensor`): :math:`(B, N1)`, B is batch size and N1 corresponds to ``action_shape``. + - logit.mu (:obj:`torch.Tensor`): :math:`(B, N1)`, B is batch size and N1 corresponds to ``action_shape``. + - logit.sigma (:obj:`torch.Tensor`): :math:`(B, N1)`, B is batch size. + - logit (:obj:`torch.Tensor`): :math:`(B, N2)`, B is batch size and N2 corresponds to \ + ``action_shape.action_type_shape``. + - action_args (:obj:`torch.Tensor`): :math:`(B, N3)`, B is batch size and N3 corresponds to \ + ``action_shape.action_args_shape``. + Examples: + >>> model = EDAC(64, 64,) + >>> obs = torch.randn(4, 64) + >>> actor_outputs = model(obs,'compute_actor') + >>> assert actor_outputs['logit'][0].shape == torch.Size([4, 64]) # mu + >>> actor_outputs['logit'][1].shape == torch.Size([4, 64]) # sigma + """ + x = self.actor(obs) + return {'logit': [x['mu'], x['sigma']]} + + def compute_critic(self, inputs: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: + """ + Overview: + The forward computation graph of compute_critic mode, uses observation and action tensor to produce critic + output, such as ``q_value``. + Arguments: + - inputs (:obj:`Dict[str, torch.Tensor]`): Dict strcture of input data, including ``obs`` and ``action`` tensor + Returns: + - outputs (:obj:`Dict[str, torch.Tensor]`): Critic output, such as ``q_value``. + ArgumentsKeys: + - obs: (:obj:`torch.Tensor`): Observation tensor data, now supports a batch of 1-dim vector data. + - action (:obj:`Union[torch.Tensor, Dict]`): Continuous action with same size as ``action_shape``. + ReturnKeys: + - q_value (:obj:`torch.Tensor`): Q value tensor with same size as batch size. + Shapes: + - obs (:obj:`torch.Tensor`): :math:`(B, N1)` or '(Ensemble_num, B, N1)', where B is batch size and N1 is ``obs_shape``. + - action (:obj:`torch.Tensor`): :math:`(B, N2)` or '(Ensemble_num, B, N2)', where B is batch size and N4 is ``action_shape``. + - q_value (:obj:`torch.Tensor`): :math:`(Ensemble_num, B)`, where B is batch size. + Examples: + >>> inputs = {'obs': torch.randn(4, 8), 'action': torch.randn(4, 1)} + >>> model = EDAC(obs_shape=(8, ),action_shape=1) + >>> model(inputs, mode='compute_critic')['q_value'] # q value + ... tensor([0.0773, 0.1639, 0.0917, 0.0370], grad_fn=) + """ + + obs, action = inputs['obs'], inputs['action'] + if len(action.shape) == 1: # (B, ) -> (B, 1) + action = action.unsqueeze(1) + x = torch.cat([obs, action], dim=-1) + if len(obs.shape) < 3: + # [batch_size,dim] -> [batch_size,Ensemble_num * dim,1] + x = x.repeat(1, self.ensemble_num).unsqueeze(-1) + else: + # [Ensemble_num,batch_size,dim] -> [batch_size,Ensemble_num,dim] -> [batch_size,Ensemble_num * dim, 1] + x = x.transpose(0, 1) + batch_size = obs.shape[1] + x = x.reshape(batch_size, -1, 1) + # [Ensemble_num,batch_size,1] + x = self.critic(x)['pred'] + # [batch_size,1*Ensemble_num] -> [Ensemble_num,batch_size] + x = x.permute(1, 0) + return {'q_value': x} From 6830d7602cc2b1688b0f76dd4ca4184a1771e80f Mon Sep 17 00:00:00 2001 From: Super1ce <278042904@qq.com> Date: Sat, 8 Apr 2023 10:38:59 +0800 Subject: [PATCH 05/25] add encoder --- ding/model/template/qac.py | 43 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) mode change 100644 => 100755 ding/model/template/qac.py diff --git a/ding/model/template/qac.py b/ding/model/template/qac.py old mode 100644 new mode 100755 index 4107961d2b..138bd7bbc6 --- a/ding/model/template/qac.py +++ b/ding/model/template/qac.py @@ -583,6 +583,8 @@ def __init__( critic_head_layer_num: int = 1, activation: Optional[nn.Module] = nn.ReLU(), norm_type: Optional[str] = None, + encoder_hidden_size_list: Optional[SequenceType] = [32, 64, 256], + share_encoder: Optional[bool] = False, **kwargs ) -> None: """ @@ -609,6 +611,37 @@ def __init__( action_shape = squeeze(action_shape) self.action_shape = action_shape self.ensemble_num = ensemble_num + + if np.isscalar(obs_shape) or len(obs_shape) == 1: + assert not self.share_encoder, "Vector observation doesn't need share encoder." + self.encoder = None + self.input_size = obs_shape + elif len(obs_shape) == 3: + + def setup_conv_encoder(): + kernel_size = [3 for _ in range(len(encoder_hidden_size_list))] + stride = [2] + [1 for _ in range(len(encoder_hidden_size_list) - 1)] + return ConvEncoder( + obs_shape, + encoder_hidden_size_list, + activation=activation, + norm_type=norm_type, + kernel_size=kernel_size, + stride=stride + ) + + if self.share_encoder: + self.encoder = setup_conv_encoder() + self.input_size = self.encoder.output_size + else: + self.encoder = nn.ModuleDict({ + 'actor': setup_conv_encoder(), + 'critic': setup_conv_encoder(), + }) + self.input_size = self.encoder['actor'].output_size + else: + raise RuntimeError("not support observation shape: {}".format(obs_shape)) + self.actor = nn.Sequential( nn.Linear(obs_shape, actor_head_hidden_size), activation, ReparameterizationHead( @@ -686,6 +719,11 @@ def compute_actor(self, obs: torch.Tensor) -> Dict[str, Union[torch.Tensor, Dict >>> assert actor_outputs['logit'][0].shape == torch.Size([4, 64]) # mu >>> actor_outputs['logit'][1].shape == torch.Size([4, 64]) # sigma """ + if self.encoder is not None: + if self.share_encoder: + obs = self.encoder(obs) + else: + obs = self.encoder['actor'](obs) x = self.actor(obs) return {'logit': [x['mu'], x['sigma']]} @@ -715,6 +753,11 @@ def compute_critic(self, inputs: Dict[str, torch.Tensor]) -> Dict[str, torch.Ten """ obs, action = inputs['obs'], inputs['action'] + if self.encoder is not None: + if self.share_encoder: + obs = self.encoder(obs) + else: + obs = self.encoder['critic'](obs) if len(action.shape) == 1: # (B, ) -> (B, 1) action = action.unsqueeze(1) x = torch.cat([obs, action], dim=-1) From 9bf07f362ceb5fb5f50f50d99f6a1526d8f5e8d2 Mon Sep 17 00:00:00 2001 From: Super1ce <278042904@qq.com> Date: Sat, 8 Apr 2023 11:25:40 +0800 Subject: [PATCH 06/25] add encoder --- ding/model/template/qac.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ding/model/template/qac.py b/ding/model/template/qac.py index 138bd7bbc6..1fa44963b8 100755 --- a/ding/model/template/qac.py +++ b/ding/model/template/qac.py @@ -611,6 +611,7 @@ def __init__( action_shape = squeeze(action_shape) self.action_shape = action_shape self.ensemble_num = ensemble_num + self.share_encoder = share_encoder if np.isscalar(obs_shape) or len(obs_shape) == 1: assert not self.share_encoder, "Vector observation doesn't need share encoder." @@ -643,7 +644,7 @@ def setup_conv_encoder(): raise RuntimeError("not support observation shape: {}".format(obs_shape)) self.actor = nn.Sequential( - nn.Linear(obs_shape, actor_head_hidden_size), activation, + nn.Linear(self.input_size, actor_head_hidden_size), activation, ReparameterizationHead( actor_head_hidden_size, action_shape, @@ -654,7 +655,7 @@ def setup_conv_encoder(): ) ) - critic_input_size = obs_shape + action_shape + critic_input_size = self.input_size + action_shape self.critic = nn.Sequential( EnsembleHead( critic_input_size, From 261820027a0629bdb23ef942121a6bd5b6af234c Mon Sep 17 00:00:00 2001 From: Super1ce <278042904@qq.com> Date: Sat, 8 Apr 2023 11:29:17 +0800 Subject: [PATCH 07/25] add encoder --- ding/model/template/qac.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ding/model/template/qac.py b/ding/model/template/qac.py index 1fa44963b8..fd504e0421 100755 --- a/ding/model/template/qac.py +++ b/ding/model/template/qac.py @@ -612,6 +612,7 @@ def __init__( self.action_shape = action_shape self.ensemble_num = ensemble_num self.share_encoder = share_encoder + encoder_hidden_size_list = encoder_hidden_size_list * ensemble_num if np.isscalar(obs_shape) or len(obs_shape) == 1: assert not self.share_encoder, "Vector observation doesn't need share encoder." From c55294ec6c2bf5a755787639f958b5929e580846 Mon Sep 17 00:00:00 2001 From: Super1ce <278042904@qq.com> Date: Mon, 10 Apr 2023 15:02:35 +0800 Subject: [PATCH 08/25] modify policy_init --- ding/policy/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ding/policy/__init__.py b/ding/policy/__init__.py index 0d8924857b..02eac73eee 100755 --- a/ding/policy/__init__.py +++ b/ding/policy/__init__.py @@ -50,7 +50,6 @@ from .pc import ProcedureCloningBFSPolicy from .edac import EDACPolicy -from .bcq import BCQPolicy # new-type policy from .ppof import PPOFPolicy From febde4856d72c8e0be8e824e323cc224d6395eea Mon Sep 17 00:00:00 2001 From: Super1ce <278042904@qq.com> Date: Tue, 11 Apr 2023 15:37:39 +0800 Subject: [PATCH 09/25] modify edac --- README.md | 1 + ding/example/edac.py | 2 +- ding/model/common/head.py | 18 +- ding/model/template/__init__.py | 5 +- ding/model/template/qac.py | 218 +----------------- ding/policy/edac.py | 64 +---- .../config/halfcheetah_medium_edac_config.py | 2 - .../halfcheetah_medium_expert_edac_config.py | 2 - .../d4rl/config/hopper_medium_edac_config.py | 2 - .../hopper_medium_expert_edac_config.py | 2 - 10 files changed, 22 insertions(+), 294 deletions(-) diff --git a/README.md b/README.md index e2d5e10cc5..384bfc0bb1 100644 --- a/README.md +++ b/README.md @@ -248,6 +248,7 @@ P.S: The `.py` file in `Runnable Demo` can be found in `dizoo` | 50 | [ST-DIM](https://arxiv.org/pdf/1906.08226.pdf) | ![other](https://img.shields.io/badge/-other-lightgrey) | [torch_utils/loss/contrastive_loss](https://github.com/opendilab/DI-engine/blob/main/ding/torch_utils/loss/contrastive_loss.py) | ding -m serial -c cartpole_dqn_stdim_config.py -s 0 | | 51 | [PLR](https://arxiv.org/pdf/2010.03934.pdf) | ![other](https://img.shields.io/badge/-other-lightgrey) | [PLR doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/plr.html)
[data/level_replay/level_sampler](https://github.com/opendilab/DI-engine/blob/main/ding/data/level_replay/level_sampler.py) | python3 -u bigfish_plr_config.py -s 0 | | 52 | [PCGrad](https://arxiv.org/pdf/2001.06782.pdf) | ![other](https://img.shields.io/badge/-other-lightgrey) | [torch_utils/optimizer_helper/PCGrad](https://github.com/opendilab/DI-engine/blob/main/ding/data/torch_utils/optimizer_helper.py) | python3 -u multi_mnist_pcgrad_main.py -s 0 | +| 53 | [edac](https://arxiv.org/pdf/2110.01548.pdf) | ![offline](https://img.shields.io/badge/-offlineRL-darkblue) | [EDAC doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/edac.html)
[policy/cql](https://github.com/opendilab/DI-engine/blob/main/ding/policy/edac.py) | python3 -u d4rl_edac_main.py | diff --git a/ding/example/edac.py b/ding/example/edac.py index c1da13947e..53bdbdbe14 100755 --- a/ding/example/edac.py +++ b/ding/example/edac.py @@ -33,7 +33,7 @@ def main(): task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) task.use(offline_data_fetcher(cfg, dataset)) task.use(trainer(cfg, policy.learn_mode)) - task.use(CkptSaver(policy, cfg.exp_name, train_freq=100)) + task.use(CkptSaver(policy, cfg.exp_name, train_freq=1e4)) task.use(offline_logger()) task.run() diff --git a/ding/model/common/head.py b/ding/model/common/head.py index 1f376788ee..e8b6b052b9 100755 --- a/ding/model/common/head.py +++ b/ding/model/common/head.py @@ -1318,12 +1318,12 @@ def forward(self, x: torch.Tensor) -> Dict: class EnsembleHead(nn.Module): """ - Overview: - The ``EnsembleHead`` used to output action Q-value for Q-ensemble. \ - Input is a (:obj:`torch.Tensor`) of shape ''(B, N * Ensemble_num, 1)'' and returns a (:obj:`Dict`) containing \ - output ``pred``. - Interfaces: - ``__init__``, ``forward``. + Overview: + The ``EnsembleHead`` used to output action Q-value for Q-ensemble. \ + Input is a (:obj:`torch.Tensor`) of shape ''(B, N * Ensemble_num, 1)'' and returns a (:obj:`Dict`) containing \ + output ``pred``. + Interfaces: + ``__init__``, ``forward``. """ def __init__( @@ -1331,7 +1331,7 @@ def __init__( input_size: int, output_size: int, hidden_size: int, - layer_nun: int, + layer_num: int, ensemble_num: int, activation: Optional[nn.Module] = nn.ReLU(), norm_type: Optional[str] = None @@ -1339,7 +1339,7 @@ def __init__( super(EnsembleHead, self).__init__() d = input_size layers = [] - for _ in range(layer_nun): + for _ in range(layer_num): layers.append( conv1d_block( d * ensemble_num, @@ -1352,6 +1352,8 @@ def __init__( ) ) d = hidden_size + + # Adding activation for last layer will lead to train fail layers.append( conv1d_block( hidden_size * ensemble_num, diff --git a/ding/model/template/__init__.py b/ding/model/template/__init__.py index 494d091f45..8cba175c1b 100755 --- a/ding/model/template/__init__.py +++ b/ding/model/template/__init__.py @@ -1,6 +1,6 @@ # general from .q_learning import DQN, RainbowDQN, QRDQN, IQN, FQF, DRQN, C51DQN, BDQ -from .qac import QAC, DiscreteQAC, Q_ensemble +from .qac import QAC, DiscreteQAC from .pdqn import PDQN from .vac import VAC from .bc import DiscreteBC, ContinuousBC @@ -22,4 +22,5 @@ from .madqn import MADQN from .vae import VanillaVAE from .decision_transformer import DecisionTransformer -from .procedure_cloning import ProcedureCloningMCTS, ProcedureCloningBFS \ No newline at end of file +from .procedure_cloning import ProcedureCloningMCTS, ProcedureCloningBFS +from .edac import Q_ensemble \ No newline at end of file diff --git a/ding/model/template/qac.py b/ding/model/template/qac.py index fd504e0421..f3f069d8b2 100755 --- a/ding/model/template/qac.py +++ b/ding/model/template/qac.py @@ -559,220 +559,4 @@ def compute_critic(self, inputs: Dict) -> Dict: x = [m(inputs['obs'])['logit'] for m in self.critic] else: x = self.critic(inputs['obs'])['logit'] - return {'q_value': x} - - -@MODEL_REGISTRY.register('edac') -class Q_ensemble(nn.Module): - r""" - Overview: - The QAC network with ensemble, which is used in EDAC. - Interfaces: - ``__init__``, ``forward``, ``compute_actor``, ``compute_critic`` - """ - mode = ['compute_actor', 'compute_critic'] - - def __init__( - self, - obs_shape: Union[int, SequenceType], - action_shape: Union[int, SequenceType, EasyDict], - ensemble_num: int = 2, - actor_head_hidden_size: int = 64, - actor_head_layer_num: int = 1, - critic_head_hidden_size: int = 64, - critic_head_layer_num: int = 1, - activation: Optional[nn.Module] = nn.ReLU(), - norm_type: Optional[str] = None, - encoder_hidden_size_list: Optional[SequenceType] = [32, 64, 256], - share_encoder: Optional[bool] = False, - **kwargs - ) -> None: - """ - Overview: - Initailize the EDAC Model according to input arguments. - Arguments: - - obs_shape (:obj:`Union[int, SequenceType]`): Observation's shape, such as 128, (156, ). - - action_shape (:obj:`Union[int, SequenceType, EasyDict]`): Action's shape, such as 4, (3, ), \ - EasyDict({'action_type_shape': 3, 'action_args_shape': 4}). - - ensemble_num (:obj:`bool`): Q-net numble. - - actor_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to actor head. - - actor_head_layer_num (:obj:`int`): The num of layers used in the network to compute Q value output \ - for actor head. - - critic_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to critic head. - - critic_head_layer_num (:obj:`int`): The num of layers used in the network to compute Q value output \ - for critic head. - - activation (:obj:`Optional[nn.Module]`): The type of activation function to use in ``MLP`` \ - after each FC layer, if ``None`` then default set to ``nn.ReLU()``. - - norm_type (:obj:`Optional[str]`): The type of normalization to after network layer (FC, Conv), \ - see ``ding.torch_utils.network`` for more details. - """ - super(Q_ensemble, self).__init__() - obs_shape: int = squeeze(obs_shape) - action_shape = squeeze(action_shape) - self.action_shape = action_shape - self.ensemble_num = ensemble_num - self.share_encoder = share_encoder - encoder_hidden_size_list = encoder_hidden_size_list * ensemble_num - - if np.isscalar(obs_shape) or len(obs_shape) == 1: - assert not self.share_encoder, "Vector observation doesn't need share encoder." - self.encoder = None - self.input_size = obs_shape - elif len(obs_shape) == 3: - - def setup_conv_encoder(): - kernel_size = [3 for _ in range(len(encoder_hidden_size_list))] - stride = [2] + [1 for _ in range(len(encoder_hidden_size_list) - 1)] - return ConvEncoder( - obs_shape, - encoder_hidden_size_list, - activation=activation, - norm_type=norm_type, - kernel_size=kernel_size, - stride=stride - ) - - if self.share_encoder: - self.encoder = setup_conv_encoder() - self.input_size = self.encoder.output_size - else: - self.encoder = nn.ModuleDict({ - 'actor': setup_conv_encoder(), - 'critic': setup_conv_encoder(), - }) - self.input_size = self.encoder['actor'].output_size - else: - raise RuntimeError("not support observation shape: {}".format(obs_shape)) - - self.actor = nn.Sequential( - nn.Linear(self.input_size, actor_head_hidden_size), activation, - ReparameterizationHead( - actor_head_hidden_size, - action_shape, - actor_head_layer_num, - sigma_type='conditioned', - activation=activation, - norm_type=norm_type - ) - ) - - critic_input_size = self.input_size + action_shape - self.critic = nn.Sequential( - EnsembleHead( - critic_input_size, - 1, - critic_head_hidden_size, - critic_head_layer_num, - self.ensemble_num, - activation=activation, - norm_type=norm_type - ) - ) - - def forward(self, inputs: Union[torch.Tensor, Dict[str, torch.Tensor]], mode: str) -> Dict[str, torch.Tensor]: - """ - Overview: - The unique execution (forward) method of EDAC method, and one can indicate different modes to implement \ - different computation graph, including ``compute_actor`` and ``compute_critic`` in EDAC. - Mode compute_actor: - Arguments: - - inputs (:obj:`torch.Tensor`): Observation data, defaults to tensor. - Returns: - - output (:obj:`Dict`): Output dict data, including differnet key-values among distinct action_space. - Mode compute_critic: - Arguments: - - inputs (:obj:`Dict`): Input dict data, including obs and action tensor. - Returns: - - output (:obj:`Dict`): Output dict data, including q_value tensor. - .. note:: - For specific examples, one can refer to API doc of ``compute_actor`` and ``compute_critic`` respectively. - """ - assert mode in self.mode, "not support forward mode: {}/{}".format(mode, self.mode) - return getattr(self, mode)(inputs) - - def compute_actor(self, obs: torch.Tensor) -> Dict[str, Union[torch.Tensor, Dict[str, torch.Tensor]]]: - """ - Overview: - The forward computation graph of compute_actor mode, uses observation tensor to produce actor output, - such as ``action``, ``logit`` and so on. - Arguments: - - obs (:obj:`torch.Tensor`): Observation tensor data, now supports a batch of 1-dim vector data, \ - i.e. ``(B, obs_shape)``. - Returns: - - outputs (:obj:`Dict[str, Union[torch.Tensor, Dict[str, torch.Tensor]]]`): Actor output varying \ - from action_space: ``reparameterization``. - ReturnsKeys (either): - - logit (:obj:`Dict[str, torch.Tensor]`): Reparameterization logit, usually in SAC. - - mu (:obj:`torch.Tensor`): Mean of parameterization gaussion distribution. - - sigma (:obj:`torch.Tensor`): Standard variation of parameterization gaussion distribution. - Shapes: - - obs (:obj:`torch.Tensor`): :math:`(B, N0)`, B is batch size and N0 corresponds to ``obs_shape``. - - action (:obj:`torch.Tensor`): :math:`(B, N1)`, B is batch size and N1 corresponds to ``action_shape``. - - logit.mu (:obj:`torch.Tensor`): :math:`(B, N1)`, B is batch size and N1 corresponds to ``action_shape``. - - logit.sigma (:obj:`torch.Tensor`): :math:`(B, N1)`, B is batch size. - - logit (:obj:`torch.Tensor`): :math:`(B, N2)`, B is batch size and N2 corresponds to \ - ``action_shape.action_type_shape``. - - action_args (:obj:`torch.Tensor`): :math:`(B, N3)`, B is batch size and N3 corresponds to \ - ``action_shape.action_args_shape``. - Examples: - >>> model = EDAC(64, 64,) - >>> obs = torch.randn(4, 64) - >>> actor_outputs = model(obs,'compute_actor') - >>> assert actor_outputs['logit'][0].shape == torch.Size([4, 64]) # mu - >>> actor_outputs['logit'][1].shape == torch.Size([4, 64]) # sigma - """ - if self.encoder is not None: - if self.share_encoder: - obs = self.encoder(obs) - else: - obs = self.encoder['actor'](obs) - x = self.actor(obs) - return {'logit': [x['mu'], x['sigma']]} - - def compute_critic(self, inputs: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: - """ - Overview: - The forward computation graph of compute_critic mode, uses observation and action tensor to produce critic - output, such as ``q_value``. - Arguments: - - inputs (:obj:`Dict[str, torch.Tensor]`): Dict strcture of input data, including ``obs`` and ``action`` tensor - Returns: - - outputs (:obj:`Dict[str, torch.Tensor]`): Critic output, such as ``q_value``. - ArgumentsKeys: - - obs: (:obj:`torch.Tensor`): Observation tensor data, now supports a batch of 1-dim vector data. - - action (:obj:`Union[torch.Tensor, Dict]`): Continuous action with same size as ``action_shape``. - ReturnKeys: - - q_value (:obj:`torch.Tensor`): Q value tensor with same size as batch size. - Shapes: - - obs (:obj:`torch.Tensor`): :math:`(B, N1)` or '(Ensemble_num, B, N1)', where B is batch size and N1 is ``obs_shape``. - - action (:obj:`torch.Tensor`): :math:`(B, N2)` or '(Ensemble_num, B, N2)', where B is batch size and N4 is ``action_shape``. - - q_value (:obj:`torch.Tensor`): :math:`(Ensemble_num, B)`, where B is batch size. - Examples: - >>> inputs = {'obs': torch.randn(4, 8), 'action': torch.randn(4, 1)} - >>> model = EDAC(obs_shape=(8, ),action_shape=1) - >>> model(inputs, mode='compute_critic')['q_value'] # q value - ... tensor([0.0773, 0.1639, 0.0917, 0.0370], grad_fn=) - """ - - obs, action = inputs['obs'], inputs['action'] - if self.encoder is not None: - if self.share_encoder: - obs = self.encoder(obs) - else: - obs = self.encoder['critic'](obs) - if len(action.shape) == 1: # (B, ) -> (B, 1) - action = action.unsqueeze(1) - x = torch.cat([obs, action], dim=-1) - if len(obs.shape) < 3: - # [batch_size,dim] -> [batch_size,Ensemble_num * dim,1] - x = x.repeat(1, self.ensemble_num).unsqueeze(-1) - else: - # [Ensemble_num,batch_size,dim] -> [batch_size,Ensemble_num,dim] -> [batch_size,Ensemble_num * dim, 1] - x = x.transpose(0, 1) - batch_size = obs.shape[1] - x = x.reshape(batch_size, -1, 1) - # [Ensemble_num,batch_size,1] - x = self.critic(x)['pred'] - # [batch_size,1*Ensemble_num] -> [Ensemble_num,batch_size] - x = x.permute(1, 0) - return {'q_value': x} + return {'q_value': x} \ No newline at end of file diff --git a/ding/policy/edac.py b/ding/policy/edac.py index 7b46b1dd17..eb64f8c5ca 100755 --- a/ding/policy/edac.py +++ b/ding/policy/edac.py @@ -20,7 +20,7 @@ class EDACPolicy(SACPolicy): """ Overview: - Policy class of EDAC algorithm. + Policy class of EDAC algorithm. https://arxiv.org/pdf/2110.01548.pdf Config: == ==================== ======== ============= ================================= ======================= @@ -68,7 +68,8 @@ class EDACPolicy(SACPolicy): == ==================== ======== ============= ================================= ======================= """ config = dict( - type='sac', + # (str) RL policy register name + type='edac', cuda=False, on_policy=False, multi_agent=False, @@ -79,7 +80,7 @@ class EDACPolicy(SACPolicy): # (bool type) ensemble_num:num of Q-network. ensemble_num=10, # (bool type) value_network: Determine whether to use value network as the - # original SAC paper (arXiv 1801.01290). + # original EDAC paper (arXiv 2110.01548). # using value_network needs to set learning_rate_value, learning_rate_q, # and learning_rate_policy in `cfg.policy.learn`. # Default to False. @@ -144,63 +145,9 @@ def _init_learn(self) -> None: Learn mode init method. Called by ``self.__init__``. Init q, value and policy's optimizers, algorithm config, main and target models. """ - # Init - self._priority = self._cfg.priority - self._priority_IS_weight = self._cfg.priority_IS_weight + # EDAC special implementation self._eta = self._cfg.learn.eta - self._with_q_entropy = self._cfg.learn.with_q_entropy - self._value_network = False - self._optimizer_q = Adam( - self._model.critic.parameters(), - lr=self._cfg.learn.learning_rate_q, - ) - self._optimizer_policy = Adam( - self._model.actor.parameters(), - lr=self._cfg.learn.learning_rate_policy, - ) - - self._gamma = self._cfg.learn.discount_factor - - init_w = self._cfg.learn.init_w - self._model.actor[2].mu.weight.data.uniform_(-init_w, init_w) - self._model.actor[2].mu.bias.data.uniform_(-init_w, init_w) - self._model.actor[2].log_sigma_layer.weight.data.uniform_(-init_w, init_w) - self._model.actor[2].log_sigma_layer.bias.data.uniform_(-init_w, init_w) - - if self._cfg.learn.auto_alpha: - self._target_entropy = -np.prod(self._cfg.model.action_shape) - if self._cfg.learn.log_space: - self._log_alpha = torch.log(torch.FloatTensor([self._cfg.learn.alpha])) - self._log_alpha = self._log_alpha.to(self._device).requires_grad_() - self._alpha_optim = torch.optim.Adam([self._log_alpha], lr=self._cfg.learn.learning_rate_alpha) - assert self._log_alpha.shape == torch.Size([1]) and self._log_alpha.requires_grad - self._alpha = self._log_alpha.detach().exp() - self._auto_alpha = True - self._log_space = True - else: - self._alpha = torch.FloatTensor([self._cfg.learn.alpha]).to(self._device).requires_grad_() - self._alpha_optim = torch.optim.Adam([self._alpha], lr=self._cfg.learn.learning_rate_alpha) - self._auto_alpha = True - self._log_space = False - else: - self._alpha = torch.tensor( - [self._cfg.learn.alpha], requires_grad=False, device=self._device, dtype=torch.float32 - ) - self._auto_alpha = False - - self._target_model = copy.deepcopy(self._model) - self._target_model = model_wrap( - self._target_model, - wrapper_name='target', - update_type='momentum', - update_kwargs={'theta': self._cfg.learn.target_theta} - ) - self._learn_model = model_wrap(self._model, wrapper_name='base') - self._learn_model.reset() - self._target_model.reset() - - self._forward_learn_cnt = 0 def _forward_learn(self, data: dict) -> Dict[str, Any]: loss_dict = {} @@ -252,6 +199,7 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: td_error_per_sample = nn.MSELoss(reduction='none')(q_value, target_q_value).mean(dim=1).sum() loss_dict['critic_loss'] = (td_error_per_sample * weight).mean() + # penalty term of EDAC if self._eta > 0: # [batch_size,dim] -> [Ensemble_num,batch_size,dim] pre_obs = obs.unsqueeze(0).repeat_interleave(self._cfg.model.ensemble_num, dim=0) diff --git a/dizoo/d4rl/config/halfcheetah_medium_edac_config.py b/dizoo/d4rl/config/halfcheetah_medium_edac_config.py index abce76c71e..b2774cea54 100755 --- a/dizoo/d4rl/config/halfcheetah_medium_edac_config.py +++ b/dizoo/d4rl/config/halfcheetah_medium_edac_config.py @@ -36,7 +36,6 @@ ), collect=dict(data_type='d4rl', ), eval=dict(evaluator=dict(eval_freq=500, )), - other=dict(replay_buffer=dict(replay_buffer_size=2000000, ), ), ), seed = 0, ) @@ -54,7 +53,6 @@ type='edac', import_names=['ding.policy.edac'], ), - replay_buffer=dict(type='naive', ), ) create_config = EasyDict(create_config) create_config = create_config \ No newline at end of file diff --git a/dizoo/d4rl/config/halfcheetah_medium_expert_edac_config.py b/dizoo/d4rl/config/halfcheetah_medium_expert_edac_config.py index be066afae2..3e6839188e 100755 --- a/dizoo/d4rl/config/halfcheetah_medium_expert_edac_config.py +++ b/dizoo/d4rl/config/halfcheetah_medium_expert_edac_config.py @@ -36,7 +36,6 @@ ), collect=dict(data_type='d4rl', ), eval=dict(evaluator=dict(eval_freq=500, )), - other=dict(replay_buffer=dict(replay_buffer_size=2000000, ), ), ), seed = 123, ) @@ -54,7 +53,6 @@ type='edac', import_names=['ding.policy.edac'], ), - replay_buffer=dict(type='naive', ), ) create_config = EasyDict(create_config) create_config = create_config \ No newline at end of file diff --git a/dizoo/d4rl/config/hopper_medium_edac_config.py b/dizoo/d4rl/config/hopper_medium_edac_config.py index 21b5c10eb6..15a78938d0 100755 --- a/dizoo/d4rl/config/hopper_medium_edac_config.py +++ b/dizoo/d4rl/config/hopper_medium_edac_config.py @@ -36,7 +36,6 @@ ), collect=dict(data_type='d4rl', ), eval=dict(evaluator=dict(eval_freq=500, )), - other=dict(replay_buffer=dict(replay_buffer_size=2000000, ), ), ), seed = 0, ) @@ -54,7 +53,6 @@ type='edac', import_names=['ding.policy.edac'], ), - replay_buffer=dict(type='naive', ), ) create_config = EasyDict(create_config) create_config = create_config \ No newline at end of file diff --git a/dizoo/d4rl/config/hopper_medium_expert_edac_config.py b/dizoo/d4rl/config/hopper_medium_expert_edac_config.py index 22131544f6..a7b1902c6d 100755 --- a/dizoo/d4rl/config/hopper_medium_expert_edac_config.py +++ b/dizoo/d4rl/config/hopper_medium_expert_edac_config.py @@ -36,7 +36,6 @@ ), collect=dict(data_type='d4rl', ), eval=dict(evaluator=dict(eval_freq=500, )), - other=dict(replay_buffer=dict(replay_buffer_size=2000000, ), ), ), seed = 0, ) @@ -54,7 +53,6 @@ type='edac', import_names=['ding.policy.edac'], ), - replay_buffer=dict(type='naive', ), ) create_config = EasyDict(create_config) create_config = create_config \ No newline at end of file From 5d0cb0fe6da1f493b52bac73cd3331718c31f63e Mon Sep 17 00:00:00 2001 From: Super1ce <278042904@qq.com> Date: Tue, 11 Apr 2023 15:45:57 +0800 Subject: [PATCH 10/25] add init --- ding/policy/edac.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ding/policy/edac.py b/ding/policy/edac.py index eb64f8c5ca..323a944378 100755 --- a/ding/policy/edac.py +++ b/ding/policy/edac.py @@ -145,6 +145,7 @@ def _init_learn(self) -> None: Learn mode init method. Called by ``self.__init__``. Init q, value and policy's optimizers, algorithm config, main and target models. """ + super()._init_learn() # EDAC special implementation self._eta = self._cfg.learn.eta From 6ef0ae2d92af865229b8ac1758f441294d9e6680 Mon Sep 17 00:00:00 2001 From: Super1ce <278042904@qq.com> Date: Tue, 11 Apr 2023 22:09:07 +0800 Subject: [PATCH 11/25] modify td3_bc and readme --- README.md | 2 +- ding/policy/td3_bc.py | 7 +++++++ dizoo/d4rl/config/hopper_expert_td3bc_config.py | 6 +----- dizoo/d4rl/config/hopper_medium_expert_td3bc_config.py | 6 +----- dizoo/d4rl/config/hopper_medium_replay_td3bc_config.py | 6 +----- dizoo/d4rl/config/hopper_medium_td3bc_config.py | 6 +----- 6 files changed, 12 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 384bfc0bb1..6cbf35cb7d 100644 --- a/README.md +++ b/README.md @@ -248,7 +248,7 @@ P.S: The `.py` file in `Runnable Demo` can be found in `dizoo` | 50 | [ST-DIM](https://arxiv.org/pdf/1906.08226.pdf) | ![other](https://img.shields.io/badge/-other-lightgrey) | [torch_utils/loss/contrastive_loss](https://github.com/opendilab/DI-engine/blob/main/ding/torch_utils/loss/contrastive_loss.py) | ding -m serial -c cartpole_dqn_stdim_config.py -s 0 | | 51 | [PLR](https://arxiv.org/pdf/2010.03934.pdf) | ![other](https://img.shields.io/badge/-other-lightgrey) | [PLR doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/plr.html)
[data/level_replay/level_sampler](https://github.com/opendilab/DI-engine/blob/main/ding/data/level_replay/level_sampler.py) | python3 -u bigfish_plr_config.py -s 0 | | 52 | [PCGrad](https://arxiv.org/pdf/2001.06782.pdf) | ![other](https://img.shields.io/badge/-other-lightgrey) | [torch_utils/optimizer_helper/PCGrad](https://github.com/opendilab/DI-engine/blob/main/ding/data/torch_utils/optimizer_helper.py) | python3 -u multi_mnist_pcgrad_main.py -s 0 | -| 53 | [edac](https://arxiv.org/pdf/2110.01548.pdf) | ![offline](https://img.shields.io/badge/-offlineRL-darkblue) | [EDAC doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/edac.html)
[policy/cql](https://github.com/opendilab/DI-engine/blob/main/ding/policy/edac.py) | python3 -u d4rl_edac_main.py | +| 53 | [edac](https://arxiv.org/pdf/2110.01548.pdf) | ![offline](https://img.shields.io/badge/-offlineRL-darkblue) | [EDAC doc](https://di-engine-docs.readthedocs.io/en/latest/12_policies/edac.html)
[policy/edac](https://github.com/opendilab/DI-engine/blob/main/ding/policy/edac.py) | python3 -u d4rl_edac_main.py | diff --git a/ding/policy/td3_bc.py b/ding/policy/td3_bc.py index 6441990790..97c19a70d6 100644 --- a/ding/policy/td3_bc.py +++ b/ding/policy/td3_bc.py @@ -198,6 +198,9 @@ def _init_learn(self) -> None: clip_value=1.0, ) + self.noise_sigma = self._cfg.learn.noise_sigma + self.noise_range = self._cfg.learn.noise_range + def _forward_learn(self, data: dict) -> Dict[str, Any]: r""" Overview: @@ -237,6 +240,10 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: # target q value. with torch.no_grad(): next_action = self._target_model.forward(next_obs, mode='compute_actor')['action'] + noise = ( + torch.randn_like(next_action) * self.noise_sigma + ).clamp(self.noise_range['min'], self.noise_range['max']) + next_action = (next_action + noise).clamp(-1, 1) next_data = {'obs': next_obs, 'action': next_action} target_q_value = self._target_model.forward(next_data, mode='compute_critic')['q_value'] if self._twin_critic: diff --git a/dizoo/d4rl/config/hopper_expert_td3bc_config.py b/dizoo/d4rl/config/hopper_expert_td3bc_config.py index b0874a0018..446a897d1b 100644 --- a/dizoo/d4rl/config/hopper_expert_td3bc_config.py +++ b/dizoo/d4rl/config/hopper_expert_td3bc_config.py @@ -5,11 +5,7 @@ main_config = dict( exp_name='hopper_expert_td3-bc_seed0', env=dict( - env_id='hopper-expert-v0', - norm_obs=dict( - use_norm=True, - offline_stats=dict(use_offline_stats=True, ), - ), + env_id='hopper-expert-v2', collector_env_num=1, evaluator_env_num=8, use_act_scale=True, diff --git a/dizoo/d4rl/config/hopper_medium_expert_td3bc_config.py b/dizoo/d4rl/config/hopper_medium_expert_td3bc_config.py index 19531debad..23b2a88dec 100644 --- a/dizoo/d4rl/config/hopper_medium_expert_td3bc_config.py +++ b/dizoo/d4rl/config/hopper_medium_expert_td3bc_config.py @@ -5,11 +5,7 @@ main_config = dict( exp_name='hopper_medium_expert_td3-bc_seed0', env=dict( - env_id='hopper-medium-expert-v0', - norm_obs=dict( - use_norm=True, - offline_stats=dict(use_offline_stats=True, ), - ), + env_id='hopper-medium-expert-v2', collector_env_num=1, evaluator_env_num=8, use_act_scale=True, diff --git a/dizoo/d4rl/config/hopper_medium_replay_td3bc_config.py b/dizoo/d4rl/config/hopper_medium_replay_td3bc_config.py index 8f754781db..1cd21052a2 100644 --- a/dizoo/d4rl/config/hopper_medium_replay_td3bc_config.py +++ b/dizoo/d4rl/config/hopper_medium_replay_td3bc_config.py @@ -5,11 +5,7 @@ main_config = dict( exp_name='hopper_medium_replay_td3-bc_seed0', env=dict( - env_id='hopper-medium-replay-v0', - norm_obs=dict( - use_norm=True, - offline_stats=dict(use_offline_stats=True, ), - ), + env_id='hopper-medium-replay-v2', collector_env_num=1, evaluator_env_num=8, use_act_scale=True, diff --git a/dizoo/d4rl/config/hopper_medium_td3bc_config.py b/dizoo/d4rl/config/hopper_medium_td3bc_config.py index cbf5fcce19..3794dd21f8 100644 --- a/dizoo/d4rl/config/hopper_medium_td3bc_config.py +++ b/dizoo/d4rl/config/hopper_medium_td3bc_config.py @@ -5,11 +5,7 @@ main_config = dict( exp_name='hopper_medium_td3-bc_seed0', env=dict( - env_id='hopper-medium-v0', - norm_obs=dict( - use_norm=True, - offline_stats=dict(use_offline_stats=True, ), - ), + env_id='hopper-medium-v2', collector_env_num=1, evaluator_env_num=8, use_act_scale=True, From eb1578e6189de6a2d518a4bb7e890997304d729e Mon Sep 17 00:00:00 2001 From: Super1ce <278042904@qq.com> Date: Tue, 11 Apr 2023 22:11:07 +0800 Subject: [PATCH 12/25] remove head in qac --- ding/model/template/qac.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ding/model/template/qac.py b/ding/model/template/qac.py index f3f069d8b2..7618e7a1fa 100755 --- a/ding/model/template/qac.py +++ b/ding/model/template/qac.py @@ -6,7 +6,7 @@ from ding.utils import SequenceType, squeeze, MODEL_REGISTRY from ..common import RegressionHead, ReparameterizationHead, DiscreteHead, MultiHead, \ - FCEncoder, ConvEncoder, EnsembleHead + FCEncoder, ConvEncoder @MODEL_REGISTRY.register('qac') From 483319decd980bd67bba872ea08ac32c1bcdff64 Mon Sep 17 00:00:00 2001 From: Super1ce <278042904@qq.com> Date: Sun, 16 Apr 2023 13:22:09 +0800 Subject: [PATCH 13/25] modify edac comment --- ding/model/template/edac.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ding/model/template/edac.py b/ding/model/template/edac.py index fa49b2404e..e8e8609917 100755 --- a/ding/model/template/edac.py +++ b/ding/model/template/edac.py @@ -39,7 +39,7 @@ def __init__( - obs_shape (:obj:`Union[int, SequenceType]`): Observation's shape, such as 128, (156, ). - action_shape (:obj:`Union[int, SequenceType, EasyDict]`): Action's shape, such as 4, (3, ), \ EasyDict({'action_type_shape': 3, 'action_args_shape': 4}). - - ensemble_num (:obj:`bool`): Q-net numble. + - ensemble_num (:obj:`int`): Q-net numble. - actor_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to actor head. - actor_head_layer_num (:obj:`int`): The num of layers used in the network to compute Q value output \ for actor head. From 4dabfb837935a77da790504241a3d34f908377ca Mon Sep 17 00:00:00 2001 From: Super1ce <278042904@qq.com> Date: Sun, 16 Apr 2023 13:45:19 +0800 Subject: [PATCH 14/25] modify edac comment --- ding/model/template/edac.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ding/model/template/edac.py b/ding/model/template/edac.py index e8e8609917..cedb469700 100755 --- a/ding/model/template/edac.py +++ b/ding/model/template/edac.py @@ -39,7 +39,7 @@ def __init__( - obs_shape (:obj:`Union[int, SequenceType]`): Observation's shape, such as 128, (156, ). - action_shape (:obj:`Union[int, SequenceType, EasyDict]`): Action's shape, such as 4, (3, ), \ EasyDict({'action_type_shape': 3, 'action_args_shape': 4}). - - ensemble_num (:obj:`int`): Q-net numble. + - ensemble_num (:obj:`int`): Q-net number. - actor_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to actor head. - actor_head_layer_num (:obj:`int`): The num of layers used in the network to compute Q value output \ for actor head. From 99c8146611c08e27ff9a2002c6cc2f7ae2431889 Mon Sep 17 00:00:00 2001 From: Super1ce <278042904@qq.com> Date: Wed, 19 Apr 2023 15:45:21 +0800 Subject: [PATCH 15/25] modif edac --- ding/policy/edac.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ding/policy/edac.py b/ding/policy/edac.py index 323a944378..de8d133cd8 100755 --- a/ding/policy/edac.py +++ b/ding/policy/edac.py @@ -148,6 +148,8 @@ def _init_learn(self) -> None: super()._init_learn() # EDAC special implementation self._eta = self._cfg.learn.eta + self._with_q_entropy = self._cfg.learn.with_q_entropy + self._forward_learn_cnt = 0 def _forward_learn(self, data: dict) -> Dict[str, Any]: From 2b1efe6997f95505be85ee5f3d0f1d51ce21e626 Mon Sep 17 00:00:00 2001 From: Super1ce <278042904@qq.com> Date: Sun, 23 Apr 2023 12:46:48 +0800 Subject: [PATCH 16/25] modify edac --- ding/model/template/__init__.py | 2 +- ding/model/template/edac.py | 27 +++++++++++++-------------- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/ding/model/template/__init__.py b/ding/model/template/__init__.py index 8cba175c1b..c4ecd9397e 100755 --- a/ding/model/template/__init__.py +++ b/ding/model/template/__init__.py @@ -23,4 +23,4 @@ from .vae import VanillaVAE from .decision_transformer import DecisionTransformer from .procedure_cloning import ProcedureCloningMCTS, ProcedureCloningBFS -from .edac import Q_ensemble \ No newline at end of file +from .edac import QACEnsemble \ No newline at end of file diff --git a/ding/model/template/edac.py b/ding/model/template/edac.py index cedb469700..649808669f 100755 --- a/ding/model/template/edac.py +++ b/ding/model/template/edac.py @@ -10,7 +10,7 @@ @MODEL_REGISTRY.register('edac') -class Q_ensemble(nn.Module): +class QACEnsemble(nn.Module): r""" Overview: The QAC network with ensemble, which is used in EDAC. @@ -51,7 +51,7 @@ def __init__( - norm_type (:obj:`Optional[str]`): The type of normalization to after network layer (FC, Conv), \ see ``ding.torch_utils.network`` for more details. """ - super(Q_ensemble, self).__init__() + super(QACEnsemble, self).__init__() obs_shape: int = squeeze(obs_shape) action_shape = squeeze(action_shape) self.action_shape = action_shape @@ -69,17 +69,16 @@ def __init__( ) critic_input_size = obs_shape + action_shape - self.critic = nn.Sequential( - EnsembleHead( - critic_input_size, - 1, - critic_head_hidden_size, - critic_head_layer_num, - self.ensemble_num, - activation=activation, - norm_type=norm_type - ) - ) + self.critic = EnsembleHead( + critic_input_size, + 1, + critic_head_hidden_size, + critic_head_layer_num, + self.ensemble_num, + activation=activation, + norm_type=norm_type + ) + def forward(self, inputs: Union[torch.Tensor, Dict[str, torch.Tensor]], mode: str) -> Dict[str, torch.Tensor]: """ @@ -127,7 +126,7 @@ def compute_actor(self, obs: torch.Tensor) -> Dict[str, Union[torch.Tensor, Dict - action_args (:obj:`torch.Tensor`): :math:`(B, N3)`, B is batch size and N3 corresponds to \ ``action_shape.action_args_shape``. Examples: - >>> model = EDAC(64, 64,) + >>> model = QACEnsemble(64, 64,) >>> obs = torch.randn(4, 64) >>> actor_outputs = model(obs,'compute_actor') >>> assert actor_outputs['logit'][0].shape == torch.Size([4, 64]) # mu From ca4e95c2da8beb108c8572d3931fdcc1f11f79b2 Mon Sep 17 00:00:00 2001 From: Super1ce <278042904@qq.com> Date: Sun, 23 Apr 2023 12:50:00 +0800 Subject: [PATCH 17/25] modify head overview --- ding/model/common/head.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ding/model/common/head.py b/ding/model/common/head.py index e8b6b052b9..5ed3ca2384 100755 --- a/ding/model/common/head.py +++ b/ding/model/common/head.py @@ -1320,7 +1320,7 @@ class EnsembleHead(nn.Module): """ Overview: The ``EnsembleHead`` used to output action Q-value for Q-ensemble. \ - Input is a (:obj:`torch.Tensor`) of shape ''(B, N * Ensemble_num, 1)'' and returns a (:obj:`Dict`) containing \ + Input is a (:obj:`torch.Tensor`) of shape ''(B, N * ensemble_num, 1)'' and returns a (:obj:`Dict`) containing \ output ``pred``. Interfaces: ``__init__``, ``forward``. @@ -1376,8 +1376,8 @@ def forward(self, x: torch.Tensor) -> Dict: Returns: - outputs (:obj:`Dict`): Dict containing keyword ``pred`` (:obj:`torch.Tensor`). Shapes: - - x: :math:`(B, N * Ensemble_num, 1)`, where ``B = batch_size`` and ``N = hidden_size``. - - pred: :math:`(B, M * Ensemble_num, 1)`, where ``M = output_size``. + - x: :math:`(B, N * ensemble_num, 1)`, where ``B = batch_size`` and ``N = hidden_size``. + - pred: :math:`(B, M * ensemble_num, 1)`, where ``M = output_size``. Examples: >>> head = EnsembleHead(64 * 10, 64 * 10) >>> inputs = torch.randn(4, 64 * 10, 1) ` From bd7f3e1610ba6feca495a876716729e68921ebba Mon Sep 17 00:00:00 2001 From: Super1ce <278042904@qq.com> Date: Sun, 23 Apr 2023 14:18:05 +0800 Subject: [PATCH 18/25] modify example --- ding/example/edac.py | 4 ++-- ding/model/common/head.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ding/example/edac.py b/ding/example/edac.py index 53bdbdbe14..0b2da5d565 100755 --- a/ding/example/edac.py +++ b/ding/example/edac.py @@ -1,6 +1,6 @@ import gym from ditk import logging -from ding.model import Q_ensemble +from ding.model import QACEnsemble from ding.policy import EDACPolicy from ding.envs import DingEnvWrapper, BaseEnvManagerV2 from ding.data import create_dataset @@ -27,7 +27,7 @@ def main(): set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) dataset = create_dataset(cfg) - model = Q_ensemble(**cfg.policy.model) + model = QACEnsemble(**cfg.policy.model) policy = EDACPolicy(cfg.policy, model=model) task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) diff --git a/ding/model/common/head.py b/ding/model/common/head.py index 5ed3ca2384..def699564b 100755 --- a/ding/model/common/head.py +++ b/ding/model/common/head.py @@ -1385,7 +1385,7 @@ def forward(self, x: torch.Tensor) -> Dict: >>> assert isinstance(outputs, dict) >>> assert outputs['pred'].shape == torch.Size([10, 64 * 10]) """ - x = self.pred(x).squeeze() + x = self.pred(x).squeeze(-1) return {'pred': x} From a54ca4e3ed6b9e753155fa07e7dfbb916e6e55df Mon Sep 17 00:00:00 2001 From: Super1ce <278042904@qq.com> Date: Sun, 23 Apr 2023 14:44:48 +0800 Subject: [PATCH 19/25] forma --- ding/model/common/tests/test_head.py | 2 +- ding/model/template/edac.py | 17 ++++++++--------- ding/policy/command_mode_policy_instance.py | 1 + ding/policy/edac.py | 1 - ding/policy/td3_bc.py | 5 ++--- .../config/halfcheetah_medium_edac_config.py | 4 ++-- .../halfcheetah_medium_expert_edac_config.py | 4 ++-- dizoo/d4rl/config/hopper_medium_edac_config.py | 4 ++-- .../config/hopper_medium_expert_edac_config.py | 4 ++-- dizoo/d4rl/entry/d4rl_edac_main.py | 4 ++-- 10 files changed, 22 insertions(+), 24 deletions(-) diff --git a/ding/model/common/tests/test_head.py b/ding/model/common/tests/test_head.py index 0fa78a343c..7ad68cd805 100644 --- a/ding/model/common/tests/test_head.py +++ b/ding/model/common/tests/test_head.py @@ -87,7 +87,7 @@ def test_stochastic_dueling(self): def test_ensemble(self): inputs = torch.randn(B, embedding_dim * 3, 1) - model = EnsembleHead(embedding_dim, action_shape, 3, 3,3) + model = EnsembleHead(embedding_dim, action_shape, 3, 3, 3) outputs = model(inputs)['pred'] self.output_check(model, outputs) assert outputs.shape == (B, action_shape * 3, 1) diff --git a/ding/model/template/edac.py b/ding/model/template/edac.py index 649808669f..1cbc4b3fb3 100755 --- a/ding/model/template/edac.py +++ b/ding/model/template/edac.py @@ -70,15 +70,14 @@ def __init__( critic_input_size = obs_shape + action_shape self.critic = EnsembleHead( - critic_input_size, - 1, - critic_head_hidden_size, - critic_head_layer_num, - self.ensemble_num, - activation=activation, - norm_type=norm_type - ) - + critic_input_size, + 1, + critic_head_hidden_size, + critic_head_layer_num, + self.ensemble_num, + activation=activation, + norm_type=norm_type + ) def forward(self, inputs: Union[torch.Tensor, Dict[str, torch.Tensor]], mode: str) -> Dict[str, torch.Tensor]: """ diff --git a/ding/policy/command_mode_policy_instance.py b/ding/policy/command_mode_policy_instance.py index 7e93b1d4a9..3ed6aec5f9 100755 --- a/ding/policy/command_mode_policy_instance.py +++ b/ding/policy/command_mode_policy_instance.py @@ -381,6 +381,7 @@ class SQILSACCommandModePolicy(SQILSACPolicy, DummyCommandModePolicy): class IBCCommandModePolicy(IBCPolicy, DummyCommandModePolicy): pass + @POLICY_REGISTRY.register('edac_command') class EDACCommandModelPolicy(EDACPolicy, DummyCommandModePolicy): pass diff --git a/ding/policy/edac.py b/ding/policy/edac.py index de8d133cd8..ce68a24c23 100755 --- a/ding/policy/edac.py +++ b/ding/policy/edac.py @@ -151,7 +151,6 @@ def _init_learn(self) -> None: self._with_q_entropy = self._cfg.learn.with_q_entropy self._forward_learn_cnt = 0 - def _forward_learn(self, data: dict) -> Dict[str, Any]: loss_dict = {} data = default_preprocess_learn( diff --git a/ding/policy/td3_bc.py b/ding/policy/td3_bc.py index 97c19a70d6..c3295d70d0 100644 --- a/ding/policy/td3_bc.py +++ b/ding/policy/td3_bc.py @@ -240,9 +240,8 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: # target q value. with torch.no_grad(): next_action = self._target_model.forward(next_obs, mode='compute_actor')['action'] - noise = ( - torch.randn_like(next_action) * self.noise_sigma - ).clamp(self.noise_range['min'], self.noise_range['max']) + noise = (torch.randn_like(next_action) * + self.noise_sigma).clamp(self.noise_range['min'], self.noise_range['max']) next_action = (next_action + noise).clamp(-1, 1) next_data = {'obs': next_obs, 'action': next_action} target_q_value = self._target_model.forward(next_data, mode='compute_critic')['q_value'] diff --git a/dizoo/d4rl/config/halfcheetah_medium_edac_config.py b/dizoo/d4rl/config/halfcheetah_medium_edac_config.py index b2774cea54..66ea8039dc 100755 --- a/dizoo/d4rl/config/halfcheetah_medium_edac_config.py +++ b/dizoo/d4rl/config/halfcheetah_medium_edac_config.py @@ -37,7 +37,7 @@ collect=dict(data_type='d4rl', ), eval=dict(evaluator=dict(eval_freq=500, )), ), - seed = 0, + seed=0, ) main_config = EasyDict(main_config) @@ -55,4 +55,4 @@ ), ) create_config = EasyDict(create_config) -create_config = create_config \ No newline at end of file +create_config = create_config diff --git a/dizoo/d4rl/config/halfcheetah_medium_expert_edac_config.py b/dizoo/d4rl/config/halfcheetah_medium_expert_edac_config.py index 3e6839188e..17e897f048 100755 --- a/dizoo/d4rl/config/halfcheetah_medium_expert_edac_config.py +++ b/dizoo/d4rl/config/halfcheetah_medium_expert_edac_config.py @@ -37,7 +37,7 @@ collect=dict(data_type='d4rl', ), eval=dict(evaluator=dict(eval_freq=500, )), ), - seed = 123, + seed=123, ) main_config = EasyDict(main_config) @@ -55,4 +55,4 @@ ), ) create_config = EasyDict(create_config) -create_config = create_config \ No newline at end of file +create_config = create_config diff --git a/dizoo/d4rl/config/hopper_medium_edac_config.py b/dizoo/d4rl/config/hopper_medium_edac_config.py index 15a78938d0..f14fad350f 100755 --- a/dizoo/d4rl/config/hopper_medium_edac_config.py +++ b/dizoo/d4rl/config/hopper_medium_edac_config.py @@ -37,7 +37,7 @@ collect=dict(data_type='d4rl', ), eval=dict(evaluator=dict(eval_freq=500, )), ), - seed = 0, + seed=0, ) main_config = EasyDict(main_config) @@ -55,4 +55,4 @@ ), ) create_config = EasyDict(create_config) -create_config = create_config \ No newline at end of file +create_config = create_config diff --git a/dizoo/d4rl/config/hopper_medium_expert_edac_config.py b/dizoo/d4rl/config/hopper_medium_expert_edac_config.py index a7b1902c6d..5bbc5b375d 100755 --- a/dizoo/d4rl/config/hopper_medium_expert_edac_config.py +++ b/dizoo/d4rl/config/hopper_medium_expert_edac_config.py @@ -37,7 +37,7 @@ collect=dict(data_type='d4rl', ), eval=dict(evaluator=dict(eval_freq=500, )), ), - seed = 0, + seed=0, ) main_config = EasyDict(main_config) @@ -55,4 +55,4 @@ ), ) create_config = EasyDict(create_config) -create_config = create_config \ No newline at end of file +create_config = create_config diff --git a/dizoo/d4rl/entry/d4rl_edac_main.py b/dizoo/d4rl/entry/d4rl_edac_main.py index 9ee28815ab..b6710836cb 100755 --- a/dizoo/d4rl/entry/d4rl_edac_main.py +++ b/dizoo/d4rl/entry/d4rl_edac_main.py @@ -5,7 +5,7 @@ def train(args): # launch from anywhere - config = Path(__file__).absolute().parent.parent / 'config' / args.config + config = Path(__file__).absolute().parent.parent / 'config' / args.config config = read_config(str(config)) config[0].exp_name = config[0].exp_name.replace('0', str(args.seed)) serial_pipeline_offline(config, seed=args.seed) @@ -18,4 +18,4 @@ def train(args): parser.add_argument('--seed', '-s', type=int, default=0) parser.add_argument('--config', '-c', type=str, default='halfcheetah_medium_edac_config.py') args = parser.parse_args() - train(args) \ No newline at end of file + train(args) From e4f663d9f034473e6d13737960a27978d048ab43 Mon Sep 17 00:00:00 2001 From: Super1ce <278042904@qq.com> Date: Sun, 23 Apr 2023 14:50:37 +0800 Subject: [PATCH 20/25] format --- ding/example/edac.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ding/example/edac.py b/ding/example/edac.py index 0b2da5d565..40230f3008 100755 --- a/ding/example/edac.py +++ b/ding/example/edac.py @@ -10,7 +10,7 @@ from ding.framework.middleware import interaction_evaluator, trainer, CkptSaver, offline_data_fetcher, offline_logger from ding.utils import set_pkg_seed from dizoo.d4rl.envs import D4RLEnv -from dizoo.d4rl.config.halfcheetah_medium_edac_config import main_config,create_config +from dizoo.d4rl.config.halfcheetah_medium_edac_config import main_config, create_config def main(): @@ -39,4 +39,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() From 62f172efa6b3b94bbfe8bbcc2932e23a3c0a0e69 Mon Sep 17 00:00:00 2001 From: Super1ce <278042904@qq.com> Date: Mon, 24 Apr 2023 09:51:35 +0800 Subject: [PATCH 21/25] format --- ding/model/template/__init__.py | 2 +- ding/model/template/qac.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/ding/model/template/__init__.py b/ding/model/template/__init__.py index c4ecd9397e..55bd468034 100755 --- a/ding/model/template/__init__.py +++ b/ding/model/template/__init__.py @@ -23,4 +23,4 @@ from .vae import VanillaVAE from .decision_transformer import DecisionTransformer from .procedure_cloning import ProcedureCloningMCTS, ProcedureCloningBFS -from .edac import QACEnsemble \ No newline at end of file +from .edac import QACEnsemble diff --git a/ding/model/template/qac.py b/ding/model/template/qac.py index 7618e7a1fa..5435424e2f 100755 --- a/ding/model/template/qac.py +++ b/ding/model/template/qac.py @@ -559,4 +559,5 @@ def compute_critic(self, inputs: Dict) -> Dict: x = [m(inputs['obs'])['logit'] for m in self.critic] else: x = self.critic(inputs['obs'])['logit'] - return {'q_value': x} \ No newline at end of file + return {'q_value': x} + \ No newline at end of file From 4d4aaa8f72a67fae0e07838f9bbf47cca882533d Mon Sep 17 00:00:00 2001 From: Super1ce <278042904@qq.com> Date: Mon, 24 Apr 2023 13:53:16 +0800 Subject: [PATCH 22/25] format --- ding/model/template/qac.py | 1 - dizoo/d4rl/config/halfcheetah_expert_td3bc_config.py | 2 +- dizoo/d4rl/config/halfcheetah_medium_expert_td3bc_config.py | 2 +- dizoo/d4rl/config/halfcheetah_medium_replay_td3bc_config.py | 2 +- dizoo/d4rl/config/halfcheetah_medium_td3bc_config.py | 2 +- dizoo/d4rl/config/halfcheetah_random_td3bc_config.py | 2 +- dizoo/d4rl/config/hopper_random_td3bc_config.py | 2 +- dizoo/d4rl/config/walker2d_expert_td3bc_config.py | 2 +- dizoo/d4rl/config/walker2d_medium_expert_td3bc_config.py | 2 +- dizoo/d4rl/config/walker2d_medium_replay_td3bc_config.py | 2 +- dizoo/d4rl/config/walker2d_medium_td3bc_config.py | 2 +- dizoo/d4rl/config/walker2d_random_td3bc_config.py | 2 +- 12 files changed, 11 insertions(+), 12 deletions(-) diff --git a/ding/model/template/qac.py b/ding/model/template/qac.py index 5435424e2f..aa0cc42b0e 100755 --- a/ding/model/template/qac.py +++ b/ding/model/template/qac.py @@ -560,4 +560,3 @@ def compute_critic(self, inputs: Dict) -> Dict: else: x = self.critic(inputs['obs'])['logit'] return {'q_value': x} - \ No newline at end of file diff --git a/dizoo/d4rl/config/halfcheetah_expert_td3bc_config.py b/dizoo/d4rl/config/halfcheetah_expert_td3bc_config.py index 55e2ddab19..8e9c0f77e1 100755 --- a/dizoo/d4rl/config/halfcheetah_expert_td3bc_config.py +++ b/dizoo/d4rl/config/halfcheetah_expert_td3bc_config.py @@ -7,7 +7,7 @@ env=dict( env_id='halfcheetah-expert-v2', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), collector_env_num=1, diff --git a/dizoo/d4rl/config/halfcheetah_medium_expert_td3bc_config.py b/dizoo/d4rl/config/halfcheetah_medium_expert_td3bc_config.py index 045e4decfe..82d6a90fb5 100755 --- a/dizoo/d4rl/config/halfcheetah_medium_expert_td3bc_config.py +++ b/dizoo/d4rl/config/halfcheetah_medium_expert_td3bc_config.py @@ -7,7 +7,7 @@ env=dict( env_id='halfcheetah-medium-expert-v2', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), collector_env_num=1, diff --git a/dizoo/d4rl/config/halfcheetah_medium_replay_td3bc_config.py b/dizoo/d4rl/config/halfcheetah_medium_replay_td3bc_config.py index 9f41ba8a5c..46122d40f5 100755 --- a/dizoo/d4rl/config/halfcheetah_medium_replay_td3bc_config.py +++ b/dizoo/d4rl/config/halfcheetah_medium_replay_td3bc_config.py @@ -7,7 +7,7 @@ env=dict( env_id='halfcheetah-medium-replay-v2', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), collector_env_num=1, diff --git a/dizoo/d4rl/config/halfcheetah_medium_td3bc_config.py b/dizoo/d4rl/config/halfcheetah_medium_td3bc_config.py index 99cf4bc1ea..9d9b02d450 100755 --- a/dizoo/d4rl/config/halfcheetah_medium_td3bc_config.py +++ b/dizoo/d4rl/config/halfcheetah_medium_td3bc_config.py @@ -7,7 +7,7 @@ env=dict( env_id='halfcheetah-medium-v2', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), collector_env_num=1, diff --git a/dizoo/d4rl/config/halfcheetah_random_td3bc_config.py b/dizoo/d4rl/config/halfcheetah_random_td3bc_config.py index a823d4f4a5..606d1d2943 100755 --- a/dizoo/d4rl/config/halfcheetah_random_td3bc_config.py +++ b/dizoo/d4rl/config/halfcheetah_random_td3bc_config.py @@ -7,7 +7,7 @@ env=dict( env_id='halfcheetah-random-v2', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), collector_env_num=1, diff --git a/dizoo/d4rl/config/hopper_random_td3bc_config.py b/dizoo/d4rl/config/hopper_random_td3bc_config.py index 8cf796b5fb..0f1127f16a 100644 --- a/dizoo/d4rl/config/hopper_random_td3bc_config.py +++ b/dizoo/d4rl/config/hopper_random_td3bc_config.py @@ -7,7 +7,7 @@ env=dict( env_id='hopper-random-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), collector_env_num=1, diff --git a/dizoo/d4rl/config/walker2d_expert_td3bc_config.py b/dizoo/d4rl/config/walker2d_expert_td3bc_config.py index 55e2ddab19..8e9c0f77e1 100755 --- a/dizoo/d4rl/config/walker2d_expert_td3bc_config.py +++ b/dizoo/d4rl/config/walker2d_expert_td3bc_config.py @@ -7,7 +7,7 @@ env=dict( env_id='halfcheetah-expert-v2', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), collector_env_num=1, diff --git a/dizoo/d4rl/config/walker2d_medium_expert_td3bc_config.py b/dizoo/d4rl/config/walker2d_medium_expert_td3bc_config.py index 91389fdb31..6d09932a6e 100755 --- a/dizoo/d4rl/config/walker2d_medium_expert_td3bc_config.py +++ b/dizoo/d4rl/config/walker2d_medium_expert_td3bc_config.py @@ -7,7 +7,7 @@ env=dict( env_id='walker2d-medium-expert-v2', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), collector_env_num=1, diff --git a/dizoo/d4rl/config/walker2d_medium_replay_td3bc_config.py b/dizoo/d4rl/config/walker2d_medium_replay_td3bc_config.py index e532676db2..1408c39b1d 100755 --- a/dizoo/d4rl/config/walker2d_medium_replay_td3bc_config.py +++ b/dizoo/d4rl/config/walker2d_medium_replay_td3bc_config.py @@ -7,7 +7,7 @@ env=dict( env_id='walker2d-medium-replay-v2', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), collector_env_num=1, diff --git a/dizoo/d4rl/config/walker2d_medium_td3bc_config.py b/dizoo/d4rl/config/walker2d_medium_td3bc_config.py index 3f9adc89d7..f3a6ee1933 100755 --- a/dizoo/d4rl/config/walker2d_medium_td3bc_config.py +++ b/dizoo/d4rl/config/walker2d_medium_td3bc_config.py @@ -7,7 +7,7 @@ env=dict( env_id='walker2d-medium-v2', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), collector_env_num=1, diff --git a/dizoo/d4rl/config/walker2d_random_td3bc_config.py b/dizoo/d4rl/config/walker2d_random_td3bc_config.py index 091744acda..5eae10df51 100755 --- a/dizoo/d4rl/config/walker2d_random_td3bc_config.py +++ b/dizoo/d4rl/config/walker2d_random_td3bc_config.py @@ -7,7 +7,7 @@ env=dict( env_id='walker2d-random-v2', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), collector_env_num=1, From 12c3b8551291cc790a24253c790a192a3f250143 Mon Sep 17 00:00:00 2001 From: Super1ce <278042904@qq.com> Date: Mon, 24 Apr 2023 14:12:00 +0800 Subject: [PATCH 23/25] format --- ding/model/template/edac.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/ding/model/template/edac.py b/ding/model/template/edac.py index 1cbc4b3fb3..49789f8acc 100755 --- a/ding/model/template/edac.py +++ b/ding/model/template/edac.py @@ -140,7 +140,8 @@ def compute_critic(self, inputs: Dict[str, torch.Tensor]) -> Dict[str, torch.Ten The forward computation graph of compute_critic mode, uses observation and action tensor to produce critic output, such as ``q_value``. Arguments: - - inputs (:obj:`Dict[str, torch.Tensor]`): Dict strcture of input data, including ``obs`` and ``action`` tensor + - inputs (:obj:`Dict[str, torch.Tensor]`): Dict strcture of input data, including ``obs`` and \ + ``action`` tensor Returns: - outputs (:obj:`Dict[str, torch.Tensor]`): Critic output, such as ``q_value``. ArgumentsKeys: @@ -149,8 +150,10 @@ def compute_critic(self, inputs: Dict[str, torch.Tensor]) -> Dict[str, torch.Ten ReturnKeys: - q_value (:obj:`torch.Tensor`): Q value tensor with same size as batch size. Shapes: - - obs (:obj:`torch.Tensor`): :math:`(B, N1)` or '(Ensemble_num, B, N1)', where B is batch size and N1 is ``obs_shape``. - - action (:obj:`torch.Tensor`): :math:`(B, N2)` or '(Ensemble_num, B, N2)', where B is batch size and N4 is ``action_shape``. + - obs (:obj:`torch.Tensor`): :math:`(B, N1)` or '(Ensemble_num, B, N1)', where B is batch size and N1 is \ + ``obs_shape``. + - action (:obj:`torch.Tensor`): :math:`(B, N2)` or '(Ensemble_num, B, N2)', where B is batch size and N4 \ + is ``action_shape``. - q_value (:obj:`torch.Tensor`): :math:`(Ensemble_num, B)`, where B is batch size. Examples: >>> inputs = {'obs': torch.randn(4, 8), 'action': torch.randn(4, 1)} From 7e6e5ff2065c4238d98cc901280091ac72c6257f Mon Sep 17 00:00:00 2001 From: Super1ce <278042904@qq.com> Date: Tue, 25 Apr 2023 14:05:46 +0800 Subject: [PATCH 24/25] format --- ding/policy/__init__.py | 1 - ding/policy/edac.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/ding/policy/__init__.py b/ding/policy/__init__.py index 02eac73eee..c56b38f766 100755 --- a/ding/policy/__init__.py +++ b/ding/policy/__init__.py @@ -49,7 +49,6 @@ from .pc import ProcedureCloningBFSPolicy -from .edac import EDACPolicy # new-type policy from .ppof import PPOFPolicy diff --git a/ding/policy/edac.py b/ding/policy/edac.py index ce68a24c23..bdc2e41f7c 100755 --- a/ding/policy/edac.py +++ b/ding/policy/edac.py @@ -36,8 +36,8 @@ class EDACPolicy(SACPolicy): | ``embedding_size`` | network. | 5 | ``model.soft_q_`` int 256 | Linear layer size for soft q | | ``embedding_size`` | network. | - 6 | ``model.emsemble_`` int 10 | Number of Q-ensemble network | - | ``num`` | | + 6 | ``model.emsemble_`` int 10 | Number of Q-ensemble network | + | ``num`` | | | | | is False. 7 | ``learn.learning`` float 3e-4 | Learning rate for soft q | Defalut to 1e-3, when | ``_rate_q`` | network. | model.value_network From 16ef00a4cc499bd4acb7d535eb3466d2ad7a0ca3 Mon Sep 17 00:00:00 2001 From: Super1ce <278042904@qq.com> Date: Tue, 25 Apr 2023 14:24:55 +0800 Subject: [PATCH 25/25] format --- ding/policy/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ding/policy/__init__.py b/ding/policy/__init__.py index c56b38f766..2293432383 100755 --- a/ding/policy/__init__.py +++ b/ding/policy/__init__.py @@ -49,6 +49,5 @@ from .pc import ProcedureCloningBFSPolicy - # new-type policy from .ppof import PPOFPolicy