diff --git a/ding/entry/tests/test_serial_entry.py b/ding/entry/tests/test_serial_entry.py index a9123d0fcf..1a44c7b548 100644 --- a/ding/entry/tests/test_serial_entry.py +++ b/ding/entry/tests/test_serial_entry.py @@ -284,15 +284,10 @@ def test_sac_log_space(): assert False, "pipeline fail" -auto_alpha = [True, False] -log_space = [True, False] -args = [item for item in product(*[auto_alpha, log_space])] - - @pytest.mark.platformtest @pytest.mark.unittest -@pytest.mark.parametrize('auto_alpha, log_space', args) -def test_discrete_sac(auto_alpha, log_space): +def test_discrete_sac(): + auto_alpha, log_space = True, False config = [deepcopy(cartpole_sac_config), deepcopy(cartpole_sac_create_config)] config[0].policy.learn.update_per_collect = 1 config[0].policy.learn.auto_alpha = auto_alpha diff --git a/ding/framework/tests/test_parallel.py b/ding/framework/tests/test_parallel.py index 8d2cf648c2..7bdf6ea343 100644 --- a/ding/framework/tests/test_parallel.py +++ b/ding/framework/tests/test_parallel.py @@ -39,7 +39,7 @@ def uncaught_exception_main(): time.sleep(0.2) -@pytest.mark.unittest +@pytest.mark.tmp def test_uncaught_exception(): # Make one process crash, then the parent process will also crash and output the stack of the wrong process. with pytest.raises(Exception) as exc_info: @@ -70,7 +70,7 @@ def disconnected_main(): assert i == 9 -@pytest.mark.unittest +@pytest.mark.tmp def test_disconnected(): # Make one process exit normally and the rest will still run, even if the network request # is not received by other processes. @@ -141,7 +141,7 @@ def main(cls): raise Exception("Invalid node id") -@pytest.mark.unittest +@pytest.mark.tmp def test_auto_recover(): # With max_retries=1 Parallel.runner( diff --git a/ding/framework/tests/test_supervisor.py b/ding/framework/tests/test_supervisor.py index b4fdb95dc0..d6f4c646fa 100644 --- a/ding/framework/tests/test_supervisor.py +++ b/ding/framework/tests/test_supervisor.py @@ -29,7 +29,7 @@ def sleep1(self): sleep(1) -@pytest.mark.unittest +@pytest.mark.tmp @pytest.mark.parametrize("type_", [ChildType.PROCESS, ChildType.THREAD]) def test_supervisor(type_): sv = Supervisor(type_=type_) @@ -74,7 +74,7 @@ def test_supervisor(type_): sv.shutdown() -@pytest.mark.unittest +@pytest.mark.tmp def test_supervisor_spawn(): sv = Supervisor(type_=ChildType.PROCESS, mp_ctx=mp.get_context("spawn")) for _ in range(3): @@ -103,7 +103,7 @@ def step(self, _): return self._counter -# @pytest.mark.unittest +@pytest.mark.tmp @pytest.mark.parametrize("type_", [ChildType.PROCESS, ChildType.THREAD]) def test_crash_supervisor(type_): sv = Supervisor(type_=type_) @@ -143,7 +143,7 @@ def test_crash_supervisor(type_): sv.shutdown() -@pytest.mark.unittest +@pytest.mark.tmp @pytest.mark.parametrize("type_", [ChildType.PROCESS, ChildType.THREAD]) def test_recv_all(type_): sv = Supervisor(type_=type_) diff --git a/ding/framework/tests/test_task.py b/ding/framework/tests/test_task.py index 8b6f9ee1de..67f3dc34c7 100644 --- a/ding/framework/tests/test_task.py +++ b/ding/framework/tests/test_task.py @@ -124,12 +124,12 @@ def _counter(ctx): assert sync_count > 0 -@pytest.mark.unittest +@pytest.mark.tmp def test_parallel_pipeline(): Parallel.runner(n_parallel_workers=2, startup_interval=0.1)(parallel_main) -@pytest.mark.unittest +@pytest.mark.tmp def test_emit(): with task.start(): greets = [] @@ -161,12 +161,12 @@ def emit_remote_main(): assert len(greets) == 0 -@pytest.mark.unittest +@pytest.mark.tmp def test_emit_remote(): Parallel.runner(n_parallel_workers=2, startup_interval=0.1)(emit_remote_main) -@pytest.mark.unittest +@pytest.mark.tmp def test_wait_for(): # Wait for will only work in async or parallel mode with task.start(async_mode=True, n_async_workers=2): @@ -198,7 +198,7 @@ def step1(_): task.run(max_step=1) -@pytest.mark.unittest +@pytest.mark.tmp def test_async_exception(): with task.start(async_mode=True, n_async_workers=2): @@ -227,12 +227,12 @@ def early_stop_main(): assert task.ctx.total_step < 7 -@pytest.mark.unittest +@pytest.mark.tmp def test_early_stop(): Parallel.runner(n_parallel_workers=2, startup_interval=0.1)(early_stop_main) -@pytest.mark.unittest +@pytest.mark.tmp def test_parallel_in_sequencial(): result = [] @@ -250,7 +250,7 @@ def slow(_): assert result == ["begin", "fast", "slow"] -@pytest.mark.unittest +@pytest.mark.tmp def test_serial_in_parallel(): result = [] diff --git a/ding/model/common/encoder.py b/ding/model/common/encoder.py index 8e3f8a5c2a..e22112601e 100644 --- a/ding/model/common/encoder.py +++ b/ding/model/common/encoder.py @@ -23,7 +23,7 @@ def prod(iterable): class ConvEncoder(nn.Module): """ Overview: - The ``Convolution Encoder`` used to encode raw 2-dim image observations (e.g. Atari/Procgen). + The ``Convolution Encoder`` used to encode raw 2-dim image observations (e.g. Atari/Procgen). Interfaces: ``__init__``, ``forward``. """ diff --git a/ding/model/common/head.py b/ding/model/common/head.py index 1f2d324430..c1d27fba89 100755 --- a/ding/model/common/head.py +++ b/ding/model/common/head.py @@ -290,7 +290,7 @@ def forward(self, x: torch.Tensor) -> Dict: class RainbowHead(nn.Module): """ Overview: - The ``RainbowHead`` used to output Q-value distribution, which is used in Rainbow DQN. + The ``RainbowHead`` used to output Q-value distribution, which is used in Rainbow DQN. Interfaces: ``__init__``, ``forward``. """ @@ -394,7 +394,7 @@ def forward(self, x: torch.Tensor) -> Dict: class QRDQNHead(nn.Module): """ Overview: - The ``QRDQNHead`` (Quantile Regression DQN) used to output action quantiles. + The ``QRDQNHead`` (Quantile Regression DQN) used to output action quantiles. Interfaces: ``__init__``, ``forward``. """ diff --git a/ding/model/common/utils.py b/ding/model/common/utils.py index b7c2159215..0f508de0b8 100644 --- a/ding/model/common/utils.py +++ b/ding/model/common/utils.py @@ -1,3 +1,4 @@ +import copy import torch from easydict import EasyDict from ding.utils import import_module, MODEL_REGISTRY diff --git a/ding/model/template/__init__.py b/ding/model/template/__init__.py index a291936d64..df5f337888 100755 --- a/ding/model/template/__init__.py +++ b/ding/model/template/__init__.py @@ -1,12 +1,12 @@ # general -from .q_learning import DQN, RainbowDQN, QRDQN, IQN, FQF, DRQN, C51DQN, BDQ -from .qac import QAC, DiscreteQAC +from .q_learning import DQN, RainbowDQN, QRDQN, IQN, FQF, DRQN, C51DQN, BDQ, GTrXLDQN +from .qac import DiscreteQAC, ContinuousQAC from .pdqn import PDQN from .vac import VAC, DREAMERVAC from .bc import DiscreteBC, ContinuousBC -from .pg import PG from .language_transformer import LanguageTransformer # algorithm-specific +from .pg import PG from .ppg import PPG from .qmix import Mixer, QMix from .collaq import CollaQ @@ -19,10 +19,10 @@ from .mavac import MAVAC from .ngu import NGU from .qac_dist import QACDIST -from .maqac import MAQAC, ContinuousMAQAC +from .maqac import DiscreteMAQAC, ContinuousMAQAC from .madqn import MADQN from .vae import VanillaVAE from .dt import DecisionTransformer from .procedure_cloning import ProcedureCloningMCTS, ProcedureCloningBFS from .bcq import BCQ -from .edac import QACEnsemble +from .edac import EDAC diff --git a/ding/model/template/bc.py b/ding/model/template/bc.py index b40ef4f118..4568e3ce1c 100644 --- a/ding/model/template/bc.py +++ b/ding/model/template/bc.py @@ -177,10 +177,10 @@ def forward(self, inputs: Union[torch.Tensor, Dict[str, torch.Tensor]]) -> Dict[ """ Overview: The unique execution (forward) method of ContinuousBC method. - Arguments: - - inputs (:obj:`torch.Tensor`): Observation data, defaults to tensor. - Returns: - - output (:obj:`Dict`): Output dict data, including differnet key-values among distinct action_space. + Arguments: + - inputs (:obj:`torch.Tensor`): Observation data, defaults to tensor. + Returns: + - output (:obj:`Dict`): Output dict data, including differnet key-values among distinct action_space. """ if self.action_space == 'regression': x = self.actor(inputs) diff --git a/ding/model/template/edac.py b/ding/model/template/edac.py index 49789f8acc..397ba69763 100755 --- a/ding/model/template/edac.py +++ b/ding/model/template/edac.py @@ -10,10 +10,10 @@ @MODEL_REGISTRY.register('edac') -class QACEnsemble(nn.Module): - r""" +class EDAC(nn.Module): + """ Overview: - The QAC network with ensemble, which is used in EDAC. + The Q-value Actor-Critic network with the ensemble mechanism, which is used in EDAC. Interfaces: ``__init__``, ``forward``, ``compute_actor``, ``compute_critic`` """ @@ -51,7 +51,7 @@ def __init__( - norm_type (:obj:`Optional[str]`): The type of normalization to after network layer (FC, Conv), \ see ``ding.torch_utils.network`` for more details. """ - super(QACEnsemble, self).__init__() + super(EDAC, self).__init__() obs_shape: int = squeeze(obs_shape) action_shape = squeeze(action_shape) self.action_shape = action_shape @@ -94,6 +94,7 @@ def forward(self, inputs: Union[torch.Tensor, Dict[str, torch.Tensor]], mode: st - inputs (:obj:`Dict`): Input dict data, including obs and action tensor. Returns: - output (:obj:`Dict`): Output dict data, including q_value tensor. + .. note:: For specific examples, one can refer to API doc of ``compute_actor`` and ``compute_critic`` respectively. """ @@ -125,7 +126,7 @@ def compute_actor(self, obs: torch.Tensor) -> Dict[str, Union[torch.Tensor, Dict - action_args (:obj:`torch.Tensor`): :math:`(B, N3)`, B is batch size and N3 corresponds to \ ``action_shape.action_args_shape``. Examples: - >>> model = QACEnsemble(64, 64,) + >>> model = EDAC(64, 64,) >>> obs = torch.randn(4, 64) >>> actor_outputs = model(obs,'compute_actor') >>> assert actor_outputs['logit'][0].shape == torch.Size([4, 64]) # mu diff --git a/ding/model/template/maqac.py b/ding/model/template/maqac.py index e6ddd996dd..798a3753f5 100644 --- a/ding/model/template/maqac.py +++ b/ding/model/template/maqac.py @@ -9,11 +9,11 @@ FCEncoder, ConvEncoder -@MODEL_REGISTRY.register('maqac') -class MAQAC(nn.Module): - r""" +@MODEL_REGISTRY.register('discrete_maqac') +class DiscreteMAQAC(nn.Module): + """ Overview: - The MAQAC model. + The discrete action Multi-Agent Q-value Actor-CritiC (MAQAC) model. Interfaces: ``__init__``, ``forward``, ``compute_actor``, ``compute_critic`` """ @@ -32,9 +32,9 @@ def __init__( activation: Optional[nn.Module] = nn.ReLU(), norm_type: Optional[str] = None, ) -> None: - r""" + """ Overview: - Init the MAQAC Model according to arguments. + Init the DiscreteMAQAC Model according to arguments. Arguments: - agent_obs_shape (:obj:`Union[int, SequenceType]`): Agent's observation's space. - global_obs_shape (:obj:`Union[int, SequenceType]`): Global observation's space. @@ -42,18 +42,17 @@ def __init__( - action_shape (:obj:`Union[int, SequenceType]`): Action's space. - twin_critic (:obj:`bool`): Whether include twin critic. - actor_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to actor-nn's ``Head``. - - actor_head_layer_num (:obj:`int`): - The num of layers used in the network to compute Q value output for actor's nn. + - actor_head_layer_num (:obj:`int`): The num of layers used in the network to compute Q value output \ + for actor's nn. - critic_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to critic-nn's ``Head``. - - critic_head_layer_num (:obj:`int`): - The num of layers used in the network to compute Q value output for critic's nn. - - activation (:obj:`Optional[nn.Module]`): - The type of activation function to use in ``MLP`` the after ``layer_fn``, - if ``None`` then default set to ``nn.ReLU()`` - - norm_type (:obj:`Optional[str]`): - The type of normalization to use, see ``ding.torch_utils.fc_block`` for more details. + - critic_head_layer_num (:obj:`int`): The num of layers used in the network to compute Q value output \ + for critic's nn. + - activation (:obj:`Optional[nn.Module]`): The type of activation function to use in ``MLP`` the after \ + ``layer_fn``, if ``None`` then default set to ``nn.ReLU()`` + - norm_type (:obj:`Optional[str]`): The type of normalization to use, see ``ding.torch_utils.fc_block`` \ + for more details. """ - super(MAQAC, self).__init__() + super(DiscreteMAQAC, self).__init__() agent_obs_shape: int = squeeze(agent_obs_shape) action_shape: int = squeeze(action_shape) self.actor = nn.Sequential( @@ -188,11 +187,11 @@ def compute_critic(self, inputs: Dict) -> Dict: return {'q_value': x} -@MODEL_REGISTRY.register('maqac_continuous') +@MODEL_REGISTRY.register('continuous_maqac') class ContinuousMAQAC(nn.Module): - r""" + """ Overview: - The Continuous MAQAC model. + The continuous action Multi-Agent Q-value Actor-CritiC (MAQAC) model. Interfaces: ``__init__``, ``forward``, ``compute_actor``, ``compute_critic`` """ @@ -212,7 +211,7 @@ def __init__( activation: Optional[nn.Module] = nn.ReLU(), norm_type: Optional[str] = None, ) -> None: - r""" + """ Overview: Init the QAC Model according to arguments. Arguments: @@ -221,16 +220,15 @@ def __init__( - action_space (:obj:`str`): Whether choose ``regression`` or ``reparameterization``. - twin_critic (:obj:`bool`): Whether include twin critic. - actor_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to actor-nn's ``Head``. - - actor_head_layer_num (:obj:`int`): - The num of layers used in the network to compute Q value output for actor's nn. + - actor_head_layer_num (:obj:`int`): The num of layers used in the network to compute Q value output \ + for actor's nn. - critic_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to critic-nn's ``Head``. - - critic_head_layer_num (:obj:`int`): - The num of layers used in the network to compute Q value output for critic's nn. - - activation (:obj:`Optional[nn.Module]`): - The type of activation function to use in ``MLP`` the after ``layer_fn``, - if ``None`` then default set to ``nn.ReLU()`` - - norm_type (:obj:`Optional[str]`): - The type of normalization to use, see ``ding.torch_utils.fc_block`` for more details. + - critic_head_layer_num (:obj:`int`): The num of layers used in the network to compute Q value output \ + for critic's nn. + - activation (:obj:`Optional[nn.Module]`): The type of activation function to use in ``MLP`` the after \ + ``layer_fn``, if ``None`` then default set to ``nn.ReLU()`` + - norm_type (:obj:`Optional[str]`): The type of normalization to use, see ``ding.torch_utils.fc_block`` \ + for more details. """ super(ContinuousMAQAC, self).__init__() obs_shape: int = squeeze(agent_obs_shape) @@ -238,7 +236,7 @@ def __init__( action_shape = squeeze(action_shape) self.action_shape = action_shape self.action_space = action_space - assert self.action_space in ['regression', 'reparameterization'] + assert self.action_space in ['regression', 'reparameterization'], self.action_space if self.action_space == 'regression': # DDPG, TD3 self.actor = nn.Sequential( nn.Linear(obs_shape, actor_head_hidden_size), activation, diff --git a/ding/model/template/q_learning.py b/ding/model/template/q_learning.py index 013790cd65..e1ddbd6e5f 100644 --- a/ding/model/template/q_learning.py +++ b/ding/model/template/q_learning.py @@ -11,6 +11,19 @@ @MODEL_REGISTRY.register('dqn') class DQN(nn.Module): + """ + Overview: + The neural nework structure and computation graph of Deep Q Network (DQN) algorithm, which is the most classic \ + value-based RL algorithm for discrete action. The DQN is composed of two parts: ``encoder`` and ``head``. \ + The ``encoder`` is used to extract the feature from various observation, and the ``head`` is used to compute \ + the Q value of each action dimension. + Interfaces: + ``__init__``, ``forward``. + + .. note:: + Current ``DQN`` supports two types of encoder: ``FCEncoder`` and ``ConvEncoder``, two types of head: \ + ``DiscreteHead`` and ``DuelingHead``. You can customize your own encoder or head by inheriting this class. + """ def __init__( self, @@ -26,21 +39,22 @@ def __init__( ) -> None: """ Overview: - Init the DQN (encoder + head) Model according to input arguments. + initialize the DQN (encoder + head) Model according to corresponding input arguments. Arguments: - obs_shape (:obj:`Union[int, SequenceType]`): Observation space shape, such as 8 or [4, 84, 84]. - action_shape (:obj:`Union[int, SequenceType]`): Action space shape, such as 6 or [2, 3, 3]. - encoder_hidden_size_list (:obj:`SequenceType`): Collection of ``hidden_size`` to pass to ``Encoder``, \ the last element must match ``head_hidden_size``. - - dueling (:obj:`Optional[bool]`): Whether choose ``DuelingHead`` or ``DiscreteHead(default)``. - - head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` of head network. - - head_layer_num (:obj:`int`): The number of layers used in the head network to compute Q value output + - dueling (:obj:`Optional[bool]`): Whether choose ``DuelingHead`` or ``DiscreteHead (default)``. + - head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` of head network, defaults to None, \ + then it will be set to the last element of ``encoder_hidden_size_list``. + - head_layer_num (:obj:`int`): The number of layers used in the head network to compute Q value output. - activation (:obj:`Optional[nn.Module]`): The type of activation function in networks \ if ``None`` then default set it to ``nn.ReLU()``. - norm_type (:obj:`Optional[str]`): The type of normalization in networks, see \ ``ding.torch_utils.fc_block`` for more details. you can choose one of ['BN', 'IN', 'SyncBN', 'LN'] - dropout (:obj:`Optional[float]`): The dropout rate of the dropout layer. \ - if ``None`` then default no dropout layer. + if ``None`` then default disable dropout layer. """ super(DQN, self).__init__() # Squeeze data from tuple, list or dict to single object. For example, from (4, ) to 4 @@ -91,19 +105,23 @@ def forward(self, x: torch.Tensor) -> Dict: Overview: DQN forward computation graph, input observation tensor to predict q_value. Arguments: - - x (:obj:`torch.Tensor`): Observation inputs + - x (:obj:`torch.Tensor`): The input observation tensor data. Returns: - - outputs (:obj:`Dict`): DQN forward outputs, such as q_value. + - outputs (:obj:`Dict`): The output of DQN's forward, including q_value. ReturnsKeys: - - logit (:obj:`torch.Tensor`): Discrete Q-value output of each action dimension. + - logit (:obj:`torch.Tensor`): Discrete Q-value output of each possible action dimension. Shapes: - x (:obj:`torch.Tensor`): :math:`(B, N)`, where B is batch size and N is ``obs_shape`` - - logit (:obj:`torch.FloatTensor`): :math:`(B, M)`, where B is batch size and M is ``action_shape`` + - logit (:obj:`torch.Tensor`): :math:`(B, M)`, where B is batch size and M is ``action_shape`` Examples: >>> model = DQN(32, 6) # arguments: 'obs_shape' and 'action_shape' >>> inputs = torch.randn(4, 32) >>> outputs = model(inputs) >>> assert isinstance(outputs, dict) and outputs['logit'].shape == torch.Size([4, 6]) + + .. note:: + For consistency and compatibility, we name all the outputs of the network which are related to action \ + selections as ``logit``. """ x = self.encoder(x) x = self.head(x) @@ -207,6 +225,18 @@ def forward(self, x: torch.Tensor) -> Dict: @MODEL_REGISTRY.register('c51dqn') class C51DQN(nn.Module): + """ + Overview: + The neural network structure and computation graph of C51DQN, which combines distributional RL and DQN. \ + You can refer to https://arxiv.org/pdf/1707.06887.pdf for more details. The C51DQN is composed of \ + ``encoder`` and ``head``. ``encoder`` is used to extract the feature of observation, and ``head`` is \ + used to compute the distribution of Q-value. + Interfaces: + ``__init__``, ``forward`` + + .. note:: + Current C51DQN supports two types of encoder: ``FCEncoder`` and ``ConvEncoder``. + """ def __init__( self, @@ -221,21 +251,27 @@ def __init__( v_max: Optional[float] = 10, n_atom: Optional[int] = 51, ) -> None: - r""" + """ Overview: - Init the C51 Model according to input arguments. + initialize the C51 Model according to corresponding input arguments. Arguments: - - obs_shape (:obj:`Union[int, SequenceType]`): Observation's space. - - action_shape (:obj:`Union[int, SequenceType]`): Action's space. - - encoder_hidden_size_list (:obj:`SequenceType`): Collection of ``hidden_size`` to pass to ``Encoder`` - - head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to ``Head``. - - head_layer_num (:obj:`int`): The num of layers used in the network to compute Q value output - - activation (:obj:`Optional[nn.Module]`): - The type of activation function to use in ``MLP`` the after ``layer_fn``, - if ``None`` then default set to ``nn.ReLU()`` - - norm_type (:obj:`Optional[str]`): - The type of normalization to use, see ``ding.torch_utils.fc_block`` for more details` - - n_atom (:obj:`Optional[int]`): Number of atoms in the prediction distribution. + - obs_shape (:obj:`Union[int, SequenceType]`): Observation space shape, such as 8 or [4, 84, 84]. + - action_shape (:obj:`Union[int, SequenceType]`): Action space shape, such as 6 or [2, 3, 3]. + - encoder_hidden_size_list (:obj:`SequenceType`): Collection of ``hidden_size`` to pass to ``Encoder``, \ + the last element must match ``head_hidden_size``. + - head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` of head network, defaults to None, \ + then it will be set to the last element of ``encoder_hidden_size_list``. + - head_layer_num (:obj:`int`): The number of layers used in the head network to compute Q value output. + - activation (:obj:`Optional[nn.Module]`): The type of activation function in networks \ + if ``None`` then default set it to ``nn.ReLU()``. + - norm_type (:obj:`Optional[str]`): The type of normalization in networks, see \ + ``ding.torch_utils.fc_block`` for more details. you can choose one of ['BN', 'IN', 'SyncBN', 'LN'] + - v_min (:obj:`Optional[float]`): The minimum value of the support of the distribution, which is related \ + to the value (discounted sum of reward) scale of the specific environment. Defaults to -10. + - v_max (:obj:`Optional[float]`): The maximum value of the support of the distribution, which is related \ + to the value (discounted sum of reward) scale of the specific environment. Defaults to 10. + - n_atom (:obj:`Optional[int]`): The number of atoms in the prediction distribution, 51 is the default \ + value in the paper, you can also try other values such as 301. """ super(C51DQN, self).__init__() # For compatibility: 1, (1, ), [4, 32, 32] @@ -279,24 +315,28 @@ def __init__( ) def forward(self, x: torch.Tensor) -> Dict: - r""" + """ + Returns: + - outputs (:obj:`Dict`): The output of DQN's forward, including q_value. + ReturnsKeys: + - logit (:obj:`torch.Tensor`): Discrete Q-value output of each possible action dimension. + Shapes: + - x (:obj:`torch.Tensor`): :math:`(B, N)`, where B is batch size and N is ``obs_shape`` + - logit (:obj:`torch.Tensor`): :math:`(B, M)`, where B is batch size and M is ``action_shape`` Overview: - Use observation tensor to predict C51DQN's output. - Parameter updates with C51DQN's MLPs forward setup. + C51DQN forward computation graph, input observation tensor to predict q_value and its distribution. Arguments: - - x (:obj:`torch.Tensor`): - The encoded embedding tensor w/ ``(B, N=head_hidden_size)``. + - x (:obj:`torch.Tensor`): The input observation tensor data. Returns: - - outputs (:obj:`Dict`): - Run with encoder and head. Return the result prediction dictionary. - + - outputs (:obj:`Dict`): The output of DQN's forward, including q_value, and distribution. ReturnsKeys: - - logit (:obj:`torch.Tensor`): Logit tensor with same size as input ``x``. - - distribution (:obj:`torch.Tensor`): Distribution tensor of size ``(B, N, n_atom)`` + - logit (:obj:`torch.Tensor`): Discrete Q-value output of each possible action dimension. + - distribution (:obj:`torch.Tensor`): Q-Value discretized distribution, i.e., probability of each \ + uniformly spaced atom Q-value, such as dividing [-10, 10] into 51 uniform spaces. Shapes: - x (:obj:`torch.Tensor`): :math:`(B, N)`, where B is batch size and N is head_hidden_size. - - logit (:obj:`torch.FloatTensor`): :math:`(B, M)`, where M is action_shape. - - distribution(:obj:`torch.FloatTensor`): :math:`(B, M, P)`, where P is n_atom. + - logit (:obj:`torch.Tensor`): :math:`(B, M)`, where M is action_shape. + - distribution(:obj:`torch.Tensor`): :math:`(B, M, P)`, where P is n_atom. Examples: >>> model = C51DQN(128, 64) # arguments: 'obs_shape' and 'action_shape' @@ -307,6 +347,14 @@ def forward(self, x: torch.Tensor) -> Dict: >>> assert outputs['logit'].shape == torch.Size([4, 64]) >>> # default n_atom: int = 51 >>> assert outputs['distribution'].shape == torch.Size([4, 64, 51]) + + .. note:: + For consistency and compatibility, we name all the outputs of the network which are related to action \ + selections as ``logit``. + + .. note:: + For convenience, we recommend that the number of atoms should be odd, so that the middle atom is exactly \ + the value of the Q-value. """ x = self.encoder(x) x = self.head(x) @@ -640,7 +688,7 @@ class RainbowDQN(nn.Module): RainbowDQN network (C51 + Dueling + Noisy Block) .. note:: - RainbowDQN contains dueling architecture by default + RainbowDQN contains dueling architecture by default. """ def __init__( @@ -787,7 +835,18 @@ def reshape(d): class DRQN(nn.Module): """ Overview: - DQN + RNN = DRQN + The neural network structure and computation graph of DRQN (DQN + RNN = DRQN) algorithm, which is the most \ + common DQN variant for sequential data and paratially observable environment. The DRQN is composed of three \ + parts: ``encoder``, ``head`` and ``rnn``. The ``encoder`` is used to extract the feature from various \ + observation, the ``rnn`` is used to process the sequential observation and other data, and the ``head`` is \ + used to compute the Q value of each action dimension. + Interfaces: + ``__init__``, ``forward``. + + .. note:: + Current ``DRQN`` supports two types of encoder: ``FCEncoder`` and ``ConvEncoder``, two types of head: \ + ``DiscreteHead`` and ``DuelingHead``, three types of rnn: ``normal (LSTM with LayerNorm)``, ``pytorch`` and \ + ``gru``. You can customize your own encoder, rnn or head by inheriting this class. """ def __init__( @@ -803,21 +862,25 @@ def __init__( norm_type: Optional[str] = None, res_link: bool = False ) -> None: - r""" + """ Overview: - Init the DRQN Model according to arguments. + Initialize the DRQN Model according to the corresponding input arguments. Arguments: - - obs_shape (:obj:`Union[int, SequenceType]`): Observation's space. - - action_shape (:obj:`Union[int, SequenceType]`): Action's space. - - encoder_hidden_size_list (:obj:`SequenceType`): Collection of ``hidden_size`` to pass to ``Encoder`` - - head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to ``Head``. - - lstm_type (:obj:`Optional[str]`): Version of rnn cell, now support ['normal', 'pytorch', 'hpc', 'gru'] - - activation (:obj:`Optional[nn.Module]`): - The type of activation function to use in ``MLP`` the after ``layer_fn``, - if ``None`` then default set to ``nn.ReLU()`` - - norm_type (:obj:`Optional[str]`): - The type of normalization to use, see ``ding.torch_utils.fc_block`` for more details` - - res_link (:obj:`bool`): use the residual link or not, default to False + - obs_shape (:obj:`Union[int, SequenceType]`): Observation space shape, such as 8 or [4, 84, 84]. + - action_shape (:obj:`Union[int, SequenceType]`): Action space shape, such as 6 or [2, 3, 3]. + - encoder_hidden_size_list (:obj:`SequenceType`): Collection of ``hidden_size`` to pass to ``Encoder``, \ + the last element must match ``head_hidden_size``. + - dueling (:obj:`Optional[bool]`): Whether choose ``DuelingHead`` or ``DiscreteHead (default)``. + - head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` of head network, defaults to None, \ + then it will be set to the last element of ``encoder_hidden_size_list``. + - head_layer_num (:obj:`int`): The number of layers used in the head network to compute Q value output. + - lstm_type (:obj:`Optional[str]`): The type of RNN module, now support ['normal', 'pytorch', 'gru']. + - activation (:obj:`Optional[nn.Module]`): The type of activation function in networks \ + if ``None`` then default set it to ``nn.ReLU()``. + - norm_type (:obj:`Optional[str]`): The type of normalization in networks, see \ + ``ding.torch_utils.fc_block`` for more details. you can choose one of ['BN', 'IN', 'SyncBN', 'LN'] + - res_link (:obj:`bool`): Whether to enable the residual link, which is the skip connnection between \ + single frame data and the sequential data, defaults to False. """ super(DRQN, self).__init__() # For compatibility: 1, (1, ), [4, 32, 32] @@ -858,34 +921,26 @@ def __init__( ) def forward(self, inputs: Dict, inference: bool = False, saved_state_timesteps: Optional[list] = None) -> Dict: - r""" + """ Overview: - Use observation tensor to predict DRQN output. - Parameter updates with DRQN's MLPs forward setup. + DRQN forward computation graph, input observation tensor to predict q_value. Arguments: - - inputs (:obj:`Dict`): - - inference: (:obj:'bool'): if inference is True, we unroll the one timestep transition, - if inference is False, we unroll the sequence transitions. - - saved_state_timesteps: (:obj:'Optional[list]'): when inference is False, - we unroll the sequence transitions, then we would save rnn hidden states at timesteps - that are listed in list saved_state_timesteps. - - ArgumentsKeys: - - obs (:obj:`torch.Tensor`): Encoded observation - - prev_state (:obj:`list`): Previous state's tensor of size ``(B, N)`` - + - inputs (:obj:`torch.Tensor`): The dict of input data, including observation and previous rnn state. + - inference: (:obj:'bool'): Whether to enable inference forward mode, if True, we unroll the one timestep \ + transition, otherwise, we unroll the eentire sequence transitions. + - saved_state_timesteps: (:obj:'Optional[list]'): When inference is False, we unroll the sequence \ + transitions, then we would use this list to indicate how to save and return hidden state. + ArgumentsKeys: + - obs (:obj:`torch.Tensor`): The raw observation tensor. + - prev_state (:obj:`list`): The previous rnn state tensor, whose structure depends on ``lstm_type``. Returns: - - outputs (:obj:`Dict`): - Run ``MLP`` with ``DRQN`` setups and return the result prediction dictionary. - + - outputs (:obj:`Dict`): The output of DRQN's forward, including logit (q_value) and next state. ReturnsKeys: - - logit (:obj:`torch.Tensor`): Logit tensor with same size as input ``obs``. - - next_state (:obj:`list`): Next state's tensor of size ``(B, N)`` + - logit (:obj:`torch.Tensor`): Discrete Q-value output of each possible action dimension. + - next_state (:obj:`list`): The next rnn state tensor, whose structure depends on ``lstm_type``. Shapes: - - obs (:obj:`torch.Tensor`): :math:`(B, N=obs_space)`, where B is batch size. - - prev_state(:obj:`torch.FloatTensor list`): :math:`[(B, N)]` - - logit (:obj:`torch.FloatTensor`): :math:`(B, N)` - - next_state(:obj:`torch.FloatTensor list`): :math:`[(B, N)]` + - obs (:obj:`torch.Tensor`): :math:`(B, N)`, where B is batch size and N is ``obs_shape`` + - logit (:obj:`torch.Tensor`): :math:`(B, M)`, where B is batch size and M is ``action_shape`` Examples: >>> # Init input's Keys: @@ -958,18 +1013,24 @@ def forward(self, inputs: Dict, inference: bool = False, saved_state_timesteps: x['hidden_state'] = torch.cat(hidden_state_list, dim=0) if saved_state_timesteps is not None: # the selected saved hidden states, including the hidden state (h) and the cell state (c) - # in r2d2, set 'saved_hidden_​​state_timesteps=[self._burnin_step, self._burnin_step + self._nstep]', + # in r2d2, set 'saved_hidden_state_timesteps=[self._burnin_step, self._burnin_step + self._nstep]', # then saved_state will record the hidden_state for main_obs and target_obs to # initialize their lstm (h c) x['saved_state'] = saved_state return x -@MODEL_REGISTRY.register('gtrxl_discrete') -class GTrXLDiscreteHead(nn.Module): +@MODEL_REGISTRY.register('gtrxldqn') +class GTrXLDQN(nn.Module): """ Overview: - Add a discrete head on top of the GTrXL module. + The neural network structure and computation graph of Gated Transformer-XL DQN algorithm, which is the \ + enhanced version of DRQN, using Transformer-XL to improve long-term sequential modelling ability. The \ + GTrXL-DQN is composed of three parts: ``encoder``, ``head`` and ``core``. The ``encoder`` is used to extract \ + the feature from various observation, the ``core`` is used to process the sequential observation and other \ + data, and the ``head`` is used to compute the Q value of each action dimension. + Interfaces: + ``__init__``, ``forward``, ``reset_memory``, ``get_memory`` . """ def __init__( @@ -992,11 +1053,15 @@ def __init__( encoder_hidden_size_list: SequenceType = [128, 128, 256], encoder_norm_type: Optional[str] = None, ) -> None: - r""" + """ Overview: - Init the model according to arguments. + Initialize the GTrXLDQN model accoding to corresponding input arguments. + + .. tip:: + You can refer to GTrXl class in ``ding.torch_utils.network.gtrxl`` for more details about the input \ + arguments. + Arguments: - Refer to GTrXl class in `ding.torch_utils.network.gtrxl` for more details about the input arguments. - obs_shape (:obj:`Union[int, SequenceType]`): Used by Transformer. Observation's space. - action_shape (:obj:Union[int, SequenceType]): Used by Head. Action's space. - head_layer_num (:obj:`int`): Used by Head. Number of layers. @@ -1006,20 +1071,20 @@ def __init__( - att_mlp_num (:obj:`int`): Used by Transformer. - att_layer_num (:obj:`int`): Used by Transformer. - memory_len (:obj:`int`): Used by Transformer. - - activation (:obj:`Optional[nn.Module]`): Used by Transformer and Head. if ``None`` then default set to - ``nn.ReLU()``. - - head_norm_type (:obj:`Optional[str]`): Used by Head. The type of normalization to use, see - ``ding.torch_utils.fc_block`` for more details`. + - activation (:obj:`Optional[nn.Module]`): Used by Transformer and Head. if ``None`` then default set to \ + ``nn.ReLU()``. + - head_norm_type (:obj:`Optional[str]`): Used by Head. The type of normalization to use, see \ + ``ding.torch_utils.fc_block`` for more details`. - dropout (:obj:`bool`): Used by Transformer. - gru_gating (:obj:`bool`): Used by Transformer. - gru_bias (:obj:`float`): Used by Transformer. - dueling (:obj:`bool`): Used by Head. Make the head dueling. - - encoder_hidden_size_list(:obj:`SequenceType`): Used by Encoder. The collection of ``hidden_size`` if using - a custom convolutional encoder. - - encoder_norm_type (:obj:`Optional[str]`): Used by Encoder. The type of normalization to use, see + - encoder_hidden_size_list(:obj:`SequenceType`): Used by Encoder. The collection of ``hidden_size`` if \ + using a custom convolutional encoder. + - encoder_norm_type (:obj:`Optional[str]`): Used by Encoder. The type of normalization to use, see \ ``ding.torch_utils.fc_block`` for more details`. """ - super(GTrXLDiscreteHead, self).__init__() + super(GTrXLDQN, self).__init__() self.core = GTrXL( input_dim=obs_shape, head_dim=att_head_dim, @@ -1035,7 +1100,7 @@ def __init__( ) if isinstance(obs_shape, int) or len(obs_shape) == 1: - pass + raise NotImplementedError("not support obs_shape for pre-defined encoder: {}".format(obs_shape)) # replace the embedding layer of Transformer with Conv Encoder elif len(obs_shape) == 3: assert encoder_hidden_size_list[-1] == hidden_size @@ -1069,19 +1134,17 @@ def __init__( ) def forward(self, x: torch.Tensor) -> Dict: - r""" + """ Overview: Let input tensor go through GTrXl and the Head sequentially. Arguments: - x (:obj:`torch.Tensor`): input tensor of shape (seq_len, bs, obs_shape). Returns: - out (:obj:`Dict`): run ``GTrXL`` with ``DiscreteHead`` setups and return the result prediction dictionary. - Necessary Keys: - - logit (:obj:`torch.Tensor`): discrete Q-value output of each action dimension. - Shape is (bs, action_space) - - memory (:obj:`torch.Tensor`): - memory tensor of size ``(bs x layer_num+1 x memory_len x embedding_dim)`` - - transformer_out (:obj:`torch.Tensor`): output tensor of transformer with same size as input ``x``. + ReturnKeys: + - logit (:obj:`torch.Tensor`): discrete Q-value output of each action dimension, shape is (B, action_space). + - memory (:obj:`torch.Tensor`): memory tensor of size ``(bs x layer_num+1 x memory_len x embedding_dim)``. + - transformer_out (:obj:`torch.Tensor`): output tensor of transformer with same size as input ``x``. Examples: >>> # Init input's Keys: >>> obs_dim, seq_len, bs, action_dim = 128, 64, 32, 4 @@ -1102,27 +1165,23 @@ def forward(self, x: torch.Tensor) -> Dict: out['transformer_out'] = o1['logit'] # output of gtrxl, out['logit'] is final output return out - def reset_memory(self, batch_size: Optional[int] = None, state: Optional[torch.Tensor] = None): - r""" + def reset_memory(self, batch_size: Optional[int] = None, state: Optional[torch.Tensor] = None) -> None: + """ Overview: - Clear or set the memory of GTrXL. - Arguments: - - batch_size (:obj:`Optional[int]`): batch size - - state (:obj:`Optional[torch.Tensor]`): input memory. - Shape is (layer_num, memory_len, bs, embedding_dim). + Clear or reset the memory of GTrXL. + Arguments: + - batch_size (:obj:`Optional[int]`): The number of samples in a training batch. + - state (:obj:`Optional[torch.Tensor]`): The input memory data, whose shape is \ + (layer_num, memory_len, bs, embedding_dim). """ self.core.reset_memory(batch_size, state) def get_memory(self) -> Optional[torch.Tensor]: - r""" + """ Overview: Return the memory of GTrXL. Returns: - - memory: (:obj:`Optional[torch.Tensor]`): output memory or None if memory has not been initialized. - Shape is (layer_num, memory_len, bs, embedding_dim). + - memory: (:obj:`Optional[torch.Tensor]`): output memory or None if memory has not been initialized, \ + whose shape is (layer_num, memory_len, bs, embedding_dim). """ return self.core.get_memory() - - -class GeneralQNetwork(nn.Module): - pass diff --git a/ding/model/template/qac.py b/ding/model/template/qac.py index aa0cc42b0e..6034a4d74c 100755 --- a/ding/model/template/qac.py +++ b/ding/model/template/qac.py @@ -9,11 +9,16 @@ FCEncoder, ConvEncoder -@MODEL_REGISTRY.register('qac') -class QAC(nn.Module): - r""" +@MODEL_REGISTRY.register('continuous_qac') +class ContinuousQAC(nn.Module): + """ Overview: - The QAC network, which is used in DDPG/TD3/SAC. + The neural network and computation graph of algorithms related to Q-value Actor-Critic (QAC), such as \ + DDPG/TD3/SAC. This model now supports continuous and hybrid action space. The ContinuousQAC is composed of \ + four parts: ``actor_encoder``, ``critic_encoder``, ``actor_head`` and ``critic_head``. Encoders are used to \ + extract the feature from various observation. Heads are used to predict corresponding Q-value or action logit. \ + In high-dimensional observation space like 2D image, we often use a shared encoder for both ``actor_encoder`` \ + and ``critic_encoder``. In low-dimensional observation space like 1D vector, we often use different encoders. Interfaces: ``__init__``, ``forward``, ``compute_actor``, ``compute_critic`` """ @@ -31,32 +36,33 @@ def __init__( critic_head_layer_num: int = 1, activation: Optional[nn.Module] = nn.ReLU(), norm_type: Optional[str] = None, - encoder_hidden_size_list: Optional[SequenceType] = [32, 64, 256], + encoder_hidden_size_list: Optional[SequenceType] = None, share_encoder: Optional[bool] = False, ) -> None: """ Overview: - Initailize the QAC Model according to input arguments. + Initailize the ContinuousQAC Model according to input arguments. Arguments: - obs_shape (:obj:`Union[int, SequenceType]`): Observation's shape, such as 128, (156, ). - action_shape (:obj:`Union[int, SequenceType, EasyDict]`): Action's shape, such as 4, (3, ), \ EasyDict({'action_type_shape': 3, 'action_args_shape': 4}). - - action_space (:obj:`str`): The type of action space, \ - including [``regression``, ``reparameterization``, ``hybrid``]. + - action_space (:obj:`str`): The type of action space, including [``regression``, ``reparameterization``, \ + ``hybrid``], ``regression`` is used for DDPG/TD3, ``reparameterization`` is used for SAC and \ + ``hybrid`` for PADDPG. - twin_critic (:obj:`bool`): Whether to use twin critic, one of tricks in TD3. - actor_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to actor head. - - actor_head_layer_num (:obj:`int`): The num of layers used in the network to compute Q value output \ - for actor head. + - actor_head_layer_num (:obj:`int`): The num of layers used in the actor network to compute action. - critic_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to critic head. - - critic_head_layer_num (:obj:`int`): The num of layers used in the network to compute Q value output \ - for critic head. + - critic_head_layer_num (:obj:`int`): The num of layers used in the critic network to compute Q-value. - activation (:obj:`Optional[nn.Module]`): The type of activation function to use in ``MLP`` \ after each FC layer, if ``None`` then default set to ``nn.ReLU()``. - norm_type (:obj:`Optional[str]`): The type of normalization to after network layer (FC, Conv), \ see ``ding.torch_utils.network`` for more details. + - encoder_hidden_size_list (:obj:`SequenceType`): Collection of ``hidden_size`` to pass to ``Encoder``, \ + the last element must match ``head_hidden_size``, this argument is only used in image observation. - share_encoder (:obj:`Optional[bool]`): Whether to share encoder between actor and critic. """ - super(QAC, self).__init__() + super(ContinuousQAC, self).__init__() obs_shape: int = squeeze(obs_shape) action_shape = squeeze(action_shape) self.action_shape = action_shape @@ -67,8 +73,12 @@ def __init__( self.share_encoder = share_encoder if np.isscalar(obs_shape) or len(obs_shape) == 1: assert not self.share_encoder, "Vector observation doesn't need share encoder." - self.encoder = None - self.input_size = obs_shape + assert encoder_hidden_size_list is None, "Vector obs encoder only uses one layer nn.Linear" + # Because there is already a layer nn.Linear in the head, so we use nn.Identity here to keep + # compatible with the image observation and avoid adding an extra layer nn.Linear. + self.actor_encoder = nn.Identity() + self.critic_encoder = nn.Identity() + encoder_output_size = obs_shape elif len(obs_shape) == 3: def setup_conv_encoder(): @@ -84,20 +94,18 @@ def setup_conv_encoder(): ) if self.share_encoder: - self.encoder = setup_conv_encoder() - self.input_size = self.encoder.output_size + encoder = setup_conv_encoder() + self.actor_encoder = self.critic_encoder = encoder else: - self.encoder = nn.ModuleDict({ - 'actor': setup_conv_encoder(), - 'critic': setup_conv_encoder(), - }) - self.input_size = self.encoder['actor'].output_size + self.actor_encoder = setup_conv_encoder() + self.critic_encoder = setup_conv_encoder() + encoder_output_size = self.actor_encoder.output_size else: raise RuntimeError("not support observation shape: {}".format(obs_shape)) # head if self.action_space == 'regression': # DDPG, TD3 - self.actor = nn.Sequential( - nn.Linear(self.input_size, actor_head_hidden_size), activation, + self.actor_head = nn.Sequential( + nn.Linear(encoder_output_size, actor_head_hidden_size), activation, RegressionHead( actor_head_hidden_size, action_shape, @@ -108,8 +116,8 @@ def setup_conv_encoder(): ) ) elif self.action_space == 'reparameterization': # SAC - self.actor = nn.Sequential( - nn.Linear(self.input_size, actor_head_hidden_size), activation, + self.actor_head = nn.Sequential( + nn.Linear(encoder_output_size, actor_head_hidden_size), activation, ReparameterizationHead( actor_head_hidden_size, action_shape, @@ -125,7 +133,7 @@ def setup_conv_encoder(): action_shape.action_args_shape = squeeze(action_shape.action_args_shape) action_shape.action_type_shape = squeeze(action_shape.action_type_shape) actor_action_args = nn.Sequential( - nn.Linear(self.input_size, actor_head_hidden_size), activation, + nn.Linear(encoder_output_size, actor_head_hidden_size), activation, RegressionHead( actor_head_hidden_size, action_shape.action_args_shape, @@ -136,7 +144,7 @@ def setup_conv_encoder(): ) ) actor_action_type = nn.Sequential( - nn.Linear(self.input_size, actor_head_hidden_size), activation, + nn.Linear(encoder_output_size, actor_head_hidden_size), activation, DiscreteHead( actor_head_hidden_size, action_shape.action_type_shape, @@ -145,17 +153,17 @@ def setup_conv_encoder(): norm_type=norm_type, ) ) - self.actor = nn.ModuleList([actor_action_type, actor_action_args]) + self.actor_head = nn.ModuleList([actor_action_type, actor_action_args]) self.twin_critic = twin_critic if self.action_space == 'hybrid': - critic_input_size = self.input_size + action_shape.action_type_shape + action_shape.action_args_shape + critic_input_size = encoder_output_size + action_shape.action_type_shape + action_shape.action_args_shape else: - critic_input_size = self.input_size + action_shape + critic_input_size = encoder_output_size + action_shape if self.twin_critic: - self.critic = nn.ModuleList() + self.critic_head = nn.ModuleList() for _ in range(2): - self.critic.append( + self.critic_head.append( nn.Sequential( nn.Linear(critic_input_size, critic_head_hidden_size), activation, RegressionHead( @@ -169,7 +177,7 @@ def setup_conv_encoder(): ) ) else: - self.critic = nn.Sequential( + self.critic_head = nn.Sequential( nn.Linear(critic_input_size, critic_head_hidden_size), activation, RegressionHead( critic_head_hidden_size, @@ -181,24 +189,41 @@ def setup_conv_encoder(): ) ) + # Convenient for calling some apis (e.g. self.critic.parameters()), + # but may cause misunderstanding when `print(self)` + self.actor = nn.ModuleList([self.actor_encoder, self.actor_head]) + self.critic = nn.ModuleList([self.critic_encoder, self.critic_head]) + def forward(self, inputs: Union[torch.Tensor, Dict[str, torch.Tensor]], mode: str) -> Dict[str, torch.Tensor]: """ Overview: - The unique execution (forward) method of QAC method, and one can indicate different modes to implement \ - different computation graph, including ``compute_actor`` and ``compute_critic`` in QAC. - Mode compute_actor: - Arguments: - - inputs (:obj:`torch.Tensor`): Observation data, defaults to tensor. - Returns: - - output (:obj:`Dict`): Output dict data, including differnet key-values among distinct action_space. - Mode compute_critic: - Arguments: - - inputs (:obj:`Dict`): Input dict data, including obs and action tensor. - Returns: - - output (:obj:`Dict`): Output dict data, including q_value tensor. + QAC forward computation graph, input observation tensor to predict Q-value or action logit. Different \ + ``mode`` will forward with different network modules to get different outputs and save computation. + Arguments: + - inputs (:obj:`Union[torch.Tensor, Dict[str, torch.Tensor]]`): The input data for forward computation \ + graph, for ``compute_actor``, it is the observation tensor, for ``compute_critic``, it is the \ + dict data including obs and action tensor. + - mode (:obj:`str`): The forward mode, all the modes are defined in the beginning of this class. + Returns: + - output (:obj:`Dict[str, torch.Tensor]`): The output dict of QAC forward computation graph, whose \ + key-values vary in different forward modes. + Examples (Actor): + >>> # Regression mode + >>> model = ContinuousQAC(64, 6, 'regression') + >>> obs = torch.randn(4, 64) + >>> actor_outputs = model(obs,'compute_actor') + >>> assert actor_outputs['action'].shape == torch.Size([4, 6]) + >>> # Reparameterization Mode + >>> model = ContinuousQAC(64, 6, 'reparameterization') + >>> obs = torch.randn(4, 64) + >>> actor_outputs = model(obs,'compute_actor') + >>> assert actor_outputs['logit'][0].shape == torch.Size([4, 6]) # mu + >>> actor_outputs['logit'][1].shape == torch.Size([4, 6]) # sigma - .. note:: - For specific examples, one can refer to API doc of ``compute_actor`` and ``compute_critic`` respectively. + Examples (Critic): + >>> inputs = {'obs': torch.randn(4, 8), 'action': torch.randn(4, 1)} + >>> model = ContinuousQAC(obs_shape=(8, ),action_shape=1, action_space='regression') + >>> assert model(inputs, mode='compute_critic')['q_value'].shape == (4, ) # q value """ assert mode in self.mode, "not support forward mode: {}/{}".format(mode, self.mode) return getattr(self, mode)(inputs) @@ -206,26 +231,22 @@ def forward(self, inputs: Union[torch.Tensor, Dict[str, torch.Tensor]], mode: st def compute_actor(self, obs: torch.Tensor) -> Dict[str, Union[torch.Tensor, Dict[str, torch.Tensor]]]: """ Overview: - The forward computation graph of compute_actor mode, uses observation tensor to produce actor output, - such as ``action``, ``logit`` and so on. + QAC forward computation graph for actor part, input observation tensor to predict action or action logit. Arguments: - - obs (:obj:`torch.Tensor`): Observation tensor data, now supports a batch of 1-dim vector data, \ - i.e. ``(B, obs_shape)``. + - x (:obj:`torch.Tensor`): The input observation tensor data. Returns: - - outputs (:obj:`Dict[str, Union[torch.Tensor, Dict[str, torch.Tensor]]]`): Actor output varying \ + - outputs (:obj:`Dict[str, Union[torch.Tensor, Dict[str, torch.Tensor]]]`): Actor output dict varying \ from action_space: ``regression``, ``reparameterization``, ``hybrid``. - - ReturnsKeys (either): - - regression action_space - - action (:obj:`torch.Tensor`): Continuous action with same size as ``action_shape``, usually in DDPG. - - reparameterization action_space - - logit (:obj:`Dict[str, torch.Tensor]`): Reparameterization logit, usually in SAC. - - - mu (:obj:`torch.Tensor`): Mean of parameterization gaussion distribution. - - sigma (:obj:`torch.Tensor`): Standard variation of parameterization gaussion distribution. - - hybrid action_space - - logit (:obj:`torch.Tensor`): Discrete action type logit. - - action_args (:obj:`torch.Tensor`): Continuous action arguments. + ReturnsKeys (regression): + - action (:obj:`torch.Tensor`): Continuous action with same size as ``action_shape``, usually in DDPG/TD3. + ReturnsKeys (reparameterization): + - logit (:obj:`Dict[str, torch.Tensor]`): The predictd reparameterization action logit, usually in SAC. \ + It is a list containing two tensors: ``mu`` and ``sigma``. The former is the mean of the gaussian \ + distribution, the latter is the standard deviation of the gaussian distribution. + ReturnsKeys (hybrid): + - logit (:obj:`torch.Tensor`): The predicted discrete action type logit, it will be the same dimension \ + as ``action_type_shape``, i.e., all the possible discrete action types. + - action_args (:obj:`torch.Tensor`): Continuous action arguments with same size as ``action_args_shape``. Shapes: - obs (:obj:`torch.Tensor`): :math:`(B, N0)`, B is batch size and N0 corresponds to ``obs_shape``. - action (:obj:`torch.Tensor`): :math:`(B, N1)`, B is batch size and N1 corresponds to ``action_shape``. @@ -237,49 +258,44 @@ def compute_actor(self, obs: torch.Tensor) -> Dict[str, Union[torch.Tensor, Dict ``action_shape.action_args_shape``. Examples: >>> # Regression mode - >>> model = QAC(64, 64, 'regression') + >>> model = ContinuousQAC(64, 6, 'regression') >>> obs = torch.randn(4, 64) >>> actor_outputs = model(obs,'compute_actor') - >>> assert actor_outputs['action'].shape == torch.Size([4, 64]) + >>> assert actor_outputs['action'].shape == torch.Size([4, 6]) >>> # Reparameterization Mode - >>> model = QAC(64, 64, 'reparameterization') + >>> model = ContinuousQAC(64, 6, 'reparameterization') >>> obs = torch.randn(4, 64) >>> actor_outputs = model(obs,'compute_actor') - >>> assert actor_outputs['logit'][0].shape == torch.Size([4, 64]) # mu - >>> actor_outputs['logit'][1].shape == torch.Size([4, 64]) # sigma + >>> assert actor_outputs['logit'][0].shape == torch.Size([4, 6]) # mu + >>> actor_outputs['logit'][1].shape == torch.Size([4, 6]) # sigma """ - if self.encoder is not None: - if self.share_encoder: - obs = self.encoder(obs) - else: - obs = self.encoder['actor'](obs) + obs = self.actor_encoder(obs) if self.action_space == 'regression': - x = self.actor(obs) + x = self.actor_head(obs) return {'action': x['pred']} elif self.action_space == 'reparameterization': - x = self.actor(obs) + x = self.actor_head(obs) return {'logit': [x['mu'], x['sigma']]} elif self.action_space == 'hybrid': - logit = self.actor[0](obs) - action_args = self.actor[1](obs) + logit = self.actor_head[0](obs) + action_args = self.actor_head[1](obs) return {'logit': logit['logit'], 'action_args': action_args['pred']} def compute_critic(self, inputs: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: """ Overview: - The forward computation graph of compute_critic mode, uses observation and action tensor to produce critic - output, such as ``q_value``. + QAC forward computation graph for critic part, input observation and action tensor to predict Q-value. Arguments: - - inputs (:obj:`Dict[str, torch.Tensor]`): Dict strcture of input data, including ``obs`` and ``action`` \ - tensor, also contains ``logit`` tensor in hybrid action_space. - Returns: - - outputs (:obj:`Dict[str, torch.Tensor]`): Critic output, such as ``q_value``. - + - inputs (:obj:`Dict[str, torch.Tensor]`): The dict of input data, including ``obs`` and ``action`` \ + tensor, also contains ``logit`` and ``action_args`` tensor in hybrid action_space. ArgumentsKeys: - obs: (:obj:`torch.Tensor`): Observation tensor data, now supports a batch of 1-dim vector data. - action (:obj:`Union[torch.Tensor, Dict]`): Continuous action with same size as ``action_shape``. - logit (:obj:`torch.Tensor`): Discrete action logit, only in hybrid action_space. - action_args (:obj:`torch.Tensor`): Continuous action arguments, only in hybrid action_space. + Returns: + - outputs (:obj:`Dict[str, torch.Tensor]`): The output dict of QAC's forward computation graph for critic, \ + including ``q_value``. ReturnKeys: - q_value (:obj:`torch.Tensor`): Q value tensor with same size as batch size. Shapes: @@ -293,17 +309,12 @@ def compute_critic(self, inputs: Dict[str, torch.Tensor]) -> Dict[str, torch.Ten Examples: >>> inputs = {'obs': torch.randn(4, 8), 'action': torch.randn(4, 1)} - >>> model = QAC(obs_shape=(8, ),action_shape=1, action_space='regression') - >>> model(inputs, mode='compute_critic')['q_value'] # q value - ... tensor([0.0773, 0.1639, 0.0917, 0.0370], grad_fn=) + >>> model = ContinuousQAC(obs_shape=(8, ),action_shape=1, action_space='regression') + >>> assert model(inputs, mode='compute_critic')['q_value'].shape == (4, ) # q value """ obs, action = inputs['obs'], inputs['action'] - if self.encoder is not None: - if self.share_encoder: - obs = self.encoder(obs) - else: - obs = self.encoder['critic'](obs) + obs = self.critic_encoder(obs) assert len(obs.shape) == 2 if self.action_space == 'hybrid': action_type_logit = inputs['logit'] @@ -317,80 +328,100 @@ def compute_critic(self, inputs: Dict[str, torch.Tensor]) -> Dict[str, torch.Ten action = action.unsqueeze(1) x = torch.cat([obs, action], dim=1) if self.twin_critic: - x = [m(x)['pred'] for m in self.critic] + x = [m(x)['pred'] for m in self.critic_head] else: - x = self.critic(x)['pred'] + x = self.critic_head(x)['pred'] return {'q_value': x} @MODEL_REGISTRY.register('discrete_qac') class DiscreteQAC(nn.Module): - r""" + """ Overview: - The Discrete QAC model, used in DiscreteSAC. + The neural network and computation graph of algorithms related to discrete action Q-value Actor-Critic (QAC), \ + such as DiscreteSAC. This model now supports only discrete action space. The DiscreteQAC is composed of \ + four parts: ``actor_encoder``, ``critic_encoder``, ``actor_head`` and ``critic_head``. Encoders are used to \ + extract the feature from various observation. Heads are used to predict corresponding Q-value or action logit. \ + In high-dimensional observation space like 2D image, we often use a shared encoder for both ``actor_encoder`` \ + and ``critic_encoder``. In low-dimensional observation space like 1D vector, we often use different encoders. Interfaces: ``__init__``, ``forward``, ``compute_actor``, ``compute_critic`` """ mode = ['compute_actor', 'compute_critic'] def __init__( - self, - agent_obs_shape: Union[int, SequenceType], - global_obs_shape: Union[int, SequenceType], - action_shape: Union[int, SequenceType], - encoder_hidden_size_list: SequenceType = [64], - twin_critic: bool = False, - actor_head_hidden_size: int = 64, - actor_head_layer_num: int = 1, - critic_head_hidden_size: int = 64, - critic_head_layer_num: int = 1, - activation: Optional[nn.Module] = nn.ReLU(), - norm_type: Optional[str] = None, + self, + obs_shape: Union[int, SequenceType], + action_shape: Union[int, SequenceType], + twin_critic: bool = False, + actor_head_hidden_size: int = 64, + actor_head_layer_num: int = 1, + critic_head_hidden_size: int = 64, + critic_head_layer_num: int = 1, + activation: Optional[nn.Module] = nn.ReLU(), + norm_type: Optional[str] = None, + encoder_hidden_size_list: SequenceType = None, + share_encoder: Optional[bool] = False, ) -> None: - r""" + """ Overview: - Init the QAC Model according to arguments. + Initailize the DiscreteQAC Model according to input arguments. Arguments: - - obs_shape (:obj:`Union[int, SequenceType]`): Observation's space. - - action_shape (:obj:`Union[int, SequenceType]`): Action's space. - - twin_critic (:obj:`bool`): Whether include twin critic. - - actor_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to actor-nn's ``Head``. - - actor_head_layer_num (:obj:`int`): - The num of layers used in the network to compute Q value output for actor's nn. - - critic_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to critic-nn's ``Head``. - - critic_head_layer_num (:obj:`int`): - The num of layers used in the network to compute Q value output for critic's nn. - - activation (:obj:`Optional[nn.Module]`): - The type of activation function to use in ``MLP`` the after ``layer_fn``, - if ``None`` then default set to ``nn.ReLU()`` - - norm_type (:obj:`Optional[str]`): - The type of normalization to use, see ``ding.torch_utils.fc_block`` for more details. + - obs_shape (:obj:`Union[int, SequenceType]`): Observation's shape, such as 128, (156, ). + - action_shape (:obj:`Union[int, SequenceType, EasyDict]`): Action's shape, such as 4, (3, ). + - twin_critic (:obj:`bool`): Whether to use twin critic. + - actor_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to actor head. + - actor_head_layer_num (:obj:`int`): The num of layers used in the actor network to compute action. + - critic_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to critic head. + - critic_head_layer_num (:obj:`int`): The num of layers used in the critic network to compute Q-value. + - activation (:obj:`Optional[nn.Module]`): The type of activation function to use in ``MLP`` \ + after each FC layer, if ``None`` then default set to ``nn.ReLU()``. + - norm_type (:obj:`Optional[str]`): The type of normalization to after network layer (FC, Conv), \ + see ``ding.torch_utils.network`` for more details. + - encoder_hidden_size_list (:obj:`SequenceType`): Collection of ``hidden_size`` to pass to ``Encoder``, \ + the last element must match ``head_hidden_size``, this argument is only used in image observation. + - share_encoder (:obj:`Optional[bool]`): Whether to share encoder between actor and critic. """ super(DiscreteQAC, self).__init__() - agent_obs_shape: int = squeeze(agent_obs_shape) + obs_shape: int = squeeze(obs_shape) action_shape: int = squeeze(action_shape) + # encoder + self.share_encoder = share_encoder + if np.isscalar(obs_shape) or len(obs_shape) == 1: + assert not self.share_encoder, "Vector observation doesn't need share encoder." + assert encoder_hidden_size_list is None, "Vector obs encoder only uses one layer nn.Linear" + # Because there is already a layer nn.Linear in the head, so we use nn.Identity here to keep + # compatible with the image observation and avoid adding an extra layer nn.Linear. + self.actor_encoder = nn.Identity() + self.critic_encoder = nn.Identity() + encoder_output_size = obs_shape + elif len(obs_shape) == 3: - if isinstance(agent_obs_shape, int) or len(agent_obs_shape) == 1: - encoder_cls = FCEncoder - elif len(agent_obs_shape) == 3: - encoder_cls = ConvEncoder - else: - raise RuntimeError( - "not support obs_shape for pre-defined encoder: {}, please customize your own DQN". - format(agent_obs_shape) - ) - if isinstance(global_obs_shape, int) or len(global_obs_shape) == 1: - global_encoder_cls = FCEncoder - elif len(global_obs_shape) == 3: - global_encoder_cls = ConvEncoder + def setup_conv_encoder(): + kernel_size = [3 for _ in range(len(encoder_hidden_size_list))] + stride = [2] + [1 for _ in range(len(encoder_hidden_size_list) - 1)] + return ConvEncoder( + obs_shape, + encoder_hidden_size_list, + activation=activation, + norm_type=norm_type, + kernel_size=kernel_size, + stride=stride + ) + + if self.share_encoder: + encoder = setup_conv_encoder() + self.actor_encoder = self.critic_encoder = encoder + else: + self.actor_encoder = setup_conv_encoder() + self.critic_encoder = setup_conv_encoder() + encoder_output_size = self.actor_encoder.output_size else: - raise RuntimeError( - "not support obs_shape for pre-defined encoder: {}, please customize your own DQN". - format(global_obs_shape) - ) + raise RuntimeError("not support observation shape: {}".format(obs_shape)) - self.actor = nn.Sequential( - encoder_cls(agent_obs_shape, encoder_hidden_size_list, activation=activation, norm_type=norm_type), + # head + self.actor_head = nn.Sequential( + nn.Linear(encoder_output_size, actor_head_hidden_size), activation, DiscreteHead( actor_head_hidden_size, action_shape, actor_head_layer_num, activation=activation, norm_type=norm_type ) @@ -398,13 +429,11 @@ def __init__( self.twin_critic = twin_critic if self.twin_critic: - self.critic = nn.ModuleList() + self.critic_head = nn.ModuleList() for _ in range(2): - self.critic.append( + self.critic_head.append( nn.Sequential( - global_encoder_cls( - agent_obs_shape, encoder_hidden_size_list, activation=activation, norm_type=norm_type - ), + nn.Linear(encoder_output_size, critic_head_hidden_size), activation, DiscreteHead( critic_head_hidden_size, action_shape, @@ -415,10 +444,8 @@ def __init__( ) ) else: - self.critic = nn.Sequential( - global_encoder_cls( - agent_obs_shape, encoder_hidden_size_list, activation=activation, norm_type=norm_type - ), + self.critic_head = nn.Sequential( + nn.Linear(encoder_output_size, critic_head_hidden_size), activation, DiscreteHead( critic_head_hidden_size, action_shape, @@ -427,136 +454,88 @@ def __init__( norm_type=norm_type ) ) + # Convenient for calling some apis (e.g. self.critic.parameters()), + # but may cause misunderstanding when `print(self)` + self.actor = nn.ModuleList([self.actor_encoder, self.actor_head]) + self.critic = nn.ModuleList([self.critic_encoder, self.critic_head]) - def forward(self, inputs: Union[torch.Tensor, Dict], mode: str) -> Dict: - r""" + def forward(self, inputs: torch.Tensor, mode: str) -> Dict[str, torch.Tensor]: + """ Overview: - Use bbservation and action tensor to predict output. - Parameter updates with QAC's MLPs forward setup. + QAC forward computation graph, input observation tensor to predict Q-value or action logit. Different \ + ``mode`` will forward with different network modules to get different outputs and save computation. Arguments: - Forward with ``'compute_actor'``: - - inputs (:obj:`torch.Tensor`): - The encoded embedding tensor, determined with given ``hidden_size``, i.e. ``(B, N=hidden_size)``. - Whether ``actor_head_hidden_size`` or ``critic_head_hidden_size`` depend on ``mode``. - - Forward with ``'compute_critic'``, inputs (`Dict`) Necessary Keys: - - ``obs``, ``action`` encoded tensors. - - - mode (:obj:`str`): Name of the forward mode. + - inputs (:obj:`torch.Tensor`): The input observation tensor data. + - mode (:obj:`str`): The forward mode, all the modes are defined in the beginning of this class. Returns: - - outputs (:obj:`Dict`): Outputs of network forward. - - Forward with ``'compute_actor'``, Necessary Keys (either): - - action (:obj:`torch.Tensor`): Action tensor with same size as input ``x``. - - logit (:obj:`torch.Tensor`): - Logit tensor encoding ``mu`` and ``sigma``, both with same size as input ``x``. - - Forward with ``'compute_critic'``, Necessary Keys: - - q_value (:obj:`torch.Tensor`): Q value tensor with same size as batch size. - Actor Shapes: - - inputs (:obj:`torch.Tensor`): :math:`(B, N0)`, B is batch size and N0 corresponds to ``hidden_size`` - - action (:obj:`torch.Tensor`): :math:`(B, N0)` - - q_value (:obj:`torch.FloatTensor`): :math:`(B, )`, where B is batch size. - - Critic Shapes: - - obs (:obj:`torch.Tensor`): :math:`(B, N1)`, where B is batch size and N1 is ``obs_shape`` - - action (:obj:`torch.Tensor`): :math:`(B, N2)`, where B is batch size and N2 is``action_shape`` - - logit (:obj:`torch.FloatTensor`): :math:`(B, N2)`, where B is batch size and N3 is ``action_shape`` - - Actor Examples: - >>> # Regression mode - >>> model = QAC(64, 64, 'regression') - >>> inputs = torch.randn(4, 64) - >>> actor_outputs = model(inputs,'compute_actor') - >>> assert actor_outputs['action'].shape == torch.Size([4, 64]) - >>> # Reparameterization Mode - >>> model = QAC(64, 64, 'reparameterization') - >>> inputs = torch.randn(4, 64) - >>> actor_outputs = model(inputs,'compute_actor') - >>> actor_outputs['logit'][0].shape # mu - >>> torch.Size([4, 64]) - >>> actor_outputs['logit'][1].shape # sigma - >>> torch.Size([4, 64]) - - Critic Examples: - >>> inputs = {'obs': torch.randn(4,N), 'action': torch.randn(4,1)} - >>> model = QAC(obs_shape=(N, ), action_shape=1, action_space='regression') - >>> model(inputs, mode='compute_critic')['q_value'] # q value - tensor([0.0773, 0.1639, 0.0917, 0.0370], grad_fn=) + - output (:obj:`Dict[str, torch.Tensor]`): The output dict of QAC forward computation graph, whose \ + key-values vary in different forward modes. + Examples (Actor): + >>> model = DiscreteQAC(64, 6) + >>> obs = torch.randn(4, 64) + >>> actor_outputs = model(obs,'compute_actor') + >>> assert actor_outputs['logit'].shape == torch.Size([4, 6]) + Examples(Critic): + >>> model = DiscreteQAC(64, 6, twin_critic=False) + >>> obs = torch.randn(4, 64) + >>> actor_outputs = model(obs,'compute_critic') + >>> assert actor_outputs['q_value'].shape == torch.Size([4, 6]) """ assert mode in self.mode, "not support forward mode: {}/{}".format(mode, self.mode) return getattr(self, mode)(inputs) - def compute_actor(self, inputs: torch.Tensor) -> Dict: - r""" + def compute_actor(self, inputs: torch.Tensor) -> Dict[str, torch.Tensor]: + """ Overview: - Use encoded embedding tensor to predict output. - Execute parameter updates with ``'compute_actor'`` mode - Use encoded embedding tensor to predict output. + QAC forward computation graph for actor part, input observation tensor to predict action or action logit. Arguments: - - inputs (:obj:`torch.Tensor`): - The encoded embedding tensor, determined with given ``hidden_size``, i.e. ``(B, N=hidden_size)``. - ``hidden_size = actor_head_hidden_size`` - - mode (:obj:`str`): Name of the forward mode. + - inputs (:obj:`torch.Tensor`): The input observation tensor data. Returns: - - outputs (:obj:`Dict`): Outputs of forward pass encoder and head. - - ReturnsKeys (either): - - action (:obj:`torch.Tensor`): Continuous action tensor with same size as ``action_shape``. - - logit (:obj:`torch.Tensor`): - Logit tensor encoding ``mu`` and ``sigma``, both with same size as input ``x``. + - outputs (:obj:`Dict[str, torch.Tensor]`): The output dict of QAC forward computation graph for actor, \ + including discrete action ``logit``. + ReturnsKeys: + - logit (:obj:`torch.Tensor`): The predicted discrete action type logit, it will be the same dimension \ + as ``action_shape``, i.e., all the possible discrete action choices. Shapes: - - inputs (:obj:`torch.Tensor`): :math:`(B, N0)`, B is batch size and N0 corresponds to ``hidden_size`` - - action (:obj:`torch.Tensor`): :math:`(B, N0)` - - logit (:obj:`list`): 2 elements, mu and sigma, each is the shape of :math:`(B, N0)`. - - q_value (:obj:`torch.FloatTensor`): :math:`(B, )`, B is batch size. + - inputs (:obj:`torch.Tensor`): :math:`(B, N0)`, B is batch size and N0 corresponds to ``obs_shape``. + - logit (:obj:`torch.Tensor`): :math:`(B, N2)`, B is batch size and N2 corresponds to \ + ``action_shape``. Examples: - >>> # Regression mode - >>> model = QAC(64, 64, 'regression') - >>> inputs = torch.randn(4, 64) - >>> actor_outputs = model(inputs,'compute_actor') - >>> assert actor_outputs['action'].shape == torch.Size([4, 64]) - >>> # Reparameterization Mode - >>> model = QAC(64, 64, 'reparameterization') - >>> inputs = torch.randn(4, 64) - >>> actor_outputs = model(inputs,'compute_actor') - >>> actor_outputs['logit'][0].shape # mu - >>> torch.Size([4, 64]) - >>> actor_outputs['logit'][1].shape # sigma - >>> torch.Size([4, 64]) + >>> model = DiscreteQAC(64, 6) + >>> obs = torch.randn(4, 64) + >>> actor_outputs = model(obs,'compute_actor') + >>> assert actor_outputs['logit'].shape == torch.Size([4, 6]) """ - x = self.actor(inputs['obs']) + x = self.actor_encoder(inputs) + x = self.actor_head(x) return {'logit': x['logit']} - def compute_critic(self, inputs: Dict) -> Dict: - r""" + def compute_critic(self, inputs: torch.Tensor) -> Dict[str, torch.Tensor]: + """ Overview: - Execute parameter updates with ``'compute_critic'`` mode - Use encoded embedding tensor to predict output. + QAC forward computation graph for critic part, input observation to predict Q-value for each possible \ + discrete action choices. Arguments: - - ``obs``, ``action`` encoded tensors. - - mode (:obj:`str`): Name of the forward mode. + - inputs (:obj:`torch.Tensor`): The input observation tensor data. Returns: - - outputs (:obj:`Dict`): Q-value output. - + - outputs (:obj:`Dict[str, torch.Tensor]`): The output dict of QAC forward computation graph for critic, \ + including ``q_value`` for each possible discrete action choices. ReturnKeys: - - q_value (:obj:`torch.Tensor`): Q value tensor with same size as batch size. + - q_value (:obj:`torch.Tensor`): The predicted Q-value for each possible discrete action choices, it will \ + be the same dimension as ``action_shape`` and used to calculate the loss. Shapes: - - obs (:obj:`torch.Tensor`): :math:`(B, N1)`, where B is batch size and N1 is ``obs_shape`` - - action (:obj:`torch.Tensor`): :math:`(B, N2)`, where B is batch size and N2 is ``action_shape`` - - q_value (:obj:`torch.FloatTensor`): :math:`(B, )`, where B is batch size. - + - obs (:obj:`torch.Tensor`): :math:`(B, N1)`, where B is batch size and N1 is ``obs_shape``. + - q_value (:obj:`torch.Tensor`): :math:`(B, N2)`, where B is batch size and N2 is ``action_shape``. Examples: - >>> inputs = {'obs': torch.randn(4, N), 'action': torch.randn(4, 1)} - >>> model = QAC(obs_shape=(N, ),action_shape=1, action_space='regression') - >>> model(inputs, mode='compute_critic')['q_value'] # q value - tensor([0.0773, 0.1639, 0.0917, 0.0370], grad_fn=) - + >>> model = DiscreteQAC(64, 6, twin_critic=False) + >>> obs = torch.randn(4, 64) + >>> actor_outputs = model(obs,'compute_critic') + >>> assert actor_outputs['q_value'].shape == torch.Size([4, 6]) """ - + inputs = self.critic_encoder(inputs) if self.twin_critic: - x = [m(inputs['obs'])['logit'] for m in self.critic] + x = [m(inputs)['logit'] for m in self.critic_head] else: - x = self.critic(inputs['obs'])['logit'] + x = self.critic_head(inputs)['logit'] return {'q_value': x} diff --git a/ding/model/template/tests/test_hybrid_qac.py b/ding/model/template/tests/test_hybrid_qac.py index 018c3f2d36..3a81d55350 100644 --- a/ding/model/template/tests/test_hybrid_qac.py +++ b/ding/model/template/tests/test_hybrid_qac.py @@ -3,7 +3,7 @@ import pytest from itertools import product -from ding.model.template import QAC +from ding.model.template import ContinuousQAC from ding.torch_utils import is_differentiable from ding.utils import squeeze from easydict import EasyDict @@ -21,7 +21,7 @@ @pytest.mark.unittest -class TestHybridQAC: +class TestHybridContinuousQAC: def test_hybrid_qac( self, @@ -39,7 +39,7 @@ def test_hybrid_qac( }, 'logit': torch.randn(B, squeeze(action_shape.action_type_shape)) } - model = QAC( + model = ContinuousQAC( obs_shape=(N, ), action_shape=action_shape, action_space=action_space, @@ -50,8 +50,8 @@ def test_hybrid_qac( # compute_q q = model(inputs, mode='compute_critic')['q_value'] if twin: - is_differentiable(q[0].sum(), model.critic[0]) - is_differentiable(q[1].sum(), model.critic[1]) + is_differentiable(q[0].sum(), model.critic[1][0]) + is_differentiable(q[1].sum(), model.critic[1][1]) else: is_differentiable(q.sum(), model.critic) diff --git a/ding/model/template/tests/test_maqac.py b/ding/model/template/tests/test_maqac.py index 4b6f40e69a..fa917e7ebc 100644 --- a/ding/model/template/tests/test_maqac.py +++ b/ding/model/template/tests/test_maqac.py @@ -3,7 +3,7 @@ import pytest from itertools import product -from ding.model.template import MAQAC, ContinuousMAQAC +from ding.model.template import DiscreteMAQAC, ContinuousMAQAC from ding.torch_utils import is_differentiable from ding.utils.default_helper import squeeze @@ -17,7 +17,7 @@ @pytest.mark.unittest @pytest.mark.parametrize('agent_obs_shape, global_obs_shape, twin_critic', args) -class TestMAQAC: +class TestDiscreteMAQAC: def output_check(self, model, outputs, action_shape): if isinstance(action_shape, tuple): @@ -34,7 +34,7 @@ def test_maqac(self, agent_obs_shape, global_obs_shape, twin_critic): 'action_mask': torch.randint(0, 2, size=(B, agent_num, action_shape)) } } - model = MAQAC(agent_obs_shape, global_obs_shape, action_shape, twin_critic=twin_critic) + model = DiscreteMAQAC(agent_obs_shape, global_obs_shape, action_shape, twin_critic=twin_critic) logit = model(data, mode='compute_actor')['logit'] value = model(data, mode='compute_critic')['q_value'] diff --git a/ding/model/template/tests/test_qac.py b/ding/model/template/tests/test_qac.py index 4bcc27cc1a..7ddbf9d511 100644 --- a/ding/model/template/tests/test_qac.py +++ b/ding/model/template/tests/test_qac.py @@ -3,7 +3,7 @@ import pytest from itertools import product -from ding.model.template import QAC, MAQAC, DiscreteQAC +from ding.model.template import ContinuousQAC, DiscreteMAQAC, DiscreteQAC from ding.torch_utils import is_differentiable from ding.utils import squeeze @@ -18,12 +18,12 @@ @pytest.mark.unittest @pytest.mark.parametrize('action_shape, twin, action_space', args) -class TestQAC: +class TestContinuousQAC: def test_fcqac(self, action_shape, twin, action_space): N = 32 inputs = {'obs': torch.randn(B, N), 'action': torch.randn(B, squeeze(action_shape))} - model = QAC( + model = ContinuousQAC( obs_shape=(N, ), action_shape=action_shape, action_space=action_space, @@ -34,8 +34,8 @@ def test_fcqac(self, action_shape, twin, action_space): # compute_q q = model(inputs, mode='compute_critic')['q_value'] if twin: - is_differentiable(q[0].sum(), model.critic[0]) - is_differentiable(q[1].sum(), model.critic[1]) + is_differentiable(q[0].sum(), model.critic[1][0]) + is_differentiable(q[1].sum(), model.critic[1][1]) else: is_differentiable(q.sum(), model.critic) @@ -56,83 +56,38 @@ def test_fcqac(self, action_shape, twin, action_space): is_differentiable(mu.sum() + sigma.sum(), model.actor) -args = list(product(*[[True, False]])) +args = list(product(*[[True, False], [(13, ), [4, 84, 84]]])) @pytest.mark.unittest -@pytest.mark.parametrize('twin', args) +@pytest.mark.parametrize('twin, obs_shape', args) class TestDiscreteQAC: - def test_discreteqac(self, twin): - N = 32 - A = 6 - inputs = {'obs': torch.randn(B, N)} + def test_discreteqac(self, twin, obs_shape): + action_shape = 6 + inputs = torch.randn(B, *obs_shape) model = DiscreteQAC( - agent_obs_shape=N, - global_obs_shape=N, - action_shape=A, + obs_shape=obs_shape, + action_shape=action_shape, twin_critic=twin, + encoder_hidden_size_list=[32, 32, 64] if len(obs_shape) > 1 else None, ) - # compute_q + # compute_critic q = model(inputs, mode='compute_critic')['q_value'] if twin: - is_differentiable(q[0].sum(), model.critic[0]) - is_differentiable(q[1].sum(), model.critic[1]) + is_differentiable(q[0].sum(), model.critic[1][0]) + # is_differentiable(q[1].sum(), model.critic[1][1]) # backward encoder twice + assert q[0].shape == (B, action_shape) + assert q[1].shape == (B, action_shape) else: - is_differentiable(q.sum(), model.critic) + is_differentiable(q.sum(), model.critic[1]) + assert q.shape == (B, action_shape) - # compute_action + # compute_actor print(model) logit = model(inputs, mode='compute_actor')['logit'] - assert logit.shape[0] == B - assert logit.shape[1] == A - - -B = 32 -agent_obs_shape = [216, 265] -global_obs_shape = [264, 324] -agent_num = 8 -action_shape = 14 -args = list(product(*[agent_obs_shape, global_obs_shape])) - - -@pytest.mark.unittest -@pytest.mark.parametrize('agent_obs_shape, global_obs_shape', args) -class TestMAQAC: - - def output_check(self, model, outputs, action_shape): - if isinstance(action_shape, tuple): - loss = sum([t.sum() for t in outputs]) - elif np.isscalar(action_shape): - loss = outputs.sum() - is_differentiable(loss, model) - - def test_maqac(self, agent_obs_shape, global_obs_shape): - data = { - 'obs': { - 'agent_state': torch.randn(B, agent_num, agent_obs_shape), - 'global_state': torch.randn(B, agent_num, global_obs_shape), - 'action_mask': torch.randint(0, 2, size=(B, agent_num, action_shape)) - } - } - model = MAQAC(agent_obs_shape, global_obs_shape, action_shape) - - logit = model(data, mode='compute_actor')['logit'] - value = model(data, mode='compute_critic')['q_value'] - - outputs = value.sum() + logit.sum() - self.output_check(model, outputs, action_shape) - - for p in model.parameters(): - p.grad = None - logit = model(data, mode='compute_actor')['logit'] - self.output_check(model.actor, logit, action_shape) - - for p in model.parameters(): - p.grad = None - value = model(data, mode='compute_critic')['q_value'] - assert value.shape == (B, agent_num, action_shape) - self.output_check(model.critic, value, action_shape) + assert logit.shape == (B, action_shape) + is_differentiable(logit.sum(), model.actor) B = 4 @@ -143,11 +98,11 @@ def test_maqac(self, agent_obs_shape, global_obs_shape): @pytest.mark.unittest @pytest.mark.parametrize('action_shape, twin, share_encoder', args) -class TestQACPixel: +class TestContinuousQACPixel: def test_qacpixel(self, action_shape, twin, share_encoder): inputs = {'obs': torch.randn(B, 3, 84, 84), 'action': torch.randn(B, squeeze(action_shape))} - model = QAC( + model = ContinuousQAC( obs_shape=(3, 84, 84), action_shape=action_shape, action_space='reparameterization', @@ -169,4 +124,7 @@ def test_qacpixel(self, action_shape, twin, share_encoder): action_shape = squeeze(action_shape) assert mu.shape == (B, action_shape) assert sigma.shape == (B, action_shape) - is_differentiable(mu.sum() + sigma.sum(), model.actor) + if share_encoder: # if share_encoder, actor_encoder's grad is not None + is_differentiable(mu.sum() + sigma.sum(), model.actor_head) + else: + is_differentiable(mu.sum() + sigma.sum(), model.actor) diff --git a/ding/model/template/vac.py b/ding/model/template/vac.py index 24fe845b94..2a2a56b881 100644 --- a/ding/model/template/vac.py +++ b/ding/model/template/vac.py @@ -11,11 +11,16 @@ @MODEL_REGISTRY.register('vac') class VAC(nn.Module): - r""" + """ Overview: - The VAC model. + The neural network and computation graph of algorithms related to (state) Value Actor-Critic (VAC), such as \ + A2C/PPO/IMPALA. This model now supports discrete, continuous and hybrid action space. The VAC is composed of \ + four parts: ``actor_encoder``, ``critic_encoder``, ``actor_head`` and ``critic_head``. Encoders are used to \ + extract the feature from various observation. Heads are used to predict corresponding value or action logit. \ + In high-dimensional observation space like 2D image, we often use a shared encoder for both ``actor_encoder`` \ + and ``critic_encoder``. In low-dimensional observation space like 1D vector, we often use different encoders. Interfaces: - ``__init__``, ``forward``, ``compute_actor``, ``compute_critic`` + ``__init__``, ``forward``, ``compute_actor``, ``compute_critic``, ``compute_actor_critic``. """ mode = ['compute_actor', 'compute_critic', 'compute_actor_critic'] @@ -38,26 +43,37 @@ def __init__( encoder: Optional[torch.nn.Module] = None, impala_cnn_encoder: bool = False, ) -> None: - r""" + """ Overview: - Init the VAC Model according to arguments. + Initialize the VAC model according to corresponding input arguments. Arguments: - - obs_shape (:obj:`Union[int, SequenceType]`): Observation's space. - - action_shape (:obj:`Union[int, SequenceType]`): Action's space. - - action_space (:obj:`str`): Choose action head in ['discrete', 'continuous', 'hybrid'] - - share_encoder (:obj:`bool`): Whether share encoder. - - encoder_hidden_size_list (:obj:`SequenceType`): Collection of ``hidden_size`` to pass to ``Encoder`` - - actor_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to actor-nn's ``Head``. - - actor_head_layer_num (:obj:`int`): - The num of layers used in the network to compute Q value output for actor's nn. - - critic_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to critic-nn's ``Head``. - - critic_head_layer_num (:obj:`int`): - The num of layers used in the network to compute Q value output for critic's nn. - - activation (:obj:`Optional[nn.Module]`): - The type of activation function to use in ``MLP`` the after ``layer_fn``, - if ``None`` then default set to ``nn.ReLU()`` - - norm_type (:obj:`Optional[str]`): - The type of normalization to use, see ``ding.torch_utils.fc_block`` for more details` + - obs_shape (:obj:`Union[int, SequenceType]`): Observation space shape, such as 8 or [4, 84, 84]. + - action_shape (:obj:`Union[int, SequenceType]`): Action space shape, such as 6 or [2, 3, 3]. + - action_space (:obj:`str`): The type of different action spaces, including ['discrete', 'continuous', \ + 'hybrid'], then will instantiate corresponding head, including ``DiscreteHead``, \ + ``ReparameterizationHead``, and hybrid heads. + - share_encoder (:obj:`bool`): Whether to share observation encoders between actor and decoder. + - encoder_hidden_size_list (:obj:`SequenceType`): Collection of ``hidden_size`` to pass to ``Encoder``, \ + the last element must match ``head_hidden_size``. + - actor_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` of ``actor_head`` network, defaults \ + to 64, it must match the last element of ``encoder_hidden_size_list``. + - actor_head_layer_num (:obj:`int`): The num of layers used in the ``actor_head`` network to compute action. + - critic_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` of ``critic_head`` network, defaults \ + to 64, it must match the last element of ``encoder_hidden_size_list``. + - critic_head_layer_num (:obj:`int`): The num of layers used in the ``critic_head`` network. + - activation (:obj:`Optional[nn.Module]`): The type of activation function in networks \ + if ``None`` then default set it to ``nn.ReLU()``. + - norm_type (:obj:`Optional[str]`): The type of normalization in networks, see \ + ``ding.torch_utils.fc_block`` for more details. you can choose one of ['BN', 'IN', 'SyncBN', 'LN'] + - sigma_type (:obj:`Optional[str]`): The type of sigma in continuous action space, see \ + ``ding.torch_utils.network.dreamer.ReparameterizationHead`` for more details, in A2C/PPO, it defaults \ + to ``independent``, which means state-independent sigma parameters. + - fixed_sigma_value (:obj:`Optional[int]`): If ``sigma_type`` is ``fixed``, then use this value as sigma. + - bound_type (:obj:`Optional[str]`): The type of action bound methods in continuous action space, defaults \ + to ``None``, which means no bound. + - encoder (:obj:`Optional[torch.nn.Module]`): The encoder module, defaults to ``None``, you can define \ + your own encoder module and pass it into VAC to deal with different observation space. + - impala_cnn_encoder (:obj:`bool`): Whether to use IMPALA CNN encoder, defaults to ``False``. """ super(VAC, self).__init__() obs_shape: int = squeeze(obs_shape) @@ -174,7 +190,6 @@ def new_encoder(outsize): ) self.actor_head = nn.ModuleList([actor_action_type, actor_action_args]) - # must use list, not nn.ModuleList if self.share_encoder: self.actor = [self.encoder, self.actor_head] self.critic = [self.encoder, self.critic_head] @@ -186,78 +201,63 @@ def new_encoder(outsize): self.actor = nn.ModuleList(self.actor) self.critic = nn.ModuleList(self.critic) - def forward(self, inputs: Union[torch.Tensor, Dict], mode: str) -> Dict: - r""" + def forward(self, x: torch.Tensor, mode: str) -> Dict: + """ Overview: - Use encoded embedding tensor to predict output. - Parameter updates with VAC's MLPs forward setup. + VAC forward computation graph, input observation tensor to predict state value or action logit. Different \ + ``mode`` will forward with different network modules to get different outputs and save computation. Arguments: - Forward with ``'compute_actor'`` or ``'compute_critic'``: - - inputs (:obj:`torch.Tensor`): - The encoded embedding tensor, determined with given ``hidden_size``, i.e. ``(B, N=hidden_size)``. - Whether ``actor_head_hidden_size`` or ``critic_head_hidden_size`` depend on ``mode``. + - x (:obj:`torch.Tensor`): The input observation tensor data. + - mode (:obj:`str`): The forward mode, all the modes are defined in the beginning of this class. Returns: - - outputs (:obj:`Dict`): - Run with encoder and head. + - outputs (:obj:`Dict`): The output dict of VAC's forward computation graph, whose key-values vary from \ + different ``mode``. - Forward with ``'compute_actor'``, Necessary Keys: - - logit (:obj:`torch.Tensor`): Logit encoding tensor, with same size as input ``x``. - - Forward with ``'compute_critic'``, Necessary Keys: - - value (:obj:`torch.Tensor`): Q value tensor with same size as batch size. - Shapes: - - inputs (:obj:`torch.Tensor`): :math:`(B, N)`, where B is batch size and N corresponding ``hidden_size`` - - logit (:obj:`torch.FloatTensor`): :math:`(B, N)`, where B is batch size and N is ``action_shape`` - - value (:obj:`torch.FloatTensor`): :math:`(B, )`, where B is batch size. - - Actor Examples: - >>> model = VAC(64,128) + Examples (Actor): + >>> model = VAC(64, 128) >>> inputs = torch.randn(4, 64) >>> actor_outputs = model(inputs,'compute_actor') >>> assert actor_outputs['logit'].shape == torch.Size([4, 128]) - Critic Examples: - >>> model = VAC(64,64) + Examples (Critic): + >>> model = VAC(64, 64) >>> inputs = torch.randn(4, 64) >>> critic_outputs = model(inputs,'compute_critic') - >>> critic_outputs['value'] - tensor([0.0252, 0.0235, 0.0201, 0.0072], grad_fn=) + >>> assert actor_outputs['logit'].shape == torch.Size([4, 64]) - Actor-Critic Examples: - >>> model = VAC(64,64) + Examples (Actor-Critic): + >>> model = VAC(64, 64) >>> inputs = torch.randn(4, 64) >>> outputs = model(inputs,'compute_actor_critic') - >>> outputs['value'] - tensor([0.0252, 0.0235, 0.0201, 0.0072], grad_fn=) + >>> assert critic_outputs['value'].shape == torch.Size([4]) >>> assert outputs['logit'].shape == torch.Size([4, 64]) """ assert mode in self.mode, "not support forward mode: {}/{}".format(mode, self.mode) - return getattr(self, mode)(inputs) + return getattr(self, mode)(x) def compute_actor(self, x: torch.Tensor) -> Dict: - r""" + """ Overview: - Execute parameter updates with ``'compute_actor'`` mode - Use encoded embedding tensor to predict output. + VAC forward computation graph for actor part, input observation tensor to predict action logit. Arguments: - - inputs (:obj:`torch.Tensor`): - The encoded embedding tensor, determined with given ``hidden_size``, i.e. ``(B, N=hidden_size)``. - ``hidden_size = actor_head_hidden_size`` + - x (:obj:`torch.Tensor`): The input observation tensor data. Returns: - - outputs (:obj:`Dict`): - Run with encoder and head. - + - outputs (:obj:`Dict`): The output dict of VAC's forward computation graph for actor, including ``logit``. ReturnsKeys: - - logit (:obj:`torch.Tensor`): Logit encoding tensor, with same size as input ``x``. + - logit (:obj:`torch.Tensor`): The predicted action logit tensor, for discrete action space, it will be \ + the same dimension real-value ranged tensor of possible action choices, and for continuous action \ + space, it will be the mu and sigma of the Gaussian distribution, and the number of mu and sigma is the \ + same as the number of continuous actions. Hybrid action space is a kind of combination of discrete \ + and continuous action space, so the logit will be a dict with ``action_type`` and ``action_args``. Shapes: - - logit (:obj:`torch.FloatTensor`): :math:`(B, N)`, where B is batch size and N is ``action_shape`` + - logit (:obj:`torch.Tensor`): :math:`(B, N)`, where B is batch size and N is ``action_shape`` Examples: - >>> model = VAC(64,64) + >>> model = VAC(64, 64) >>> inputs = torch.randn(4, 64) >>> actor_outputs = model(inputs,'compute_actor') - >>> assert actor_outputs['action'].shape == torch.Size([4, 64]) + >>> assert actor_outputs['logit'].shape == torch.Size([4, 64]) """ if self.share_encoder: x = self.encoder(x) @@ -275,29 +275,23 @@ def compute_actor(self, x: torch.Tensor) -> Dict: return {'logit': {'action_type': action_type['logit'], 'action_args': action_args}} def compute_critic(self, x: torch.Tensor) -> Dict: - r""" + """ Overview: - Execute parameter updates with ``'compute_critic'`` mode - Use encoded embedding tensor to predict output. + VAC forward computation graph for critic part, input observation tensor to predict state value. Arguments: - - inputs (:obj:`torch.Tensor`): - The encoded embedding tensor, determined with given ``hidden_size``, i.e. ``(B, N=hidden_size)``. - ``hidden_size = critic_head_hidden_size`` + - x (:obj:`torch.Tensor`): The input observation tensor data. Returns: - - outputs (:obj:`Dict`): - Run with encoder and head. - - Necessary Keys: - - value (:obj:`torch.Tensor`): Q value tensor with same size as batch size. + - outputs (:obj:`Dict`): The output dict of VAC's forward computation graph for critic, including ``value``. + ReturnsKeys: + - value (:obj:`torch.Tensor`): The predicted state value tensor. Shapes: - - value (:obj:`torch.FloatTensor`): :math:`(B, )`, where B is batch size. + - value (:obj:`torch.Tensor`): :math:`(B, )`, where B is batch size, (B, 1) is squeezed to (B, ). Examples: - >>> model = VAC(64,64) + >>> model = VAC(64, 64) >>> inputs = torch.randn(4, 64) >>> critic_outputs = model(inputs,'compute_critic') - >>> critic_outputs['value'] - tensor([0.0252, 0.0235, 0.0201, 0.0072], grad_fn=) + >>> assert critic_outputs['value'].shape == torch.Size([4]) """ if self.share_encoder: x = self.encoder(x) @@ -307,37 +301,37 @@ def compute_critic(self, x: torch.Tensor) -> Dict: return {'value': x['pred']} def compute_actor_critic(self, x: torch.Tensor) -> Dict: - r""" + """ Overview: - Execute parameter updates with ``'compute_actor_critic'`` mode - Use encoded embedding tensor to predict output. + VAC forward computation graph for both actor and critic part, input observation tensor to predict action \ + logit and state value. Arguments: - - inputs (:obj:`torch.Tensor`): The encoded embedding tensor. - + - x (:obj:`torch.Tensor`): The input observation tensor data. Returns: - - outputs (:obj:`Dict`): - Run with encoder and head. - + - outputs (:obj:`Dict`): The output dict of VAC's forward computation graph for both actor and critic, \ + including ``logit`` and ``value``. ReturnsKeys: - - logit (:obj:`torch.Tensor`): Logit encoding tensor, with same size as input ``x``. - - value (:obj:`torch.Tensor`): Q value tensor with same size as batch size. + - logit (:obj:`torch.Tensor`): The predicted action logit tensor, for discrete action space, it will be \ + the same dimension real-value ranged tensor of possible action choices, and for continuous action \ + space, it will be the mu and sigma of the Gaussian distribution, and the number of mu and sigma is the \ + same as the number of continuous actions. Hybrid action space is a kind of combination of discrete \ + and continuous action space, so the logit will be a dict with ``action_type`` and ``action_args``. + - value (:obj:`torch.Tensor`): The predicted state value tensor. Shapes: - - logit (:obj:`torch.FloatTensor`): :math:`(B, N)`, where B is batch size and N is ``action_shape`` - - value (:obj:`torch.FloatTensor`): :math:`(B, )`, where B is batch size. + - logit (:obj:`torch.Tensor`): :math:`(B, N)`, where B is batch size and N is ``action_shape`` + - value (:obj:`torch.Tensor`): :math:`(B, )`, where B is batch size, (B, 1) is squeezed to (B, ). Examples: - >>> model = VAC(64,64) + >>> model = VAC(64, 64) >>> inputs = torch.randn(4, 64) >>> outputs = model(inputs,'compute_actor_critic') - >>> outputs['value'] - tensor([0.0252, 0.0235, 0.0201, 0.0072], grad_fn=) + >>> assert critic_outputs['value'].shape == torch.Size([4]) >>> assert outputs['logit'].shape == torch.Size([4, 64]) .. note:: - ``compute_actor_critic`` interface aims to save computation when shares encoder. - Returning the combination dictionry. - + ``compute_actor_critic`` interface aims to save computation when shares encoder and return the combination \ + dict output. """ if self.share_encoder: actor_embedding = critic_embedding = self.encoder(x) @@ -361,11 +355,12 @@ def compute_actor_critic(self, x: torch.Tensor) -> Dict: @MODEL_REGISTRY.register('dreamervac') class DREAMERVAC(nn.Module): - r""" + """ Overview: - The VAC model. + The neural network and computation graph of DreamerV3 (state) Value Actor-Critic (VAC). + This model now supports discrete, continuous action space. Interfaces: - ``__init__``, ``forward``, ``compute_actor``, ``compute_critic`` + ``__init__``, ``forward``. """ mode = ['compute_actor', 'compute_critic', 'compute_actor_critic'] @@ -388,26 +383,12 @@ def __init__( actor_temp=0.1, action_unimix_ratio=0.01, ) -> None: - r""" + """ Overview: - Init the VAC Model according to arguments. + Initialize the ``DREAMERVAC`` model according to arguments. Arguments: - - obs_shape (:obj:`Union[int, SequenceType]`): Observation's space. - - action_shape (:obj:`Union[int, SequenceType]`): Action's space. - - action_space (:obj:`str`): Choose action head in ['discrete', 'continuous', 'hybrid'] - - share_encoder (:obj:`bool`): Whether share encoder. - - encoder_hidden_size_list (:obj:`SequenceType`): Collection of ``hidden_size`` to pass to ``Encoder`` - - actor_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to actor-nn's ``Head``. - - actor_head_layer_num (:obj:`int`): - The num of layers used in the network to compute Q value output for actor's nn. - - critic_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to critic-nn's ``Head``. - - critic_head_layer_num (:obj:`int`): - The num of layers used in the network to compute Q value output for critic's nn. - - activation (:obj:`Optional[nn.Module]`): - The type of activation function to use in ``MLP`` the after ``layer_fn``, - if ``None`` then default set to ``nn.ReLU()`` - - norm_type (:obj:`Optional[str]`): - The type of normalization to use, see ``ding.torch_utils.fc_block`` for more details` + - obs_shape (:obj:`Union[int, SequenceType]`): Observation space shape, such as 8 or [4, 84, 84]. + - action_shape (:obj:`Union[int, SequenceType]`): Action space shape, such as 6 or [2, 3, 3]. """ super(DREAMERVAC, self).__init__() obs_shape: int = squeeze(obs_shape) diff --git a/ding/model/wrapper/__init__.py b/ding/model/wrapper/__init__.py index f74d3f2668..24d621e973 100644 --- a/ding/model/wrapper/__init__.py +++ b/ding/model/wrapper/__init__.py @@ -1 +1 @@ -from .model_wrappers import model_wrap, register_wrapper, IModelWrapper, BaseModelWrapper +from .model_wrappers import model_wrap, register_wrapper, IModelWrapper diff --git a/ding/model/wrapper/model_wrappers.py b/ding/model/wrapper/model_wrappers.py index 38c6346f2e..e3d57d1ce1 100644 --- a/ding/model/wrapper/model_wrappers.py +++ b/ding/model/wrapper/model_wrappers.py @@ -1,40 +1,52 @@ -from typing import Any, Tuple, Callable, Optional, List, Dict +from typing import Any, Tuple, Callable, Optional, List, Dict, Union from abc import ABC import numpy as np import torch +import torch.nn as nn import torch.nn.functional as F from torch.distributions import Categorical, Independent, Normal -from ding.torch_utils import get_tensor_data +from ding.torch_utils import get_tensor_data, zeros_like from ding.rl_utils import create_noise_generator from ding.utils.data import default_collate class IModelWrapper(ABC): - r""" + """ Overview: - the base class of Model Wrappers + The basic interface class of model wrappers. Model wrapper is a wrapper class of torch.nn.Module model, which \ + is used to add some extra operations for the wrapped model, such as hidden state maintain for RNN-base model, \ + argmax action selection for discrete action space, etc. Interfaces: - register + ``__init__``, ``__getattr__``, ``info``, ``reset``, ``forward``. """ - def __init__(self, model: Any) -> None: + def __init__(self, model: nn.Module) -> None: + """ + Overview: + Initialize model and other necessary member variabls in the model wrapper. + """ self._model = model def __getattr__(self, key: str) -> Any: - r""" + """ Overview: - Get the attrbute in model. + Get original attrbutes of torch.nn.Module model, such as variables and methods defined in model. Arguments: - - key (:obj:`str`): The key to query. + - key (:obj:`str`): The string key to query. Returns: - ret (:obj:`Any`): The queried attribute. """ return getattr(self._model, key) - def info(self, attr_name): - r""" + def info(self, attr_name: str) -> str: + """ Overview: - get info of attr_name + Get some string information of the indicated ``attr_name``, which is used for debug wrappers. + This method will recursively search for the indicated ``attr_name``. + Arguments: + - attr_name (:obj:`str`): The string key to query information. + Returns: + - info_string (:obj:`str`): The information string of the indicated ``attr_name``. """ if attr_name in dir(self): if isinstance(self._model, IModelWrapper): @@ -50,36 +62,46 @@ def info(self, attr_name): else: return '{}'.format(self._model.__class__.__name__) - -class BaseModelWrapper(IModelWrapper): - r""" - Overview: - the base class of Model Wrappers - Interfaces: - register - """ - - def reset(self, data_id: List[int] = None) -> None: - r""" + def reset(self, data_id: List[int] = None, **kwargs) -> None: + """ Overview - the reset function that the Model Wrappers with states should implement - used to reset the stored states + Basic interface, reset some stateful varaibles in the model wrapper, such as hidden state of RNN. + Here we do nothing and just implement this interface method. + Other derived model wrappers can override this method to add some extra operations. + Arguments: + - data_id (:obj:`List[int]`): The data id list to reset. If None, reset all data. In practice, \ + model wrappers often needs to maintain some stateful variables for each data trajectory, \ + so we leave this ``data_id`` argument to reset the stateful variables of the indicated data. """ pass + def forward(self, *args, **kwargs) -> Any: + """ + Overview: + Basic interface, call the wrapped model's forward method. Other derived model wrappers can override this \ + method to add some extra operations. + """ + return self._model.forward(*args, **kwargs) -def zeros_like(h): - if isinstance(h, torch.Tensor): - return torch.zeros_like(h) - elif isinstance(h, (list, tuple)): - return [zeros_like(t) for t in h] - elif isinstance(h, dict): - return {k: zeros_like(v) for k, v in h.items()} - else: - raise TypeError("not support type: {}".format(h)) + +class BaseModelWrapper(IModelWrapper): + """ + Overview: + Placeholder class for the model wrapper. This class is used to wrap the model without any extra operations, \ + including a empty ``reset`` method and a ``forward`` method which directly call the wrapped model's forward. + To keep the consistency of the model wrapper interface, we use this class to wrap the model without specific \ + operations in the implementation of DI-engine's policy. + """ + pass class HiddenStateWrapper(IModelWrapper): + """ + Overview: + Maintain the hidden state for RNN-base model. Each sample in a batch has its own state. + Interfaces: + ``__init__``, ``reset``, ``forward``. + """ def __init__( self, @@ -387,12 +409,18 @@ def sample_action(logit=None, prob=None): class ArgmaxSampleWrapper(IModelWrapper): - r""" + """ Overview: - Used to help the model to sample argmax action + Used to help the model to sample argmax action. + Interfaces: + ``forward``. """ def forward(self, *args, **kwargs): + """ + Overview: + Employ model forward computation graph, and use the output logit to greedily select max action (argmax). + """ output = self._model.forward(*args, **kwargs) assert isinstance(output, dict), "model output must be dict, but find {}".format(type(output)) logit = output['logit'] @@ -415,6 +443,8 @@ class CombinationArgmaxSampleWrapper(IModelWrapper): r""" Overview: Used to help the model to sample combination argmax action. + Interfaces: + ``forward``. """ def forward(self, shot_number, *args, **kwargs): @@ -438,6 +468,8 @@ class CombinationMultinomialSampleWrapper(IModelWrapper): r""" Overview: Used to help the model to sample combination multinomial action. + Interfaces: + ``forward``. """ def forward(self, shot_number, *args, **kwargs): @@ -463,6 +495,8 @@ class HybridArgmaxSampleWrapper(IModelWrapper): Overview: Used to help the model to sample argmax action in hybrid action space, i.e.{'action_type': discrete, 'action_args', continuous} + Interfaces: + ``forward``. """ def forward(self, *args, **kwargs): @@ -487,11 +521,11 @@ def forward(self, *args, **kwargs): class MultinomialSampleWrapper(IModelWrapper): - r""" + """ Overview: - Used to help the model get the corresponding action from the output['logits'] + Used to help the model get the corresponding action from the output['logits']self. Interfaces: - register + ``forward``. """ def forward(self, *args, **kwargs): @@ -529,7 +563,7 @@ class EpsGreedySampleWrapper(IModelWrapper): - float (i.e. python native scalar): for almost normal case - Dict[str, float]: for algorithm NGU Interfaces: - register + ``forward``. """ def forward(self, *args, **kwargs): @@ -583,7 +617,7 @@ class EpsGreedyMultinomialSampleWrapper(IModelWrapper): Epsilon greedy sampler coupled with multinomial sample used in collector_model to help balance exploration and exploitation. Interfaces: - register + ``forward``. """ def forward(self, *args, **kwargs): @@ -630,7 +664,7 @@ class HybridEpsGreedySampleWrapper(IModelWrapper): Epsilon greedy sampler used in collector_model to help balance exploration and exploitation. In hybrid action space, i.e.{'action_type': discrete, 'action_args', continuous} Interfaces: - register, forward + ``forward``. """ def forward(self, *args, **kwargs): @@ -670,7 +704,7 @@ class HybridEpsGreedyMultinomialSampleWrapper(IModelWrapper): to help balance exploration and exploitation. In hybrid action space, i.e.{'action_type': discrete, 'action_args', continuous} Interfaces: - register + ``forward``. """ def forward(self, *args, **kwargs): @@ -759,7 +793,7 @@ def forward(self, *args, **kwargs): return output -class DeterministicSample(IModelWrapper): +class DeterministicSampleWrapper(IModelWrapper): """ Overview: Deterministic sampler (just use mu directly) used in eval_model. @@ -774,7 +808,7 @@ def forward(self, *args, **kwargs): return output -class ReparamSample(IModelWrapper): +class ReparamSampleWrapper(IModelWrapper): """ Overview: Reparameterization gaussian sampler used in collector_model. @@ -796,7 +830,7 @@ class ActionNoiseWrapper(IModelWrapper): Overview: Add noise to collector's action output; Do clips on both generated noise and action after adding noise. Interfaces: - register, __init__, add_noise, reset + ``__init__``, ``forward``. Arguments: - model (:obj:`Any`): Wrapped model class. Should contain ``forward`` method. - noise_type (:obj:`str`): The type of noise that should be generated, support ['gauss', 'ou']. @@ -854,13 +888,6 @@ def add_noise(self, action: torch.Tensor) -> torch.Tensor: action = action.clamp(self.action_range['min'], self.action_range['max']) return action - def reset(self) -> None: - r""" - Overview: - Reset noise generator. - """ - pass - class TargetNetworkWrapper(IModelWrapper): r""" @@ -919,17 +946,15 @@ def reset_state(self, target_update_count: int = None) -> None: class TeacherNetworkWrapper(IModelWrapper): - r""" + """ Overview: Set the teacher Network. Set the model's model.teacher_cfg to the input teacher_cfg - - Interfaces: - register """ def __init__(self, model, teacher_cfg): super().__init__(model) self._model._teacher_cfg = teacher_cfg + raise NotImplementedError wrapper_name_map = { @@ -939,8 +964,8 @@ def __init__(self, model, teacher_cfg): 'hybrid_argmax_sample': HybridArgmaxSampleWrapper, 'eps_greedy_sample': EpsGreedySampleWrapper, 'eps_greedy_multinomial_sample': EpsGreedyMultinomialSampleWrapper, - 'deterministic_sample': DeterministicSample, - 'reparam_sample': ReparamSample, + 'deterministic_sample': DeterministicSampleWrapper, + 'reparam_sample': ReparamSampleWrapper, 'hybrid_eps_greedy_sample': HybridEpsGreedySampleWrapper, 'hybrid_eps_greedy_multinomial_sample': HybridEpsGreedyMultinomialSampleWrapper, 'hybrid_reparam_multinomial_sample': HybridReparamMultinomialSampleWrapper, @@ -958,8 +983,19 @@ def __init__(self, model, teacher_cfg): } -def model_wrap(model, wrapper_name: str = None, **kwargs): +def model_wrap(model: Union[nn.Module, IModelWrapper], wrapper_name: str = None, **kwargs): + """ + Overview: + Wrap the model with the specified wrapper and return the wrappered model. + Arguments: + - model (:obj:`Any`): The model to be wrapped. + - wrapper_name (:obj:`str`): The name of the wrapper to be used. + + .. note:: + The arguments of the wrapper should be passed in as kwargs. + """ if wrapper_name in wrapper_name_map: + # TODO test whether to remove this if branch if not isinstance(model, IModelWrapper): model = wrapper_name_map['base'](model) model = wrapper_name_map[wrapper_name](model, **kwargs) @@ -968,13 +1004,15 @@ def model_wrap(model, wrapper_name: str = None, **kwargs): return model -def register_wrapper(name: str, wrapper_type: type): - r""" +def register_wrapper(name: str, wrapper_type: type) -> None: + """ Overview: - Register new wrapper to wrapper_name_map + Register new wrapper to ``wrapper_name_map``. When user implements a new wrapper, they must call this function \ + to complete the registration. Then the wrapper can be called by ``model_wrap``. Arguments: - - name (:obj:`str`): the name of the wrapper - - wrapper_type (subclass of :obj:`IModelWrapper`): the wrapper class added to the plguin_name_map + - name (:obj:`str`): The name of the new wrapper to be registered. + - wrapper_type (:obj:`type`): The wrapper class needs to be added in ``wrapper_name_map``. This argument \ + should be the subclass of ``IModelWrapper``. """ assert isinstance(name, str) assert issubclass(wrapper_type, IModelWrapper) diff --git a/ding/model/wrapper/test_model_wrappers.py b/ding/model/wrapper/test_model_wrappers.py index 93334bee00..890d1eb1fc 100644 --- a/ding/model/wrapper/test_model_wrappers.py +++ b/ding/model/wrapper/test_model_wrappers.py @@ -9,7 +9,8 @@ from ding.torch_utils import get_lstm from ding.torch_utils.network.gtrxl import GTrXL -from ding.model import model_wrap, register_wrapper, IModelWrapper, BaseModelWrapper +from ding.model import model_wrap, register_wrapper, IModelWrapper +from ding.model.wrapper.model_wrappers import BaseModelWrapper class TempMLP(torch.nn.Module): @@ -38,7 +39,7 @@ def __init__(self): self.bn1 = nn.BatchNorm1d(4) self.fc2 = nn.Linear(4, 6) self.act = nn.ReLU() - self.out = nn.Softmax() + self.out = nn.Softmax(dim=-1) def forward(self, inputs, tmp=0): x = self.fc1(inputs['obs']) @@ -61,7 +62,7 @@ def __init__(self): self.bn1 = nn.BatchNorm1d(4) self.fc2 = nn.Linear(4, 6) self.act = nn.ReLU() - self.out = nn.Softmax() + self.out = nn.Softmax(dim=-1) self.fc2_cont = nn.Linear(4, 6) self.act_cont = nn.ReLU() @@ -93,7 +94,7 @@ def __init__(self): self.bn1 = nn.BatchNorm1d(4) self.fc2 = nn.Linear(4, 6) self.act = nn.ReLU() - self.out = nn.Softmax() + self.out = nn.Softmax(dim=-1) self.fc2_cont_mu = nn.Linear(4, 6) self.act_cont_mu = nn.ReLU() @@ -131,7 +132,6 @@ def __init__(self): self.bn1 = nn.BatchNorm1d(4) self.fc2 = nn.Linear(4, 6) self.act = nn.ReLU() - self.out = nn.Softmax() self.fc2_cont_mu = nn.Linear(4, 6) self.fc2_cont_sigma = nn.Linear(4, 6) @@ -553,13 +553,15 @@ def test_transformer_memory_wrapper(self): def test_combination_argmax_sample_wrapper(self): model = model_wrap(ActorMLP(), wrapper_name='combination_argmax_sample') data = {'obs': torch.randn(4, 3)} - output = model.forward(shot_number=2, inputs=data) - assert output['action'].shape == (4, ) + shot_number = 2 + output = model.forward(shot_number=shot_number, inputs=data) + assert output['action'].shape == (4, shot_number) assert (output['action'] >= 0).all() and (output['action'] < 64).all() def test_combination_multinomial_sample_wrapper(self): model = model_wrap(ActorMLP(), wrapper_name='combination_multinomial_sample') data = {'obs': torch.randn(4, 3)} - output = model.forward(shot_number=2, inputs=data) - assert output['action'].shape == (4, ) + shot_number = 2 + output = model.forward(shot_number=shot_number, inputs=data) + assert output['action'].shape == (4, shot_number) assert (output['action'] >= 0).all() and (output['action'] < 64).all() diff --git a/ding/policy/cql.py b/ding/policy/cql.py index 21dd28d21c..1622c184ad 100644 --- a/ding/policy/cql.py +++ b/ding/policy/cql.py @@ -221,18 +221,18 @@ def _init_learn(self) -> None: # Weight Init init_w = self._cfg.learn.init_w - self._model.actor[2].mu.weight.data.uniform_(-init_w, init_w) - self._model.actor[2].mu.bias.data.uniform_(-init_w, init_w) - self._model.actor[2].log_sigma_layer.weight.data.uniform_(-init_w, init_w) - self._model.actor[2].log_sigma_layer.bias.data.uniform_(-init_w, init_w) + self._model.actor_head[-1].mu.weight.data.uniform_(-init_w, init_w) + self._model.actor_head[-1].mu.bias.data.uniform_(-init_w, init_w) + self._model.actor_head[-1].log_sigma_layer.weight.data.uniform_(-init_w, init_w) + self._model.actor_head[-1].log_sigma_layer.bias.data.uniform_(-init_w, init_w) if self._twin_critic: - self._model.critic[0][2].last.weight.data.uniform_(-init_w, init_w) - self._model.critic[0][2].last.bias.data.uniform_(-init_w, init_w) - self._model.critic[1][2].last.weight.data.uniform_(-init_w, init_w) - self._model.critic[1][2].last.bias.data.uniform_(-init_w, init_w) + self._model.critic_head[0][-1].last.weight.data.uniform_(-init_w, init_w) + self._model.critic_head[0][-1].last.bias.data.uniform_(-init_w, init_w) + self._model.critic_head[1][-1].last.weight.data.uniform_(-init_w, init_w) + self._model.critic_head[1][-1].last.bias.data.uniform_(-init_w, init_w) else: - self._model.critic[2].last.weight.data.uniform_(-init_w, init_w) - self._model.critic[2].last.bias.data.uniform_(-init_w, init_w) + self._model.critic_head[2].last.weight.data.uniform_(-init_w, init_w) + self._model.critic_head[-1].last.bias.data.uniform_(-init_w, init_w) # Optimizers if self._value_network: diff --git a/ding/policy/ddpg.py b/ding/policy/ddpg.py index 2887b7480d..8629cca4af 100644 --- a/ding/policy/ddpg.py +++ b/ding/policy/ddpg.py @@ -148,9 +148,9 @@ class DDPGPolicy(Policy): def default_model(self) -> Tuple[str, List[str]]: if self._cfg.multi_agent: - return 'maqac_continuous', ['ding.model.template.maqac'] + return 'continuous_maqac', ['ding.model.template.maqac'] else: - return 'qac', ['ding.model.template.qac'] + return 'continuous_qac', ['ding.model.template.qac'] def _init_learn(self) -> None: r""" diff --git a/ding/policy/r2d2_gtrxl.py b/ding/policy/r2d2_gtrxl.py index 660f0aaef4..73b89239f3 100644 --- a/ding/policy/r2d2_gtrxl.py +++ b/ding/policy/r2d2_gtrxl.py @@ -1,5 +1,5 @@ import copy -import sys +import torch from collections import namedtuple from typing import List, Dict, Any, Tuple, Union, Optional @@ -10,9 +10,6 @@ from ding.utils import POLICY_REGISTRY from ding.utils.data import timestep_collate, default_collate, default_decollate from .base_policy import Policy -import torch - -from ding.model.common.head import * @POLICY_REGISTRY.register('r2d2_gtrxl') @@ -130,7 +127,7 @@ class R2D2GTrXLPolicy(Policy): ) def default_model(self) -> Tuple[str, List[str]]: - return 'gtrxl_discrete', ['ding.model.template.q_learning'] + return 'gtrxldqn', ['ding.model.template.q_learning'] def _init_learn(self) -> None: """ diff --git a/ding/policy/sac.py b/ding/policy/sac.py index ca0263305a..ebf2845e51 100644 --- a/ding/policy/sac.py +++ b/ding/policy/sac.py @@ -143,7 +143,7 @@ class SACDiscretePolicy(Policy): def default_model(self) -> Tuple[str, List[str]]: if self._cfg.multi_agent: - return 'maqac', ['ding.model.template.maqac'] + return 'discrete_maqac', ['ding.model.template.maqac'] else: return 'discrete_qac', ['ding.model.template.qac'] @@ -227,7 +227,7 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: action = data['action'] # 1. predict q value - q_value = self._learn_model.forward({'obs': obs}, mode='compute_critic')['q_value'] + q_value = self._learn_model.forward(obs, mode='compute_critic')['q_value'] dist = torch.distributions.categorical.Categorical(logits=logit) dist_entropy = dist.entropy() entropy = dist_entropy.mean() @@ -236,12 +236,12 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: # target q value. SARSA: first predict next action, then calculate next q value with torch.no_grad(): - policy_output_next = self._learn_model.forward({'obs': next_obs}, mode='compute_actor') + policy_output_next = self._learn_model.forward(next_obs, mode='compute_actor') if self._cfg.multi_agent: policy_output_next['logit'][policy_output_next['action_mask'] == 0.0] = -1e8 prob = F.softmax(policy_output_next['logit'], dim=-1) log_prob = torch.log(prob + 1e-8) - target_q_value = self._target_model.forward({'obs': next_obs}, mode='compute_critic')['q_value'] + target_q_value = self._target_model.forward(next_obs, mode='compute_critic')['q_value'] # the value of a policy according to the maximum entropy objective if self._twin_critic: # find min one as target q value @@ -270,7 +270,7 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: self._optimizer_q.step() # 5. evaluate to get action distribution - policy_output = self._learn_model.forward({'obs': data['obs']}, mode='compute_actor') + policy_output = self._learn_model.forward(obs, mode='compute_actor') # 6. apply discrete action mask in multi_agent setting if self._cfg.multi_agent: policy_output['logit'][policy_output['action_mask'] == 0.0] = -1e8 @@ -279,7 +279,7 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: log_prob = F.log_softmax(logit, dim=-1) with torch.no_grad(): - new_q_value = self._learn_model.forward({'obs': data['obs']}, mode='compute_critic')['q_value'] + new_q_value = self._learn_model.forward(obs, mode='compute_critic')['q_value'] if self._twin_critic: new_q_value = torch.min(new_q_value[0], new_q_value[1]) # 7. compute policy loss @@ -363,7 +363,7 @@ def _forward_collect(self, data: dict, eps: float) -> dict: data = to_device(data, self._device) self._collect_model.eval() with torch.no_grad(): - output = self._collect_model.forward({'obs': data}, mode='compute_actor', eps=eps) + output = self._collect_model.forward(data, mode='compute_actor', eps=eps) if self._cuda: output = to_device(output, 'cpu') output = default_decollate(output) @@ -394,7 +394,7 @@ def _forward_eval(self, data: dict) -> dict: data = to_device(data, self._device) self._eval_model.eval() with torch.no_grad(): - output = self._eval_model.forward({'obs': data}, mode='compute_actor') + output = self._eval_model.forward(data, mode='compute_actor') if self._cuda: output = to_device(output, 'cpu') output = default_decollate(output) @@ -543,9 +543,9 @@ class SACPolicy(Policy): def default_model(self) -> Tuple[str, List[str]]: if self._cfg.multi_agent: - return 'maqac_continuous', ['ding.model.template.maqac'] + return 'continuous_maqac', ['ding.model.template.maqac'] else: - return 'qac', ['ding.model.template.qac'] + return 'continuous_qac', ['ding.model.template.qac'] def _init_learn(self) -> None: self._priority = self._cfg.priority @@ -554,10 +554,10 @@ def _init_learn(self) -> None: # Weight Init for the last output layer init_w = self._cfg.learn.init_w - self._model.actor[-1].mu.weight.data.uniform_(-init_w, init_w) - self._model.actor[-1].mu.bias.data.uniform_(-init_w, init_w) - self._model.actor[-1].log_sigma_layer.weight.data.uniform_(-init_w, init_w) - self._model.actor[-1].log_sigma_layer.bias.data.uniform_(-init_w, init_w) + self._model.actor_head[-1].mu.weight.data.uniform_(-init_w, init_w) + self._model.actor_head[-1].mu.bias.data.uniform_(-init_w, init_w) + self._model.actor_head[-1].log_sigma_layer.weight.data.uniform_(-init_w, init_w) + self._model.actor_head[-1].log_sigma_layer.bias.data.uniform_(-init_w, init_w) self._optimizer_q = Adam( self._model.critic.parameters(), @@ -838,10 +838,10 @@ def _init_learn(self) -> None: # Weight Init for the last output layer init_w = self._cfg.learn.init_w - self._model.actor[2].mu.weight.data.uniform_(-init_w, init_w) - self._model.actor[2].mu.bias.data.uniform_(-init_w, init_w) - self._model.actor[2].log_sigma_layer.weight.data.uniform_(-init_w, init_w) - self._model.actor[2].log_sigma_layer.bias.data.uniform_(-init_w, init_w) + self._model.actor_head[-1].mu.weight.data.uniform_(-init_w, init_w) + self._model.actor_head[-1].mu.bias.data.uniform_(-init_w, init_w) + self._model.actor_head[-1].log_sigma_layer.weight.data.uniform_(-init_w, init_w) + self._model.actor_head[-1].log_sigma_layer.bias.data.uniform_(-init_w, init_w) self._optimizer_q = Adam( self._model.critic.parameters(), diff --git a/ding/policy/td3_bc.py b/ding/policy/td3_bc.py index c3295d70d0..e30b6bfc07 100644 --- a/ding/policy/td3_bc.py +++ b/ding/policy/td3_bc.py @@ -174,13 +174,12 @@ class from DDPG class by changing ``_actor_update_freq``, ``_twin_critic`` and n ) def default_model(self) -> Tuple[str, List[str]]: - return 'qac', ['ding.model.template.qac'] + return 'continuous_qac', ['ding.model.template.qac'] def _init_learn(self) -> None: - r""" + """ Overview: - Learn mode init method. Called by ``self.__init__``. - Init actor and critic optimizers, algorithm config. + Learn mode init method. Called by ``self.__init__``. Init actor and critic optimizers, algorithm config. """ super(TD3BCPolicy, self)._init_learn() self._alpha = self._cfg.learn.alpha diff --git a/ding/policy/td3_vae.py b/ding/policy/td3_vae.py index 7a192d5a6b..7d029c0a91 100644 --- a/ding/policy/td3_vae.py +++ b/ding/policy/td3_vae.py @@ -168,7 +168,7 @@ class from DDPG class by changing ``_actor_update_freq``, ``_twin_critic`` and n ) def default_model(self) -> Tuple[str, List[str]]: - return 'qac', ['ding.model.template.qac'] + return 'continuous_qac', ['ding.model.template.qac'] def _init_learn(self) -> None: r""" diff --git a/ding/torch_utils/__init__.py b/ding/torch_utils/__init__.py index 9c7b677143..c98eb3bab4 100755 --- a/ding/torch_utils/__init__.py +++ b/ding/torch_utils/__init__.py @@ -1,6 +1,7 @@ from .checkpoint_helper import build_checkpoint_helper, CountVar, auto_checkpoint from .data_helper import to_device, to_tensor, to_ndarray, to_list, to_dtype, same_shape, tensor_to_list, \ - build_log_buffer, CudaFetcher, get_tensor_data, unsqueeze, squeeze, get_null_data, get_shape0, to_item + build_log_buffer, CudaFetcher, get_tensor_data, unsqueeze, squeeze, get_null_data, get_shape0, to_item, \ + zeros_like from .distribution import CategoricalPd, CategoricalPdPytorch from .metric import levenshtein_distance, hamming_distance from .network import * diff --git a/ding/torch_utils/data_helper.py b/ding/torch_utils/data_helper.py index e34df5308a..a985ef0345 100644 --- a/ding/torch_utils/data_helper.py +++ b/ding/torch_utils/data_helper.py @@ -461,3 +461,14 @@ def get_null_data(template: Any, num: int) -> List[Any]: data['reward'].zero_() ret.append(data) return ret + + +def zeros_like(h): + if isinstance(h, torch.Tensor): + return torch.zeros_like(h) + elif isinstance(h, (list, tuple)): + return [zeros_like(t) for t in h] + elif isinstance(h, dict): + return {k: zeros_like(v) for k, v in h.items()} + else: + raise TypeError("not support type: {}".format(h)) diff --git a/ding/world_model/tests/test_world_model.py b/ding/world_model/tests/test_world_model.py index ec5f0645ef..f8dd620c59 100644 --- a/ding/world_model/tests/test_world_model.py +++ b/ding/world_model/tests/test_world_model.py @@ -52,11 +52,11 @@ def step(self, obs, action): return (torch.zeros(B), torch.rand(B, O), obs.sum(-1) > 0) from ding.policy import SACPolicy - from ding.model import QAC + from ding.model import ContinuousQAC policy_config = SACPolicy.default_config() policy_config.model.update(dict(obs_shape=2, action_shape=2)) - model = QAC(**policy_config.model) + model = ContinuousQAC(**policy_config.model) policy = SACPolicy(policy_config, model=model).collect_mode fake_model = FakeModel(fake_config, None, None) diff --git a/dizoo/classic_control/cartpole/config/cartpole_sac_config.py b/dizoo/classic_control/cartpole/config/cartpole_sac_config.py index 736c7ee930..36dcb53be6 100644 --- a/dizoo/classic_control/cartpole/config/cartpole_sac_config.py +++ b/dizoo/classic_control/cartpole/config/cartpole_sac_config.py @@ -13,8 +13,7 @@ random_collect_size=0, multi_agent=False, model=dict( - agent_obs_shape=4, - global_obs_shape=4, + obs_shape=4, action_shape=2, twin_critic=True, actor_head_hidden_size=64,