diff --git a/ding/policy/edac.py b/ding/policy/edac.py index 5e315ccbf9..b67b1203c8 100755 --- a/ding/policy/edac.py +++ b/ding/policy/edac.py @@ -20,7 +20,7 @@ class EDACPolicy(SACPolicy): """ Overview: - Policy class of EDAC algorithm. https://arxiv.org/pdf/2110.01548.pdf + Policy class of EDAC algorithm. Paper link: https://arxiv.org/pdf/2110.01548.pdf Config: == ==================== ======== ============= ================================= ======================= @@ -139,10 +139,23 @@ class EDACPolicy(SACPolicy): ) def _init_learn(self) -> None: - r""" + """ Overview: - Learn mode init method. Called by ``self.__init__``. - Init q, value and policy's optimizers, algorithm config, main and target models. + Initialize the learn mode of policy, including related attributes and modules. For EDAC, in addition \ + to the things that need to be initialized in SAC, it is also necessary to additionally define \ + eta/with_q_entropy/forward_learn_cnt. \ + This method will be called in ``__init__`` method if ``learn`` field is in ``enable_field``. + + .. note:: + For the member variables that need to be saved and loaded, please refer to the ``_state_dict_learn`` \ + and ``_load_state_dict_learn`` methods. + + .. note:: + For the member variables that need to be monitored, please refer to the ``_monitor_vars_learn`` method. + + .. note:: + If you want to set some spacial member variables in ``_init_learn`` method, you'd better name them \ + with prefix ``_learn_`` to avoid conflict with other modes, such as ``self._learn_attr1``. """ super()._init_learn() # EDAC special implementation @@ -150,7 +163,35 @@ def _init_learn(self) -> None: self._with_q_entropy = self._cfg.learn.with_q_entropy self._forward_learn_cnt = 0 - def _forward_learn(self, data: dict) -> Dict[str, Any]: + def _forward_learn(self, data: List[Dict[int, Any]]) -> Dict[str, Any]: + """ + Overview: + Policy forward function of learn mode (training policy and updating parameters). Forward means \ + that the policy inputs some training batch data from the replay buffer and then returns the output \ + result, including various training information such as loss, action, priority. + Arguments: + - data (:obj:`List[Dict[int, Any]]`): The input data used for policy forward, including a batch of \ + training samples. For each element in list, the key of the dict is the name of data items and the \ + value is the corresponding data. Usually, the value is torch.Tensor or np.ndarray or there dict/list \ + combinations. In the ``_forward_learn`` method, data often need to first be stacked in the batch \ + dimension by some utility functions such as ``default_preprocess_learn``. \ + For EDAC, each element in list is a dict containing at least the following keys: ``obs``, ``action``, \ + ``logit``, ``reward``, ``next_obs``, ``done``. Sometimes, it also contains other keys like ``weight``. + Returns: + - info_dict (:obj:`Dict[str, Any]`): The information dict that indicated training result, which will be \ + recorded in text log and tensorboard, values must be python scalar or a list of scalars. For the \ + detailed definition of the dict, refer to the code of ``_monitor_vars_learn`` method. + + .. note:: + The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \ + For the data type that not supported, the main reason is that the corresponding model does not support it. \ + You can implement you own model rather than use the default model. For more information, please raise an \ + issue in GitHub repo and we will continue to follow up. + + .. note:: + For more detailed examples, please refer to our unittest for EDACPolicy: \ + ``ding.policy.tests.test_edac``. + """ loss_dict = {} data = default_preprocess_learn( data, diff --git a/ding/policy/iqn.py b/ding/policy/iqn.py index 1bb9b683cc..7ff69d0528 100644 --- a/ding/policy/iqn.py +++ b/ding/policy/iqn.py @@ -13,9 +13,13 @@ @POLICY_REGISTRY.register('iqn') class IQNPolicy(DQNPolicy): - r""" + """ Overview: - Policy class of IQN algorithm. + Policy class of IQN algorithm. Paper link: https://arxiv.org/pdf/1806.06923.pdf. \ + Distrbutional RL is a new direction of RL, which is more stable than the traditional RL algorithm. \ + The core idea of distributional RL is to estimate the distribution of action value instead of the \ + expectation. The difference between IQN and DQN is that IQN uses quantile regression to estimate the \ + quantile value of the action distribution, while DQN uses the expectation of the action distribution. \ Config: == ==================== ======== ============== ======================================== ======================= @@ -98,13 +102,37 @@ class IQNPolicy(DQNPolicy): ) def default_model(self) -> Tuple[str, List[str]]: + """ + Overview: + Return this algorithm default neural network model setting for demonstration. ``__init__`` method will \ + automatically call this method to get the default model setting and create model. + Returns: + - model_info (:obj:`Tuple[str, List[str]]`): The registered model name and model's import_names. + + .. note:: + The user can define and use customized network model but must obey the same inferface definition indicated \ + by import_names path. For example about IQN, its registered name is ``iqn`` and the import_names is \ + ``ding.model.template.q_learning``. + """ return 'iqn', ['ding.model.template.q_learning'] def _init_learn(self) -> None: - r""" + """ Overview: - Learn mode init method. Called by ``self.__init__``. - Init the optimizer, algorithm config, main and target models. + Initialize the learn mode of policy, including related attributes and modules. For IQN, it mainly contains \ + optimizer, algorithm-specific arguments such as nstep, kappa and gamma, main and target model. + This method will be called in ``__init__`` method if ``learn`` field is in ``enable_field``. + + .. note:: + For the member variables that need to be saved and loaded, please refer to the ``_state_dict_learn`` \ + and ``_load_state_dict_learn`` methods. + + .. note:: + For the member variables that need to be monitored, please refer to the ``_monitor_vars_learn`` method. + + .. note:: + If you want to set some spacial member variables in ``_init_learn`` method, you'd better name them \ + with prefix ``_learn_`` to avoid conflict with other modes, such as ``self._learn_attr1``. """ self._priority = self._cfg.priority # Optimizer @@ -126,14 +154,34 @@ def _init_learn(self) -> None: self._learn_model.reset() self._target_model.reset() - def _forward_learn(self, data: dict) -> Dict[str, Any]: - r""" + def _forward_learn(self, data: List[Dict[int, Any]]) -> Dict[str, Any]: + """ Overview: - Forward and backward function of learn mode. + Policy forward function of learn mode (training policy and updating parameters). Forward means \ + that the policy inputs some training batch data from the replay buffer and then returns the output \ + result, including various training information such as loss, priority. Arguments: - - data (:obj:`dict`): Dict type data, including at least ['obs', 'action', 'reward', 'next_obs'] + - data (:obj:`List[Dict[int, Any]]`): The input data used for policy forward, including a batch of \ + training samples. For each element in list, the key of the dict is the name of data items and the \ + value is the corresponding data. Usually, the value is torch.Tensor or np.ndarray or there dict/list \ + combinations. In the ``_forward_learn`` method, data often need to first be stacked in the batch \ + dimension by some utility functions such as ``default_preprocess_learn``. \ + For IQN, each element in list is a dict containing at least the following keys: ``obs``, ``action``, \ + ``reward``, ``next_obs``, ``done``. Sometimes, it also contains other keys such as ``weight`` \ + and ``value_gamma``. Returns: - - info_dict (:obj:`Dict[str, Any]`): Including current lr and loss. + - info_dict (:obj:`Dict[str, Any]`): The information dict that indicated training result, which will be \ + recorded in text log and tensorboard, values must be python scalar or a list of scalars. For the \ + detailed definition of the dict, refer to the code of ``_monitor_vars_learn`` method. + + .. note:: + The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \ + For the data type that not supported, the main reason is that the corresponding model does not support it. \ + You can implement you own model rather than use the default model. For more information, please raise an \ + issue in GitHub repo and we will continue to follow up. + + .. note:: + For more detailed examples, please refer to our unittest for IQNPolicy: ``ding.policy.tests.test_iqn``. """ data = default_preprocess_learn( data, use_priority=self._priority, ignore_done=self._cfg.learn.ignore_done, use_nstep=True @@ -186,6 +234,12 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: } def _state_dict_learn(self) -> Dict[str, Any]: + """ + Overview: + Return the state_dict of learn mode, usually including model, target_model and optimizer. + Returns: + - state_dict (:obj:`Dict[str, Any]`): The dict of current policy learn state, for saving and restoring. + """ return { 'model': self._learn_model.state_dict(), 'target_model': self._target_model.state_dict(), @@ -193,6 +247,17 @@ def _state_dict_learn(self) -> Dict[str, Any]: } def _load_state_dict_learn(self, state_dict: Dict[str, Any]) -> None: + """ + Overview: + Load the state_dict variable into policy learn mode. + Arguments: + - state_dict (:obj:`Dict[str, Any]`): The dict of policy learn state saved before. + + .. tip:: + If you want to only load some parts of model, you can simply set the ``strict`` argument in \ + load_state_dict to ``False``, or refer to ``ding.torch_utils.checkpoint_helper`` for more \ + complicated operation. + """ self._learn_model.load_state_dict(state_dict['model']) self._target_model.load_state_dict(state_dict['target_model']) self._optimizer.load_state_dict(state_dict['optimizer']) diff --git a/ding/policy/pg.py b/ding/policy/pg.py index 667439d07b..9ae0c87f25 100644 --- a/ding/policy/pg.py +++ b/ding/policy/pg.py @@ -13,9 +13,10 @@ @POLICY_REGISTRY.register('pg') class PGPolicy(Policy): - r""" + """ Overview: - Policy class of Policy Gradient (REINFORCE) algorithm. + Policy class of Policy Gradient (REINFORCE) algorithm. Paper link: \ + https://proceedings.neurips.cc/paper_files/paper/1999/file/464d828b85b0bed98e80ade0a5c43b0f-Paper.pdf """ config = dict( # (string) RL policy register name (refer to function "register_policy"). @@ -60,13 +61,33 @@ class PGPolicy(Policy): ) def default_model(self) -> Tuple[str, List[str]]: + """ + Overview: + Return this algorithm default neural network model setting for demonstration. ``__init__`` method will \ + automatically call this method to get the default model setting and create model. + Returns: + - model_info (:obj:`Tuple[str, List[str]]`): The registered model name and model's import_names. + """ return 'pg', ['ding.model.template.pg'] def _init_learn(self) -> None: - r""" + """ Overview: - Learn mode init method. Called by ``self.__init__``. - Init the optimizer, algorithm config, main and target models. + Initialize the learn mode of policy, including related attributes and modules. For PG, it mainly \ + contains optimizer, algorithm-specific arguments such as entropy weight and grad norm. This method \ + also executes some special network initializations. + This method will be called in ``__init__`` method if ``learn`` field is in ``enable_field``. + + .. note:: + For the member variables that need to be saved and loaded, please refer to the ``_state_dict_learn`` \ + and ``_load_state_dict_learn`` methods. + + .. note:: + For the member variables that need to be monitored, please refer to the ``_monitor_vars_learn`` method. + + .. note:: + If you want to set some spacial member variables in ``_init_learn`` method, you'd better name them \ + with prefix ``_learn_`` to avoid conflict with other modes, such as ``self._learn_attr1``. """ # Optimizer self._optimizer = Adam(self._model.parameters(), lr=self._cfg.learn.learning_rate) @@ -75,14 +96,32 @@ def _init_learn(self) -> None: self._grad_norm = self._cfg.learn.grad_norm self._learn_model = self._model # for compatibility - def _forward_learn(self, data: dict) -> Dict[str, Any]: - r""" + def _forward_learn(self, data: List[Dict[int, Any]]) -> Dict[str, Any]: + """ Overview: - Forward and backward function of learn mode. + Policy forward function of learn mode (training policy and updating parameters). Forward means \ + that the policy inputs some training batch data from the replay buffer and then returns the output \ + result, including various training information such as loss, clipfrac, approx_kl. Arguments: - - data (:obj:`dict`): Dict type data, including at least ['obs', 'action', 'reward', 'next_obs','adv'] + - data (:obj:`List[Dict[int, Any]]`): The input data used for policy forward, including the latest \ + collected training samples for on-policy algorithms like PG. For each element in list, the key of the \ + dict is the name of data items and the value is the corresponding data. Usually, the value is \ + torch.Tensor or np.ndarray or there dict/list combinations. In the ``_forward_learn`` method, data \ + often need to first be stacked in the batch dimension by some utility functions such as \ + ``default_preprocess_learn``. \ + For PG, each element in list is a dict containing at least the following keys: ``obs``, ``action``, \ + ``return``. Returns: - - info_dict (:obj:`Dict[str, Any]`): Including current lr and loss. + - return_infos (:obj:`List[Dict[str, Any]]`): The information list that indicated training result, each \ + training iteration contains append a information dict into the final list. The list will be precessed \ + and recorded in text log and tensorboard. The value of the dict must be python scalar or a list of \ + scalars. For the detailed definition of the dict, refer to the code of ``_monitor_vars_learn`` method. + + .. note:: + The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \ + For the data type that not supported, the main reason is that the corresponding model does not support it. \ + You can implement you own model rather than use the default model. For more information, please raise an \ + issue in GitHub repo and we will continue to follow up. """ data = default_preprocess_learn(data, ignore_done=self._cfg.learn.ignore_done, use_nstep=False) if self._cuda: @@ -124,10 +163,43 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: return return_infos def _init_collect(self) -> None: + """ + Overview: + Initialize the collect mode of policy, including related attributes and modules. For PPG, it contains \ + algorithm-specific arguments such as unroll_len and gamma. + This method will be called in ``__init__`` method if ``collect`` field is in ``enable_field``. + + .. note:: + If you want to set some spacial member variables in ``_init_collect`` method, you'd better name them \ + with prefix ``_collect_`` to avoid conflict with other modes, such as ``self._collect_attr1``. + """ self._unroll_len = self._cfg.collect.unroll_len self._gamma = self._cfg.collect.discount_factor - def _forward_collect(self, data: dict) -> dict: + def _forward_collect(self, data: Dict[int, Any]) -> dict: + """ + Overview: + Policy forward function of collect mode (collecting training data by interacting with envs). Forward means \ + that the policy gets some necessary data (mainly observation) from the envs and then returns the output \ + data, such as the action to interact with the envs. + Arguments: + - data (:obj:`Dict[int, Any]`): The input data used for policy forward, including at least the obs. The \ + key of the dict is environment id and the value is the corresponding data of the env. + Returns: + - output (:obj:`Dict[int, Any]`): The output data of policy forward, including at least the action and \ + other necessary data (action logit) for learn mode defined in ``self._process_transition`` \ + method. The key of the dict is the same as the input data, i.e. environment id. + + .. tip:: + If you want to add more tricks on this policy, like temperature factor in multinomial sample, you can pass \ + related data as extra keyword arguments of this method. + + .. note:: + The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \ + For the data type that not supported, the main reason is that the corresponding model does not support it. \ + You can implement you own model rather than use the default model. For more information, please raise an \ + issue in GitHub repo and we will continue to follow up. + """ data_id = list(data.keys()) data = default_collate(list(data.values())) if self._cuda: @@ -141,17 +213,20 @@ def _forward_collect(self, data: dict) -> dict: output = default_decollate(output) return {i: d for i, d in zip(data_id, output)} - def _process_transition(self, obs: Any, model_output: dict, timestep: namedtuple) -> dict: - r""" + def _process_transition(self, obs: Any, model_output: Dict[str, torch.Tensor], timestep: namedtuple) -> dict: + """ Overview: - Generate dict type transition data from inputs. + Process and pack one timestep transition data into a dict, which can be directly used for training and \ + saved in replay buffer. For PG, it contains obs, action, reward, done. Arguments: - - obs (:obj:`Any`): Env observation - - model_output (:obj:`dict`): Output of collect model, including at least ['action'] - - timestep (:obj:`namedtuple`): Output after env step, including at least ['obs', 'reward', 'done'] \ - (here 'obs' indicates obs after env step). + - obs (:obj:`torch.Tensor`): The env observation of current timestep, such as stacked 2D image in Atari. + - model_output (:obj:`Dict[str, torch.Tensor]`): The output of the policy network with the observation \ + as input. For PG, it contains the action. + - timestep (:obj:`namedtuple`): The execution result namedtuple returned by the environment step method, \ + except all the elements have been transformed into tensor data. Usually, it contains the next obs, \ + reward, done, info, etc. Returns: - - transition (:obj:`dict`): Dict type transition data. + - transition (:obj:`Dict[str, torch.Tensor]`): The processed transition data of the current timestep. """ return { 'obs': obs, @@ -160,14 +235,22 @@ def _process_transition(self, obs: Any, model_output: dict, timestep: namedtuple 'done': timestep.done, } - def _get_train_sample(self, data: list) -> Union[None, List[Any]]: - r""" + def _get_train_sample(self, data: List[Dict[str, Any]]) -> Union[None, List[Any]]: + """ Overview: - Get the trajectory and the n step return data, then sample from the n_step return data + For a given entire episode data (a list of transition), process it into a list of sample that \ + can be used for training directly. In PG, a train sample is a processed transition with new computed \ + ``return`` field. This method is usually used in collectors to execute necessary \ + RL data preprocessing before training, which can help learner amortize revelant time consumption. \ + In addition, you can also implement this method as an identity function and do the data processing \ + in ``self._forward_learn`` method. Arguments: - - data (:obj:`list`): The trajectory's buffer list + - data (:obj:`List[Dict[str, Any]`): The episode data (a list of transition), each element is \ + the same format as the return value of ``self._process_transition`` method. Note that PG needs \ + a complete epsiode Returns: - - samples (:obj:`dict`): The training samples generated + - samples (:obj:`List[Dict[str, Any]]`): The processed train samples, each element is the similar format \ + as input transitions, but may contain more data for training, such as discounted episode return. """ assert data[-1]['done'], "PG needs a complete epsiode" @@ -193,7 +276,30 @@ def _get_train_sample(self, data: list) -> Union[None, List[Any]]: def _init_eval(self) -> None: pass - def _forward_eval(self, data: dict) -> dict: + def _forward_eval(self, data: Dict[int, Any]) -> dict: + """ + Overview: + Policy forward function of eval mode (evaluation policy performance by interacting with envs). Forward \ + means that the policy gets some necessary data (mainly observation) from the envs and then returns the \ + action to interact with the envs. ``_forward_eval`` in PG often uses deterministic sample method to get \ + actions while ``_forward_collect`` usually uses stochastic sample method for balance exploration and \ + exploitation. + Arguments: + - data (:obj:`Dict[int, Any]`): The input data used for policy forward, including at least the obs. The \ + key of the dict is environment id and the value is the corresponding data of the env. + Returns: + - output (:obj:`Dict[int, Any]`): The output data of policy forward, including at least the action. The \ + key of the dict is the same as the input data, i.e. environment id. + + .. note:: + The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \ + For the data type that not supported, the main reason is that the corresponding model does not support it. \ + You can implement you own model rather than use the default model. For more information, please raise an \ + issue in GitHub repo and we will continue to follow up. + + .. note:: + For more detailed examples, please refer to our unittest for PGPGPolicy: ``ding.policy.tests.test_pg``. + """ data_id = list(data.keys()) data = default_collate(list(data.values())) if self._cuda: @@ -216,4 +322,11 @@ def _forward_eval(self, data: dict) -> dict: return {i: d for i, d in zip(data_id, output)} def _monitor_vars_learn(self) -> List[str]: + """ + Overview: + Return the necessary keys for logging the return dict of ``self._forward_learn``. The logger module, such \ + as text logger, tensorboard logger, will use these keys to save the corresponding data. + Returns: + - necessary_keys (:obj:`List[str]`): The list of the necessary keys to be logged. + """ return super()._monitor_vars_learn() + ['policy_loss', 'entropy_loss', 'return_abs_max', 'grad_norm']