From 26ccf499b38e7ca5ad8ce369ad157913a8f0750b Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Fri, 21 Feb 2020 14:50:28 +0100 Subject: [PATCH 01/17] Use normal sampling for SAC --- docs/misc/changelog.rst | 1 + setup.py | 2 +- torchy_baselines/__init__.py | 2 +- torchy_baselines/sac/sac.py | 4 ++-- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst index 56d0d1bc5..76616ad5f 100644 --- a/docs/misc/changelog.rst +++ b/docs/misc/changelog.rst @@ -21,6 +21,7 @@ Deprecations: Others: ^^^^^^^ +- SAC with SDE now sample only one matrix Documentation: ^^^^^^^^^^^^^^ diff --git a/setup.py b/setup.py index 0c39da372..8780397de 100644 --- a/setup.py +++ b/setup.py @@ -47,7 +47,7 @@ license="MIT", long_description="", long_description_content_type='text/markdown', - version="0.2.0", + version="0.2.1", ) # python setup.py sdist diff --git a/torchy_baselines/__init__.py b/torchy_baselines/__init__.py index cc2889b20..b201dc8ef 100644 --- a/torchy_baselines/__init__.py +++ b/torchy_baselines/__init__.py @@ -4,4 +4,4 @@ from torchy_baselines.sac import SAC from torchy_baselines.td3 import TD3 -__version__ = "0.2.0" +__version__ = "0.2.1" diff --git a/torchy_baselines/sac/sac.py b/torchy_baselines/sac/sac.py index f0930ee21..61a36057e 100644 --- a/torchy_baselines/sac/sac.py +++ b/torchy_baselines/sac/sac.py @@ -171,8 +171,8 @@ def train(self, gradient_steps: int, batch_size: int = 64): # is lost and we cannot backpropagate through again # anyway, we need to sample because `log_std` may have changed between two gradient steps if self.use_sde: - self.actor.reset_noise(batch_size=batch_size) - # self.actor.reset_noise() + # self.actor.reset_noise(batch_size=batch_size) + self.actor.reset_noise() # Action by the current actor for the sampled state action_pi, log_prob = self.actor.action_log_prob(obs) From 67894dab9f806b2815e98c3f1af5052bcc0ada66 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Mon, 9 Mar 2020 19:02:40 +0100 Subject: [PATCH 02/17] Add clip_mean parameter --- torchy_baselines/ppo/ppo.py | 1 - torchy_baselines/sac/policies.py | 67 +++++++++++++------------------- torchy_baselines/sac/sac.py | 9 +---- 3 files changed, 29 insertions(+), 48 deletions(-) diff --git a/torchy_baselines/ppo/ppo.py b/torchy_baselines/ppo/ppo.py index 2ce168b1f..6f30f7f70 100644 --- a/torchy_baselines/ppo/ppo.py +++ b/torchy_baselines/ppo/ppo.py @@ -140,7 +140,6 @@ def collect_rollouts(self, continue_training = True rollout_buffer.reset() # Sample new weights for the state dependent exploration - # TODO: ensure episodic setting? if self.use_sde: self.policy.reset_noise(env.num_envs) diff --git a/torchy_baselines/sac/policies.py b/torchy_baselines/sac/policies.py index 4f9673840..e570b570c 100644 --- a/torchy_baselines/sac/policies.py +++ b/torchy_baselines/sac/policies.py @@ -1,3 +1,5 @@ +from typing import Optional, List, Tuple + import torch as th import torch.nn as nn @@ -10,28 +12,6 @@ LOG_STD_MIN = -20 -class LeakyClip(nn.Module): - """ - Cip values outside a certain range - (it is not a hard clip, there is a small slope to have non-zero gradient) - - :param min_val: (float) - :param max_val: (float) - :param slope: (float) - """ - def __init__(self, min_val=-2.0, max_val=2.0, slope=0.01): - super(LeakyClip, self).__init__() - self.min_val = min_val - self.max_val = max_val - self.slope = slope - - def forward(self, x): - linear_part = x * (x >= self.min_val) * (x <= self.max_val) - above_max_val = self.slope * (x - self.max_val) * (x > self.max_val) - below_min_val = self.slope * (x - self.min_val) * (x < self.min_val) - return linear_part + below_min_val + above_max_val - - class Actor(BaseNetwork): """ Actor network (policy) for SAC. @@ -50,10 +30,18 @@ class Actor(BaseNetwork): :param use_expln: (bool) Use `expln()` function instead of `exp()` when using SDE to ensure a positive standard deviation (cf paper). It allows to keep variance above zero and prevent it from growing too fast. In practice, `exp()` is usually enough. + :param clip_mean: (float) Clip the mean output when using SDE to avoid numerical instability. """ - def __init__(self, obs_dim, action_dim, net_arch, activation_fn=nn.ReLU, - use_sde=False, log_std_init=-3, full_std=True, - sde_net_arch=None, use_expln=False): + def __init__(self, obs_dim: int, + action_dim: int, + net_arch: List[int], + activation_fn: nn.Module = nn.ReLU, + use_sde: bool = False, + log_std_init: float = -3, + full_std: bool = True, + sde_net_arch: Optional[List[int]] = None, + use_expln: bool = False, + clip_mean: float = 2.0): super(Actor, self).__init__() latent_pi_net = create_mlp(obs_dim, -1, net_arch, activation_fn) @@ -68,23 +56,21 @@ def __init__(self, obs_dim, action_dim, net_arch, activation_fn=nn.ReLU, self.sde_feature_extractor, latent_sde_dim = create_sde_feature_extractor(obs_dim, sde_net_arch, activation_fn) - # TODO: check for the learn_features self.action_dist = StateDependentNoiseDistribution(action_dim, full_std=full_std, use_expln=use_expln, learn_features=True, squash_output=True) self.mu, self.log_std = self.action_dist.proba_distribution_net(latent_dim=net_arch[-1], latent_sde_dim=latent_sde_dim, log_std_init=log_std_init) - # Avoid saturation by limiting the mean of the Gaussian to be in [-1, 1] - # self.mu = nn.Sequential(self.mu, nn.Tanh()) - self.mu = nn.Sequential(self.mu, nn.Hardtanh(min_val=-2.0, max_val=2.0)) - # Small positive slope to have non-zero gradient - # self.mu = nn.Sequential(self.mu, LeakyClip()) + # Avoid numerical issues by limiting the mean of the Gaussian + # to be in [-clip_mean, clip_mean] + if clip_mean > 0.0: + self.mu = nn.Sequential(self.mu, nn.Hardtanh(min_val=-clip_mean, max_val=clip_mean)) else: self.action_dist = SquashedDiagGaussianDistribution(action_dim) self.mu = nn.Linear(net_arch[-1], action_dim) self.log_std = nn.Linear(net_arch[-1], action_dim) - def get_std(self): + def get_std(self) -> th.Tensor: """ Retrieve the standard deviation of the action distribution. Only useful when using SDE. @@ -97,7 +83,7 @@ def get_std(self): assert isinstance(self.action_dist, StateDependentNoiseDistribution), 'get_std() is only available when using SDE' return self.action_dist.get_std(self.log_std) - def reset_noise(self, batch_size=1): + def reset_noise(self, batch_size: int = 1) -> None: """ Sample new weights for the exploration matrix, when using SDE. @@ -106,7 +92,7 @@ def reset_noise(self, batch_size=1): assert isinstance(self.action_dist, StateDependentNoiseDistribution), 'reset_noise() is only available when using SDE' self.action_dist.sample_weights(self.log_std, batch_size=batch_size) - def _get_latent(self, obs): + def _get_latent(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor]: latent_pi = self.latent_pi(obs) if self.sde_feature_extractor is not None: @@ -115,7 +101,7 @@ def _get_latent(self, obs): latent_sde = latent_pi return latent_pi, latent_sde - def get_action_dist_params(self, obs): + def get_action_dist_params(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor, th.Tensor]: latent_pi, latent_sde = self._get_latent(obs) if self.use_sde: @@ -126,7 +112,7 @@ def get_action_dist_params(self, obs): log_std = th.clamp(log_std, LOG_STD_MIN, LOG_STD_MAX) return mean_actions, log_std, latent_sde - def forward(self, obs, deterministic=False): + def forward(self, obs: th.Tensor, deterministic: bool = False) -> th.Tensor: mean_actions, log_std, latent_sde = self.get_action_dist_params(obs) if self.use_sde: # Note: the action is squashed @@ -138,7 +124,7 @@ def forward(self, obs, deterministic=False): deterministic=deterministic) return action - def action_log_prob(self, obs): + def action_log_prob(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor]: mean_actions, log_std, latent_sde = self.get_action_dist_params(obs) if self.use_sde: @@ -195,11 +181,13 @@ class SACPolicy(BasePolicy): :param use_expln: (bool) Use `expln()` function instead of `exp()` when using SDE to ensure a positive standard deviation (cf paper). It allows to keep variance above zero and prevent it from growing too fast. In practice, `exp()` is usually enough. + :param clip_mean: (float) Clip the mean output when using SDE to avoid numerical instability. """ def __init__(self, observation_space, action_space, learning_rate, net_arch=None, device='cpu', activation_fn=nn.ReLU, use_sde=False, - log_std_init=-3, sde_net_arch=None, use_expln=False): + log_std_init=-3, sde_net_arch=None, + use_expln=False, clip_mean=2.0): super(SACPolicy, self).__init__(observation_space, action_space, device, squash_output=True) if net_arch is None: @@ -220,7 +208,8 @@ def __init__(self, observation_space, action_space, 'use_sde': use_sde, 'log_std_init': log_std_init, 'sde_net_arch': sde_net_arch, - 'use_expln': use_expln + 'use_expln': use_expln, + 'clip_mean': clip_mean } self.actor_kwargs.update(sde_kwargs) self.actor, self.actor_target = None, None diff --git a/torchy_baselines/sac/sac.py b/torchy_baselines/sac/sac.py index 61a36057e..fa6738ba4 100644 --- a/torchy_baselines/sac/sac.py +++ b/torchy_baselines/sac/sac.py @@ -165,13 +165,8 @@ def train(self, gradient_steps: int, batch_size: int = 64): obs, action_batch, next_obs, done, reward = replay_data - # Two options: retain_graph=True in the actor_loss.backward() - # or sample again the noise matrix - # otherwise the intermediate step `std = th.exp(log_std)` - # is lost and we cannot backpropagate through again - # anyway, we need to sample because `log_std` may have changed between two gradient steps + # We need to sample because `log_std` may have changed between two gradient steps if self.use_sde: - # self.actor.reset_noise(batch_size=batch_size) self.actor.reset_noise() # Action by the current actor for the sampled state @@ -196,8 +191,6 @@ def train(self, gradient_steps: int, batch_size: int = 64): self.ent_coef_optimizer.step() with th.no_grad(): - # if self.use_sde: - # self.actor.reset_noise(batch_size=batch_size) # Select action according to policy next_action, next_log_prob = self.actor.action_log_prob(next_obs) # Compute the target Q value From 1e81f38d664af9b108620caf58c89d5e19b89a75 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Mon, 9 Mar 2020 19:05:22 +0100 Subject: [PATCH 03/17] Update changelog --- docs/misc/changelog.rst | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst index 76616ad5f..b05e73a1a 100644 --- a/docs/misc/changelog.rst +++ b/docs/misc/changelog.rst @@ -22,6 +22,7 @@ Deprecations: Others: ^^^^^^^ - SAC with SDE now sample only one matrix +- Added ``clip_mean`` parameter to SAC policy Documentation: ^^^^^^^^^^^^^^ @@ -35,25 +36,25 @@ Pre-Release 0.2.0 (2020-02-14) Breaking Changes: ^^^^^^^^^^^^^^^^^ - Python 2 support was dropped, Torchy Baselines now requires Python 3.6 or above -- Return type of `evaluation.evaluate_policy()` has been changed +- Return type of ``evaluation.evaluate_policy()`` has been changed - Refactored the replay buffer to avoid transformation between PyTorch and NumPy - Created `OffPolicyRLModel` base class - Remove deprecated JSON format for `Monitor` New Features: ^^^^^^^^^^^^^ -- Add `seed()` method to `VecEnv` class +- Add ``seed()`` method to ``VecEnv`` class - Add support for Callback (cf https://github.com/hill-a/stable-baselines/pull/644) - Add methods for saving and loading replay buffer -- Add `extend()` method to the buffers -- Add `get_vec_normalize_env()` to `BaseRLModel` to retrieve `VecNormalize` wrapper when it exists -- Add `results_plotter` from Stable Baselines -- Improve `predict()` method to handle different type of observations (single, vectorized, ...) +- Add ``extend()`` method to the buffers +- Add ``get_vec_normalize_env()`` to ``BaseRLModel`` to retrieve ``VecNormalize`` wrapper when it exists +- Add ``results_plotter`` from Stable Baselines +- Improve ``predict()`` method to handle different type of observations (single, vectorized, ...) Bug Fixes: ^^^^^^^^^^ - Fix loading model on CPU that were trained on GPU -- Fix `reset_num_timesteps` that was not used +- Fix ``reset_num_timesteps`` that was not used - Fix entropy computation for squashed Gaussian (approximate it now) - Fix seeding when using multiple environments (different seed per env) @@ -64,8 +65,8 @@ Others: ^^^^^^^ - Add type check - Converted all format string to f-strings -- Add test for `OrnsteinUhlenbeckActionNoise` -- Add type aliases in `common.type_aliases` +- Add test for ``OrnsteinUhlenbeckActionNoise`` +- Add type aliases in ``common.type_aliases`` Documentation: ^^^^^^^^^^^^^^ @@ -81,7 +82,7 @@ Breaking Changes: New Features: ^^^^^^^^^^^^^ -- Initial release of A2C, CEM-RL, PPO, SAC and TD3, working only with `Box` input space +- Initial release of A2C, CEM-RL, PPO, SAC and TD3, working only with ``Box`` input space - State-Dependent Exploration (SDE) for A2C, PPO, SAC and TD3 Bug Fixes: @@ -111,4 +112,12 @@ Contributors: ------------- In random order... -Thanks to @hill-a @enerijunior @AdamGleave @Miffyli +Thanks to the maintainers of V2: @hill-a @enerijunior @AdamGleave @Miffyli + +And all the contributors: +@bjmuld @iambenzo @iandanforth @r7vme @brendenpetersen @huvar @abhiskk @JohannesAck +@EliasHasle @mrakgr @Bleyddyn @antoine-galataud @junhyeokahn @AdamGleave @keshaviyengar @tperol +@XMaster96 @kantneel @Pastafarianist @GerardMaggiolino @PatrickWalter214 @yutingsz @sc420 @Aaahh @billtubbs +@Miffyli @dwiel @miguelrass @qxcv @jaberkow @eavelardev @ruifeng96150 @pedrohbtp @srivatsankrishnan @evilsocket +@MarvineGothic @jdossgollin @SyllogismRXS @rusu24edward @jbulow @Antymon @seheevic @justinkterry @edbeeching +@flodorner @KuKuXia @NeoExtended @solliet @mmcenta @richardwu From fb4e66213d2a36d78706cc5cafd5220909cdd789 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Tue, 10 Mar 2020 16:43:10 +0100 Subject: [PATCH 04/17] Use NamedTuple for buffers --- torchy_baselines/a2c/a2c.py | 20 ++++++++++---------- torchy_baselines/common/base_class.py | 1 - torchy_baselines/common/buffers.py | 13 +++++++------ torchy_baselines/common/type_aliases.py | 20 +++++++++++++++++--- torchy_baselines/ppo/ppo.py | 25 ++++++++++++------------- torchy_baselines/sac/sac.py | 14 ++++++-------- torchy_baselines/td3/td3.py | 20 ++++++++------------ 7 files changed, 60 insertions(+), 53 deletions(-) diff --git a/torchy_baselines/a2c/a2c.py b/torchy_baselines/a2c/a2c.py index 01cbd3993..ea27d715f 100644 --- a/torchy_baselines/a2c/a2c.py +++ b/torchy_baselines/a2c/a2c.py @@ -81,30 +81,30 @@ def train(self, gradient_steps: int, batch_size=None): # Update optimizer learning rate self._update_learning_rate(self.policy.optimizer) # A2C with gradient_steps > 1 does not make sense - assert gradient_steps == 1 + assert gradient_steps == 1, "A2C does not support multiple gradient steps" # We do not use minibatches for A2C - assert batch_size is None + assert batch_size is None, "A2C does not support minibatch" for rollout_data in self.rollout_buffer.get(batch_size=None): - # Unpack - obs, action, _, _, advantage, return_batch = rollout_data + actions = rollout_data.actions if isinstance(self.action_space, spaces.Discrete): - # Convert discrete action for float to long - action = action.long().flatten() + # Convert discrete action from float to long + actions = actions.long().flatten() # TODO: avoid second computation of everything because of the gradient - values, log_prob, entropy = self.policy.evaluate_actions(obs, action) + values, log_prob, entropy = self.policy.evaluate_actions(rollout_data.observations, actions) values = values.flatten() # Normalize advantage (not present in the original implementation) if self.normalize_advantage: - advantage = (advantage - advantage.mean()) / (advantage.std() + 1e-8) + advantages = (rollout_data.advantages - rollout_data.advantages.mean()) / (rollout_data.advantages.std() + 1e-8) - policy_loss = -(advantage * log_prob).mean() + # Policy gradient loss + policy_loss = -(advantages * log_prob).mean() # Value loss using the TD(gae_lambda) target - value_loss = F.mse_loss(return_batch, values) + value_loss = F.mse_loss(rollout_data.returns, values) # Entropy loss favor exploration if entropy is None: diff --git a/torchy_baselines/common/base_class.py b/torchy_baselines/common/base_class.py index 6b3fb6a52..b01002576 100644 --- a/torchy_baselines/common/base_class.py +++ b/torchy_baselines/common/base_class.py @@ -956,7 +956,6 @@ def collect_rollouts(self, total_episodes += 1 episode_rewards.append(episode_reward) total_timesteps.append(episode_timesteps) - # TODO: reset SDE matrix at the end of the episode? if action_noise is not None: action_noise.reset() diff --git a/torchy_baselines/common/buffers.py b/torchy_baselines/common/buffers.py index c6e9d5c69..6ffe479a2 100644 --- a/torchy_baselines/common/buffers.py +++ b/torchy_baselines/common/buffers.py @@ -1,4 +1,4 @@ -from typing import Union, Optional, Tuple, Generator +from typing import Union, Optional, Generator import numpy as np import torch as th @@ -80,11 +80,12 @@ def reset(self) -> None: def sample(self, batch_size: int, env: Optional[VecNormalize] = None - ) -> Tuple[th.Tensor, ...]: + ): """ :param batch_size: (int) Number of element to sample :param env: (Optional[VecNormalize]) associated gym VecEnv to normalize the observations/rewards when sampling + :return: (Union[RolloutBufferSamples, ReplayBufferSamples]) """ upper_bound = self.buffer_size if self.full else self.pos batch_inds = np.random.randint(0, upper_bound, size=batch_size) @@ -93,11 +94,11 @@ def sample(self, def _get_samples(self, batch_inds: np.ndarray, env: Optional[VecNormalize] = None - ) -> Tuple[th.Tensor, ...]: + ): """ :param batch_inds: (th.Tensor) :param env: (Optional[VecNormalize]) - :return: ([th.Tensor]) + :return: (Union[RolloutBufferSamples, ReplayBufferSamples]) """ raise NotImplementedError() @@ -184,7 +185,7 @@ def _get_samples(self, self._normalize_obs(self.next_observations[batch_inds, 0, :], env), self.dones[batch_inds], self._normalize_reward(self.rewards[batch_inds], env)) - return tuple(map(self.to_torch, data)) + return ReplayBufferSamples(*tuple(map(self.to_torch, data))) class RolloutBuffer(BaseBuffer): @@ -333,4 +334,4 @@ def _get_samples(self, batch_inds: np.ndarray, self.log_probs[batch_inds].flatten(), self.advantages[batch_inds].flatten(), self.returns[batch_inds].flatten()) - return tuple(map(self.to_torch, data)) + return RolloutBufferSamples(*tuple(map(self.to_torch, data))) diff --git a/torchy_baselines/common/type_aliases.py b/torchy_baselines/common/type_aliases.py index b9035dbd6..16576fff6 100644 --- a/torchy_baselines/common/type_aliases.py +++ b/torchy_baselines/common/type_aliases.py @@ -1,7 +1,8 @@ """ Common aliases for type hing """ -from typing import Union, Type, Optional, Dict, Any, List, Tuple +from typing import Union, Type, Optional, Dict, Any, List, NamedTuple +from collections import namedtuple import torch as th import gym @@ -13,6 +14,19 @@ TensorDict = Dict[str, th.Tensor] OptimizerStateDict = Dict[str, Any] # obs, action, old_values, old_log_prob, advantage, return_batch -RolloutBufferSamples = Tuple[th.Tensor, th.Tensor, th.Tensor, th.Tensor, th.Tensor, th.Tensor] +class RolloutBufferSamples(NamedTuple): + observations: th.Tensor + actions: th.Tensor + old_values: th.Tensor + old_log_prob: th.Tensor + advantages: th.Tensor + returns: th.Tensor + + # obs, action, next_obs, done, reward -ReplayBufferSamples = Tuple[th.Tensor, th.Tensor, th.Tensor, th.Tensor, th.Tensor] +class ReplayBufferSamples(NamedTuple): + observations: th.Tensor + actions: th.Tensor + next_observations: th.Tensor + dones: th.Tensor + rewards: th.Tensor diff --git a/torchy_baselines/ppo/ppo.py b/torchy_baselines/ppo/ppo.py index 6f30f7f70..f10f5e219 100644 --- a/torchy_baselines/ppo/ppo.py +++ b/torchy_baselines/ppo/ppo.py @@ -195,13 +195,12 @@ def train(self, gradient_steps: int, batch_size: int = 64) -> None: for gradient_step in range(gradient_steps): approx_kl_divs = [] # Sample replay buffer - for replay_data in self.rollout_buffer.get(batch_size): - # Unpack - obs, action, old_values, old_log_prob, advantage, return_batch = replay_data + for rollout_data in self.rollout_buffer.get(batch_size): + actions = rollout_data.actions if isinstance(self.action_space, spaces.Discrete): - # Convert discrete action for float to long - action = action.long().flatten() + # Convert discrete action from float to long + actions = rollout_data.actions.long().flatten() # Re-sample the noise matrix because the log_std has changed # TODO: investigate why there is no issue with the gradient @@ -209,16 +208,16 @@ def train(self, gradient_steps: int, batch_size: int = 64) -> None: if self.use_sde: self.policy.reset_noise(batch_size) - values, log_prob, entropy = self.policy.evaluate_actions(obs, action) + values, log_prob, entropy = self.policy.evaluate_actions(rollout_data.observations, actions) values = values.flatten() # Normalize advantage - advantage = (advantage - advantage.mean()) / (advantage.std() + 1e-8) + advantages = (rollout_data.advantages - rollout_data.advantages.mean()) / (rollout_data.advantages.std() + 1e-8) # ratio between old and new policy, should be one at the first iteration - ratio = th.exp(log_prob - old_log_prob) + ratio = th.exp(log_prob - rollout_data.old_log_prob) # clipped surrogate loss - policy_loss_1 = advantage * ratio - policy_loss_2 = advantage * th.clamp(ratio, 1 - clip_range, 1 + clip_range) + policy_loss_1 = advantages * ratio + policy_loss_2 = advantages * th.clamp(ratio, 1 - clip_range, 1 + clip_range) policy_loss = -th.min(policy_loss_1, policy_loss_2).mean() if self.clip_range_vf is None: @@ -227,9 +226,9 @@ def train(self, gradient_steps: int, batch_size: int = 64) -> None: else: # Clip the different between old and new value # NOTE: this depends on the reward scaling - values_pred = old_values + th.clamp(values - old_values, -clip_range_vf, clip_range_vf) + values_pred = rollout_data.old_values + th.clamp(values - rollout_data.old_values, -clip_range_vf, clip_range_vf) # Value loss using the TD(gae_lambda) target - value_loss = F.mse_loss(return_batch, values_pred) + value_loss = F.mse_loss(rollout_data.returns, values_pred) # Entropy loss favor exploration if entropy is None: @@ -246,7 +245,7 @@ def train(self, gradient_steps: int, batch_size: int = 64) -> None: # Clip grad norm th.nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_grad_norm) self.policy.optimizer.step() - approx_kl_divs.append(th.mean(old_log_prob - log_prob).detach().cpu().numpy()) + approx_kl_divs.append(th.mean(rollout_data.old_log_prob - log_prob).detach().cpu().numpy()) if self.target_kl is not None and np.mean(approx_kl_divs) > 1.5 * self.target_kl: print("Early stopping at step {} due to reaching max kl: {:.2f}".format(gradient_step, diff --git a/torchy_baselines/sac/sac.py b/torchy_baselines/sac/sac.py index fa6738ba4..d7906e891 100644 --- a/torchy_baselines/sac/sac.py +++ b/torchy_baselines/sac/sac.py @@ -163,14 +163,12 @@ def train(self, gradient_steps: int, batch_size: int = 64): # Sample replay buffer replay_data = self.replay_buffer.sample(batch_size, env=self._vec_normalize_env) - obs, action_batch, next_obs, done, reward = replay_data - # We need to sample because `log_std` may have changed between two gradient steps if self.use_sde: self.actor.reset_noise() # Action by the current actor for the sampled state - action_pi, log_prob = self.actor.action_log_prob(obs) + actions_pi, log_prob = self.actor.action_log_prob(replay_data.observations) log_prob = log_prob.reshape(-1, 1) ent_coef_loss = None @@ -192,17 +190,17 @@ def train(self, gradient_steps: int, batch_size: int = 64): with th.no_grad(): # Select action according to policy - next_action, next_log_prob = self.actor.action_log_prob(next_obs) + next_actions, next_log_prob = self.actor.action_log_prob(replay_data.next_observations) # Compute the target Q value - target_q1, target_q2 = self.critic_target(next_obs, next_action) + target_q1, target_q2 = self.critic_target(replay_data.next_observations, next_actions) target_q = th.min(target_q1, target_q2) - target_q = reward + (1 - done) * self.gamma * target_q + target_q = replay_data.rewards + (1 - replay_data.dones) * self.gamma * target_q # td error + entropy term q_backup = target_q - ent_coef * next_log_prob.reshape(-1, 1) # Get current Q estimates # using action from the replay buffer - current_q1, current_q2 = self.critic(obs, action_batch) + current_q1, current_q2 = self.critic(replay_data.observations, replay_data.actions) # Compute critic loss critic_loss = 0.5 * (F.mse_loss(current_q1, q_backup) + F.mse_loss(current_q2, q_backup)) @@ -214,7 +212,7 @@ def train(self, gradient_steps: int, batch_size: int = 64): # Compute actor loss # Alternative: actor_loss = th.mean(log_prob - qf1_pi) - qf1_pi, qf2_pi = self.critic.forward(obs, action_pi) + qf1_pi, qf2_pi = self.critic.forward(replay_data.observations, actions_pi) min_qf_pi = th.min(qf1_pi, qf2_pi) actor_loss = (ent_coef * log_prob - min_qf_pi).mean() diff --git a/torchy_baselines/td3/td3.py b/torchy_baselines/td3/td3.py index 1ba594775..963338df7 100644 --- a/torchy_baselines/td3/td3.py +++ b/torchy_baselines/td3/td3.py @@ -124,22 +124,20 @@ def train_critic(self, gradient_steps: int = 1, for gradient_step in range(gradient_steps): # Sample replay buffer if replay_data is None: - obs, action, next_obs, done, reward = self.replay_buffer.sample(batch_size, env=self._vec_normalize_env) - else: - obs, action, next_obs, done, reward = replay_data + replay_data = self.replay_buffer.sample(batch_size, env=self._vec_normalize_env) # Select action according to policy and add clipped noise - noise = action.clone().data.normal_(0, self.target_policy_noise) + noise = replay_data.actions.clone().data.normal_(0, self.target_policy_noise) noise = noise.clamp(-self.target_noise_clip, self.target_noise_clip) - next_action = (self.actor_target(next_obs) + noise).clamp(-1, 1) + next_actions = (self.actor_target(replay_data.next_observations) + noise).clamp(-1, 1) # Compute the target Q value - target_q1, target_q2 = self.critic_target(next_obs, next_action) + target_q1, target_q2 = self.critic_target(replay_data.next_observations, next_actions) target_q = th.min(target_q1, target_q2) - target_q = reward + ((1 - done) * self.gamma * target_q).detach() + target_q = replay_data.rewards + ((1 - replay_data.dones) * self.gamma * target_q).detach() # Get current Q estimates - current_q1, current_q2 = self.critic(obs, action) + current_q1, current_q2 = self.critic(replay_data.observations, replay_data.actions) # Compute critic loss critic_loss = F.mse_loss(current_q1, target_q) + F.mse_loss(current_q2, target_q) @@ -167,12 +165,10 @@ def train_actor(self, gradient_steps: int = 1, for gradient_step in range(gradient_steps): # Sample replay buffer if replay_data is None: - obs, _, next_obs, done, reward = self.replay_buffer.sample(batch_size, env=self._vec_normalize_env) - else: - obs, _, next_obs, done, reward = replay_data + replay_data = self.replay_buffer.sample(batch_size, env=self._vec_normalize_env) # Compute actor loss - actor_loss = -self.critic.q1_forward(obs, self.actor(obs)).mean() + actor_loss = -self.critic.q1_forward(replay_data.observations, self.actor(replay_data.observations)).mean() # Optimize the actor self.actor.optimizer.zero_grad() From 20ee8cb68dcd036b5045e6fce41d6b31f359e16a Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Tue, 10 Mar 2020 16:55:13 +0100 Subject: [PATCH 05/17] Update changelog and add more namedtuples --- docs/misc/changelog.rst | 1 + torchy_baselines/common/base_class.py | 10 +++++----- torchy_baselines/common/type_aliases.py | 13 +++++++++++-- torchy_baselines/sac/sac.py | 10 ++++------ torchy_baselines/td3/td3.py | 9 ++++----- 5 files changed, 25 insertions(+), 18 deletions(-) diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst index b05e73a1a..2aef391a2 100644 --- a/docs/misc/changelog.rst +++ b/docs/misc/changelog.rst @@ -23,6 +23,7 @@ Others: ^^^^^^^ - SAC with SDE now sample only one matrix - Added ``clip_mean`` parameter to SAC policy +- Buffers now return ``NamedTuple`` Documentation: ^^^^^^^^^^^^^^ diff --git a/torchy_baselines/common/base_class.py b/torchy_baselines/common/base_class.py index b01002576..d6bd69de3 100644 --- a/torchy_baselines/common/base_class.py +++ b/torchy_baselines/common/base_class.py @@ -16,7 +16,7 @@ from torchy_baselines.common.utils import set_random_seed, get_schedule_fn, update_learning_rate from torchy_baselines.common.vec_env import DummyVecEnv, VecEnv, unwrap_vec_normalize, VecNormalize from torchy_baselines.common.save_util import data_to_json, json_to_data, recursive_getattr, recursive_setattr -from torchy_baselines.common.type_aliases import GymEnv, TensorDict, OptimizerStateDict +from torchy_baselines.common.type_aliases import GymEnv, TensorDict, OptimizerStateDict, RolloutReturn from torchy_baselines.common.callbacks import BaseCallback, CallbackList, ConvertCallback, EvalCallback from torchy_baselines.common.monitor import Monitor from torchy_baselines.common.noise import ActionNoise @@ -830,7 +830,7 @@ def collect_rollouts(self, replay_buffer: Optional[ReplayBuffer] = None, obs: Optional[np.ndarray] = None, episode_num: int = 0, - log_interval: Optional[int] = None) -> Tuple[float, int, int, Optional[np.ndarray], bool]: + log_interval: Optional[int] = None) -> RolloutReturn: """ Collect rollout using the current policy (and possibly fill the replay buffer) @@ -849,6 +849,7 @@ def collect_rollouts(self, :param obs: (np.ndarray) Last observation from the environment :param episode_num: (int) Episode index :param log_interval: (int) Log data every `log_interval` episodes + :return: (RolloutReturn) """ episode_rewards, total_timesteps = [], [] total_steps, total_episodes = 0, 0 @@ -878,8 +879,7 @@ def collect_rollouts(self, # Only stop training if return value is False, not when it is None. if callback() is False: - continue_training = False - return 0.0, total_steps, total_episodes, None, continue_training + return RolloutReturn(0.0, total_steps, total_episodes, None, continue_training=False) if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0: # Sample a new noise matrix @@ -1003,4 +1003,4 @@ def collect_rollouts(self, callback.on_rollout_end() - return mean_reward, total_steps, total_episodes, obs, continue_training + return RolloutReturn(mean_reward, total_steps, total_episodes, obs, continue_training) diff --git a/torchy_baselines/common/type_aliases.py b/torchy_baselines/common/type_aliases.py index 16576fff6..12c220fe4 100644 --- a/torchy_baselines/common/type_aliases.py +++ b/torchy_baselines/common/type_aliases.py @@ -4,6 +4,7 @@ from typing import Union, Type, Optional, Dict, Any, List, NamedTuple from collections import namedtuple +import numpy as np import torch as th import gym @@ -13,7 +14,8 @@ GymEnv = Union[gym.Env, VecEnv] TensorDict = Dict[str, th.Tensor] OptimizerStateDict = Dict[str, Any] -# obs, action, old_values, old_log_prob, advantage, return_batch + + class RolloutBufferSamples(NamedTuple): observations: th.Tensor actions: th.Tensor @@ -23,10 +25,17 @@ class RolloutBufferSamples(NamedTuple): returns: th.Tensor -# obs, action, next_obs, done, reward class ReplayBufferSamples(NamedTuple): observations: th.Tensor actions: th.Tensor next_observations: th.Tensor dones: th.Tensor rewards: th.Tensor + + +class RolloutReturn(NamedTuple): + episode_reward: float + episode_timesteps: int + n_episodes: int + obs: Optional[np.ndarray] + continue_training: bool diff --git a/torchy_baselines/sac/sac.py b/torchy_baselines/sac/sac.py index d7906e891..58b19dbb8 100644 --- a/torchy_baselines/sac/sac.py +++ b/torchy_baselines/sac/sac.py @@ -249,18 +249,16 @@ def learn(self, total_timesteps, callback=None, log_interval=4, replay_buffer=self.replay_buffer, obs=obs, episode_num=episode_num, log_interval=log_interval) - # Unpack - episode_reward, episode_timesteps, n_episodes, obs, continue_training = rollout - if continue_training is False: + if rollout.continue_training is False: break - episode_num += n_episodes + obs = rollout.obs + episode_num += rollout.n_episodes self._update_current_progress(self.num_timesteps, total_timesteps) if self.num_timesteps > 0 and self.num_timesteps > self.learning_starts: - gradient_steps = self.gradient_steps if self.gradient_steps > 0 else episode_timesteps - + gradient_steps = self.gradient_steps if self.gradient_steps > 0 else rollout.episode_timesteps self.train(gradient_steps, batch_size=self.batch_size) callback.on_training_end() diff --git a/torchy_baselines/td3/td3.py b/torchy_baselines/td3/td3.py index 963338df7..767f2f070 100644 --- a/torchy_baselines/td3/td3.py +++ b/torchy_baselines/td3/td3.py @@ -259,13 +259,12 @@ def learn(self, total_timesteps, callback=None, log_interval=4, replay_buffer=self.replay_buffer, obs=obs, episode_num=episode_num, log_interval=log_interval) - # Unpack - episode_reward, episode_timesteps, n_episodes, obs, continue_training = rollout - if continue_training is False: + if rollout.continue_training is False: break - episode_num += n_episodes + obs = rollout.obs + episode_num += rollout.n_episodes self._update_current_progress(self.num_timesteps, total_timesteps) if self.num_timesteps > 0 and self.num_timesteps > self.learning_starts: @@ -279,7 +278,7 @@ def learn(self, total_timesteps, callback=None, log_interval=4, # On-policy gradient self.train_sde() - gradient_steps = self.gradient_steps if self.gradient_steps > 0 else episode_timesteps + gradient_steps = self.gradient_steps if self.gradient_steps > 0 else rollout.episode_timesteps self.train(gradient_steps, batch_size=self.batch_size, policy_delay=self.policy_delay) callback.on_training_end() From f159a4a9f261737db4427de072bb189056604731 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Tue, 10 Mar 2020 17:08:39 +0100 Subject: [PATCH 06/17] Bug fix for A2C --- torchy_baselines/a2c/a2c.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/torchy_baselines/a2c/a2c.py b/torchy_baselines/a2c/a2c.py index ea27d715f..5f0d507b3 100644 --- a/torchy_baselines/a2c/a2c.py +++ b/torchy_baselines/a2c/a2c.py @@ -97,8 +97,9 @@ def train(self, gradient_steps: int, batch_size=None): values = values.flatten() # Normalize advantage (not present in the original implementation) + advantages = rollout_data.advantages if self.normalize_advantage: - advantages = (rollout_data.advantages - rollout_data.advantages.mean()) / (rollout_data.advantages.std() + 1e-8) + advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8) # Policy gradient loss policy_loss = -(advantages * log_prob).mean() From 80fb62e22ddbf3ab431e3f6a324575a15968ca43 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Tue, 10 Mar 2020 17:10:15 +0100 Subject: [PATCH 07/17] Bump version --- setup.py | 2 +- torchy_baselines/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 8780397de..51dea7b81 100644 --- a/setup.py +++ b/setup.py @@ -47,7 +47,7 @@ license="MIT", long_description="", long_description_content_type='text/markdown', - version="0.2.1", + version="0.2.2", ) # python setup.py sdist diff --git a/torchy_baselines/__init__.py b/torchy_baselines/__init__.py index b201dc8ef..a548d3037 100644 --- a/torchy_baselines/__init__.py +++ b/torchy_baselines/__init__.py @@ -4,4 +4,4 @@ from torchy_baselines.sac import SAC from torchy_baselines.td3 import TD3 -__version__ = "0.2.1" +__version__ = "0.2.2" From 6ebad92e1b1191a98dc79de84352cf2b15162716 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Tue, 10 Mar 2020 17:43:54 +0100 Subject: [PATCH 08/17] Remove default seed and bump dependencies --- docs/misc/changelog.rst | 3 ++ setup.py | 4 +- torchy_baselines/a2c/a2c.py | 2 +- torchy_baselines/cem_rl/cem_rl.py | 2 +- torchy_baselines/ppo/ppo.py | 2 +- torchy_baselines/sac/policies.py | 38 ++++++++++-------- torchy_baselines/sac/sac.py | 65 +++++++++++++++++++++---------- torchy_baselines/td3/td3.py | 2 +- 8 files changed, 76 insertions(+), 42 deletions(-) diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst index 2aef391a2..45d28de6c 100644 --- a/docs/misc/changelog.rst +++ b/docs/misc/changelog.rst @@ -9,6 +9,8 @@ Pre-Release 0.3.0a0 (WIP) Breaking Changes: ^^^^^^^^^^^^^^^^^ +- Removed default seed +- Bump dependencies (PyTorch and Gym) New Features: ^^^^^^^^^^^^^ @@ -24,6 +26,7 @@ Others: - SAC with SDE now sample only one matrix - Added ``clip_mean`` parameter to SAC policy - Buffers now return ``NamedTuple`` +- More typing Documentation: ^^^^^^^^^^^^^^ diff --git a/setup.py b/setup.py index 51dea7b81..1fa044cc6 100644 --- a/setup.py +++ b/setup.py @@ -7,9 +7,9 @@ packages=[package for package in find_packages() if package.startswith('torchy_baselines')], install_requires=[ - 'gym[classic_control]>=0.10.9', + 'gym[classic_control]>=0.11', 'numpy', - 'torch>=1.2.0', + 'torch>=1.4.0', 'cloudpickle', # For reading logs 'pandas', diff --git a/torchy_baselines/a2c/a2c.py b/torchy_baselines/a2c/a2c.py index 5f0d507b3..1b60e2740 100644 --- a/torchy_baselines/a2c/a2c.py +++ b/torchy_baselines/a2c/a2c.py @@ -51,7 +51,7 @@ def __init__(self, policy, env, learning_rate=7e-4, ent_coef=0.0, vf_coef=0.5, max_grad_norm=0.5, rms_prop_eps=1e-5, use_rms_prop=True, use_sde=False, sde_sample_freq=-1, normalize_advantage=False, tensorboard_log=None, create_eval_env=False, - policy_kwargs=None, verbose=0, seed=0, device='auto', + policy_kwargs=None, verbose=0, seed=None, device='auto', _init_setup_model=True): super(A2C, self).__init__(policy, env, learning_rate=learning_rate, diff --git a/torchy_baselines/cem_rl/cem_rl.py b/torchy_baselines/cem_rl/cem_rl.py index 9a2defa59..2d1838ee6 100644 --- a/torchy_baselines/cem_rl/cem_rl.py +++ b/torchy_baselines/cem_rl/cem_rl.py @@ -62,7 +62,7 @@ def __init__(self, policy, env, sigma_init=1e-3, pop_size=10, action_noise=None, target_policy_noise=0.2, target_noise_clip=0.5, n_episodes_rollout=1, update_style='original', tensorboard_log=None, create_eval_env=False, - policy_kwargs=None, verbose=0, seed=0, device='auto', + policy_kwargs=None, verbose=0, seed=None, device='auto', _init_setup_model=True): super(CEMRL, self).__init__(policy, env, diff --git a/torchy_baselines/ppo/ppo.py b/torchy_baselines/ppo/ppo.py index f10f5e219..4ee8ef1f0 100644 --- a/torchy_baselines/ppo/ppo.py +++ b/torchy_baselines/ppo/ppo.py @@ -79,7 +79,7 @@ def __init__(self, policy, env, learning_rate=3e-4, ent_coef=0.0, vf_coef=0.5, max_grad_norm=0.5, use_sde=False, sde_sample_freq=-1, target_kl=None, tensorboard_log=None, create_eval_env=False, - policy_kwargs=None, verbose=0, seed=0, device='auto', + policy_kwargs=None, verbose=0, seed=None, device='auto', _init_setup_model=True): super(PPO, self).__init__(policy, env, PPOPolicy, policy_kwargs=policy_kwargs, diff --git a/torchy_baselines/sac/policies.py b/torchy_baselines/sac/policies.py index e570b570c..15a28f2dd 100644 --- a/torchy_baselines/sac/policies.py +++ b/torchy_baselines/sac/policies.py @@ -1,5 +1,6 @@ -from typing import Optional, List, Tuple +from typing import Optional, List, Tuple, Callable, Union +import gym import torch as th import torch.nn as nn @@ -143,8 +144,10 @@ class Critic(BaseNetwork): :param net_arch: ([int]) Network architecture :param activation_fn: (nn.Module) Activation function """ - def __init__(self, obs_dim, action_dim, - net_arch, activation_fn=nn.ReLU): + def __init__(self, obs_dim: int, + action_dim: int, + net_arch: List[int], + activation_fn: nn.Module = nn.ReLU): super(Critic, self).__init__() q1_net = create_mlp(obs_dim + action_dim, 1, net_arch, activation_fn) @@ -155,13 +158,10 @@ def __init__(self, obs_dim, action_dim, self.q_networks = [self.q1_net, self.q2_net] - def forward(self, obs, action): + def forward(self, obs: th.Tensor, action: th.Tensor) -> List[th.Tensor]: qvalue_input = th.cat([obs, action], dim=1) return [q_net(qvalue_input) for q_net in self.q_networks] - def q1_forward(self, obs, action): - return self.q_networks[0](th.cat([obs, action], dim=1)) - class SACPolicy(BasePolicy): """ @@ -183,11 +183,17 @@ class SACPolicy(BasePolicy): above zero and prevent it from growing too fast. In practice, `exp()` is usually enough. :param clip_mean: (float) Clip the mean output when using SDE to avoid numerical instability. """ - def __init__(self, observation_space, action_space, - learning_rate, net_arch=None, device='cpu', - activation_fn=nn.ReLU, use_sde=False, - log_std_init=-3, sde_net_arch=None, - use_expln=False, clip_mean=2.0): + def __init__(self, observation_space: gym.spaces.Space, + action_space: gym.spaces.Space, + learning_rate: Callable, + net_arch: Optional[List[int]] = None, + device: Union[th.device, str] = 'cpu', + activation_fn: nn.Module = nn.ReLU, + use_sde: bool = False, + log_std_init: float = -3, + sde_net_arch: Optional[List[int]] = None, + use_expln: bool = False, + clip_mean: float = 2.0): super(SACPolicy, self).__init__(observation_space, action_space, device, squash_output=True) if net_arch is None: @@ -217,7 +223,7 @@ def __init__(self, observation_space, action_space, self._build(learning_rate) - def _build(self, learning_rate): + def _build(self, learning_rate: Callable) -> None: self.actor = self.make_actor() self.actor.optimizer = th.optim.Adam(self.actor.parameters(), lr=learning_rate(1)) @@ -226,13 +232,13 @@ def _build(self, learning_rate): self.critic_target.load_state_dict(self.critic.state_dict()) self.critic.optimizer = th.optim.Adam(self.critic.parameters(), lr=learning_rate(1)) - def make_actor(self): + def make_actor(self) -> Actor: return Actor(**self.actor_kwargs).to(self.device) - def make_critic(self): + def make_critic(self) -> Critic: return Critic(**self.net_args).to(self.device) - def forward(self, obs): + def forward(self, obs: th.Tensor) -> th.Tensor: return self.actor(obs) def predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor: diff --git a/torchy_baselines/sac/sac.py b/torchy_baselines/sac/sac.py index 58b19dbb8..2ac822fc6 100644 --- a/torchy_baselines/sac/sac.py +++ b/torchy_baselines/sac/sac.py @@ -1,13 +1,16 @@ -from typing import List, Tuple +from typing import List, Tuple, Type, Union, Callable, Optional, Dict, Any import torch as th import torch.nn.functional as F import numpy as np +from torchy_baselines.common import logger from torchy_baselines.common.base_class import OffPolicyRLModel from torchy_baselines.common.buffers import ReplayBuffer +from torchy_baselines.common.type_aliases import GymEnv +from torchy_baselines.common.noise import ActionNoise +from torchy_baselines.common.callbacks import BaseCallback from torchy_baselines.sac.policies import SACPolicy -from torchy_baselines.common import logger class SAC(OffPolicyRLModel): @@ -25,7 +28,7 @@ class SAC(OffPolicyRLModel): in https://github.com/hill-a/stable-baselines/issues/270 :param policy: (SACPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, ...) - :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) + :param env: (GymEnv or str) The environment to learn from (if registered in Gym, can be str) :param learning_rate: (float or callable) learning rate for adam optimizer, the same learning rate will be used for all networks (Q-Values, Actor and Value function) it can be a function of the current progress (from 1 to 0) @@ -61,16 +64,31 @@ class SAC(OffPolicyRLModel): :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance """ - def __init__(self, policy, env, learning_rate=3e-4, buffer_size=int(1e6), - learning_starts=100, batch_size=256, - tau=0.005, ent_coef='auto', target_update_interval=1, - train_freq=1, gradient_steps=1, n_episodes_rollout=-1, - target_entropy='auto', action_noise=None, - gamma=0.99, use_sde=False, sde_sample_freq=-1, - use_sde_at_warmup=False, - tensorboard_log=None, create_eval_env=False, - policy_kwargs=None, verbose=0, seed=0, device='auto', - _init_setup_model=True): + def __init__(self, policy: Union[str, Type[SACPolicy]], + env: Union[GymEnv, str], + learning_rate: Union[float, Callable] = 3e-4, + buffer_size: int = int(1e6), + learning_starts: int = 100, + batch_size: int = 256, + tau: float = 0.005, + ent_coef: Union[str, float] = 'auto', + target_update_interval: int = 1, + train_freq: int = 1, + gradient_steps: int = 1, + n_episodes_rollout: int = -1, + target_entropy: Union[str, float] = 'auto', + action_noise: Optional[ActionNoise] = None, + gamma: float = 0.99, + use_sde: bool = False, + sde_sample_freq: int = -1, + use_sde_at_warmup: bool = False, + tensorboard_log: Optional[str] = None, + create_eval_env: bool = False, + policy_kwargs: Dict[str, Any] = None, + verbose: int = 0, + seed: Optional[int] = None, + device: Union[th.device, str] = 'auto', + _init_setup_model: bool = True): super(SAC, self).__init__(policy, env, SACPolicy, policy_kwargs, verbose, device, create_eval_env=create_eval_env, seed=seed, @@ -79,7 +97,7 @@ def __init__(self, policy, env, learning_rate=3e-4, buffer_size=int(1e6), self.learning_rate = learning_rate self.target_entropy = target_entropy - self.log_ent_coef = None + self.log_ent_coef = None # type: Optional[th.Tensor] self.target_update_interval = target_update_interval self.buffer_size = buffer_size # In the original paper, same learning rate is used for all networks @@ -101,7 +119,7 @@ def __init__(self, policy, env, learning_rate=3e-4, buffer_size=int(1e6), if _init_setup_model: self._setup_model() - def _setup_model(self): + def _setup_model(self) -> None: self._setup_learning_rate() obs_dim, action_dim = self.observation_space.shape[0], self.action_space.shape[0] if self.seed is not None: @@ -143,12 +161,12 @@ def _setup_model(self): self.policy = self.policy.to(self.device) self._create_aliases() - def _create_aliases(self): + def _create_aliases(self) -> None: self.actor = self.policy.actor self.critic = self.policy.critic self.critic_target = self.policy.critic_target - def train(self, gradient_steps: int, batch_size: int = 64): + def train(self, gradient_steps: int, batch_size: int = 64) -> None: # Update optimizers learning rate optimizers = [self.actor.optimizer, self.critic.optimizer] if self.ent_coef_optimizer is not None: @@ -233,9 +251,16 @@ def train(self, gradient_steps: int, batch_size: int = 64): if ent_coef_loss is not None: logger.logkv("ent_coef_loss", ent_coef_loss.item()) - def learn(self, total_timesteps, callback=None, log_interval=4, - eval_env=None, eval_freq=-1, n_eval_episodes=5, tb_log_name="SAC", - eval_log_path=None, reset_num_timesteps=True): + def learn(self, + total_timesteps: int, + callback: Optional[BaseCallback] = None, + log_interval: int = 4, + eval_env: Optional[GymEnv] = None, + eval_freq: int = -1, + n_eval_episodes: int = 5, + tb_log_name: str = "SAC", + eval_log_path: Optional[str] = None, + reset_num_timesteps: bool = True) -> OffPolicyRLModel: episode_num, obs, callback = self._setup_learn(eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps) diff --git a/torchy_baselines/td3/td3.py b/torchy_baselines/td3/td3.py index 767f2f070..7dbaf1636 100644 --- a/torchy_baselines/td3/td3.py +++ b/torchy_baselines/td3/td3.py @@ -65,7 +65,7 @@ def __init__(self, policy, env, buffer_size=int(1e6), learning_rate=1e-3, use_sde=False, sde_sample_freq=-1, sde_max_grad_norm=1, sde_ent_coef=0.0, sde_log_std_scheduler=None, use_sde_at_warmup=False, tensorboard_log=None, create_eval_env=False, policy_kwargs=None, verbose=0, - seed=0, device='auto', _init_setup_model=True): + seed=None, device='auto', _init_setup_model=True): super(TD3, self).__init__(policy, env, TD3Policy, policy_kwargs, verbose, device, create_eval_env=create_eval_env, seed=seed, From 35d0d2b32056ccc614770ec7b58065fb637b5cec Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Tue, 10 Mar 2020 18:09:45 +0100 Subject: [PATCH 09/17] More typing --- torchy_baselines/common/policies.py | 7 ++-- torchy_baselines/ppo/policies.py | 56 +++++++++++++++++------------ 2 files changed, 38 insertions(+), 25 deletions(-) diff --git a/torchy_baselines/common/policies.py b/torchy_baselines/common/policies.py index abab7d845..a5e08b5c6 100644 --- a/torchy_baselines/common/policies.py +++ b/torchy_baselines/common/policies.py @@ -237,7 +237,10 @@ class MlpExtractor(nn.Module): :param activation_fn: (nn.Module) The activation function to use for the networks. :param device: (th.device) """ - def __init__(self, feature_dim, net_arch, activation_fn, device='cpu'): + def __init__(self, feature_dim: int, + net_arch: List[Union[int, Dict[str, List[int]]]], + activation_fn: nn.Module, + device: Union[th.device, str] = 'cpu'): super(MlpExtractor, self).__init__() shared_net, policy_net, value_net = [], [], [] @@ -291,7 +294,7 @@ def __init__(self, feature_dim, net_arch, activation_fn, device='cpu'): self.policy_net = nn.Sequential(*policy_net).to(device) self.value_net = nn.Sequential(*value_net).to(device) - def forward(self, features): + def forward(self, features: th.Tensor) -> Tuple[th.Tensor, th.Tensor]: """ :return: (th.Tensor, th.Tensor) latent_policy, latent_value of the specified network. If all layers are shared, then ``latent_policy == latent_value`` diff --git a/torchy_baselines/ppo/policies.py b/torchy_baselines/ppo/policies.py index 3e47375d5..491737a02 100644 --- a/torchy_baselines/ppo/policies.py +++ b/torchy_baselines/ppo/policies.py @@ -1,13 +1,16 @@ +from typing import Optional, List, Tuple, Callable, Union, Dict from functools import partial +import gym import torch as th import torch.nn as nn import numpy as np -from torchy_baselines.common.policies import BasePolicy, register_policy, MlpExtractor, \ - create_sde_feature_extractor -from torchy_baselines.common.distributions import make_proba_distribution,\ - DiagGaussianDistribution, CategoricalDistribution, StateDependentNoiseDistribution +from torchy_baselines.common.policies import (BasePolicy, register_policy, MlpExtractor, + create_sde_feature_extractor) +from torchy_baselines.common.distributions import (make_proba_distribution, Distribution, + DiagGaussianDistribution, CategoricalDistribution, StateDependentNoiseDistribution) + class PPOPolicy(BasePolicy): @@ -35,12 +38,21 @@ class PPOPolicy(BasePolicy): :param squash_output: (bool) Whether to squash the output using a tanh function, this allows to ensure boundaries when using SDE. """ - def __init__(self, observation_space, action_space, - learning_rate, net_arch=None, device='cpu', - activation_fn=nn.Tanh, adam_epsilon=1e-5, - ortho_init=True, use_sde=False, - log_std_init=0.0, full_std=True, - sde_net_arch=None, use_expln=False, squash_output=False): + def __init__(self, + observation_space: gym.spaces.Space, + action_space: gym.spaces.Space, + learning_rate: Callable, + net_arch: Optional[List[Union[int, Dict[str, List[int]]]]] = None, + device: Union[th.device, str] = 'cpu', + activation_fn: nn.Module = nn.Tanh, + adam_epsilon: float = 1e-5, + ortho_init: bool = True, + use_sde: bool = False, + log_std_init: float = 0.0, + full_std: bool = True, + sde_net_arch: Optional[List[int]] = None, + use_expln: bool = False, + squash_output: bool = False): super(PPOPolicy, self).__init__(observation_space, action_space, device, squash_output=squash_output) self.obs_dim = self.observation_space.shape[0] @@ -83,7 +95,7 @@ def __init__(self, observation_space, action_space, self._build(learning_rate) - def reset_noise(self, n_envs: int = 1): + def reset_noise(self, n_envs: int = 1) -> None: """ Sample new weights for the exploration matrix. @@ -92,7 +104,7 @@ def reset_noise(self, n_envs: int = 1): assert isinstance(self.action_dist, StateDependentNoiseDistribution), 'reset_noise() is only available when using SDE' self.action_dist.sample_weights(self.log_std, batch_size=n_envs) - def _build(self, learning_rate): + def _build(self, learning_rate: Callable) -> None: self.mlp_extractor = MlpExtractor(self.features_dim, net_arch=self.net_arch, activation_fn=self.activation_fn, device=self.device) @@ -129,7 +141,7 @@ def _build(self, learning_rate): module.apply(partial(self.init_weights, gain=gain)) self.optimizer = th.optim.Adam(self.parameters(), lr=learning_rate(1), eps=self.adam_epsilon) - def forward(self, obs, deterministic=False): + def forward(self, obs: th.Tensor, deterministic: bool = False) -> Tuple[th.Tensor, th.Tensor, th.Tensor]: if not isinstance(obs, th.Tensor): obs = th.FloatTensor(obs).to(self.device) latent_pi, latent_vf, latent_sde = self._get_latent(obs) @@ -139,7 +151,7 @@ def forward(self, obs, deterministic=False): log_prob = action_distribution.log_prob(action) return action, value, log_prob - def _get_latent(self, obs): + def _get_latent(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor, th.Tensor]: features = self.features_extractor(obs) latent_pi, latent_vf = self.mlp_extractor(features) # Features for sde @@ -148,7 +160,9 @@ def _get_latent(self, obs): latent_sde = self.sde_feature_extractor(features) return latent_pi, latent_vf, latent_sde - def _get_action_dist_from_latent(self, latent_pi, latent_sde=None, deterministic=False): + def _get_action_dist_from_latent(self, latent_pi: th.Tensor, + latent_sde: Optional[th.Tensor] = None, + deterministic: bool = False) -> Tuple[th.Tensor, Distribution]: mean_actions = self.action_net(latent_pi) if isinstance(self.action_dist, DiagGaussianDistribution): @@ -169,7 +183,7 @@ def predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Ten action, _ = self._get_action_dist_from_latent(latent_pi, latent_sde, deterministic=deterministic) return action - def evaluate_actions(self, obs, action, deterministic=False): + def evaluate_actions(self, obs: th.Tensor, actions: th.Tensor, deterministic: bool = False) -> Tuple[th.Tensor, th.Tensor, th.Tensor]: """ Evaluate actions according to the current policy, given the observations. @@ -182,13 +196,9 @@ def evaluate_actions(self, obs, action, deterministic=False): """ latent_pi, latent_vf, latent_sde = self._get_latent(obs) _, action_distribution = self._get_action_dist_from_latent(latent_pi, latent_sde, deterministic=deterministic) - log_prob = action_distribution.log_prob(action) - value = self.value_net(latent_vf) - return value, log_prob, action_distribution.entropy() - - def value_forward(self, obs): - _, latent_vf, _ = self._get_latent(obs) - return self.value_net(latent_vf) + log_prob = action_distribution.log_prob(actions) + values = self.value_net(latent_vf) + return values, log_prob, action_distribution.entropy() MlpPolicy = PPOPolicy From 7e3736ed56a8fbc86a312ab1587558e4f34011b9 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Tue, 10 Mar 2020 18:17:47 +0100 Subject: [PATCH 10/17] Type A2C and PPO init --- torchy_baselines/a2c/a2c.py | 35 +++++++++++++++++++++++++++-------- torchy_baselines/ppo/ppo.py | 36 ++++++++++++++++++++++++++---------- 2 files changed, 53 insertions(+), 18 deletions(-) diff --git a/torchy_baselines/a2c/a2c.py b/torchy_baselines/a2c/a2c.py index 1b60e2740..4c24c7faf 100644 --- a/torchy_baselines/a2c/a2c.py +++ b/torchy_baselines/a2c/a2c.py @@ -1,10 +1,15 @@ +from typing import List, Tuple, Type, Union, Callable, Optional, Dict, Any + from gym import spaces import torch as th import torch.nn.functional as F from torchy_baselines.common.utils import explained_variance -from torchy_baselines.ppo.ppo import PPO from torchy_baselines.common import logger +from torchy_baselines.common.type_aliases import GymEnv +from torchy_baselines.ppo.ppo import PPO +from torchy_baselines.ppo.policies import PPOPolicy + class A2C(PPO): @@ -46,13 +51,27 @@ class A2C(PPO): Setting it to auto, the code will be run on the GPU if possible. :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance """ - def __init__(self, policy, env, learning_rate=7e-4, - n_steps=5, gamma=0.99, gae_lambda=1.0, - ent_coef=0.0, vf_coef=0.5, max_grad_norm=0.5, - rms_prop_eps=1e-5, use_rms_prop=True, use_sde=False, sde_sample_freq=-1, - normalize_advantage=False, tensorboard_log=None, create_eval_env=False, - policy_kwargs=None, verbose=0, seed=None, device='auto', - _init_setup_model=True): + def __init__(self, policy: Union[str, Type[PPOPolicy]], + env: Union[GymEnv, str], + learning_rate: Union[float, Callable] = 7e-4, + n_steps: int = 5, + gamma: float = 0.99, + gae_lambda: float = 1.0, + ent_coef: float = 0.0, + vf_coef: float = 0.5, + max_grad_norm: float = 0.5, + rms_prop_eps: float = 1e-5, + use_rms_prop: bool = True, + use_sde: bool = False, + sde_sample_freq: int = -1, + normalize_advantage: bool = False, + tensorboard_log: Optional[str] = None, + create_eval_env: bool = False, + policy_kwargs: Optional[Dict[str, Any]] = None, + verbose: int = 0, + seed: Optional[int] = None, + device: Union[th.device, str] = 'auto', + _init_setup_model: bool = True): super(A2C, self).__init__(policy, env, learning_rate=learning_rate, n_steps=n_steps, batch_size=None, n_epochs=1, diff --git a/torchy_baselines/ppo/ppo.py b/torchy_baselines/ppo/ppo.py index 4ee8ef1f0..6f9a4839d 100644 --- a/torchy_baselines/ppo/ppo.py +++ b/torchy_baselines/ppo/ppo.py @@ -1,6 +1,6 @@ import os import time -from typing import Optional, Tuple, List +from typing import List, Tuple, Type, Union, Callable, Optional, Dict, Any import gym from gym import spaces @@ -14,12 +14,13 @@ SummaryWriter = None import numpy as np +from torchy_baselines.common import logger from torchy_baselines.common.base_class import BaseRLModel +from torchy_baselines.common.type_aliases import GymEnv from torchy_baselines.common.buffers import RolloutBuffer from torchy_baselines.common.utils import explained_variance, get_schedule_fn from torchy_baselines.common.vec_env import VecEnv from torchy_baselines.common.callbacks import BaseCallback -from torchy_baselines.common import logger from torchy_baselines.ppo.policies import PPOPolicy @@ -73,14 +74,29 @@ class PPO(BaseRLModel): :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance """ - def __init__(self, policy, env, learning_rate=3e-4, - n_steps=2048, batch_size=64, n_epochs=10, - gamma=0.99, gae_lambda=0.95, clip_range=0.2, clip_range_vf=None, - ent_coef=0.0, vf_coef=0.5, max_grad_norm=0.5, - use_sde=False, sde_sample_freq=-1, - target_kl=None, tensorboard_log=None, create_eval_env=False, - policy_kwargs=None, verbose=0, seed=None, device='auto', - _init_setup_model=True): + def __init__(self, policy: Union[str, Type[PPOPolicy]], + env: Union[GymEnv, str], + learning_rate: Union[float, Callable] = 3e-4, + n_steps: int = 2048, + batch_size: Optional[int] = 64, + n_epochs: int = 10, + gamma: float = 0.99, + gae_lambda: float = 0.95, + clip_range: float = 0.2, + clip_range_vf: Optional[float] = None, + ent_coef: float = 0.0, + vf_coef: float = 0.5, + max_grad_norm: float = 0.5, + use_sde: bool = False, + sde_sample_freq: int = -1, + target_kl: Optional[float] = None, + tensorboard_log: Optional[str] = None, + create_eval_env: bool = False, + policy_kwargs: Optional[Dict[str, Any]] = None, + verbose: int = 0, + seed: Optional[int] = None, + device: Union[th.device, str] = 'auto', + _init_setup_model: bool = True): super(PPO, self).__init__(policy, env, PPOPolicy, policy_kwargs=policy_kwargs, verbose=verbose, device=device, use_sde=use_sde, sde_sample_freq=sde_sample_freq, From 90d1558534192cb6893ebca8524cd966b39da267 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Wed, 11 Mar 2020 12:45:21 +0100 Subject: [PATCH 11/17] Type and reorder arguments --- torchy_baselines/cem_rl/cem_rl.py | 97 +++++++++++++++++---------- torchy_baselines/common/base_class.py | 2 +- torchy_baselines/sac/sac.py | 28 ++++---- torchy_baselines/td3/td3.py | 90 ++++++++++++++++--------- 4 files changed, 137 insertions(+), 80 deletions(-) diff --git a/torchy_baselines/cem_rl/cem_rl.py b/torchy_baselines/cem_rl/cem_rl.py index 2d1838ee6..867f2c418 100644 --- a/torchy_baselines/cem_rl/cem_rl.py +++ b/torchy_baselines/cem_rl/cem_rl.py @@ -1,11 +1,13 @@ -import time +from typing import Type, Union, Callable, Optional, Dict, Any import torch as th +from torchy_baselines.common.base_class import OffPolicyRLModel +from torchy_baselines.common.callbacks import BaseCallback +from torchy_baselines.common.type_aliases import GymEnv +from torchy_baselines.common.noise import ActionNoise +from torchy_baselines.td3.td3 import TD3, TD3Policy from torchy_baselines.cem_rl.cem import CEM -from torchy_baselines.common.evaluation import evaluate_policy -from torchy_baselines.td3.td3 import TD3 -from torchy_baselines.common.vec_env import sync_envs_normalization class CEMRL(TD3): @@ -16,30 +18,30 @@ class CEMRL(TD3): Code: https://github.com/apourchot/CEM-RL :param policy: (TD3Policy or str) The policy model to use (MlpPolicy, CnnPolicy, ...) - :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) - :param sigma_init: (float) Initial standard deviation of the population distribution - :param pop_size: (int) Number of individuals in the population - :param damping_init: (float) Initial value of damping for preventing from early convergence. - :param damping_final: (float) Final value of damping - :param elitism: (bool) Keep the best known individual in the population - :param n_grad: (int) Number of individuals that will receive a gradient update. - Half of the population size in the paper. - :param buffer_size: (int) size of the replay buffer + :param env: (GymEnv or str) The environment to learn from (if registered in Gym, can be str) :param learning_rate: (float or callable) learning rate for adam optimizer, - the same learning rate will be used for all networks (Q-Values and Actor networks) + the same learning rate will be used for all networks (Q-Values, Actor and Value function) it can be a function of the current progress (from 1 to 0) - :param policy_delay: (int) Policy and target networks will only be updated once every policy_delay steps - per training steps. The Q values will be updated policy_delay more often (update every training step). + :param buffer_size: (int) size of the replay buffer :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts - :param gamma: (float) the discount factor :param batch_size: (int) Minibatch size for each gradient update - :param tau: (float) the soft update coefficient ("Polyak update" of the target networks, between 0 and 1) - :param action_noise: (ActionNoise) the action noise type. Cf common.noise for the different action noise type. + :param tau: (float) the soft update coefficient ("polyak update", between 0 and 1) + :param gamma: (float) the discount factor + :param n_episodes_rollout: (int) Update the model every ``n_episodes_rollout`` episodes. + :param action_noise: (ActionNoise) the action noise type (None by default), this can help + for hard exploration problem. Cf common.noise for the different action noise type. + :param policy_delay: (int) Policy and target networks will only be updated once every policy_delay steps + per training steps. The Q values will be updated policy_delay more often (update every training step). :param target_policy_noise: (float) Standard deviation of Gaussian noise added to target policy (smoothing noise) :param target_noise_clip: (float) Limit for absolute value of target policy smoothing noise. - :param n_episodes_rollout: (int) Update the model every `n_episodes_rollout` episodes. - Note that this cannot be used at the same time as `train_freq` + :param sigma_init: (float) Initial standard deviation of the population distribution + :param pop_size: (int) Number of individuals in the population + :param damping_init: (float) Initial value of damping for preventing from early convergence. + :param damping_final: (float) Final value of damping + :param elitism: (bool) Keep the best known individual in the population + :param n_grad: (int) Number of individuals that will receive a gradient update. + Half of the population size in the paper. :param update_style: (str) Update style for the individual that will use the gradient: - original: original implementation (actor_steps // n_grad steps for the critic and actor_steps gradient steps per individual) @@ -55,15 +57,33 @@ class CEMRL(TD3): Setting it to auto, the code will be run on the GPU if possible. :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance """ - def __init__(self, policy, env, sigma_init=1e-3, pop_size=10, - damping_init=1e-3, damping_final=1e-5, elitism=False, n_grad=5, - buffer_size=int(1e6), learning_rate=1e-3, policy_delay=2, - learning_starts=100, gamma=0.99, batch_size=100, tau=0.005, - action_noise=None, target_policy_noise=0.2, target_noise_clip=0.5, - n_episodes_rollout=1, update_style='original', - tensorboard_log=None, create_eval_env=False, - policy_kwargs=None, verbose=0, seed=None, device='auto', - _init_setup_model=True): + def __init__(self, policy: Union[str, Type[TD3Policy]], + env: Union[GymEnv, str], + learning_rate: Union[float, Callable] = 1e-3, + buffer_size: int = int(1e6), + learning_starts: int = 100, + batch_size: int = 100, + tau: float = 0.005, + gamma: float = 0.99, + n_episodes_rollout: int = 1, + action_noise: Optional[ActionNoise] = None, + policy_delay: int = 2, + target_policy_noise: float = 0.2, + target_noise_clip: float = 0.5, + sigma_init: float = 1e-3, + pop_size: int = 10, + damping_init: float = 1e-3, + damping_final: float = 1e-5, + elitism: bool = False, + n_grad: int = 5, + update_style: str = 'original', + tensorboard_log: Optional[str] = None, + create_eval_env: bool = False, + policy_kwargs: Dict[str, Any] = None, + verbose: int = 0, + seed: Optional[int] = None, + device: Union[th.device, str] = 'auto', + _init_setup_model: bool = True): super(CEMRL, self).__init__(policy, env, buffer_size=buffer_size, learning_rate=learning_rate, seed=seed, device=device, @@ -77,7 +97,7 @@ def __init__(self, policy, env, sigma_init=1e-3, pop_size=10, # Evolution strategy method that follows cma-es interface (ask-tell) # for now, only CEM is implemented - self.es = None + self.es = None # type: Optional[CEM] self.sigma_init = sigma_init self.pop_size = pop_size self.damping_init = damping_init @@ -91,7 +111,7 @@ def __init__(self, policy, env, sigma_init=1e-3, pop_size=10, if _init_setup_model: self._setup_model() - def _setup_model(self, seed=None): + def _setup_model(self) -> None: super(CEMRL, self)._setup_model() params_vector = self.actor.parameters_to_vector() self.es = CEM(len(params_vector), mu_init=params_vector, @@ -99,9 +119,16 @@ def _setup_model(self, seed=None): pop_size=self.pop_size, antithetic=not self.pop_size % 2, parents=self.pop_size // 2, elitism=self.elitism) - def learn(self, total_timesteps, callback=None, log_interval=4, - eval_env=None, eval_freq=-1, n_eval_episodes=5, - tb_log_name="CEMRL", eval_log_path=None, reset_num_timesteps=True): + def learn(self, + total_timesteps: int, + callback: Optional[BaseCallback] = None, + log_interval: int = 4, + eval_env: Optional[GymEnv] = None, + eval_freq: int = -1, + n_eval_episodes: int = 5, + tb_log_name: str = "CEMRL", + eval_log_path: Optional[str] = None, + reset_num_timesteps: bool = True) -> OffPolicyRLModel: episode_num, obs, callback = self._setup_learn(eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps) diff --git a/torchy_baselines/common/base_class.py b/torchy_baselines/common/base_class.py index d6bd69de3..b77237dbe 100644 --- a/torchy_baselines/common/base_class.py +++ b/torchy_baselines/common/base_class.py @@ -288,7 +288,7 @@ def learn(self, total_timesteps: int, eval_freq: int = -1, n_eval_episodes: int = 5, eval_log_path: Optional[str] = None, - reset_num_timesteps: bool = True): + reset_num_timesteps: bool = True) -> 'BaseRLModel': """ Return a trained model. diff --git a/torchy_baselines/sac/sac.py b/torchy_baselines/sac/sac.py index 2ac822fc6..d0309e8b0 100644 --- a/torchy_baselines/sac/sac.py +++ b/torchy_baselines/sac/sac.py @@ -33,21 +33,21 @@ class SAC(OffPolicyRLModel): the same learning rate will be used for all networks (Q-Values, Actor and Value function) it can be a function of the current progress (from 1 to 0) :param buffer_size: (int) size of the replay buffer + :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts :param batch_size: (int) Minibatch size for each gradient update :param tau: (float) the soft update coefficient ("polyak update", between 0 and 1) - :param ent_coef: (str or float) Entropy regularization coefficient. (Equivalent to - inverse of reward scale in the original SAC paper.) Controlling exploration/exploitation trade-off. - Set it to 'auto' to learn it automatically (and 'auto_0.1' for using 0.1 as initial value) - :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts - :param target_update_interval: (int) update the target network every `target_network_update_freq` steps. - :param train_freq: (int) Update the model every `train_freq` steps. + :param gamma: (float) the discount factor + :param train_freq: (int) Update the model every ``train_freq`` steps. :param gradient_steps: (int) How many gradient update after each step - :param n_episodes_rollout: (int) Update the model every `n_episodes_rollout` episodes. - Note that this cannot be used at the same time as `train_freq` - :param target_entropy: (str or float) target entropy when learning `ent_coef` (`ent_coef = 'auto'`) + :param n_episodes_rollout: (int) Update the model every ``n_episodes_rollout`` episodes. + Note that this cannot be used at the same time as ``train_freq`` :param action_noise: (ActionNoise) the action noise type (None by default), this can help for hard exploration problem. Cf common.noise for the different action noise type. - :param gamma: (float) the discount factor + :param ent_coef: (str or float) Entropy regularization coefficient. (Equivalent to + inverse of reward scale in the original SAC paper.) Controlling exploration/exploitation trade-off. + Set it to 'auto' to learn it automatically (and 'auto_0.1' for using 0.1 as initial value) + :param target_update_interval: (int) update the target network every ``target_network_update_freq`` steps. + :param target_entropy: (str or float) target entropy when learning ``ent_coef`` (``ent_coef = 'auto'``) :param use_sde: (bool) Whether to use State Dependent Exploration (SDE) instead of action noise exploration (default: False) :param sde_sample_freq: (int) Sample a new noise matrix every n steps when using SDE @@ -71,14 +71,14 @@ def __init__(self, policy: Union[str, Type[SACPolicy]], learning_starts: int = 100, batch_size: int = 256, tau: float = 0.005, - ent_coef: Union[str, float] = 'auto', - target_update_interval: int = 1, + gamma: float = 0.99, train_freq: int = 1, gradient_steps: int = 1, n_episodes_rollout: int = -1, - target_entropy: Union[str, float] = 'auto', action_noise: Optional[ActionNoise] = None, - gamma: float = 0.99, + ent_coef: Union[str, float] = 'auto', + target_update_interval: int = 1, + target_entropy: Union[str, float] = 'auto', use_sde: bool = False, sde_sample_freq: int = -1, use_sde_at_warmup: bool = False, diff --git a/torchy_baselines/td3/td3.py b/torchy_baselines/td3/td3.py index 7dbaf1636..ccb835e50 100644 --- a/torchy_baselines/td3/td3.py +++ b/torchy_baselines/td3/td3.py @@ -1,4 +1,4 @@ -from typing import List, Tuple, Optional +from typing import List, Tuple, Type, Union, Callable, Optional, Dict, Any import torch as th import torch.nn.functional as F @@ -6,7 +6,9 @@ from torchy_baselines.common.base_class import OffPolicyRLModel from torchy_baselines.common.buffers import ReplayBuffer -from torchy_baselines.common.type_aliases import ReplayBufferSamples +from torchy_baselines.common.type_aliases import ReplayBufferSamples, GymEnv +from torchy_baselines.common.noise import ActionNoise +from torchy_baselines.common.callbacks import BaseCallback from torchy_baselines.td3.policies import TD3Policy @@ -20,22 +22,23 @@ class TD3(OffPolicyRLModel): Introduction to TD3: https://spinningup.openai.com/en/latest/algorithms/td3.html :param policy: (TD3Policy or str) The policy model to use (MlpPolicy, CnnPolicy, ...) - :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) - :param buffer_size: (int) size of the replay buffer + :param env: (GymEnv or str) The environment to learn from (if registered in Gym, can be str) :param learning_rate: (float or callable) learning rate for adam optimizer, - the same learning rate will be used for all networks (Q-Values and Actor networks) + the same learning rate will be used for all networks (Q-Values, Actor and Value function) it can be a function of the current progress (from 1 to 0) - :param policy_delay: (int) Policy and target networks will only be updated once every policy_delay steps - per training steps. The Q values will be updated policy_delay more often (update every training step). + :param buffer_size: (int) size of the replay buffer :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts - :param gamma: (float) the discount factor :param batch_size: (int) Minibatch size for each gradient update - :param train_freq: (int) Update the model every `train_freq` steps. + :param tau: (float) the soft update coefficient ("polyak update", between 0 and 1) + :param gamma: (float) the discount factor + :param train_freq: (int) Update the model every ``train_freq`` steps. :param gradient_steps: (int) How many gradient update after each step - :param n_episodes_rollout: (int) Update the model every `n_episodes_rollout` episodes. - Note that this cannot be used at the same time as `train_freq` - :param tau: (float) the soft update coefficient ("Polyak update" of the target networks, between 0 and 1) - :param action_noise: (ActionNoise) the action noise type. Cf common.noise for the different action noise type. + :param n_episodes_rollout: (int) Update the model every ``n_episodes_rollout`` episodes. + Note that this cannot be used at the same time as ``train_freq`` + :param action_noise: (ActionNoise) the action noise type (None by default), this can help + for hard exploration problem. Cf common.noise for the different action noise type. + :param policy_delay: (int) Policy and target networks will only be updated once every policy_delay steps + per training steps. The Q values will be updated policy_delay more often (update every training step). :param target_policy_noise: (float) Standard deviation of Gaussian noise added to target policy (smoothing noise) :param target_noise_clip: (float) Limit for absolute value of target policy smoothing noise. @@ -58,14 +61,34 @@ class TD3(OffPolicyRLModel): :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance """ - def __init__(self, policy, env, buffer_size=int(1e6), learning_rate=1e-3, - policy_delay=2, learning_starts=100, gamma=0.99, batch_size=100, - train_freq=-1, gradient_steps=-1, n_episodes_rollout=1, - tau=0.005, action_noise=None, target_policy_noise=0.2, target_noise_clip=0.5, - use_sde=False, sde_sample_freq=-1, sde_max_grad_norm=1, - sde_ent_coef=0.0, sde_log_std_scheduler=None, use_sde_at_warmup=False, - tensorboard_log=None, create_eval_env=False, policy_kwargs=None, verbose=0, - seed=None, device='auto', _init_setup_model=True): + def __init__(self, policy: Union[str, Type[TD3Policy]], + env: Union[GymEnv, str], + learning_rate: Union[float, Callable] = 1e-3, + buffer_size: int = int(1e6), + learning_starts: int = 100, + batch_size: int = 100, + tau: float = 0.005, + gamma: float = 0.99, + train_freq: int = -1, + gradient_steps: int = -1, + n_episodes_rollout: int = 1, + action_noise: Optional[ActionNoise] = None, + policy_delay: int = 2, + target_policy_noise: float = 0.2, + target_noise_clip: float = 0.5, + use_sde: bool = False, + sde_sample_freq: int = -1, + sde_max_grad_norm: float = 1, + sde_ent_coef: float = 0.0, + sde_log_std_scheduler: Optional[Callable] = None, + use_sde_at_warmup: bool = False, + tensorboard_log: Optional[str] = None, + create_eval_env: bool = False, + policy_kwargs: Dict[str, Any] = None, + verbose: int = 0, + seed: Optional[int] = None, + device: Union[th.device, str] = 'auto', + _init_setup_model: bool = True): super(TD3, self).__init__(policy, env, TD3Policy, policy_kwargs, verbose, device, create_eval_env=create_eval_env, seed=seed, @@ -96,7 +119,7 @@ def __init__(self, policy, env, buffer_size=int(1e6), learning_rate=1e-3, if _init_setup_model: self._setup_model() - def _setup_model(self): + def _setup_model(self) -> None: self._setup_learning_rate() obs_dim, action_dim = self.observation_space.shape[0], self.action_space.shape[0] self.set_random_seed(self.seed) @@ -107,7 +130,7 @@ def _setup_model(self): self.policy = self.policy.to(self.device) self._create_aliases() - def _create_aliases(self): + def _create_aliases(self) -> None: self.actor = self.policy.actor self.actor_target = self.policy.actor_target self.critic = self.policy.critic @@ -117,7 +140,7 @@ def _create_aliases(self): def train_critic(self, gradient_steps: int = 1, batch_size: int = 100, replay_data: Optional[ReplayBufferSamples] = None, - tau: float = 0.0): + tau: float = 0.0) -> None: # Update optimizer learning rate self._update_learning_rate(self.critic.optimizer) @@ -158,7 +181,7 @@ def train_actor(self, gradient_steps: int = 1, batch_size: int = 100, tau_actor: float = 0.005, tau_critic: float = 0.005, - replay_data: Optional[ReplayBufferSamples] = None): + replay_data: Optional[ReplayBufferSamples] = None) -> None: # Update optimizer learning rate self._update_learning_rate(self.actor.optimizer) @@ -183,7 +206,7 @@ def train_actor(self, gradient_steps: int = 1, for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(tau_actor * param.data + (1 - tau_actor) * target_param.data) - def train(self, gradient_steps: int, batch_size: int = 100, policy_delay: int = 2): + def train(self, gradient_steps: int, batch_size: int = 100, policy_delay: int = 2) -> None: for gradient_step in range(gradient_steps): @@ -195,7 +218,7 @@ def train(self, gradient_steps: int, batch_size: int = 100, policy_delay: int = if gradient_step % policy_delay == 0: self.train_actor(replay_data=replay_data, tau_actor=self.tau, tau_critic=self.tau) - def train_sde(self): + def train_sde(self) -> None: # Update optimizer learning rate # self._update_learning_rate(self.policy.optimizer) @@ -241,9 +264,16 @@ def train_sde(self): del self.rollout_data - def learn(self, total_timesteps, callback=None, log_interval=4, - eval_env=None, eval_freq=-1, n_eval_episodes=5, - tb_log_name="TD3", eval_log_path=None, reset_num_timesteps=True): + def learn(self, + total_timesteps: int, + callback: Optional[BaseCallback] = None, + log_interval: int = 4, + eval_env: Optional[GymEnv] = None, + eval_freq: int = -1, + n_eval_episodes: int = 5, + tb_log_name: str = "TD3", + eval_log_path: Optional[str] = None, + reset_num_timesteps: bool = True) -> OffPolicyRLModel: episode_num, obs, callback = self._setup_learn(eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps) From c5e58128949b932ed32f95ec12131c17b1338a59 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Wed, 11 Mar 2020 13:01:42 +0100 Subject: [PATCH 12/17] Finish typing A2C and PPO --- torchy_baselines/a2c/a2c.py | 19 +++++++++++++------ torchy_baselines/cem_rl/cem.py | 21 +++++++++++++++------ torchy_baselines/ppo/ppo.py | 15 +++++++++++---- 3 files changed, 39 insertions(+), 16 deletions(-) diff --git a/torchy_baselines/a2c/a2c.py b/torchy_baselines/a2c/a2c.py index 4c24c7faf..6e0a1f2e9 100644 --- a/torchy_baselines/a2c/a2c.py +++ b/torchy_baselines/a2c/a2c.py @@ -7,11 +7,11 @@ from torchy_baselines.common.utils import explained_variance from torchy_baselines.common import logger from torchy_baselines.common.type_aliases import GymEnv +from torchy_baselines.common.callbacks import BaseCallback from torchy_baselines.ppo.ppo import PPO from torchy_baselines.ppo.policies import PPOPolicy - class A2C(PPO): """ Advantage Actor Critic (A2C) @@ -89,14 +89,14 @@ def __init__(self, policy: Union[str, Type[PPOPolicy]], if _init_setup_model: self._setup_model() - def _setup_model(self): + def _setup_model(self) -> None: super(A2C, self)._setup_model() if self.use_rms_prop: self.policy.optimizer = th.optim.RMSprop(self.policy.parameters(), lr=self.learning_rate(1), alpha=0.99, eps=self.rms_prop_eps, weight_decay=0) - def train(self, gradient_steps: int, batch_size=None): + def train(self, gradient_steps: int, batch_size: Optional[int] = None) -> None: # Update optimizer learning rate self._update_learning_rate(self.policy.optimizer) # A2C with gradient_steps > 1 does not make sense @@ -153,9 +153,16 @@ def train(self, gradient_steps: int, batch_size=None): if hasattr(self.policy, 'log_std'): logger.logkv("std", th.exp(self.policy.log_std).mean().item()) - def learn(self, total_timesteps, callback=None, log_interval=100, - eval_env=None, eval_freq=-1, n_eval_episodes=5, - tb_log_name="A2C", eval_log_path=None, reset_num_timesteps=True): + def learn(self, + total_timesteps: int, + callback: Optional[BaseCallback] = None, + log_interval: int = 100, + eval_env: Optional[GymEnv] = None, + eval_freq: int = -1, + n_eval_episodes: int = 5, + tb_log_name: str = "A2C", + eval_log_path: Optional[str] = None, + reset_num_timesteps: bool = True) -> 'A2C': return super(A2C, self).learn(total_timesteps=total_timesteps, callback=callback, log_interval=log_interval, eval_env=eval_env, eval_freq=eval_freq, n_eval_episodes=n_eval_episodes, diff --git a/torchy_baselines/cem_rl/cem.py b/torchy_baselines/cem_rl/cem.py index ee4f484bc..d1d221c86 100644 --- a/torchy_baselines/cem_rl/cem.py +++ b/torchy_baselines/cem_rl/cem.py @@ -1,3 +1,5 @@ +from typing import Type, Tuple, Optional, List + import numpy as np @@ -21,9 +23,16 @@ class CEM(object): :param antithetic: (bool) Use a finite difference like method for sampling (mu + epsilon, mu - epsilon) """ - def __init__(self, num_params, mu_init=None, sigma_init=1e-3, - pop_size=256, damping_init=1e-3, damping_final=1e-5, - parents=None, elitism=False, antithetic=False): + def __init__(self, + num_params: int, + mu_init: Optional[np.ndarray] = None, + sigma_init: float = 1e-3, + pop_size: int = 256, + damping_init: float = 1e-3, + damping_final: float = 1e-5, + parents: Optional[int] = None, + elitism: bool = False, + antithetic: bool = False): super(CEM, self).__init__() self.num_params = num_params @@ -66,7 +75,7 @@ def __init__(self, num_params, mu_init=None, sigma_init=1e-3, for i in range(1, self.parents + 1)]) self.weights /= self.weights.sum() - def ask(self, pop_size): + def ask(self, pop_size: int) -> List[np.ndarray]: """ Returns a list of candidates parameters @@ -87,7 +96,7 @@ def ask(self, pop_size): return individuals - def tell(self, solutions, scores): + def tell(self, solutions: List[np.ndarray], scores: List[float]) -> None: """ Updates the distribution @@ -114,7 +123,7 @@ def tell(self, solutions, scores): self.elite = solutions[idx_sorted[0]] self.elite_score = scores[idx_sorted[0]] - def get_distrib_params(self): + def get_distrib_params(self) -> Tuple[np.ndarray, np.ndarray]: """ Returns the parameters of the distribution: the mean and standard deviation. diff --git a/torchy_baselines/ppo/ppo.py b/torchy_baselines/ppo/ppo.py index 6f9a4839d..a7dcf0bdb 100644 --- a/torchy_baselines/ppo/ppo.py +++ b/torchy_baselines/ppo/ppo.py @@ -121,7 +121,7 @@ def __init__(self, policy: Union[str, Type[PPOPolicy]], if _init_setup_model: self._setup_model() - def _setup_model(self): + def _setup_model(self) -> None: self._setup_learning_rate() # TODO: preprocessing: one hot vector for obs discrete state_dim = self.observation_space.shape[0] @@ -284,9 +284,16 @@ def train(self, gradient_steps: int, batch_size: int = 64) -> None: if hasattr(self.policy, 'log_std'): logger.logkv("std", th.exp(self.policy.log_std).mean().item()) - def learn(self, total_timesteps, callback=None, log_interval=1, - eval_env=None, eval_freq=-1, n_eval_episodes=5, tb_log_name="PPO", - eval_log_path=None, reset_num_timesteps=True): + def learn(self, + total_timesteps: int, + callback: Optional[BaseCallback] = None, + log_interval: int = 1, + eval_env: Optional[GymEnv] = None, + eval_freq: int = -1, + n_eval_episodes: int = 5, + tb_log_name: str = "PPO", + eval_log_path: Optional[str] = None, + reset_num_timesteps: bool = True) -> 'PPO': episode_num, obs, callback = self._setup_learn(eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps) From 037986a91d272a79cb6a2d434fedf5fb27f23904 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Wed, 11 Mar 2020 16:35:13 +0100 Subject: [PATCH 13/17] Add test for `expln` --- docs/misc/changelog.rst | 1 + tests/test_sde.py | 11 ++++++----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst index 45d28de6c..3bd119bf5 100644 --- a/docs/misc/changelog.rst +++ b/docs/misc/changelog.rst @@ -27,6 +27,7 @@ Others: - Added ``clip_mean`` parameter to SAC policy - Buffers now return ``NamedTuple`` - More typing +- Add test for ``expln`` Documentation: ^^^^^^^^^^^^^^ diff --git a/tests/test_sde.py b/tests/test_sde.py index 497f39887..96f6c8f38 100644 --- a/tests/test_sde.py +++ b/tests/test_sde.py @@ -2,7 +2,7 @@ import torch as th from torch.distributions import Normal -from torchy_baselines import A2C, TD3, SAC +from torchy_baselines import A2C, TD3, SAC, PPO def test_state_dependent_exploration_grad(): @@ -55,12 +55,13 @@ def test_state_dependent_exploration_grad(): assert sigma_hat.grad.allclose(grad) -@pytest.mark.parametrize("model_class", [TD3, SAC, A2C]) +@pytest.mark.parametrize("model_class", [TD3, SAC, A2C, PPO]) @pytest.mark.parametrize("sde_net_arch", [None, [32, 16], []]) -def test_state_dependent_offpolicy_noise(model_class, sde_net_arch): +@pytest.mark.parametrize("use_expln", [False, True]) +def test_state_dependent_offpolicy_noise(model_class, sde_net_arch, use_expln): model = model_class('MlpPolicy', 'Pendulum-v0', use_sde=True, seed=None, create_eval_env=True, - verbose=1, policy_kwargs=dict(log_std_init=-2, sde_net_arch=sde_net_arch)) - model.learn(total_timesteps=int(1000), eval_freq=500) + verbose=1, policy_kwargs=dict(log_std_init=-2, sde_net_arch=sde_net_arch, use_expln=use_expln)) + model.learn(total_timesteps=int(500), eval_freq=250) def test_scheduler(): From 18f38f8cf5b969403d5416e62eb9a4461538c0db Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Thu, 12 Mar 2020 11:12:10 +0100 Subject: [PATCH 14/17] Reformat --- tests/test_callbacks.py | 4 ++-- tests/test_distributions.py | 1 + tests/test_logger.py | 3 ++- tests/test_predict.py | 1 + tests/test_run.py | 4 +--- tests/test_save_load.py | 1 + tests/test_vec_normalize.py | 7 +++++-- torchy_baselines/a2c/a2c.py | 11 +++++------ torchy_baselines/cem_rl/cem.py | 3 +-- torchy_baselines/common/base_class.py | 4 ++-- torchy_baselines/common/buffers.py | 10 ++++++---- torchy_baselines/common/distributions.py | 4 ++-- torchy_baselines/common/monitor.py | 5 +++-- torchy_baselines/common/noise.py | 3 +++ torchy_baselines/common/save_util.py | 2 -- torchy_baselines/common/type_aliases.py | 5 ++--- .../common/vec_env/base_vec_env.py | 2 +- .../common/vec_env/subproc_vec_env.py | 4 ++-- .../common/vec_env/vec_normalize.py | 6 +++--- torchy_baselines/ppo/policies.py | 10 ++++++---- torchy_baselines/ppo/ppo.py | 18 +++++++++--------- torchy_baselines/sac/policies.py | 1 + torchy_baselines/td3/policies.py | 5 ++--- torchy_baselines/td3/td3.py | 14 ++++++-------- 24 files changed, 67 insertions(+), 61 deletions(-) diff --git a/tests/test_callbacks.py b/tests/test_callbacks.py index 96db45a77..a4ffede11 100644 --- a/tests/test_callbacks.py +++ b/tests/test_callbacks.py @@ -6,7 +6,7 @@ from torchy_baselines import A2C, CEMRL, PPO, SAC, TD3 from torchy_baselines.common.callbacks import (CallbackList, CheckpointCallback, EvalCallback, - EveryNTimesteps, StopTrainingOnRewardThreshold) + EveryNTimesteps, StopTrainingOnRewardThreshold) @pytest.mark.parametrize("model_class", [A2C, CEMRL, PPO, SAC, TD3]) @@ -44,6 +44,6 @@ def test_callbacks(model_class): # Transform callback into a callback list automatically model.learn(500, callback=[checkpoint_callback, eval_callback]) # Automatic wrapping, old way of doing callbacks - model.learn(500, callback=lambda _locals, _globals : True) + model.learn(500, callback=lambda _locals, _globals: True) if os.path.exists(log_folder): shutil.rmtree(log_folder) diff --git a/tests/test_distributions.py b/tests/test_distributions.py index 4b5d7927f..7d28ad7dd 100644 --- a/tests/test_distributions.py +++ b/tests/test_distributions.py @@ -22,6 +22,7 @@ def test_bijector(): # Check the inverse method assert th.isclose(TanhBijector.inverse(squashed_actions), actions).all() + @pytest.mark.parametrize("model_class", [A2C, PPO]) def test_squashed_gaussian(model_class): """ diff --git a/tests/test_logger.py b/tests/test_logger.py index b55a61633..5ca0437b5 100644 --- a/tests/test_logger.py +++ b/tests/test_logger.py @@ -5,7 +5,8 @@ import numpy as np from torchy_baselines.common.logger import (make_output_format, read_csv, read_json, DEBUG, ScopedConfigure, - info, debug, set_level, configure, logkv, logkvs, dumpkvs, logkv_mean, warn, error, reset) + info, debug, set_level, configure, logkv, logkvs, dumpkvs, logkv_mean, warn, + error, reset) KEY_VALUES = { "test": 1, diff --git a/tests/test_predict.py b/tests/test_predict.py index 6f2245ce8..e68954f60 100644 --- a/tests/test_predict.py +++ b/tests/test_predict.py @@ -12,6 +12,7 @@ SAC, ] + @pytest.mark.parametrize("model_class", MODEL_LIST) def test_auto_wrap(model_class): # test auto wrapping of env into a VecEnv diff --git a/tests/test_run.py b/tests/test_run.py index fdfcff6da..1a3f99110 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -1,7 +1,5 @@ -import os - -import pytest import numpy as np +import pytest from torchy_baselines import A2C, CEMRL, PPO, SAC, TD3 from torchy_baselines.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise diff --git a/tests/test_save_load.py b/tests/test_save_load.py index 9d73ddf74..bdec3eda0 100644 --- a/tests/test_save_load.py +++ b/tests/test_save_load.py @@ -16,6 +16,7 @@ SAC, ] + @pytest.mark.parametrize("model_class", MODEL_LIST) def test_save_load(model_class): """ diff --git a/tests/test_vec_normalize.py b/tests/test_vec_normalize.py index 3c21f69aa..75fd3a857 100644 --- a/tests/test_vec_normalize.py +++ b/tests/test_vec_normalize.py @@ -8,9 +8,11 @@ ENV_ID = 'Pendulum-v0' + def make_env(): return gym.make(ENV_ID) + def check_rms_equal(rmsa, rmsb): assert np.all(rmsa.mean == rmsb.mean) assert np.all(rmsa.var == rmsb.var) @@ -34,6 +36,7 @@ def check_vec_norm_equal(norma, normb): assert norma.epsilon == normb.epsilon assert norma.training == normb.training + def _make_warmstart_cartpole(): """Warm-start VecNormalize by stepping through CartPole""" venv = DummyVecEnv([lambda: gym.make("CartPole-v1")]) @@ -50,8 +53,8 @@ def _make_warmstart_cartpole(): def test_runningmeanstd(): """Test RunningMeanStd object""" for (x_1, x_2, x_3) in [ - (np.random.randn(3), np.random.randn(4), np.random.randn(5)), - (np.random.randn(3, 2), np.random.randn(4, 2), np.random.randn(5, 2))]: + (np.random.randn(3), np.random.randn(4), np.random.randn(5)), + (np.random.randn(3, 2), np.random.randn(4, 2), np.random.randn(5, 2))]: rms = RunningMeanStd(epsilon=0.0, shape=x_1.shape[1:]) x_cat = np.concatenate([x_1, x_2, x_3], axis=0) diff --git a/torchy_baselines/a2c/a2c.py b/torchy_baselines/a2c/a2c.py index 6e0a1f2e9..5f9730df2 100644 --- a/torchy_baselines/a2c/a2c.py +++ b/torchy_baselines/a2c/a2c.py @@ -1,15 +1,14 @@ -from typing import List, Tuple, Type, Union, Callable, Optional, Dict, Any - -from gym import spaces import torch as th import torch.nn.functional as F +from gym import spaces +from typing import Type, Union, Callable, Optional, Dict, Any -from torchy_baselines.common.utils import explained_variance from torchy_baselines.common import logger -from torchy_baselines.common.type_aliases import GymEnv from torchy_baselines.common.callbacks import BaseCallback -from torchy_baselines.ppo.ppo import PPO +from torchy_baselines.common.type_aliases import GymEnv +from torchy_baselines.common.utils import explained_variance from torchy_baselines.ppo.policies import PPOPolicy +from torchy_baselines.ppo.ppo import PPO class A2C(PPO): diff --git a/torchy_baselines/cem_rl/cem.py b/torchy_baselines/cem_rl/cem.py index d1d221c86..7527b996c 100644 --- a/torchy_baselines/cem_rl/cem.py +++ b/torchy_baselines/cem_rl/cem.py @@ -1,6 +1,5 @@ -from typing import Type, Tuple, Optional, List - import numpy as np +from typing import Tuple, Optional, List # TODO: add more from https://github.com/hardmaru/estool/blob/master/es.py diff --git a/torchy_baselines/common/base_class.py b/torchy_baselines/common/base_class.py index b77237dbe..f1d205a0e 100644 --- a/torchy_baselines/common/base_class.py +++ b/torchy_baselines/common/base_class.py @@ -16,7 +16,7 @@ from torchy_baselines.common.utils import set_random_seed, get_schedule_fn, update_learning_rate from torchy_baselines.common.vec_env import DummyVecEnv, VecEnv, unwrap_vec_normalize, VecNormalize from torchy_baselines.common.save_util import data_to_json, json_to_data, recursive_getattr, recursive_setattr -from torchy_baselines.common.type_aliases import GymEnv, TensorDict, OptimizerStateDict, RolloutReturn +from torchy_baselines.common.type_aliases import GymEnv, TensorDict, RolloutReturn from torchy_baselines.common.callbacks import BaseCallback, CallbackList, ConvertCallback, EvalCallback from torchy_baselines.common.monitor import Monitor from torchy_baselines.common.noise import ActionNoise @@ -494,7 +494,7 @@ def _load_from_file(load_path: str, load_data: bool = True) -> (Tuple[Optional[D if "data" in namelist and load_data: # Load class parameters and convert to string json_data = archive.read("data").decode() - data = json_to_data(json_data, device) + data = json_to_data(json_data) if "tensors.pth" in namelist and load_data: # Load extra tensors diff --git a/torchy_baselines/common/buffers.py b/torchy_baselines/common/buffers.py index 6ffe479a2..5ddc29c39 100644 --- a/torchy_baselines/common/buffers.py +++ b/torchy_baselines/common/buffers.py @@ -18,6 +18,7 @@ class BaseBuffer(object): to which the values will be converted :param n_envs: (int) Number of parallel environments """ + def __init__(self, buffer_size: int, obs_dim: int, @@ -118,13 +119,13 @@ def to_torch(self, array: np.ndarray, copy: bool = True) -> th.Tensor: @staticmethod def _normalize_obs(obs: np.ndarray, - env: Optional[VecNormalize] = None) -> np.ndarray: + env: Optional[VecNormalize] = None) -> np.ndarray: if env is not None: return env.normalize_obs(obs).astype(np.float32) return obs - def _normalize_reward(self, - reward: np.ndarray, + @staticmethod + def _normalize_reward(reward: np.ndarray, env: Optional[VecNormalize] = None) -> np.ndarray: if env is not None: return env.normalize_reward(reward).astype(np.float32) @@ -141,13 +142,13 @@ class ReplayBuffer(BaseBuffer): :param device: (th.device) :param n_envs: (int) Number of parallel environments """ + def __init__(self, buffer_size: int, obs_dim: int, action_dim: int, device: Union[th.device, str] = 'cpu', n_envs: int = 1): - super(ReplayBuffer, self).__init__(buffer_size, obs_dim, action_dim, device, n_envs=n_envs) assert n_envs == 1, "Replay buffer only support single environment for now" @@ -201,6 +202,7 @@ class RolloutBuffer(BaseBuffer): :param gamma: (float) Discount factor :param n_envs: (int) Number of parallel environments """ + def __init__(self, buffer_size: int, obs_dim: int, diff --git a/torchy_baselines/common/distributions.py b/torchy_baselines/common/distributions.py index a7f219ac8..6a0ab2f11 100644 --- a/torchy_baselines/common/distributions.py +++ b/torchy_baselines/common/distributions.py @@ -317,7 +317,7 @@ def sample_weights(self, log_std: th.Tensor, batch_size: int = 1) -> None: self.exploration_matrices = self.weights_dist.rsample((batch_size,)) def proba_distribution_net(self, latent_dim: int, log_std_init: float = -2.0, - latent_sde_dim: Optional[th.Tensor] = None) -> Tuple[nn.Module, nn.Parameter]: + latent_sde_dim: Optional[int] = None) -> Tuple[nn.Module, nn.Parameter]: """ Create the layers and parameter that represent the distribution: one output will be the deterministic action, the other parameter will be the @@ -325,7 +325,7 @@ def proba_distribution_net(self, latent_dim: int, log_std_init: float = -2.0, :param latent_dim: (int) Dimension of the last layer of the policy (before the action layer) :param log_std_init: (float) Initial value for the log standard deviation - :param latent_sde_dim: (int) Dimension of the last layer of the feature extractor + :param latent_sde_dim: (Optional[int]) Dimension of the last layer of the feature extractor for SDE. By default, it is shared with the policy network. :return: (nn.Linear, nn.Parameter) """ diff --git a/torchy_baselines/common/monitor.py b/torchy_baselines/common/monitor.py index 3d84b9ba9..5a8716902 100644 --- a/torchy_baselines/common/monitor.py +++ b/torchy_baselines/common/monitor.py @@ -27,8 +27,9 @@ def __init__(self, :param env: (gym.Env) The environment :param filename: (Optional[str]) the location to save a log file, can be None for no log :param allow_early_resets: (bool) allows the reset of the environment before it is done - :param reset_keywords: (Tuple[str, ...]) extra keywords for the reset call, if extra parameters are needed at reset - :param info_keywords: (Tuple[str, ...]) extra information to log, from the information return of environment.step + :param reset_keywords: (Tuple[str, ...]) extra keywords for the reset call, + if extra parameters are needed at reset + :param info_keywords: (Tuple[str, ...]) extra information to log, from the information return of env.step() """ super(Monitor, self).__init__(env=env) self.t_start = time.time() diff --git a/torchy_baselines/common/noise.py b/torchy_baselines/common/noise.py index fa25f42ad..8511010e9 100644 --- a/torchy_baselines/common/noise.py +++ b/torchy_baselines/common/noise.py @@ -9,6 +9,7 @@ class ActionNoise(ABC): """ The action noise base class """ + def __init__(self): super(ActionNoise, self).__init__() @@ -22,6 +23,7 @@ def reset(self): def __call__(self): pass + class NormalActionNoise(ActionNoise): """ A Gaussian action noise @@ -29,6 +31,7 @@ class NormalActionNoise(ActionNoise): :param mean: (float) the mean value of the noise :param sigma: (float) the scale of the noise (std here) """ + def __init__(self, mean, sigma): self._mu = mean self._sigma = sigma diff --git a/torchy_baselines/common/save_util.py b/torchy_baselines/common/save_util.py index a9dedbc94..85fb6fd12 100644 --- a/torchy_baselines/common/save_util.py +++ b/torchy_baselines/common/save_util.py @@ -122,14 +122,12 @@ def data_to_json(data: Dict[str, Any]) -> str: def json_to_data(json_string: str, - device: Union[th.device, str] = 'cpu', custom_objects: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: """ Turn JSON serialization of class-parameters back into dictionary. :param json_string: (str) JSON serialization of the class-parameters that should be loaded. - :param device: torch.device device to which the data should be mapped if errors occur :param custom_objects: (dict) Dictionary of objects to replace upon loading. If a variable is present in this dictionary as a key, it will not be deserialized and the corresponding item diff --git a/torchy_baselines/common/type_aliases.py b/torchy_baselines/common/type_aliases.py index 12c220fe4..60042be0e 100644 --- a/torchy_baselines/common/type_aliases.py +++ b/torchy_baselines/common/type_aliases.py @@ -1,8 +1,7 @@ """ -Common aliases for type hing +Common aliases for type hint """ -from typing import Union, Type, Optional, Dict, Any, List, NamedTuple -from collections import namedtuple +from typing import Union, Dict, Any, NamedTuple, Optional import numpy as np import torch as th diff --git a/torchy_baselines/common/vec_env/base_vec_env.py b/torchy_baselines/common/vec_env/base_vec_env.py index 18e8a5956..4a81e6ea6 100644 --- a/torchy_baselines/common/vec_env/base_vec_env.py +++ b/torchy_baselines/common/vec_env/base_vec_env.py @@ -244,7 +244,7 @@ def __getattr__(self, name): if blocked_class is not None: own_class = f"{type(self).__module__}.{type(self).__name__}" error_str = (f"Error: Recursive attribute lookup for {name} from {own_class} is " - "ambiguous and hides attribute from {blocked_class}") + "ambiguous and hides attribute from {blocked_class}") raise AttributeError(error_str) return self.getattr_recursive(name) diff --git a/torchy_baselines/common/vec_env/subproc_vec_env.py b/torchy_baselines/common/vec_env/subproc_vec_env.py index 920568377..5e6ee858c 100644 --- a/torchy_baselines/common/vec_env/subproc_vec_env.py +++ b/torchy_baselines/common/vec_env/subproc_vec_env.py @@ -61,11 +61,11 @@ def tile_images(img_nhwc): new_width = int(np.ceil(float(n_images) / new_height)) img_nhwc = np.array(list(img_nhwc) + [img_nhwc[0] * 0 for _ in range(n_images, new_height * new_width)]) # img_HWhwc - out_image = img_nhwc.reshape(new_height, new_width, height, width, n_channels) + out_image = img_nhwc.reshape((new_height, new_width, height, width, n_channels)) # img_HhWwc out_image = out_image.transpose(0, 2, 1, 3, 4) # img_Hh_Ww_c - out_image = out_image.reshape(new_height * height, new_width * width, n_channels) + out_image = out_image.reshape((new_height * height, new_width * width, n_channels)) return out_image diff --git a/torchy_baselines/common/vec_env/vec_normalize.py b/torchy_baselines/common/vec_env/vec_normalize.py index ea94bc009..87fb70ae6 100644 --- a/torchy_baselines/common/vec_env/vec_normalize.py +++ b/torchy_baselines/common/vec_env/vec_normalize.py @@ -86,7 +86,7 @@ def step_wait(self): """ obs, rews, news, infos = self.venv.step_wait() self.old_obs = obs - self.old_rews = rews + self.old_reward = rews if self.training: self.obs_rms.update(obs) @@ -122,7 +122,7 @@ def normalize_reward(self, reward): """ if self.norm_reward: reward = np.clip(reward / np.sqrt(self.ret_rms.var + self.epsilon), - -self.clip_reward, self.clip_reward) + -self.clip_reward, self.clip_reward) return reward def unnormalize_obs(self, obs): @@ -146,7 +146,7 @@ def get_original_reward(self): """ Returns an unnormalized version of the rewards from the most recent step. """ - return self.old_rews.copy() + return self.old_reward.copy() def reset(self): """ diff --git a/torchy_baselines/ppo/policies.py b/torchy_baselines/ppo/policies.py index 491737a02..b98977683 100644 --- a/torchy_baselines/ppo/policies.py +++ b/torchy_baselines/ppo/policies.py @@ -9,8 +9,8 @@ from torchy_baselines.common.policies import (BasePolicy, register_policy, MlpExtractor, create_sde_feature_extractor) from torchy_baselines.common.distributions import (make_proba_distribution, Distribution, - DiagGaussianDistribution, CategoricalDistribution, StateDependentNoiseDistribution) - + DiagGaussianDistribution, CategoricalDistribution, + StateDependentNoiseDistribution) class PPOPolicy(BasePolicy): @@ -183,13 +183,15 @@ def predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Ten action, _ = self._get_action_dist_from_latent(latent_pi, latent_sde, deterministic=deterministic) return action - def evaluate_actions(self, obs: th.Tensor, actions: th.Tensor, deterministic: bool = False) -> Tuple[th.Tensor, th.Tensor, th.Tensor]: + def evaluate_actions(self, obs: th.Tensor, + actions: th.Tensor, + deterministic: bool = False) -> Tuple[th.Tensor, th.Tensor, th.Tensor]: """ Evaluate actions according to the current policy, given the observations. :param obs: (th.Tensor) - :param action: (th.Tensor) + :param actions: (th.Tensor) :param deterministic: (bool) :return: (th.Tensor, th.Tensor, th.Tensor) estimated value, log likelihood of taking those actions and entropy of the action distribution. diff --git a/torchy_baselines/ppo/ppo.py b/torchy_baselines/ppo/ppo.py index a7dcf0bdb..5ac1a9644 100644 --- a/torchy_baselines/ppo/ppo.py +++ b/torchy_baselines/ppo/ppo.py @@ -146,11 +146,11 @@ def _setup_model(self) -> None: self.clip_range_vf = get_schedule_fn(self.clip_range_vf) def collect_rollouts(self, - env: VecEnv, - callback: BaseCallback, - rollout_buffer: RolloutBuffer, - n_rollout_steps: int = 256, - obs: Optional[np.ndarray] = None) -> Tuple[Optional[np.ndarray], bool]: + env: VecEnv, + callback: BaseCallback, + rollout_buffer: RolloutBuffer, + n_rollout_steps: int = 256, + obs: Optional[np.ndarray] = None) -> Tuple[Optional[np.ndarray], bool]: n_steps = 0 continue_training = True @@ -167,7 +167,6 @@ def collect_rollouts(self, continue_training = False return None, continue_training - if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0: # Sample a new noise matrix self.policy.reset_noise(env.num_envs) @@ -227,7 +226,8 @@ def train(self, gradient_steps: int, batch_size: int = 64) -> None: values, log_prob, entropy = self.policy.evaluate_actions(rollout_data.observations, actions) values = values.flatten() # Normalize advantage - advantages = (rollout_data.advantages - rollout_data.advantages.mean()) / (rollout_data.advantages.std() + 1e-8) + advantages = (rollout_data.advantages - rollout_data.advantages.mean()) / ( + rollout_data.advantages.std() + 1e-8) # ratio between old and new policy, should be one at the first iteration ratio = th.exp(log_prob - rollout_data.old_log_prob) @@ -242,7 +242,8 @@ def train(self, gradient_steps: int, batch_size: int = 64) -> None: else: # Clip the different between old and new value # NOTE: this depends on the reward scaling - values_pred = rollout_data.old_values + th.clamp(values - rollout_data.old_values, -clip_range_vf, clip_range_vf) + values_pred = rollout_data.old_values + th.clamp(values - rollout_data.old_values, -clip_range_vf, + clip_range_vf) # Value loss using the TD(gae_lambda) target value_loss = F.mse_loss(rollout_data.returns, values_pred) @@ -275,7 +276,6 @@ def train(self, gradient_steps: int, batch_size: int = 64) -> None: if self.clip_range_vf is not None: logger.logkv("clip_range_vf", clip_range_vf) - logger.logkv("explained_variance", explained_var) # TODO: gather stats for the entropy and other losses? logger.logkv("entropy_loss", entropy_loss.item()) diff --git a/torchy_baselines/sac/policies.py b/torchy_baselines/sac/policies.py index 15a28f2dd..1a1dc33bb 100644 --- a/torchy_baselines/sac/policies.py +++ b/torchy_baselines/sac/policies.py @@ -244,6 +244,7 @@ def forward(self, obs: th.Tensor) -> th.Tensor: def predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor: return self.actor.forward(observation, deterministic) + MlpPolicy = SACPolicy register_policy("MlpPolicy", MlpPolicy) diff --git a/torchy_baselines/td3/policies.py b/torchy_baselines/td3/policies.py index f75cdea68..3020d268d 100644 --- a/torchy_baselines/td3/policies.py +++ b/torchy_baselines/td3/policies.py @@ -1,12 +1,11 @@ -from typing import List, Tuple, Callable, Optional - import torch import torch as th import torch.nn as nn +from typing import List, Tuple, Optional +from torchy_baselines.common.distributions import StateDependentNoiseDistribution from torchy_baselines.common.policies import BasePolicy, register_policy, create_mlp, BaseNetwork, \ create_sde_feature_extractor -from torchy_baselines.common.distributions import StateDependentNoiseDistribution class Actor(BaseNetwork): diff --git a/torchy_baselines/td3/td3.py b/torchy_baselines/td3/td3.py index ccb835e50..9de88c106 100644 --- a/torchy_baselines/td3/td3.py +++ b/torchy_baselines/td3/td3.py @@ -1,14 +1,12 @@ -from typing import List, Tuple, Type, Union, Callable, Optional, Dict, Any - import torch as th import torch.nn.functional as F -import numpy as np +from typing import List, Tuple, Type, Union, Callable, Optional, Dict, Any from torchy_baselines.common.base_class import OffPolicyRLModel from torchy_baselines.common.buffers import ReplayBuffer -from torchy_baselines.common.type_aliases import ReplayBufferSamples, GymEnv -from torchy_baselines.common.noise import ActionNoise from torchy_baselines.common.callbacks import BaseCallback +from torchy_baselines.common.noise import ActionNoise +from torchy_baselines.common.type_aliases import ReplayBufferSamples, GymEnv from torchy_baselines.td3.policies import TD3Policy @@ -138,9 +136,9 @@ def _create_aliases(self) -> None: self.vf_net = self.policy.vf_net def train_critic(self, gradient_steps: int = 1, - batch_size: int = 100, - replay_data: Optional[ReplayBufferSamples] = None, - tau: float = 0.0) -> None: + batch_size: int = 100, + replay_data: Optional[ReplayBufferSamples] = None, + tau: float = 0.0) -> None: # Update optimizer learning rate self._update_learning_rate(self.critic.optimizer) From b64873ffff1923b02a00c7b683099959a288ff6c Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Thu, 12 Mar 2020 12:34:25 +0100 Subject: [PATCH 15/17] Sync callbacks --- docs/misc/changelog.rst | 1 + tests/test_run.py | 2 +- torchy_baselines/a2c/a2c.py | 5 +- torchy_baselines/cem_rl/cem_rl.py | 5 +- torchy_baselines/common/base_class.py | 12 ++-- torchy_baselines/common/callbacks.py | 62 +++++++++++++++------ torchy_baselines/common/type_aliases.py | 5 +- torchy_baselines/common/vec_env/__init__.py | 15 +++-- torchy_baselines/ppo/ppo.py | 12 ++-- torchy_baselines/sac/sac.py | 5 +- torchy_baselines/td3/td3.py | 5 +- 11 files changed, 81 insertions(+), 48 deletions(-) diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst index 3bd119bf5..38a63d5b5 100644 --- a/docs/misc/changelog.rst +++ b/docs/misc/changelog.rst @@ -17,6 +17,7 @@ New Features: Bug Fixes: ^^^^^^^^^^ +- Synced callbacks with Stable-Baselines Deprecations: ^^^^^^^^^^^^^ diff --git a/tests/test_run.py b/tests/test_run.py index 1a3f99110..db33cc75f 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -23,7 +23,7 @@ def test_cemrl(): @pytest.mark.parametrize("model_class", [A2C, PPO]) @pytest.mark.parametrize("env_id", ['CartPole-v1', 'Pendulum-v0']) def test_onpolicy(model_class, env_id): - model = model_class('MlpPolicy', env_id, policy_kwargs=dict(net_arch=[16]), verbose=1, create_eval_env=True) + model = model_class('MlpPolicy', env_id, seed=0, policy_kwargs=dict(net_arch=[16]), verbose=1, create_eval_env=True) model.learn(total_timesteps=1000, eval_freq=500) diff --git a/torchy_baselines/a2c/a2c.py b/torchy_baselines/a2c/a2c.py index 5f9730df2..252f8bf0b 100644 --- a/torchy_baselines/a2c/a2c.py +++ b/torchy_baselines/a2c/a2c.py @@ -4,8 +4,7 @@ from typing import Type, Union, Callable, Optional, Dict, Any from torchy_baselines.common import logger -from torchy_baselines.common.callbacks import BaseCallback -from torchy_baselines.common.type_aliases import GymEnv +from torchy_baselines.common.type_aliases import GymEnv, MaybeCallback from torchy_baselines.common.utils import explained_variance from torchy_baselines.ppo.policies import PPOPolicy from torchy_baselines.ppo.ppo import PPO @@ -154,7 +153,7 @@ def train(self, gradient_steps: int, batch_size: Optional[int] = None) -> None: def learn(self, total_timesteps: int, - callback: Optional[BaseCallback] = None, + callback: MaybeCallback = None, log_interval: int = 100, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, diff --git a/torchy_baselines/cem_rl/cem_rl.py b/torchy_baselines/cem_rl/cem_rl.py index 867f2c418..0069aef0c 100644 --- a/torchy_baselines/cem_rl/cem_rl.py +++ b/torchy_baselines/cem_rl/cem_rl.py @@ -3,8 +3,7 @@ import torch as th from torchy_baselines.common.base_class import OffPolicyRLModel -from torchy_baselines.common.callbacks import BaseCallback -from torchy_baselines.common.type_aliases import GymEnv +from torchy_baselines.common.type_aliases import GymEnv, MaybeCallback from torchy_baselines.common.noise import ActionNoise from torchy_baselines.td3.td3 import TD3, TD3Policy from torchy_baselines.cem_rl.cem import CEM @@ -121,7 +120,7 @@ def _setup_model(self) -> None: def learn(self, total_timesteps: int, - callback: Optional[BaseCallback] = None, + callback: MaybeCallback = None, log_interval: int = 4, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, diff --git a/torchy_baselines/common/base_class.py b/torchy_baselines/common/base_class.py index f1d205a0e..19ea791d2 100644 --- a/torchy_baselines/common/base_class.py +++ b/torchy_baselines/common/base_class.py @@ -16,7 +16,7 @@ from torchy_baselines.common.utils import set_random_seed, get_schedule_fn, update_learning_rate from torchy_baselines.common.vec_env import DummyVecEnv, VecEnv, unwrap_vec_normalize, VecNormalize from torchy_baselines.common.save_util import data_to_json, json_to_data, recursive_getattr, recursive_setattr -from torchy_baselines.common.type_aliases import GymEnv, TensorDict, RolloutReturn +from torchy_baselines.common.type_aliases import GymEnv, TensorDict, RolloutReturn, MaybeCallback from torchy_baselines.common.callbacks import BaseCallback, CallbackList, ConvertCallback, EvalCallback from torchy_baselines.common.monitor import Monitor from torchy_baselines.common.noise import ActionNoise @@ -281,7 +281,7 @@ def get_torch_variables(self) -> Tuple[List[str], List[str]]: @abstractmethod def learn(self, total_timesteps: int, - callback: Union[None, Callable, List[BaseCallback], BaseCallback] = None, + callback: MaybeCallback = None, log_interval: int = 100, tb_log_name: str = "run", eval_env: Optional[GymEnv] = None, @@ -877,10 +877,6 @@ def collect_rollouts(self, while not done: - # Only stop training if return value is False, not when it is None. - if callback() is False: - return RolloutReturn(0.0, total_steps, total_episodes, None, continue_training=False) - if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0: # Sample a new noise matrix self.actor.reset_noise() @@ -913,6 +909,10 @@ def collect_rollouts(self, # Rescale and perform action new_obs, reward, done, infos = env.step(self.unscale_action(clipped_action)) + # Only stop training if return value is False, not when it is None. + if callback.on_step() is False: + return RolloutReturn(0.0, total_steps, total_episodes, None, continue_training=False) + episode_reward += reward # Retrieve reward and episode length if using Monitor wrapper diff --git a/torchy_baselines/common/callbacks.py b/torchy_baselines/common/callbacks.py index 392716c96..1a8403ac3 100644 --- a/torchy_baselines/common/callbacks.py +++ b/torchy_baselines/common/callbacks.py @@ -1,12 +1,13 @@ import os from abc import ABC, abstractmethod +import warnings import typing from typing import Union, List, Dict, Any, Optional import gym import numpy as np -from torchy_baselines.common.vec_env import VecEnv, sync_envs_normalization +from torchy_baselines.common.vec_env import DummyVecEnv, VecEnv, sync_envs_normalization from torchy_baselines.common.evaluation import evaluate_policy from torchy_baselines.common.logger import Logger @@ -22,9 +23,13 @@ class BaseCallback(ABC): """ def __init__(self, verbose: int = 0): super(BaseCallback, self).__init__() + # The RL model self.model = None # type: Optional[BaseRLModel] + # An alias for self.model.get_env(), the environment used for training self.training_env = None # type: Union[gym.Env, VecEnv, None] + # Number of time the callback was called self.n_calls = 0 # type: int + # n_envs * n times env.step() was called self.num_timesteps = 0 # type: int self.verbose = verbose self.locals = None # type: Optional[Dict[str, Any]] @@ -70,9 +75,13 @@ def _on_step(self) -> bool: """ return True - def __call__(self) -> bool: + def on_step(self) -> bool: """ - This method will be called by the model. This is the equivalent to the callback function. + This method will be called by the model after each call to ``env.step()``. + + For child callback (of an ``EventCallback``), this will be called + when the event is triggered. + :return: (bool) If the callback returns False, training is aborted early. """ self.n_calls += 1 @@ -128,6 +137,12 @@ def _on_step(self) -> bool: class CallbackList(BaseCallback): + """ + Class for chaining callbacks. + + :param callbacks: (List[BaseCallback]) A list of callbacks that will be called + sequentially. + """ def __init__(self, callbacks: List[BaseCallback]): super(CallbackList, self).__init__() assert isinstance(callbacks, list) @@ -141,16 +156,21 @@ def _on_training_start(self) -> None: for callback in self.callbacks: callback.on_training_start(self.locals, self.globals) + def _on_rollout_start(self) -> None: + for callback in self.callbacks: + callback.on_rollout_start() + def _on_step(self) -> bool: continue_training = True for callback in self.callbacks: - # # Update variables - # callback.num_timesteps = self.num_timesteps - # callback.n_calls = self.n_calls # Return False (stop training) if at least one callback returns False - continue_training = callback() and continue_training + continue_training = callback.on_step() and continue_training return continue_training + def _on_rollout_end(self) -> None: + for callback in self.callbacks: + callback.on_rollout_end() + def _on_training_end(self) -> None: for callback in self.callbacks: callback.on_training_end() @@ -158,7 +178,7 @@ def _on_training_end(self) -> None: class CheckpointCallback(BaseCallback): """ - Callback for saving a model every `save_freq` steps + Callback for saving a model every ``save_freq`` steps :param save_freq: (int) :param save_path: (str) Path to the folder where the model will be saved. @@ -207,16 +227,17 @@ class EvalCallback(EventCallback): :param eval_env: (Union[gym.Env, VecEnv]) The environment used for initialization :param callback_on_new_best: (Optional[BaseCallback]) Callback to trigger - when there is a new best model according to the `mean_reward` + when there is a new best model according to the ``mean_reward`` :param n_eval_episodes: (int) The number of episodes to test the agent :param eval_freq: (int) Evaluate the agent every eval_freq call of the callback. - :param log_path: (str) Path to a folder where the evaluations (`evaluations.npz`) + :param log_path: (str) Path to a folder where the evaluations (``evaluations.npz``) will be saved. It will be updated at each evaluation. :param best_model_save_path: (str) Path to a folder where the best model according to performance on the eval env will be saved. :param deterministic: (bool) Whether the evaluation should use a stochastic or deterministic actions. :param deterministic: (bool) Whether to render or not the environment during evaluation + :param render: (bool) Whether to render or not the environment during evaluation :param verbose: (int) """ def __init__(self, eval_env: Union[gym.Env, VecEnv], @@ -236,12 +257,16 @@ def __init__(self, eval_env: Union[gym.Env, VecEnv], self.deterministic = deterministic self.render = render + # Convert to VecEnv for consistency + if not isinstance(eval_env, VecEnv): + eval_env = DummyVecEnv([lambda: eval_env]) + if isinstance(eval_env, VecEnv): assert eval_env.num_envs == 1, "You must pass only one environment for evaluation" self.eval_env = eval_env self.best_model_save_path = best_model_save_path - # Logs will be written in `evaluations.npz` + # Logs will be written in ``evaluations.npz`` if log_path is not None: log_path = os.path.join(log_path, 'evaluations') self.log_path = log_path @@ -250,9 +275,10 @@ def __init__(self, eval_env: Union[gym.Env, VecEnv], self.evaluations_length = [] def _init_callback(self): - # Does not work when eval_env is a gym.Env and training_env is a VecEnv - # assert type(self.training_env) is type(self.eval_env), ("training and eval env are not of the same type", - # "{} != {}".format(self.training_env, self.eval_env)) + # Does not work in some corner cases, where the wrapper is not the same + if not type(self.training_env) is type(self.eval_env): + warnings.warn("Training and eval env are not of the same type" + f"{self.training_env} != {self.eval_env}") # Create folders if needed if self.best_model_save_path is not None: @@ -306,7 +332,7 @@ class StopTrainingOnRewardThreshold(BaseCallback): Stop the training once a threshold in episodic reward has been reached (i.e. when the model is good enough). - It must be used with the `EvalCallback`. + It must be used with the ``EvalCallback``. :param reward_threshold: (float) Minimum expected reward per episode to stop training. @@ -317,8 +343,8 @@ def __init__(self, reward_threshold: float, verbose: int = 0): self.reward_threshold = reward_threshold def _on_step(self) -> bool: - assert self.parent is not None, ("`StopTrainingOnMinimumReward` callback must be used " - "with an `EvalCallback`") + assert self.parent is not None, ("``StopTrainingOnMinimumReward`` callback must be used " + "with an ``EvalCallback``") # Convert np.bool to bool, otherwise callback() is False won't work continue_training = bool(self.parent.best_mean_reward < self.reward_threshold) if self.verbose > 0 and not continue_training: @@ -329,7 +355,7 @@ def _on_step(self) -> bool: class EveryNTimesteps(EventCallback): """ - Trigger a callback every `n_steps` timesteps + Trigger a callback every ``n_steps`` timesteps :param n_steps: (int) Number of timesteps between two trigger. :param callback: (BaseCallback) Callback that will be called diff --git a/torchy_baselines/common/type_aliases.py b/torchy_baselines/common/type_aliases.py index 60042be0e..53c152c81 100644 --- a/torchy_baselines/common/type_aliases.py +++ b/torchy_baselines/common/type_aliases.py @@ -1,18 +1,21 @@ """ Common aliases for type hint """ -from typing import Union, Dict, Any, NamedTuple, Optional +import typing +from typing import Union, Dict, Any, NamedTuple, Optional, List, Callable import numpy as np import torch as th import gym from torchy_baselines.common.vec_env import VecEnv +from torchy_baselines.common.callbacks import BaseCallback GymEnv = Union[gym.Env, VecEnv] TensorDict = Dict[str, th.Tensor] OptimizerStateDict = Dict[str, Any] +MaybeCallback = Union[None, Callable, List[BaseCallback], BaseCallback] class RolloutBufferSamples(NamedTuple): diff --git a/torchy_baselines/common/vec_env/__init__.py b/torchy_baselines/common/vec_env/__init__.py index 38099af44..2cbb34992 100644 --- a/torchy_baselines/common/vec_env/__init__.py +++ b/torchy_baselines/common/vec_env/__init__.py @@ -1,4 +1,6 @@ # flake8: noqa F401 +import typing +from typing import Optional from copy import deepcopy from torchy_baselines.common.vec_env.base_vec_env import AlreadySteppingError, NotSteppingError,\ @@ -8,8 +10,12 @@ from torchy_baselines.common.vec_env.vec_frame_stack import VecFrameStack from torchy_baselines.common.vec_env.vec_normalize import VecNormalize +# Avoid circular import +if typing.TYPE_CHECKING: + from torchy_baselines.common.type_aliases import GymEnv -def unwrap_vec_normalize(env): + +def unwrap_vec_normalize(env: 'GymEnv') -> Optional[VecNormalize]: """ :param env: (gym.Env) :return: (VecNormalize) @@ -23,16 +29,17 @@ def unwrap_vec_normalize(env): # Define here to avoid circular import -def sync_envs_normalization(env, eval_env): +def sync_envs_normalization(env: 'GymEnv', eval_env: 'GymEnv') -> None: """ Sync eval env and train env when using VecNormalize - :param env: (gym.Env) - :param eval_env: (gym.Env) + :param env: (GymEnv) + :param eval_env: (GymEnv) """ env_tmp, eval_env_tmp = env, eval_env while isinstance(env_tmp, VecEnvWrapper): if isinstance(env_tmp, VecNormalize): eval_env_tmp.obs_rms = deepcopy(env_tmp.obs_rms) + eval_env_tmp.ret_rms = deepcopy(env_tmp.ret_rms) env_tmp = env_tmp.venv eval_env_tmp = eval_env_tmp.venv diff --git a/torchy_baselines/ppo/ppo.py b/torchy_baselines/ppo/ppo.py index 5ac1a9644..7b7cd74ab 100644 --- a/torchy_baselines/ppo/ppo.py +++ b/torchy_baselines/ppo/ppo.py @@ -16,7 +16,7 @@ from torchy_baselines.common import logger from torchy_baselines.common.base_class import BaseRLModel -from torchy_baselines.common.type_aliases import GymEnv +from torchy_baselines.common.type_aliases import GymEnv, MaybeCallback from torchy_baselines.common.buffers import RolloutBuffer from torchy_baselines.common.utils import explained_variance, get_schedule_fn from torchy_baselines.common.vec_env import VecEnv @@ -163,10 +163,6 @@ def collect_rollouts(self, while n_steps < n_rollout_steps: - if callback() is False: - continue_training = False - return None, continue_training - if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0: # Sample a new noise matrix self.policy.reset_noise(env.num_envs) @@ -182,6 +178,10 @@ def collect_rollouts(self, clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high) new_obs, rewards, dones, infos = env.step(clipped_actions) + if callback.on_step() is False: + continue_training = False + return None, continue_training + self._update_info_buffer(infos) n_steps += 1 self.num_timesteps += env.num_envs @@ -286,7 +286,7 @@ def train(self, gradient_steps: int, batch_size: int = 64) -> None: def learn(self, total_timesteps: int, - callback: Optional[BaseCallback] = None, + callback: MaybeCallback = None, log_interval: int = 1, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, diff --git a/torchy_baselines/sac/sac.py b/torchy_baselines/sac/sac.py index d0309e8b0..9d517b3b2 100644 --- a/torchy_baselines/sac/sac.py +++ b/torchy_baselines/sac/sac.py @@ -7,9 +7,8 @@ from torchy_baselines.common import logger from torchy_baselines.common.base_class import OffPolicyRLModel from torchy_baselines.common.buffers import ReplayBuffer -from torchy_baselines.common.type_aliases import GymEnv +from torchy_baselines.common.type_aliases import GymEnv, MaybeCallback from torchy_baselines.common.noise import ActionNoise -from torchy_baselines.common.callbacks import BaseCallback from torchy_baselines.sac.policies import SACPolicy @@ -253,7 +252,7 @@ def train(self, gradient_steps: int, batch_size: int = 64) -> None: def learn(self, total_timesteps: int, - callback: Optional[BaseCallback] = None, + callback: MaybeCallback = None, log_interval: int = 4, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, diff --git a/torchy_baselines/td3/td3.py b/torchy_baselines/td3/td3.py index 9de88c106..c2cf27836 100644 --- a/torchy_baselines/td3/td3.py +++ b/torchy_baselines/td3/td3.py @@ -4,9 +4,8 @@ from torchy_baselines.common.base_class import OffPolicyRLModel from torchy_baselines.common.buffers import ReplayBuffer -from torchy_baselines.common.callbacks import BaseCallback from torchy_baselines.common.noise import ActionNoise -from torchy_baselines.common.type_aliases import ReplayBufferSamples, GymEnv +from torchy_baselines.common.type_aliases import ReplayBufferSamples, GymEnv, MaybeCallback from torchy_baselines.td3.policies import TD3Policy @@ -264,7 +263,7 @@ def train_sde(self) -> None: def learn(self, total_timesteps: int, - callback: Optional[BaseCallback] = None, + callback: MaybeCallback = None, log_interval: int = 4, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, From 765d8fc5b268a948e19e38fb0c8884d1b0b051e1 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Thu, 12 Mar 2020 13:24:11 +0100 Subject: [PATCH 16/17] Fix event callback --- torchy_baselines/common/callbacks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchy_baselines/common/callbacks.py b/torchy_baselines/common/callbacks.py index 1a8403ac3..b5a015d1f 100644 --- a/torchy_baselines/common/callbacks.py +++ b/torchy_baselines/common/callbacks.py @@ -129,7 +129,7 @@ def _on_training_start(self) -> None: def _on_event(self) -> bool: if self.callback is not None: - return self.callback() + return self.callback.on_step() return True def _on_step(self) -> bool: From 70e601c03cdb0135438d621a19f5f2fcf514de18 Mon Sep 17 00:00:00 2001 From: Antonin Raffin Date: Thu, 12 Mar 2020 15:34:35 +0100 Subject: [PATCH 17/17] Improve code and bump version --- setup.py | 2 +- tests/test_vec_normalize.py | 12 +++++++- torchy_baselines/__init__.py | 2 +- torchy_baselines/a2c/a2c.py | 2 +- torchy_baselines/cem_rl/cem_rl.py | 2 +- torchy_baselines/ppo/policies.py | 4 +-- torchy_baselines/ppo/ppo.py | 6 ++-- torchy_baselines/sac/policies.py | 46 +++++++++++++------------------ torchy_baselines/sac/sac.py | 2 +- torchy_baselines/td3/policies.py | 12 ++++---- torchy_baselines/td3/td3.py | 2 +- 11 files changed, 47 insertions(+), 45 deletions(-) diff --git a/setup.py b/setup.py index 1fa044cc6..044d839b1 100644 --- a/setup.py +++ b/setup.py @@ -47,7 +47,7 @@ license="MIT", long_description="", long_description_content_type='text/markdown', - version="0.2.2", + version="0.2.3", ) # python setup.py sdist diff --git a/tests/test_vec_normalize.py b/tests/test_vec_normalize.py index 75fd3a857..d80462c07 100644 --- a/tests/test_vec_normalize.py +++ b/tests/test_vec_normalize.py @@ -3,7 +3,7 @@ import numpy as np from torchy_baselines.common.running_mean_std import RunningMeanStd -from torchy_baselines.common.vec_env import DummyVecEnv, VecNormalize, VecFrameStack, sync_envs_normalization +from torchy_baselines.common.vec_env import DummyVecEnv, VecNormalize, VecFrameStack, sync_envs_normalization, unwrap_vec_normalize from torchy_baselines import CEMRL, SAC, TD3 ENV_ID = 'Pendulum-v0' @@ -132,9 +132,17 @@ def test_offpolicy_normalization(model_class): def test_sync_vec_normalize(): env = DummyVecEnv([make_env]) + + assert unwrap_vec_normalize(env) is None + env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.) + + assert isinstance(unwrap_vec_normalize(env), VecNormalize) + env = VecFrameStack(env, 1) + assert isinstance(unwrap_vec_normalize(env), VecNormalize) + eval_env = DummyVecEnv([make_env]) eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.) eval_env = VecFrameStack(eval_env, 1) @@ -146,6 +154,7 @@ def test_sync_vec_normalize(): obs = env.reset() original_obs = env.get_original_obs() + dummy_rewards = np.random.rand(10) # Normalization must be different assert not np.allclose(obs, eval_env.normalize_obs(original_obs)) @@ -153,3 +162,4 @@ def test_sync_vec_normalize(): # Now they must be synced assert np.allclose(obs, eval_env.normalize_obs(original_obs)) + assert np.allclose(env.normalize_reward(dummy_rewards), eval_env.normalize_reward(dummy_rewards)) diff --git a/torchy_baselines/__init__.py b/torchy_baselines/__init__.py index a548d3037..8c6637646 100644 --- a/torchy_baselines/__init__.py +++ b/torchy_baselines/__init__.py @@ -4,4 +4,4 @@ from torchy_baselines.sac import SAC from torchy_baselines.td3 import TD3 -__version__ = "0.2.2" +__version__ = "0.2.3" diff --git a/torchy_baselines/a2c/a2c.py b/torchy_baselines/a2c/a2c.py index 252f8bf0b..6b2c22339 100644 --- a/torchy_baselines/a2c/a2c.py +++ b/torchy_baselines/a2c/a2c.py @@ -43,7 +43,7 @@ class A2C(PPO): :param create_eval_env: (bool) Whether to create a second environment that will be used for evaluating the agent periodically. (Only available when passing string for the environment) :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation - :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug + :param verbose: (int) the verbosity level: 0 no output, 1 info, 2 debug :param seed: (int) Seed for the pseudo random generators :param device: (str or th.device) Device (cpu, cuda, ...) on which the code should be run. Setting it to auto, the code will be run on the GPU if possible. diff --git a/torchy_baselines/cem_rl/cem_rl.py b/torchy_baselines/cem_rl/cem_rl.py index 0069aef0c..585db55ab 100644 --- a/torchy_baselines/cem_rl/cem_rl.py +++ b/torchy_baselines/cem_rl/cem_rl.py @@ -50,7 +50,7 @@ class CEMRL(TD3): :param create_eval_env: (bool) Whether to create a second environment that will be used for evaluating the agent periodically. (Only available when passing string for the environment) :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation - :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug + :param verbose: (int) the verbosity level: 0 no output, 1 info, 2 debug :param seed: (int) Seed for the pseudo random generators :param device: (str or th.device) Device (cpu, cuda, ...) on which the code should be run. Setting it to auto, the code will be run on the GPU if possible. diff --git a/torchy_baselines/ppo/policies.py b/torchy_baselines/ppo/policies.py index b98977683..2d4904a61 100644 --- a/torchy_baselines/ppo/policies.py +++ b/torchy_baselines/ppo/policies.py @@ -32,9 +32,9 @@ class PPOPolicy(BasePolicy): :param sde_net_arch: ([int]) Network architecture for extracting features when using SDE. If None, the latent features from the policy will be used. Pass an empty list to use the states as features. - :param use_expln: (bool) Use `expln()` function instead of `exp()` to ensure + :param use_expln: (bool) Use ``expln()`` function instead of ``exp()`` to ensure a positive standard deviation (cf paper). It allows to keep variance - above zero and prevent it from growing too fast. In practice, `exp()` is usually enough. + above zero and prevent it from growing too fast. In practice, ``exp()`` is usually enough. :param squash_output: (bool) Whether to squash the output using a tanh function, this allows to ensure boundaries when using SDE. """ diff --git a/torchy_baselines/ppo/ppo.py b/torchy_baselines/ppo/ppo.py index 7b7cd74ab..2ee6a7538 100644 --- a/torchy_baselines/ppo/ppo.py +++ b/torchy_baselines/ppo/ppo.py @@ -67,7 +67,7 @@ class PPO(BaseRLModel): :param create_eval_env: (bool) Whether to create a second environment that will be used for evaluating the agent periodically. (Only available when passing string for the environment) :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation - :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug + :param verbose: (int) the verbosity level: 0 no output, 1 info, 2 debug :param seed: (int) Seed for the pseudo random generators :param device: (str or th.device) Device (cpu, cuda, ...) on which the code should be run. Setting it to auto, the code will be run on the GPU if possible. @@ -226,8 +226,8 @@ def train(self, gradient_steps: int, batch_size: int = 64) -> None: values, log_prob, entropy = self.policy.evaluate_actions(rollout_data.observations, actions) values = values.flatten() # Normalize advantage - advantages = (rollout_data.advantages - rollout_data.advantages.mean()) / ( - rollout_data.advantages.std() + 1e-8) + advantages = rollout_data.advantages + advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8) # ratio between old and new policy, should be one at the first iteration ratio = th.exp(log_prob - rollout_data.old_log_prob) diff --git a/torchy_baselines/sac/policies.py b/torchy_baselines/sac/policies.py index 1a1dc33bb..00d7457bd 100644 --- a/torchy_baselines/sac/policies.py +++ b/torchy_baselines/sac/policies.py @@ -28,9 +28,9 @@ class Actor(BaseNetwork): :param sde_net_arch: ([int]) Network architecture for extracting features when using SDE. If None, the latent features from the policy will be used. Pass an empty list to use the states as features. - :param use_expln: (bool) Use `expln()` function instead of `exp()` when using SDE to ensure + :param use_expln: (bool) Use ``expln()`` function instead of ``exp()`` when using SDE to ensure a positive standard deviation (cf paper). It allows to keep variance - above zero and prevent it from growing too fast. In practice, `exp()` is usually enough. + above zero and prevent it from growing too fast. In practice, ``exp()`` is usually enough. :param clip_mean: (float) Clip the mean output when using SDE to avoid numerical instability. """ def __init__(self, obs_dim: int, @@ -75,8 +75,8 @@ def get_std(self) -> th.Tensor: """ Retrieve the standard deviation of the action distribution. Only useful when using SDE. - It corresponds to `th.exp(log_std)` in the normal case, - but is slightly different when using `expln` function + It corresponds to ``th.exp(log_std)`` in the normal case, + but is slightly different when using ``expln`` function (cf StateDependentNoiseDistribution doc). :return: (th.Tensor) @@ -96,43 +96,35 @@ def reset_noise(self, batch_size: int = 1) -> None: def _get_latent(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor]: latent_pi = self.latent_pi(obs) - if self.sde_feature_extractor is not None: - latent_sde = self.sde_feature_extractor(obs) - else: - latent_sde = latent_pi + latent_sde = self.sde_feature_extractor(obs) if self.sde_feature_extractor is not None else latent_pi + return latent_pi, latent_sde def get_action_dist_params(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor, th.Tensor]: latent_pi, latent_sde = self._get_latent(obs) + mean_actions = self.mu(latent_pi) if self.use_sde: - mean_actions, log_std = self.mu(latent_pi), self.log_std + log_std = self.log_std else: - mean_actions, log_std = self.mu(latent_pi), self.log_std(latent_pi) + log_std = self.log_std(latent_pi) # Original Implementation to cap the standard deviation log_std = th.clamp(log_std, LOG_STD_MIN, LOG_STD_MAX) return mean_actions, log_std, latent_sde def forward(self, obs: th.Tensor, deterministic: bool = False) -> th.Tensor: mean_actions, log_std, latent_sde = self.get_action_dist_params(obs) - if self.use_sde: - # Note: the action is squashed - action, _ = self.action_dist.proba_distribution(mean_actions, log_std, latent_sde, - deterministic=deterministic) - else: - # Note: the action is squashed - action, _ = self.action_dist.proba_distribution(mean_actions, log_std, - deterministic=deterministic) + kwargs = dict(latent_sde=latent_sde) if self.use_sde else {} + # Note: the action is squashed + action, _ = self.action_dist.proba_distribution(mean_actions, log_std, + deterministic=deterministic, **kwargs) return action def action_log_prob(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor]: mean_actions, log_std, latent_sde = self.get_action_dist_params(obs) - - if self.use_sde: - action, log_prob = self.action_dist.log_prob_from_params(mean_actions, self.log_std, latent_sde) - else: - action, log_prob = self.action_dist.log_prob_from_params(mean_actions, log_std) - return action, log_prob + kwargs = dict(latent_sde=latent_sde) if self.use_sde else {} + # return action and associated log prob + return self.action_dist.log_prob_from_params(mean_actions, log_std, **kwargs) class Critic(BaseNetwork): @@ -178,9 +170,9 @@ class SACPolicy(BasePolicy): :param sde_net_arch: ([int]) Network architecture for extracting features when using SDE. If None, the latent features from the policy will be used. Pass an empty list to use the states as features. - :param use_expln: (bool) Use `expln()` function instead of `exp()` when using SDE to ensure + :param use_expln: (bool) Use ``expln()`` function instead of ``exp()`` when using SDE to ensure a positive standard deviation (cf paper). It allows to keep variance - above zero and prevent it from growing too fast. In practice, `exp()` is usually enough. + above zero and prevent it from growing too fast. In practice, ``exp()`` is usually enough. :param clip_mean: (float) Clip the mean output when using SDE to avoid numerical instability. """ def __init__(self, observation_space: gym.spaces.Space, @@ -239,7 +231,7 @@ def make_critic(self) -> Critic: return Critic(**self.net_args).to(self.device) def forward(self, obs: th.Tensor) -> th.Tensor: - return self.actor(obs) + return self.predict(obs, deterministic=False) def predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor: return self.actor.forward(observation, deterministic) diff --git a/torchy_baselines/sac/sac.py b/torchy_baselines/sac/sac.py index 9d517b3b2..1696bad2d 100644 --- a/torchy_baselines/sac/sac.py +++ b/torchy_baselines/sac/sac.py @@ -56,7 +56,7 @@ class SAC(OffPolicyRLModel): :param create_eval_env: (bool) Whether to create a second environment that will be used for evaluating the agent periodically. (Only available when passing string for the environment) :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation - :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug + :param verbose: (int) the verbosity level: 0 no output, 1 info, 2 debug :param seed: (int) Seed for the pseudo random generators :param device: (str or th.device) Device (cpu, cuda, ...) on which the code should be run. Setting it to auto, the code will be run on the GPU if possible. diff --git a/torchy_baselines/td3/policies.py b/torchy_baselines/td3/policies.py index 3020d268d..fc86c7790 100644 --- a/torchy_baselines/td3/policies.py +++ b/torchy_baselines/td3/policies.py @@ -25,9 +25,9 @@ class Actor(BaseNetwork): :param sde_net_arch: ([int]) Network architecture for extracting features when using SDE. If None, the latent features from the policy will be used. Pass an empty list to use the states as features. - :param use_expln: (bool) Use `expln()` function instead of `exp()` when using SDE to ensure + :param use_expln: (bool) Use ``expln()`` function instead of ``exp()`` when using SDE to ensure a positive standard deviation (cf paper). It allows to keep variance - above zero and prevent it from growing too fast. In practice, `exp()` is usually enough. + above zero and prevent it from growing too fast. In practice, ``exp()`` is usually enough. """ def __init__(self, obs_dim: int, @@ -80,8 +80,8 @@ def get_std(self) -> torch.Tensor: """ Retrieve the standard deviation of the action distribution. Only useful when using SDE. - It corresponds to `th.exp(log_std)` in the normal case, - but is slightly different when using `expln` function + It corresponds to ``th.exp(log_std)`` in the normal case, + but is slightly different when using ``expln`` function (cf StateDependentNoiseDistribution doc). :return: (th.Tensor) @@ -206,9 +206,9 @@ class TD3Policy(BasePolicy): :param sde_net_arch: ([int]) Network architecture for extracting features when using SDE. If None, the latent features from the policy will be used. Pass an empty list to use the states as features. - :param use_expln: (bool) Use `expln()` function instead of `exp()` when using SDE to ensure + :param use_expln: (bool) Use ``expln()`` function instead of ``exp()`` when using SDE to ensure a positive standard deviation (cf paper). It allows to keep variance - above zero and prevent it from growing too fast. In practice, `exp()` is usually enough. + above zero and prevent it from growing too fast. In practice, ``exp()`` is usually enough. """ def __init__(self, observation_space, action_space, learning_rate, net_arch=None, device='cpu', diff --git a/torchy_baselines/td3/td3.py b/torchy_baselines/td3/td3.py index c2cf27836..b2a7dbe23 100644 --- a/torchy_baselines/td3/td3.py +++ b/torchy_baselines/td3/td3.py @@ -51,7 +51,7 @@ class TD3(OffPolicyRLModel): :param create_eval_env: (bool) Whether to create a second environment that will be used for evaluating the agent periodically. (Only available when passing string for the environment) :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation - :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug + :param verbose: (int) the verbosity level: 0 no output, 1 info, 2 debug :param seed: (int) Seed for the pseudo random generators :param device: (str or th.device) Device (cpu, cuda, ...) on which the code should be run. Setting it to auto, the code will be run on the GPU if possible.