From 26ccf499b38e7ca5ad8ce369ad157913a8f0750b Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Fri, 21 Feb 2020 14:50:28 +0100
Subject: [PATCH 01/17] Use normal sampling for SAC

---
 docs/misc/changelog.rst      | 1 +
 setup.py                     | 2 +-
 torchy_baselines/__init__.py | 2 +-
 torchy_baselines/sac/sac.py  | 4 ++--
 4 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
index 56d0d1bc5..76616ad5f 100644
--- a/docs/misc/changelog.rst
+++ b/docs/misc/changelog.rst
@@ -21,6 +21,7 @@ Deprecations:
 
 Others:
 ^^^^^^^
+- SAC with SDE now sample only one matrix
 
 Documentation:
 ^^^^^^^^^^^^^^
diff --git a/setup.py b/setup.py
index 0c39da372..8780397de 100644
--- a/setup.py
+++ b/setup.py
@@ -47,7 +47,7 @@
       license="MIT",
       long_description="",
       long_description_content_type='text/markdown',
-      version="0.2.0",
+      version="0.2.1",
       )
 
 # python setup.py sdist
diff --git a/torchy_baselines/__init__.py b/torchy_baselines/__init__.py
index cc2889b20..b201dc8ef 100644
--- a/torchy_baselines/__init__.py
+++ b/torchy_baselines/__init__.py
@@ -4,4 +4,4 @@
 from torchy_baselines.sac import SAC
 from torchy_baselines.td3 import TD3
 
-__version__ = "0.2.0"
+__version__ = "0.2.1"
diff --git a/torchy_baselines/sac/sac.py b/torchy_baselines/sac/sac.py
index f0930ee21..61a36057e 100644
--- a/torchy_baselines/sac/sac.py
+++ b/torchy_baselines/sac/sac.py
@@ -171,8 +171,8 @@ def train(self, gradient_steps: int, batch_size: int = 64):
             # is lost and we cannot backpropagate through again
             # anyway, we need to sample because `log_std` may have changed between two gradient steps
             if self.use_sde:
-                self.actor.reset_noise(batch_size=batch_size)
-                # self.actor.reset_noise()
+                # self.actor.reset_noise(batch_size=batch_size)
+                self.actor.reset_noise()
 
             # Action by the current actor for the sampled state
             action_pi, log_prob = self.actor.action_log_prob(obs)

From 67894dab9f806b2815e98c3f1af5052bcc0ada66 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Mon, 9 Mar 2020 19:02:40 +0100
Subject: [PATCH 02/17] Add clip_mean parameter

---
 torchy_baselines/ppo/ppo.py      |  1 -
 torchy_baselines/sac/policies.py | 67 +++++++++++++-------------------
 torchy_baselines/sac/sac.py      |  9 +----
 3 files changed, 29 insertions(+), 48 deletions(-)

diff --git a/torchy_baselines/ppo/ppo.py b/torchy_baselines/ppo/ppo.py
index 2ce168b1f..6f30f7f70 100644
--- a/torchy_baselines/ppo/ppo.py
+++ b/torchy_baselines/ppo/ppo.py
@@ -140,7 +140,6 @@ def collect_rollouts(self,
         continue_training = True
         rollout_buffer.reset()
         # Sample new weights for the state dependent exploration
-        # TODO: ensure episodic setting?
         if self.use_sde:
             self.policy.reset_noise(env.num_envs)
 
diff --git a/torchy_baselines/sac/policies.py b/torchy_baselines/sac/policies.py
index 4f9673840..e570b570c 100644
--- a/torchy_baselines/sac/policies.py
+++ b/torchy_baselines/sac/policies.py
@@ -1,3 +1,5 @@
+from typing import Optional, List, Tuple
+
 import torch as th
 import torch.nn as nn
 
@@ -10,28 +12,6 @@
 LOG_STD_MIN = -20
 
 
-class LeakyClip(nn.Module):
-    """
-    Cip values outside a certain range
-    (it is not a hard clip, there is a small slope to have non-zero gradient)
-
-    :param min_val: (float)
-    :param max_val: (float)
-    :param slope: (float)
-    """
-    def __init__(self, min_val=-2.0, max_val=2.0, slope=0.01):
-        super(LeakyClip, self).__init__()
-        self.min_val = min_val
-        self.max_val = max_val
-        self.slope = slope
-
-    def forward(self, x):
-        linear_part = x * (x >= self.min_val) * (x <= self.max_val)
-        above_max_val = self.slope * (x - self.max_val) * (x > self.max_val)
-        below_min_val = self.slope * (x - self.min_val) * (x < self.min_val)
-        return linear_part + below_min_val + above_max_val
-
-
 class Actor(BaseNetwork):
     """
     Actor network (policy) for SAC.
@@ -50,10 +30,18 @@ class Actor(BaseNetwork):
     :param use_expln: (bool) Use `expln()` function instead of `exp()` when using SDE to ensure
         a positive standard deviation (cf paper). It allows to keep variance
         above zero and prevent it from growing too fast. In practice, `exp()` is usually enough.
+    :param clip_mean: (float) Clip the mean output when using SDE to avoid numerical instability.
     """
-    def __init__(self, obs_dim, action_dim, net_arch, activation_fn=nn.ReLU,
-                 use_sde=False, log_std_init=-3, full_std=True,
-                 sde_net_arch=None, use_expln=False):
+    def __init__(self, obs_dim: int,
+                 action_dim: int,
+                 net_arch: List[int],
+                 activation_fn: nn.Module = nn.ReLU,
+                 use_sde: bool = False,
+                 log_std_init: float = -3,
+                 full_std: bool = True,
+                 sde_net_arch: Optional[List[int]] = None,
+                 use_expln: bool = False,
+                 clip_mean: float = 2.0):
         super(Actor, self).__init__()
 
         latent_pi_net = create_mlp(obs_dim, -1, net_arch, activation_fn)
@@ -68,23 +56,21 @@ def __init__(self, obs_dim, action_dim, net_arch, activation_fn=nn.ReLU,
                 self.sde_feature_extractor, latent_sde_dim = create_sde_feature_extractor(obs_dim, sde_net_arch,
                                                                                           activation_fn)
 
-            # TODO: check for the learn_features
             self.action_dist = StateDependentNoiseDistribution(action_dim, full_std=full_std, use_expln=use_expln,
                                                                learn_features=True, squash_output=True)
             self.mu, self.log_std = self.action_dist.proba_distribution_net(latent_dim=net_arch[-1],
                                                                             latent_sde_dim=latent_sde_dim,
                                                                             log_std_init=log_std_init)
-            # Avoid saturation by limiting the mean of the Gaussian to be in [-1, 1]
-            # self.mu = nn.Sequential(self.mu, nn.Tanh())
-            self.mu = nn.Sequential(self.mu, nn.Hardtanh(min_val=-2.0, max_val=2.0))
-            # Small positive slope to have non-zero gradient
-            # self.mu = nn.Sequential(self.mu, LeakyClip())
+            # Avoid numerical issues by limiting the mean of the Gaussian
+            # to be in [-clip_mean, clip_mean]
+            if clip_mean > 0.0:
+                self.mu = nn.Sequential(self.mu, nn.Hardtanh(min_val=-clip_mean, max_val=clip_mean))
         else:
             self.action_dist = SquashedDiagGaussianDistribution(action_dim)
             self.mu = nn.Linear(net_arch[-1], action_dim)
             self.log_std = nn.Linear(net_arch[-1], action_dim)
 
-    def get_std(self):
+    def get_std(self) -> th.Tensor:
         """
         Retrieve the standard deviation of the action distribution.
         Only useful when using SDE.
@@ -97,7 +83,7 @@ def get_std(self):
         assert isinstance(self.action_dist, StateDependentNoiseDistribution), 'get_std() is only available when using SDE'
         return self.action_dist.get_std(self.log_std)
 
-    def reset_noise(self, batch_size=1):
+    def reset_noise(self, batch_size: int = 1) -> None:
         """
         Sample new weights for the exploration matrix, when using SDE.
 
@@ -106,7 +92,7 @@ def reset_noise(self, batch_size=1):
         assert isinstance(self.action_dist, StateDependentNoiseDistribution), 'reset_noise() is only available when using SDE'
         self.action_dist.sample_weights(self.log_std, batch_size=batch_size)
 
-    def _get_latent(self, obs):
+    def _get_latent(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
         latent_pi = self.latent_pi(obs)
 
         if self.sde_feature_extractor is not None:
@@ -115,7 +101,7 @@ def _get_latent(self, obs):
             latent_sde = latent_pi
         return latent_pi, latent_sde
 
-    def get_action_dist_params(self, obs):
+    def get_action_dist_params(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor, th.Tensor]:
         latent_pi, latent_sde = self._get_latent(obs)
 
         if self.use_sde:
@@ -126,7 +112,7 @@ def get_action_dist_params(self, obs):
             log_std = th.clamp(log_std, LOG_STD_MIN, LOG_STD_MAX)
         return mean_actions, log_std, latent_sde
 
-    def forward(self, obs, deterministic=False):
+    def forward(self, obs: th.Tensor, deterministic: bool = False) -> th.Tensor:
         mean_actions, log_std, latent_sde = self.get_action_dist_params(obs)
         if self.use_sde:
             # Note: the action is squashed
@@ -138,7 +124,7 @@ def forward(self, obs, deterministic=False):
                                                             deterministic=deterministic)
         return action
 
-    def action_log_prob(self, obs):
+    def action_log_prob(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
         mean_actions, log_std, latent_sde = self.get_action_dist_params(obs)
 
         if self.use_sde:
@@ -195,11 +181,13 @@ class SACPolicy(BasePolicy):
     :param use_expln: (bool) Use `expln()` function instead of `exp()` when using SDE to ensure
         a positive standard deviation (cf paper). It allows to keep variance
         above zero and prevent it from growing too fast. In practice, `exp()` is usually enough.
+    :param clip_mean: (float) Clip the mean output when using SDE to avoid numerical instability.
     """
     def __init__(self, observation_space, action_space,
                  learning_rate, net_arch=None, device='cpu',
                  activation_fn=nn.ReLU, use_sde=False,
-                 log_std_init=-3, sde_net_arch=None, use_expln=False):
+                 log_std_init=-3, sde_net_arch=None,
+                 use_expln=False, clip_mean=2.0):
         super(SACPolicy, self).__init__(observation_space, action_space, device, squash_output=True)
 
         if net_arch is None:
@@ -220,7 +208,8 @@ def __init__(self, observation_space, action_space,
             'use_sde': use_sde,
             'log_std_init': log_std_init,
             'sde_net_arch': sde_net_arch,
-            'use_expln': use_expln
+            'use_expln': use_expln,
+            'clip_mean': clip_mean
         }
         self.actor_kwargs.update(sde_kwargs)
         self.actor, self.actor_target = None, None
diff --git a/torchy_baselines/sac/sac.py b/torchy_baselines/sac/sac.py
index 61a36057e..fa6738ba4 100644
--- a/torchy_baselines/sac/sac.py
+++ b/torchy_baselines/sac/sac.py
@@ -165,13 +165,8 @@ def train(self, gradient_steps: int, batch_size: int = 64):
 
             obs, action_batch, next_obs, done, reward = replay_data
 
-            # Two options: retain_graph=True in the actor_loss.backward()
-            # or sample again the noise matrix
-            # otherwise the intermediate step `std = th.exp(log_std)`
-            # is lost and we cannot backpropagate through again
-            # anyway, we need to sample because `log_std` may have changed between two gradient steps
+            # We need to sample because `log_std` may have changed between two gradient steps
             if self.use_sde:
-                # self.actor.reset_noise(batch_size=batch_size)
                 self.actor.reset_noise()
 
             # Action by the current actor for the sampled state
@@ -196,8 +191,6 @@ def train(self, gradient_steps: int, batch_size: int = 64):
                 self.ent_coef_optimizer.step()
 
             with th.no_grad():
-                # if self.use_sde:
-                #     self.actor.reset_noise(batch_size=batch_size)
                 # Select action according to policy
                 next_action, next_log_prob = self.actor.action_log_prob(next_obs)
                 # Compute the target Q value

From 1e81f38d664af9b108620caf58c89d5e19b89a75 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Mon, 9 Mar 2020 19:05:22 +0100
Subject: [PATCH 03/17] Update changelog

---
 docs/misc/changelog.rst | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
index 76616ad5f..b05e73a1a 100644
--- a/docs/misc/changelog.rst
+++ b/docs/misc/changelog.rst
@@ -22,6 +22,7 @@ Deprecations:
 Others:
 ^^^^^^^
 - SAC with SDE now sample only one matrix
+- Added ``clip_mean`` parameter to SAC policy
 
 Documentation:
 ^^^^^^^^^^^^^^
@@ -35,25 +36,25 @@ Pre-Release 0.2.0 (2020-02-14)
 Breaking Changes:
 ^^^^^^^^^^^^^^^^^
 - Python 2 support was dropped, Torchy Baselines now requires Python 3.6 or above
-- Return type of `evaluation.evaluate_policy()` has been changed
+- Return type of ``evaluation.evaluate_policy()`` has been changed
 - Refactored the replay buffer to avoid transformation between PyTorch and NumPy
 - Created `OffPolicyRLModel` base class
 - Remove deprecated JSON format for `Monitor`
 
 New Features:
 ^^^^^^^^^^^^^
-- Add `seed()` method to `VecEnv` class
+- Add ``seed()`` method to ``VecEnv`` class
 - Add support for Callback (cf https://github.com/hill-a/stable-baselines/pull/644)
 - Add methods for saving and loading replay buffer
-- Add `extend()` method to the buffers
-- Add `get_vec_normalize_env()` to `BaseRLModel` to retrieve `VecNormalize` wrapper when it exists
-- Add `results_plotter` from Stable Baselines
-- Improve `predict()` method to handle different type of observations (single, vectorized, ...)
+- Add ``extend()`` method to the buffers
+- Add ``get_vec_normalize_env()`` to ``BaseRLModel`` to retrieve ``VecNormalize`` wrapper when it exists
+- Add ``results_plotter`` from Stable Baselines
+- Improve ``predict()`` method to handle different type of observations (single, vectorized, ...)
 
 Bug Fixes:
 ^^^^^^^^^^
 - Fix loading model on CPU that were trained on GPU
-- Fix `reset_num_timesteps` that was not used
+- Fix ``reset_num_timesteps`` that was not used
 - Fix entropy computation for squashed Gaussian (approximate it now)
 - Fix seeding when using multiple environments (different seed per env)
 
@@ -64,8 +65,8 @@ Others:
 ^^^^^^^
 - Add type check
 - Converted all format string to f-strings
-- Add test for `OrnsteinUhlenbeckActionNoise`
-- Add type aliases in `common.type_aliases`
+- Add test for ``OrnsteinUhlenbeckActionNoise``
+- Add type aliases in ``common.type_aliases``
 
 Documentation:
 ^^^^^^^^^^^^^^
@@ -81,7 +82,7 @@ Breaking Changes:
 
 New Features:
 ^^^^^^^^^^^^^
-- Initial release of A2C, CEM-RL, PPO, SAC and TD3, working only with `Box` input space
+- Initial release of A2C, CEM-RL, PPO, SAC and TD3, working only with ``Box`` input space
 - State-Dependent Exploration (SDE) for A2C, PPO, SAC and TD3
 
 Bug Fixes:
@@ -111,4 +112,12 @@ Contributors:
 -------------
 In random order...
 
-Thanks to @hill-a @enerijunior @AdamGleave @Miffyli
+Thanks to the maintainers of V2: @hill-a @enerijunior @AdamGleave @Miffyli
+
+And all the contributors:
+@bjmuld @iambenzo @iandanforth @r7vme @brendenpetersen @huvar @abhiskk @JohannesAck
+@EliasHasle @mrakgr @Bleyddyn @antoine-galataud @junhyeokahn @AdamGleave @keshaviyengar @tperol
+@XMaster96 @kantneel @Pastafarianist @GerardMaggiolino @PatrickWalter214 @yutingsz @sc420 @Aaahh @billtubbs
+@Miffyli @dwiel @miguelrass @qxcv @jaberkow @eavelardev @ruifeng96150 @pedrohbtp @srivatsankrishnan @evilsocket
+@MarvineGothic @jdossgollin @SyllogismRXS @rusu24edward @jbulow @Antymon @seheevic @justinkterry @edbeeching
+@flodorner @KuKuXia @NeoExtended @solliet @mmcenta @richardwu

From fb4e66213d2a36d78706cc5cafd5220909cdd789 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Tue, 10 Mar 2020 16:43:10 +0100
Subject: [PATCH 04/17] Use NamedTuple for buffers

---
 torchy_baselines/a2c/a2c.py             | 20 ++++++++++----------
 torchy_baselines/common/base_class.py   |  1 -
 torchy_baselines/common/buffers.py      | 13 +++++++------
 torchy_baselines/common/type_aliases.py | 20 +++++++++++++++++---
 torchy_baselines/ppo/ppo.py             | 25 ++++++++++++-------------
 torchy_baselines/sac/sac.py             | 14 ++++++--------
 torchy_baselines/td3/td3.py             | 20 ++++++++------------
 7 files changed, 60 insertions(+), 53 deletions(-)

diff --git a/torchy_baselines/a2c/a2c.py b/torchy_baselines/a2c/a2c.py
index 01cbd3993..ea27d715f 100644
--- a/torchy_baselines/a2c/a2c.py
+++ b/torchy_baselines/a2c/a2c.py
@@ -81,30 +81,30 @@ def train(self, gradient_steps: int, batch_size=None):
         # Update optimizer learning rate
         self._update_learning_rate(self.policy.optimizer)
         # A2C with gradient_steps > 1 does not make sense
-        assert gradient_steps == 1
+        assert gradient_steps == 1, "A2C does not support multiple gradient steps"
         # We do not use minibatches for A2C
-        assert batch_size is None
+        assert batch_size is None, "A2C does not support minibatch"
 
         for rollout_data in self.rollout_buffer.get(batch_size=None):
-            # Unpack
-            obs, action, _, _, advantage, return_batch = rollout_data
 
+            actions = rollout_data.actions
             if isinstance(self.action_space, spaces.Discrete):
-                # Convert discrete action for float to long
-                action = action.long().flatten()
+                # Convert discrete action from float to long
+                actions = actions.long().flatten()
 
             # TODO: avoid second computation of everything because of the gradient
-            values, log_prob, entropy = self.policy.evaluate_actions(obs, action)
+            values, log_prob, entropy = self.policy.evaluate_actions(rollout_data.observations, actions)
             values = values.flatten()
 
             # Normalize advantage (not present in the original implementation)
             if self.normalize_advantage:
-                advantage = (advantage - advantage.mean()) / (advantage.std() + 1e-8)
+                advantages = (rollout_data.advantages - rollout_data.advantages.mean()) / (rollout_data.advantages.std() + 1e-8)
 
-            policy_loss = -(advantage * log_prob).mean()
+            # Policy gradient loss
+            policy_loss = -(advantages * log_prob).mean()
 
             # Value loss using the TD(gae_lambda) target
-            value_loss = F.mse_loss(return_batch, values)
+            value_loss = F.mse_loss(rollout_data.returns, values)
 
             # Entropy loss favor exploration
             if entropy is None:
diff --git a/torchy_baselines/common/base_class.py b/torchy_baselines/common/base_class.py
index 6b3fb6a52..b01002576 100644
--- a/torchy_baselines/common/base_class.py
+++ b/torchy_baselines/common/base_class.py
@@ -956,7 +956,6 @@ def collect_rollouts(self,
                 total_episodes += 1
                 episode_rewards.append(episode_reward)
                 total_timesteps.append(episode_timesteps)
-                # TODO: reset SDE matrix at the end of the episode?
                 if action_noise is not None:
                     action_noise.reset()
 
diff --git a/torchy_baselines/common/buffers.py b/torchy_baselines/common/buffers.py
index c6e9d5c69..6ffe479a2 100644
--- a/torchy_baselines/common/buffers.py
+++ b/torchy_baselines/common/buffers.py
@@ -1,4 +1,4 @@
-from typing import Union, Optional, Tuple, Generator
+from typing import Union, Optional, Generator
 
 import numpy as np
 import torch as th
@@ -80,11 +80,12 @@ def reset(self) -> None:
     def sample(self,
                batch_size: int,
                env: Optional[VecNormalize] = None
-               ) -> Tuple[th.Tensor, ...]:
+               ):
         """
         :param batch_size: (int) Number of element to sample
         :param env: (Optional[VecNormalize]) associated gym VecEnv
             to normalize the observations/rewards when sampling
+        :return: (Union[RolloutBufferSamples, ReplayBufferSamples])
         """
         upper_bound = self.buffer_size if self.full else self.pos
         batch_inds = np.random.randint(0, upper_bound, size=batch_size)
@@ -93,11 +94,11 @@ def sample(self,
     def _get_samples(self,
                      batch_inds: np.ndarray,
                      env: Optional[VecNormalize] = None
-                     ) -> Tuple[th.Tensor, ...]:
+                     ):
         """
         :param batch_inds: (th.Tensor)
         :param env: (Optional[VecNormalize])
-        :return: ([th.Tensor])
+        :return: (Union[RolloutBufferSamples, ReplayBufferSamples])
         """
         raise NotImplementedError()
 
@@ -184,7 +185,7 @@ def _get_samples(self,
                 self._normalize_obs(self.next_observations[batch_inds, 0, :], env),
                 self.dones[batch_inds],
                 self._normalize_reward(self.rewards[batch_inds], env))
-        return tuple(map(self.to_torch, data))
+        return ReplayBufferSamples(*tuple(map(self.to_torch, data)))
 
 
 class RolloutBuffer(BaseBuffer):
@@ -333,4 +334,4 @@ def _get_samples(self, batch_inds: np.ndarray,
                 self.log_probs[batch_inds].flatten(),
                 self.advantages[batch_inds].flatten(),
                 self.returns[batch_inds].flatten())
-        return tuple(map(self.to_torch, data))
+        return RolloutBufferSamples(*tuple(map(self.to_torch, data)))
diff --git a/torchy_baselines/common/type_aliases.py b/torchy_baselines/common/type_aliases.py
index b9035dbd6..16576fff6 100644
--- a/torchy_baselines/common/type_aliases.py
+++ b/torchy_baselines/common/type_aliases.py
@@ -1,7 +1,8 @@
 """
 Common aliases for type hing
 """
-from typing import Union, Type, Optional, Dict, Any, List, Tuple
+from typing import Union, Type, Optional, Dict, Any, List, NamedTuple
+from collections import namedtuple
 
 import torch as th
 import gym
@@ -13,6 +14,19 @@
 TensorDict = Dict[str, th.Tensor]
 OptimizerStateDict = Dict[str, Any]
 # obs, action, old_values, old_log_prob, advantage, return_batch
-RolloutBufferSamples = Tuple[th.Tensor, th.Tensor, th.Tensor, th.Tensor, th.Tensor, th.Tensor]
+class RolloutBufferSamples(NamedTuple):
+    observations: th.Tensor
+    actions: th.Tensor
+    old_values: th.Tensor
+    old_log_prob: th.Tensor
+    advantages: th.Tensor
+    returns: th.Tensor
+
+
 # obs, action, next_obs, done, reward
-ReplayBufferSamples = Tuple[th.Tensor, th.Tensor, th.Tensor, th.Tensor, th.Tensor]
+class ReplayBufferSamples(NamedTuple):
+    observations: th.Tensor
+    actions: th.Tensor
+    next_observations: th.Tensor
+    dones: th.Tensor
+    rewards: th.Tensor
diff --git a/torchy_baselines/ppo/ppo.py b/torchy_baselines/ppo/ppo.py
index 6f30f7f70..f10f5e219 100644
--- a/torchy_baselines/ppo/ppo.py
+++ b/torchy_baselines/ppo/ppo.py
@@ -195,13 +195,12 @@ def train(self, gradient_steps: int, batch_size: int = 64) -> None:
         for gradient_step in range(gradient_steps):
             approx_kl_divs = []
             # Sample replay buffer
-            for replay_data in self.rollout_buffer.get(batch_size):
-                # Unpack
-                obs, action, old_values, old_log_prob, advantage, return_batch = replay_data
+            for rollout_data in self.rollout_buffer.get(batch_size):
 
+                actions = rollout_data.actions
                 if isinstance(self.action_space, spaces.Discrete):
-                    # Convert discrete action for float to long
-                    action = action.long().flatten()
+                    # Convert discrete action from float to long
+                    actions = rollout_data.actions.long().flatten()
 
                 # Re-sample the noise matrix because the log_std has changed
                 # TODO: investigate why there is no issue with the gradient
@@ -209,16 +208,16 @@ def train(self, gradient_steps: int, batch_size: int = 64) -> None:
                 if self.use_sde:
                     self.policy.reset_noise(batch_size)
 
-                values, log_prob, entropy = self.policy.evaluate_actions(obs, action)
+                values, log_prob, entropy = self.policy.evaluate_actions(rollout_data.observations, actions)
                 values = values.flatten()
                 # Normalize advantage
-                advantage = (advantage - advantage.mean()) / (advantage.std() + 1e-8)
+                advantages = (rollout_data.advantages - rollout_data.advantages.mean()) / (rollout_data.advantages.std() + 1e-8)
 
                 # ratio between old and new policy, should be one at the first iteration
-                ratio = th.exp(log_prob - old_log_prob)
+                ratio = th.exp(log_prob - rollout_data.old_log_prob)
                 # clipped surrogate loss
-                policy_loss_1 = advantage * ratio
-                policy_loss_2 = advantage * th.clamp(ratio, 1 - clip_range, 1 + clip_range)
+                policy_loss_1 = advantages * ratio
+                policy_loss_2 = advantages * th.clamp(ratio, 1 - clip_range, 1 + clip_range)
                 policy_loss = -th.min(policy_loss_1, policy_loss_2).mean()
 
                 if self.clip_range_vf is None:
@@ -227,9 +226,9 @@ def train(self, gradient_steps: int, batch_size: int = 64) -> None:
                 else:
                     # Clip the different between old and new value
                     # NOTE: this depends on the reward scaling
-                    values_pred = old_values + th.clamp(values - old_values, -clip_range_vf, clip_range_vf)
+                    values_pred = rollout_data.old_values + th.clamp(values - rollout_data.old_values, -clip_range_vf, clip_range_vf)
                 # Value loss using the TD(gae_lambda) target
-                value_loss = F.mse_loss(return_batch, values_pred)
+                value_loss = F.mse_loss(rollout_data.returns, values_pred)
 
                 # Entropy loss favor exploration
                 if entropy is None:
@@ -246,7 +245,7 @@ def train(self, gradient_steps: int, batch_size: int = 64) -> None:
                 # Clip grad norm
                 th.nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_grad_norm)
                 self.policy.optimizer.step()
-                approx_kl_divs.append(th.mean(old_log_prob - log_prob).detach().cpu().numpy())
+                approx_kl_divs.append(th.mean(rollout_data.old_log_prob - log_prob).detach().cpu().numpy())
 
             if self.target_kl is not None and np.mean(approx_kl_divs) > 1.5 * self.target_kl:
                 print("Early stopping at step {} due to reaching max kl: {:.2f}".format(gradient_step,
diff --git a/torchy_baselines/sac/sac.py b/torchy_baselines/sac/sac.py
index fa6738ba4..d7906e891 100644
--- a/torchy_baselines/sac/sac.py
+++ b/torchy_baselines/sac/sac.py
@@ -163,14 +163,12 @@ def train(self, gradient_steps: int, batch_size: int = 64):
             # Sample replay buffer
             replay_data = self.replay_buffer.sample(batch_size, env=self._vec_normalize_env)
 
-            obs, action_batch, next_obs, done, reward = replay_data
-
             # We need to sample because `log_std` may have changed between two gradient steps
             if self.use_sde:
                 self.actor.reset_noise()
 
             # Action by the current actor for the sampled state
-            action_pi, log_prob = self.actor.action_log_prob(obs)
+            actions_pi, log_prob = self.actor.action_log_prob(replay_data.observations)
             log_prob = log_prob.reshape(-1, 1)
 
             ent_coef_loss = None
@@ -192,17 +190,17 @@ def train(self, gradient_steps: int, batch_size: int = 64):
 
             with th.no_grad():
                 # Select action according to policy
-                next_action, next_log_prob = self.actor.action_log_prob(next_obs)
+                next_actions, next_log_prob = self.actor.action_log_prob(replay_data.next_observations)
                 # Compute the target Q value
-                target_q1, target_q2 = self.critic_target(next_obs, next_action)
+                target_q1, target_q2 = self.critic_target(replay_data.next_observations, next_actions)
                 target_q = th.min(target_q1, target_q2)
-                target_q = reward + (1 - done) * self.gamma * target_q
+                target_q = replay_data.rewards + (1 - replay_data.dones) * self.gamma * target_q
                 # td error + entropy term
                 q_backup = target_q - ent_coef * next_log_prob.reshape(-1, 1)
 
             # Get current Q estimates
             # using action from the replay buffer
-            current_q1, current_q2 = self.critic(obs, action_batch)
+            current_q1, current_q2 = self.critic(replay_data.observations, replay_data.actions)
 
             # Compute critic loss
             critic_loss = 0.5 * (F.mse_loss(current_q1, q_backup) + F.mse_loss(current_q2, q_backup))
@@ -214,7 +212,7 @@ def train(self, gradient_steps: int, batch_size: int = 64):
 
             # Compute actor loss
             # Alternative: actor_loss = th.mean(log_prob - qf1_pi)
-            qf1_pi, qf2_pi = self.critic.forward(obs, action_pi)
+            qf1_pi, qf2_pi = self.critic.forward(replay_data.observations, actions_pi)
             min_qf_pi = th.min(qf1_pi, qf2_pi)
             actor_loss = (ent_coef * log_prob - min_qf_pi).mean()
 
diff --git a/torchy_baselines/td3/td3.py b/torchy_baselines/td3/td3.py
index 1ba594775..963338df7 100644
--- a/torchy_baselines/td3/td3.py
+++ b/torchy_baselines/td3/td3.py
@@ -124,22 +124,20 @@ def train_critic(self, gradient_steps: int = 1,
         for gradient_step in range(gradient_steps):
             # Sample replay buffer
             if replay_data is None:
-                obs, action, next_obs, done, reward = self.replay_buffer.sample(batch_size, env=self._vec_normalize_env)
-            else:
-                obs, action, next_obs, done, reward = replay_data
+                replay_data = self.replay_buffer.sample(batch_size, env=self._vec_normalize_env)
 
             # Select action according to policy and add clipped noise
-            noise = action.clone().data.normal_(0, self.target_policy_noise)
+            noise = replay_data.actions.clone().data.normal_(0, self.target_policy_noise)
             noise = noise.clamp(-self.target_noise_clip, self.target_noise_clip)
-            next_action = (self.actor_target(next_obs) + noise).clamp(-1, 1)
+            next_actions = (self.actor_target(replay_data.next_observations) + noise).clamp(-1, 1)
 
             # Compute the target Q value
-            target_q1, target_q2 = self.critic_target(next_obs, next_action)
+            target_q1, target_q2 = self.critic_target(replay_data.next_observations, next_actions)
             target_q = th.min(target_q1, target_q2)
-            target_q = reward + ((1 - done) * self.gamma * target_q).detach()
+            target_q = replay_data.rewards + ((1 - replay_data.dones) * self.gamma * target_q).detach()
 
             # Get current Q estimates
-            current_q1, current_q2 = self.critic(obs, action)
+            current_q1, current_q2 = self.critic(replay_data.observations, replay_data.actions)
 
             # Compute critic loss
             critic_loss = F.mse_loss(current_q1, target_q) + F.mse_loss(current_q2, target_q)
@@ -167,12 +165,10 @@ def train_actor(self, gradient_steps: int = 1,
         for gradient_step in range(gradient_steps):
             # Sample replay buffer
             if replay_data is None:
-                obs, _, next_obs, done, reward = self.replay_buffer.sample(batch_size, env=self._vec_normalize_env)
-            else:
-                obs, _, next_obs, done, reward = replay_data
+                replay_data = self.replay_buffer.sample(batch_size, env=self._vec_normalize_env)
 
             # Compute actor loss
-            actor_loss = -self.critic.q1_forward(obs, self.actor(obs)).mean()
+            actor_loss = -self.critic.q1_forward(replay_data.observations, self.actor(replay_data.observations)).mean()
 
             # Optimize the actor
             self.actor.optimizer.zero_grad()

From 20ee8cb68dcd036b5045e6fce41d6b31f359e16a Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Tue, 10 Mar 2020 16:55:13 +0100
Subject: [PATCH 05/17] Update changelog and add more namedtuples

---
 docs/misc/changelog.rst                 |  1 +
 torchy_baselines/common/base_class.py   | 10 +++++-----
 torchy_baselines/common/type_aliases.py | 13 +++++++++++--
 torchy_baselines/sac/sac.py             | 10 ++++------
 torchy_baselines/td3/td3.py             |  9 ++++-----
 5 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
index b05e73a1a..2aef391a2 100644
--- a/docs/misc/changelog.rst
+++ b/docs/misc/changelog.rst
@@ -23,6 +23,7 @@ Others:
 ^^^^^^^
 - SAC with SDE now sample only one matrix
 - Added ``clip_mean`` parameter to SAC policy
+- Buffers now return ``NamedTuple``
 
 Documentation:
 ^^^^^^^^^^^^^^
diff --git a/torchy_baselines/common/base_class.py b/torchy_baselines/common/base_class.py
index b01002576..d6bd69de3 100644
--- a/torchy_baselines/common/base_class.py
+++ b/torchy_baselines/common/base_class.py
@@ -16,7 +16,7 @@
 from torchy_baselines.common.utils import set_random_seed, get_schedule_fn, update_learning_rate
 from torchy_baselines.common.vec_env import DummyVecEnv, VecEnv, unwrap_vec_normalize, VecNormalize
 from torchy_baselines.common.save_util import data_to_json, json_to_data, recursive_getattr, recursive_setattr
-from torchy_baselines.common.type_aliases import GymEnv, TensorDict, OptimizerStateDict
+from torchy_baselines.common.type_aliases import GymEnv, TensorDict, OptimizerStateDict, RolloutReturn
 from torchy_baselines.common.callbacks import BaseCallback, CallbackList, ConvertCallback, EvalCallback
 from torchy_baselines.common.monitor import Monitor
 from torchy_baselines.common.noise import ActionNoise
@@ -830,7 +830,7 @@ def collect_rollouts(self,
                          replay_buffer: Optional[ReplayBuffer] = None,
                          obs: Optional[np.ndarray] = None,
                          episode_num: int = 0,
-                         log_interval: Optional[int] = None) -> Tuple[float, int, int, Optional[np.ndarray], bool]:
+                         log_interval: Optional[int] = None) -> RolloutReturn:
         """
         Collect rollout using the current policy (and possibly fill the replay buffer)
 
@@ -849,6 +849,7 @@ def collect_rollouts(self,
         :param obs: (np.ndarray) Last observation from the environment
         :param episode_num: (int) Episode index
         :param log_interval: (int) Log data every `log_interval` episodes
+        :return: (RolloutReturn)
         """
         episode_rewards, total_timesteps = [], []
         total_steps, total_episodes = 0, 0
@@ -878,8 +879,7 @@ def collect_rollouts(self,
 
                 # Only stop training if return value is False, not when it is None.
                 if callback() is False:
-                    continue_training = False
-                    return 0.0, total_steps, total_episodes, None, continue_training
+                    return RolloutReturn(0.0, total_steps, total_episodes, None, continue_training=False)
 
                 if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0:
                     # Sample a new noise matrix
@@ -1003,4 +1003,4 @@ def collect_rollouts(self,
 
         callback.on_rollout_end()
 
-        return mean_reward, total_steps, total_episodes, obs, continue_training
+        return RolloutReturn(mean_reward, total_steps, total_episodes, obs, continue_training)
diff --git a/torchy_baselines/common/type_aliases.py b/torchy_baselines/common/type_aliases.py
index 16576fff6..12c220fe4 100644
--- a/torchy_baselines/common/type_aliases.py
+++ b/torchy_baselines/common/type_aliases.py
@@ -4,6 +4,7 @@
 from typing import Union, Type, Optional, Dict, Any, List, NamedTuple
 from collections import namedtuple
 
+import numpy as np
 import torch as th
 import gym
 
@@ -13,7 +14,8 @@
 GymEnv = Union[gym.Env, VecEnv]
 TensorDict = Dict[str, th.Tensor]
 OptimizerStateDict = Dict[str, Any]
-# obs, action, old_values, old_log_prob, advantage, return_batch
+
+
 class RolloutBufferSamples(NamedTuple):
     observations: th.Tensor
     actions: th.Tensor
@@ -23,10 +25,17 @@ class RolloutBufferSamples(NamedTuple):
     returns: th.Tensor
 
 
-# obs, action, next_obs, done, reward
 class ReplayBufferSamples(NamedTuple):
     observations: th.Tensor
     actions: th.Tensor
     next_observations: th.Tensor
     dones: th.Tensor
     rewards: th.Tensor
+
+
+class RolloutReturn(NamedTuple):
+    episode_reward: float
+    episode_timesteps: int
+    n_episodes: int
+    obs: Optional[np.ndarray]
+    continue_training: bool
diff --git a/torchy_baselines/sac/sac.py b/torchy_baselines/sac/sac.py
index d7906e891..58b19dbb8 100644
--- a/torchy_baselines/sac/sac.py
+++ b/torchy_baselines/sac/sac.py
@@ -249,18 +249,16 @@ def learn(self, total_timesteps, callback=None, log_interval=4,
                                             replay_buffer=self.replay_buffer,
                                             obs=obs, episode_num=episode_num,
                                             log_interval=log_interval)
-            # Unpack
-            episode_reward, episode_timesteps, n_episodes, obs, continue_training = rollout
 
-            if continue_training is False:
+            if rollout.continue_training is False:
                 break
 
-            episode_num += n_episodes
+            obs = rollout.obs
+            episode_num += rollout.n_episodes
             self._update_current_progress(self.num_timesteps, total_timesteps)
 
             if self.num_timesteps > 0 and self.num_timesteps > self.learning_starts:
-                gradient_steps = self.gradient_steps if self.gradient_steps > 0 else episode_timesteps
-
+                gradient_steps = self.gradient_steps if self.gradient_steps > 0 else rollout.episode_timesteps
                 self.train(gradient_steps, batch_size=self.batch_size)
 
         callback.on_training_end()
diff --git a/torchy_baselines/td3/td3.py b/torchy_baselines/td3/td3.py
index 963338df7..767f2f070 100644
--- a/torchy_baselines/td3/td3.py
+++ b/torchy_baselines/td3/td3.py
@@ -259,13 +259,12 @@ def learn(self, total_timesteps, callback=None, log_interval=4,
                                             replay_buffer=self.replay_buffer,
                                             obs=obs, episode_num=episode_num,
                                             log_interval=log_interval)
-            # Unpack
-            episode_reward, episode_timesteps, n_episodes, obs, continue_training = rollout
 
-            if continue_training is False:
+            if rollout.continue_training is False:
                 break
 
-            episode_num += n_episodes
+            obs = rollout.obs
+            episode_num += rollout.n_episodes
             self._update_current_progress(self.num_timesteps, total_timesteps)
 
             if self.num_timesteps > 0 and self.num_timesteps > self.learning_starts:
@@ -279,7 +278,7 @@ def learn(self, total_timesteps, callback=None, log_interval=4,
                         # On-policy gradient
                         self.train_sde()
 
-                gradient_steps = self.gradient_steps if self.gradient_steps > 0 else episode_timesteps
+                gradient_steps = self.gradient_steps if self.gradient_steps > 0 else rollout.episode_timesteps
                 self.train(gradient_steps, batch_size=self.batch_size, policy_delay=self.policy_delay)
 
         callback.on_training_end()

From f159a4a9f261737db4427de072bb189056604731 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Tue, 10 Mar 2020 17:08:39 +0100
Subject: [PATCH 06/17] Bug fix for A2C

---
 torchy_baselines/a2c/a2c.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/torchy_baselines/a2c/a2c.py b/torchy_baselines/a2c/a2c.py
index ea27d715f..5f0d507b3 100644
--- a/torchy_baselines/a2c/a2c.py
+++ b/torchy_baselines/a2c/a2c.py
@@ -97,8 +97,9 @@ def train(self, gradient_steps: int, batch_size=None):
             values = values.flatten()
 
             # Normalize advantage (not present in the original implementation)
+            advantages = rollout_data.advantages
             if self.normalize_advantage:
-                advantages = (rollout_data.advantages - rollout_data.advantages.mean()) / (rollout_data.advantages.std() + 1e-8)
+                advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
 
             # Policy gradient loss
             policy_loss = -(advantages * log_prob).mean()

From 80fb62e22ddbf3ab431e3f6a324575a15968ca43 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Tue, 10 Mar 2020 17:10:15 +0100
Subject: [PATCH 07/17] Bump version

---
 setup.py                     | 2 +-
 torchy_baselines/__init__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 8780397de..51dea7b81 100644
--- a/setup.py
+++ b/setup.py
@@ -47,7 +47,7 @@
       license="MIT",
       long_description="",
       long_description_content_type='text/markdown',
-      version="0.2.1",
+      version="0.2.2",
       )
 
 # python setup.py sdist
diff --git a/torchy_baselines/__init__.py b/torchy_baselines/__init__.py
index b201dc8ef..a548d3037 100644
--- a/torchy_baselines/__init__.py
+++ b/torchy_baselines/__init__.py
@@ -4,4 +4,4 @@
 from torchy_baselines.sac import SAC
 from torchy_baselines.td3 import TD3
 
-__version__ = "0.2.1"
+__version__ = "0.2.2"

From 6ebad92e1b1191a98dc79de84352cf2b15162716 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Tue, 10 Mar 2020 17:43:54 +0100
Subject: [PATCH 08/17] Remove default seed and bump dependencies

---
 docs/misc/changelog.rst           |  3 ++
 setup.py                          |  4 +-
 torchy_baselines/a2c/a2c.py       |  2 +-
 torchy_baselines/cem_rl/cem_rl.py |  2 +-
 torchy_baselines/ppo/ppo.py       |  2 +-
 torchy_baselines/sac/policies.py  | 38 ++++++++++--------
 torchy_baselines/sac/sac.py       | 65 +++++++++++++++++++++----------
 torchy_baselines/td3/td3.py       |  2 +-
 8 files changed, 76 insertions(+), 42 deletions(-)

diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
index 2aef391a2..45d28de6c 100644
--- a/docs/misc/changelog.rst
+++ b/docs/misc/changelog.rst
@@ -9,6 +9,8 @@ Pre-Release 0.3.0a0 (WIP)
 
 Breaking Changes:
 ^^^^^^^^^^^^^^^^^
+- Removed default seed
+- Bump dependencies (PyTorch and Gym)
 
 New Features:
 ^^^^^^^^^^^^^
@@ -24,6 +26,7 @@ Others:
 - SAC with SDE now sample only one matrix
 - Added ``clip_mean`` parameter to SAC policy
 - Buffers now return ``NamedTuple``
+- More typing
 
 Documentation:
 ^^^^^^^^^^^^^^
diff --git a/setup.py b/setup.py
index 51dea7b81..1fa044cc6 100644
--- a/setup.py
+++ b/setup.py
@@ -7,9 +7,9 @@
       packages=[package for package in find_packages()
                 if package.startswith('torchy_baselines')],
       install_requires=[
-          'gym[classic_control]>=0.10.9',
+          'gym[classic_control]>=0.11',
           'numpy',
-          'torch>=1.2.0',
+          'torch>=1.4.0',
           'cloudpickle',
           # For reading logs
           'pandas',
diff --git a/torchy_baselines/a2c/a2c.py b/torchy_baselines/a2c/a2c.py
index 5f0d507b3..1b60e2740 100644
--- a/torchy_baselines/a2c/a2c.py
+++ b/torchy_baselines/a2c/a2c.py
@@ -51,7 +51,7 @@ def __init__(self, policy, env, learning_rate=7e-4,
                  ent_coef=0.0, vf_coef=0.5, max_grad_norm=0.5,
                  rms_prop_eps=1e-5, use_rms_prop=True, use_sde=False, sde_sample_freq=-1,
                  normalize_advantage=False, tensorboard_log=None, create_eval_env=False,
-                 policy_kwargs=None, verbose=0, seed=0, device='auto',
+                 policy_kwargs=None, verbose=0, seed=None, device='auto',
                  _init_setup_model=True):
 
         super(A2C, self).__init__(policy, env, learning_rate=learning_rate,
diff --git a/torchy_baselines/cem_rl/cem_rl.py b/torchy_baselines/cem_rl/cem_rl.py
index 9a2defa59..2d1838ee6 100644
--- a/torchy_baselines/cem_rl/cem_rl.py
+++ b/torchy_baselines/cem_rl/cem_rl.py
@@ -62,7 +62,7 @@ def __init__(self, policy, env, sigma_init=1e-3, pop_size=10,
                  action_noise=None, target_policy_noise=0.2, target_noise_clip=0.5,
                  n_episodes_rollout=1, update_style='original',
                  tensorboard_log=None, create_eval_env=False,
-                 policy_kwargs=None, verbose=0, seed=0, device='auto',
+                 policy_kwargs=None, verbose=0, seed=None, device='auto',
                  _init_setup_model=True):
 
         super(CEMRL, self).__init__(policy, env,
diff --git a/torchy_baselines/ppo/ppo.py b/torchy_baselines/ppo/ppo.py
index f10f5e219..4ee8ef1f0 100644
--- a/torchy_baselines/ppo/ppo.py
+++ b/torchy_baselines/ppo/ppo.py
@@ -79,7 +79,7 @@ def __init__(self, policy, env, learning_rate=3e-4,
                  ent_coef=0.0, vf_coef=0.5, max_grad_norm=0.5,
                  use_sde=False, sde_sample_freq=-1,
                  target_kl=None, tensorboard_log=None, create_eval_env=False,
-                 policy_kwargs=None, verbose=0, seed=0, device='auto',
+                 policy_kwargs=None, verbose=0, seed=None, device='auto',
                  _init_setup_model=True):
 
         super(PPO, self).__init__(policy, env, PPOPolicy, policy_kwargs=policy_kwargs,
diff --git a/torchy_baselines/sac/policies.py b/torchy_baselines/sac/policies.py
index e570b570c..15a28f2dd 100644
--- a/torchy_baselines/sac/policies.py
+++ b/torchy_baselines/sac/policies.py
@@ -1,5 +1,6 @@
-from typing import Optional, List, Tuple
+from typing import Optional, List, Tuple, Callable, Union
 
+import gym
 import torch as th
 import torch.nn as nn
 
@@ -143,8 +144,10 @@ class Critic(BaseNetwork):
     :param net_arch: ([int]) Network architecture
     :param activation_fn: (nn.Module) Activation function
     """
-    def __init__(self, obs_dim, action_dim,
-                 net_arch, activation_fn=nn.ReLU):
+    def __init__(self, obs_dim: int,
+                 action_dim: int,
+                 net_arch: List[int],
+                 activation_fn: nn.Module = nn.ReLU):
         super(Critic, self).__init__()
 
         q1_net = create_mlp(obs_dim + action_dim, 1, net_arch, activation_fn)
@@ -155,13 +158,10 @@ def __init__(self, obs_dim, action_dim,
 
         self.q_networks = [self.q1_net, self.q2_net]
 
-    def forward(self, obs, action):
+    def forward(self, obs: th.Tensor, action: th.Tensor) -> List[th.Tensor]:
         qvalue_input = th.cat([obs, action], dim=1)
         return [q_net(qvalue_input) for q_net in self.q_networks]
 
-    def q1_forward(self, obs, action):
-        return self.q_networks[0](th.cat([obs, action], dim=1))
-
 
 class SACPolicy(BasePolicy):
     """
@@ -183,11 +183,17 @@ class SACPolicy(BasePolicy):
         above zero and prevent it from growing too fast. In practice, `exp()` is usually enough.
     :param clip_mean: (float) Clip the mean output when using SDE to avoid numerical instability.
     """
-    def __init__(self, observation_space, action_space,
-                 learning_rate, net_arch=None, device='cpu',
-                 activation_fn=nn.ReLU, use_sde=False,
-                 log_std_init=-3, sde_net_arch=None,
-                 use_expln=False, clip_mean=2.0):
+    def __init__(self, observation_space: gym.spaces.Space,
+                 action_space: gym.spaces.Space,
+                 learning_rate: Callable,
+                 net_arch: Optional[List[int]] = None,
+                 device: Union[th.device, str] = 'cpu',
+                 activation_fn: nn.Module = nn.ReLU,
+                 use_sde: bool = False,
+                 log_std_init: float = -3,
+                 sde_net_arch: Optional[List[int]] = None,
+                 use_expln: bool = False,
+                 clip_mean: float = 2.0):
         super(SACPolicy, self).__init__(observation_space, action_space, device, squash_output=True)
 
         if net_arch is None:
@@ -217,7 +223,7 @@ def __init__(self, observation_space, action_space,
 
         self._build(learning_rate)
 
-    def _build(self, learning_rate):
+    def _build(self, learning_rate: Callable) -> None:
         self.actor = self.make_actor()
         self.actor.optimizer = th.optim.Adam(self.actor.parameters(), lr=learning_rate(1))
 
@@ -226,13 +232,13 @@ def _build(self, learning_rate):
         self.critic_target.load_state_dict(self.critic.state_dict())
         self.critic.optimizer = th.optim.Adam(self.critic.parameters(), lr=learning_rate(1))
 
-    def make_actor(self):
+    def make_actor(self) -> Actor:
         return Actor(**self.actor_kwargs).to(self.device)
 
-    def make_critic(self):
+    def make_critic(self) -> Critic:
         return Critic(**self.net_args).to(self.device)
 
-    def forward(self, obs):
+    def forward(self, obs: th.Tensor) -> th.Tensor:
         return self.actor(obs)
 
     def predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor:
diff --git a/torchy_baselines/sac/sac.py b/torchy_baselines/sac/sac.py
index 58b19dbb8..2ac822fc6 100644
--- a/torchy_baselines/sac/sac.py
+++ b/torchy_baselines/sac/sac.py
@@ -1,13 +1,16 @@
-from typing import List, Tuple
+from typing import List, Tuple, Type, Union, Callable, Optional, Dict, Any
 
 import torch as th
 import torch.nn.functional as F
 import numpy as np
 
+from torchy_baselines.common import logger
 from torchy_baselines.common.base_class import OffPolicyRLModel
 from torchy_baselines.common.buffers import ReplayBuffer
+from torchy_baselines.common.type_aliases import GymEnv
+from torchy_baselines.common.noise import ActionNoise
+from torchy_baselines.common.callbacks import BaseCallback
 from torchy_baselines.sac.policies import SACPolicy
-from torchy_baselines.common import logger
 
 
 class SAC(OffPolicyRLModel):
@@ -25,7 +28,7 @@ class SAC(OffPolicyRLModel):
     in https://github.com/hill-a/stable-baselines/issues/270
 
     :param policy: (SACPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, ...)
-    :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str)
+    :param env: (GymEnv or str) The environment to learn from (if registered in Gym, can be str)
     :param learning_rate: (float or callable) learning rate for adam optimizer,
         the same learning rate will be used for all networks (Q-Values, Actor and Value function)
         it can be a function of the current progress (from 1 to 0)
@@ -61,16 +64,31 @@ class SAC(OffPolicyRLModel):
     :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance
     """
 
-    def __init__(self, policy, env, learning_rate=3e-4, buffer_size=int(1e6),
-                 learning_starts=100, batch_size=256,
-                 tau=0.005, ent_coef='auto', target_update_interval=1,
-                 train_freq=1, gradient_steps=1, n_episodes_rollout=-1,
-                 target_entropy='auto', action_noise=None,
-                 gamma=0.99, use_sde=False, sde_sample_freq=-1,
-                 use_sde_at_warmup=False,
-                 tensorboard_log=None, create_eval_env=False,
-                 policy_kwargs=None, verbose=0, seed=0, device='auto',
-                 _init_setup_model=True):
+    def __init__(self, policy: Union[str, Type[SACPolicy]],
+                 env: Union[GymEnv, str],
+                 learning_rate: Union[float, Callable] = 3e-4,
+                 buffer_size: int = int(1e6),
+                 learning_starts: int = 100,
+                 batch_size: int = 256,
+                 tau: float = 0.005,
+                 ent_coef: Union[str, float] = 'auto',
+                 target_update_interval: int = 1,
+                 train_freq: int = 1,
+                 gradient_steps: int = 1,
+                 n_episodes_rollout: int = -1,
+                 target_entropy: Union[str, float] = 'auto',
+                 action_noise: Optional[ActionNoise] = None,
+                 gamma: float = 0.99,
+                 use_sde: bool = False,
+                 sde_sample_freq: int = -1,
+                 use_sde_at_warmup: bool = False,
+                 tensorboard_log: Optional[str] = None,
+                 create_eval_env: bool = False,
+                 policy_kwargs: Dict[str, Any] = None,
+                 verbose: int = 0,
+                 seed: Optional[int] = None,
+                 device: Union[th.device, str] = 'auto',
+                 _init_setup_model: bool = True):
 
         super(SAC, self).__init__(policy, env, SACPolicy, policy_kwargs, verbose, device,
                                   create_eval_env=create_eval_env, seed=seed,
@@ -79,7 +97,7 @@ def __init__(self, policy, env, learning_rate=3e-4, buffer_size=int(1e6),
 
         self.learning_rate = learning_rate
         self.target_entropy = target_entropy
-        self.log_ent_coef = None
+        self.log_ent_coef = None  # type: Optional[th.Tensor]
         self.target_update_interval = target_update_interval
         self.buffer_size = buffer_size
         # In the original paper, same learning rate is used for all networks
@@ -101,7 +119,7 @@ def __init__(self, policy, env, learning_rate=3e-4, buffer_size=int(1e6),
         if _init_setup_model:
             self._setup_model()
 
-    def _setup_model(self):
+    def _setup_model(self) -> None:
         self._setup_learning_rate()
         obs_dim, action_dim = self.observation_space.shape[0], self.action_space.shape[0]
         if self.seed is not None:
@@ -143,12 +161,12 @@ def _setup_model(self):
         self.policy = self.policy.to(self.device)
         self._create_aliases()
 
-    def _create_aliases(self):
+    def _create_aliases(self) -> None:
         self.actor = self.policy.actor
         self.critic = self.policy.critic
         self.critic_target = self.policy.critic_target
 
-    def train(self, gradient_steps: int, batch_size: int = 64):
+    def train(self, gradient_steps: int, batch_size: int = 64) -> None:
         # Update optimizers learning rate
         optimizers = [self.actor.optimizer, self.critic.optimizer]
         if self.ent_coef_optimizer is not None:
@@ -233,9 +251,16 @@ def train(self, gradient_steps: int, batch_size: int = 64):
         if ent_coef_loss is not None:
             logger.logkv("ent_coef_loss", ent_coef_loss.item())
 
-    def learn(self, total_timesteps, callback=None, log_interval=4,
-              eval_env=None, eval_freq=-1, n_eval_episodes=5, tb_log_name="SAC",
-              eval_log_path=None, reset_num_timesteps=True):
+    def learn(self,
+              total_timesteps: int,
+              callback: Optional[BaseCallback] = None,
+              log_interval: int = 4,
+              eval_env: Optional[GymEnv] = None,
+              eval_freq: int = -1,
+              n_eval_episodes: int = 5,
+              tb_log_name: str = "SAC",
+              eval_log_path: Optional[str] = None,
+              reset_num_timesteps: bool = True) -> OffPolicyRLModel:
 
         episode_num, obs, callback = self._setup_learn(eval_env, callback, eval_freq,
                                                        n_eval_episodes, eval_log_path, reset_num_timesteps)
diff --git a/torchy_baselines/td3/td3.py b/torchy_baselines/td3/td3.py
index 767f2f070..7dbaf1636 100644
--- a/torchy_baselines/td3/td3.py
+++ b/torchy_baselines/td3/td3.py
@@ -65,7 +65,7 @@ def __init__(self, policy, env, buffer_size=int(1e6), learning_rate=1e-3,
                  use_sde=False, sde_sample_freq=-1, sde_max_grad_norm=1,
                  sde_ent_coef=0.0, sde_log_std_scheduler=None, use_sde_at_warmup=False,
                  tensorboard_log=None, create_eval_env=False, policy_kwargs=None, verbose=0,
-                 seed=0, device='auto', _init_setup_model=True):
+                 seed=None, device='auto', _init_setup_model=True):
 
         super(TD3, self).__init__(policy, env, TD3Policy, policy_kwargs, verbose, device,
                                   create_eval_env=create_eval_env, seed=seed,

From 35d0d2b32056ccc614770ec7b58065fb637b5cec Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Tue, 10 Mar 2020 18:09:45 +0100
Subject: [PATCH 09/17] More typing

---
 torchy_baselines/common/policies.py |  7 ++--
 torchy_baselines/ppo/policies.py    | 56 +++++++++++++++++------------
 2 files changed, 38 insertions(+), 25 deletions(-)

diff --git a/torchy_baselines/common/policies.py b/torchy_baselines/common/policies.py
index abab7d845..a5e08b5c6 100644
--- a/torchy_baselines/common/policies.py
+++ b/torchy_baselines/common/policies.py
@@ -237,7 +237,10 @@ class MlpExtractor(nn.Module):
     :param activation_fn: (nn.Module) The activation function to use for the networks.
     :param device: (th.device)
     """
-    def __init__(self, feature_dim, net_arch, activation_fn, device='cpu'):
+    def __init__(self, feature_dim: int,
+                 net_arch: List[Union[int, Dict[str, List[int]]]],
+                 activation_fn: nn.Module,
+                 device: Union[th.device, str] = 'cpu'):
         super(MlpExtractor, self).__init__()
 
         shared_net, policy_net, value_net = [], [], []
@@ -291,7 +294,7 @@ def __init__(self, feature_dim, net_arch, activation_fn, device='cpu'):
         self.policy_net = nn.Sequential(*policy_net).to(device)
         self.value_net = nn.Sequential(*value_net).to(device)
 
-    def forward(self, features):
+    def forward(self, features: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
         """
         :return: (th.Tensor, th.Tensor) latent_policy, latent_value of the specified network.
             If all layers are shared, then ``latent_policy == latent_value``
diff --git a/torchy_baselines/ppo/policies.py b/torchy_baselines/ppo/policies.py
index 3e47375d5..491737a02 100644
--- a/torchy_baselines/ppo/policies.py
+++ b/torchy_baselines/ppo/policies.py
@@ -1,13 +1,16 @@
+from typing import Optional, List, Tuple, Callable, Union, Dict
 from functools import partial
 
+import gym
 import torch as th
 import torch.nn as nn
 import numpy as np
 
-from torchy_baselines.common.policies import BasePolicy, register_policy, MlpExtractor, \
-    create_sde_feature_extractor
-from torchy_baselines.common.distributions import make_proba_distribution,\
-    DiagGaussianDistribution, CategoricalDistribution, StateDependentNoiseDistribution
+from torchy_baselines.common.policies import (BasePolicy, register_policy, MlpExtractor,
+                                              create_sde_feature_extractor)
+from torchy_baselines.common.distributions import (make_proba_distribution, Distribution,
+    DiagGaussianDistribution, CategoricalDistribution, StateDependentNoiseDistribution)
+
 
 
 class PPOPolicy(BasePolicy):
@@ -35,12 +38,21 @@ class PPOPolicy(BasePolicy):
     :param squash_output: (bool) Whether to squash the output using a tanh function,
         this allows to ensure boundaries when using SDE.
     """
-    def __init__(self, observation_space, action_space,
-                 learning_rate, net_arch=None, device='cpu',
-                 activation_fn=nn.Tanh, adam_epsilon=1e-5,
-                 ortho_init=True, use_sde=False,
-                 log_std_init=0.0, full_std=True,
-                 sde_net_arch=None, use_expln=False, squash_output=False):
+    def __init__(self,
+                 observation_space: gym.spaces.Space,
+                 action_space: gym.spaces.Space,
+                 learning_rate: Callable,
+                 net_arch: Optional[List[Union[int, Dict[str, List[int]]]]] = None,
+                 device: Union[th.device, str] = 'cpu',
+                 activation_fn: nn.Module = nn.Tanh,
+                 adam_epsilon: float = 1e-5,
+                 ortho_init: bool = True,
+                 use_sde: bool = False,
+                 log_std_init: float = 0.0,
+                 full_std: bool = True,
+                 sde_net_arch: Optional[List[int]] = None,
+                 use_expln: bool = False,
+                 squash_output: bool = False):
         super(PPOPolicy, self).__init__(observation_space, action_space, device, squash_output=squash_output)
         self.obs_dim = self.observation_space.shape[0]
 
@@ -83,7 +95,7 @@ def __init__(self, observation_space, action_space,
 
         self._build(learning_rate)
 
-    def reset_noise(self, n_envs: int = 1):
+    def reset_noise(self, n_envs: int = 1) -> None:
         """
         Sample new weights for the exploration matrix.
 
@@ -92,7 +104,7 @@ def reset_noise(self, n_envs: int = 1):
         assert isinstance(self.action_dist, StateDependentNoiseDistribution), 'reset_noise() is only available when using SDE'
         self.action_dist.sample_weights(self.log_std, batch_size=n_envs)
 
-    def _build(self, learning_rate):
+    def _build(self, learning_rate: Callable) -> None:
         self.mlp_extractor = MlpExtractor(self.features_dim, net_arch=self.net_arch,
                                           activation_fn=self.activation_fn, device=self.device)
 
@@ -129,7 +141,7 @@ def _build(self, learning_rate):
                 module.apply(partial(self.init_weights, gain=gain))
         self.optimizer = th.optim.Adam(self.parameters(), lr=learning_rate(1), eps=self.adam_epsilon)
 
-    def forward(self, obs, deterministic=False):
+    def forward(self, obs: th.Tensor, deterministic: bool = False) -> Tuple[th.Tensor, th.Tensor, th.Tensor]:
         if not isinstance(obs, th.Tensor):
             obs = th.FloatTensor(obs).to(self.device)
         latent_pi, latent_vf, latent_sde = self._get_latent(obs)
@@ -139,7 +151,7 @@ def forward(self, obs, deterministic=False):
         log_prob = action_distribution.log_prob(action)
         return action, value, log_prob
 
-    def _get_latent(self, obs):
+    def _get_latent(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor, th.Tensor]:
         features = self.features_extractor(obs)
         latent_pi, latent_vf = self.mlp_extractor(features)
         # Features for sde
@@ -148,7 +160,9 @@ def _get_latent(self, obs):
             latent_sde = self.sde_feature_extractor(features)
         return latent_pi, latent_vf, latent_sde
 
-    def _get_action_dist_from_latent(self, latent_pi, latent_sde=None, deterministic=False):
+    def _get_action_dist_from_latent(self, latent_pi: th.Tensor,
+                                     latent_sde: Optional[th.Tensor] = None,
+                                     deterministic: bool = False) -> Tuple[th.Tensor, Distribution]:
         mean_actions = self.action_net(latent_pi)
 
         if isinstance(self.action_dist, DiagGaussianDistribution):
@@ -169,7 +183,7 @@ def predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Ten
         action, _ = self._get_action_dist_from_latent(latent_pi, latent_sde, deterministic=deterministic)
         return action
 
-    def evaluate_actions(self, obs, action, deterministic=False):
+    def evaluate_actions(self, obs: th.Tensor, actions: th.Tensor, deterministic: bool = False) -> Tuple[th.Tensor, th.Tensor, th.Tensor]:
         """
         Evaluate actions according to the current policy,
         given the observations.
@@ -182,13 +196,9 @@ def evaluate_actions(self, obs, action, deterministic=False):
         """
         latent_pi, latent_vf, latent_sde = self._get_latent(obs)
         _, action_distribution = self._get_action_dist_from_latent(latent_pi, latent_sde, deterministic=deterministic)
-        log_prob = action_distribution.log_prob(action)
-        value = self.value_net(latent_vf)
-        return value, log_prob, action_distribution.entropy()
-
-    def value_forward(self, obs):
-        _, latent_vf, _ = self._get_latent(obs)
-        return self.value_net(latent_vf)
+        log_prob = action_distribution.log_prob(actions)
+        values = self.value_net(latent_vf)
+        return values, log_prob, action_distribution.entropy()
 
 
 MlpPolicy = PPOPolicy

From 7e3736ed56a8fbc86a312ab1587558e4f34011b9 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Tue, 10 Mar 2020 18:17:47 +0100
Subject: [PATCH 10/17] Type A2C and PPO init

---
 torchy_baselines/a2c/a2c.py | 35 +++++++++++++++++++++++++++--------
 torchy_baselines/ppo/ppo.py | 36 ++++++++++++++++++++++++++----------
 2 files changed, 53 insertions(+), 18 deletions(-)

diff --git a/torchy_baselines/a2c/a2c.py b/torchy_baselines/a2c/a2c.py
index 1b60e2740..4c24c7faf 100644
--- a/torchy_baselines/a2c/a2c.py
+++ b/torchy_baselines/a2c/a2c.py
@@ -1,10 +1,15 @@
+from typing import List, Tuple, Type, Union, Callable, Optional, Dict, Any
+
 from gym import spaces
 import torch as th
 import torch.nn.functional as F
 
 from torchy_baselines.common.utils import explained_variance
-from torchy_baselines.ppo.ppo import PPO
 from torchy_baselines.common import logger
+from torchy_baselines.common.type_aliases import GymEnv
+from torchy_baselines.ppo.ppo import PPO
+from torchy_baselines.ppo.policies import PPOPolicy
+
 
 
 class A2C(PPO):
@@ -46,13 +51,27 @@ class A2C(PPO):
         Setting it to auto, the code will be run on the GPU if possible.
     :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance
     """
-    def __init__(self, policy, env, learning_rate=7e-4,
-                 n_steps=5, gamma=0.99, gae_lambda=1.0,
-                 ent_coef=0.0, vf_coef=0.5, max_grad_norm=0.5,
-                 rms_prop_eps=1e-5, use_rms_prop=True, use_sde=False, sde_sample_freq=-1,
-                 normalize_advantage=False, tensorboard_log=None, create_eval_env=False,
-                 policy_kwargs=None, verbose=0, seed=None, device='auto',
-                 _init_setup_model=True):
+    def __init__(self, policy: Union[str, Type[PPOPolicy]],
+                 env: Union[GymEnv, str],
+                 learning_rate: Union[float, Callable] = 7e-4,
+                 n_steps: int = 5,
+                 gamma: float = 0.99,
+                 gae_lambda: float = 1.0,
+                 ent_coef: float = 0.0,
+                 vf_coef: float = 0.5,
+                 max_grad_norm: float = 0.5,
+                 rms_prop_eps: float = 1e-5,
+                 use_rms_prop: bool = True,
+                 use_sde: bool = False,
+                 sde_sample_freq: int = -1,
+                 normalize_advantage: bool = False,
+                 tensorboard_log: Optional[str] = None,
+                 create_eval_env: bool = False,
+                 policy_kwargs: Optional[Dict[str, Any]] = None,
+                 verbose: int = 0,
+                 seed: Optional[int] = None,
+                 device: Union[th.device, str] = 'auto',
+                 _init_setup_model: bool = True):
 
         super(A2C, self).__init__(policy, env, learning_rate=learning_rate,
                                   n_steps=n_steps, batch_size=None, n_epochs=1,
diff --git a/torchy_baselines/ppo/ppo.py b/torchy_baselines/ppo/ppo.py
index 4ee8ef1f0..6f9a4839d 100644
--- a/torchy_baselines/ppo/ppo.py
+++ b/torchy_baselines/ppo/ppo.py
@@ -1,6 +1,6 @@
 import os
 import time
-from typing import Optional, Tuple, List
+from typing import List, Tuple, Type, Union, Callable, Optional, Dict, Any
 
 import gym
 from gym import spaces
@@ -14,12 +14,13 @@
     SummaryWriter = None
 import numpy as np
 
+from torchy_baselines.common import logger
 from torchy_baselines.common.base_class import BaseRLModel
+from torchy_baselines.common.type_aliases import GymEnv
 from torchy_baselines.common.buffers import RolloutBuffer
 from torchy_baselines.common.utils import explained_variance, get_schedule_fn
 from torchy_baselines.common.vec_env import VecEnv
 from torchy_baselines.common.callbacks import BaseCallback
-from torchy_baselines.common import logger
 from torchy_baselines.ppo.policies import PPOPolicy
 
 
@@ -73,14 +74,29 @@ class PPO(BaseRLModel):
     :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance
     """
 
-    def __init__(self, policy, env, learning_rate=3e-4,
-                 n_steps=2048, batch_size=64, n_epochs=10,
-                 gamma=0.99, gae_lambda=0.95, clip_range=0.2, clip_range_vf=None,
-                 ent_coef=0.0, vf_coef=0.5, max_grad_norm=0.5,
-                 use_sde=False, sde_sample_freq=-1,
-                 target_kl=None, tensorboard_log=None, create_eval_env=False,
-                 policy_kwargs=None, verbose=0, seed=None, device='auto',
-                 _init_setup_model=True):
+    def __init__(self, policy: Union[str, Type[PPOPolicy]],
+                 env: Union[GymEnv, str],
+                 learning_rate: Union[float, Callable] = 3e-4,
+                 n_steps: int = 2048,
+                 batch_size: Optional[int] = 64,
+                 n_epochs: int = 10,
+                 gamma: float = 0.99,
+                 gae_lambda: float = 0.95,
+                 clip_range: float = 0.2,
+                 clip_range_vf: Optional[float] = None,
+                 ent_coef: float = 0.0,
+                 vf_coef: float = 0.5,
+                 max_grad_norm: float = 0.5,
+                 use_sde: bool = False,
+                 sde_sample_freq: int = -1,
+                 target_kl: Optional[float] = None,
+                 tensorboard_log: Optional[str] = None,
+                 create_eval_env: bool = False,
+                 policy_kwargs: Optional[Dict[str, Any]] = None,
+                 verbose: int = 0,
+                 seed: Optional[int] = None,
+                 device: Union[th.device, str] = 'auto',
+                 _init_setup_model: bool = True):
 
         super(PPO, self).__init__(policy, env, PPOPolicy, policy_kwargs=policy_kwargs,
                                   verbose=verbose, device=device, use_sde=use_sde, sde_sample_freq=sde_sample_freq,

From 90d1558534192cb6893ebca8524cd966b39da267 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Wed, 11 Mar 2020 12:45:21 +0100
Subject: [PATCH 11/17] Type and reorder arguments

---
 torchy_baselines/cem_rl/cem_rl.py     | 97 +++++++++++++++++----------
 torchy_baselines/common/base_class.py |  2 +-
 torchy_baselines/sac/sac.py           | 28 ++++----
 torchy_baselines/td3/td3.py           | 90 ++++++++++++++++---------
 4 files changed, 137 insertions(+), 80 deletions(-)

diff --git a/torchy_baselines/cem_rl/cem_rl.py b/torchy_baselines/cem_rl/cem_rl.py
index 2d1838ee6..867f2c418 100644
--- a/torchy_baselines/cem_rl/cem_rl.py
+++ b/torchy_baselines/cem_rl/cem_rl.py
@@ -1,11 +1,13 @@
-import time
+from typing import Type, Union, Callable, Optional, Dict, Any
 
 import torch as th
 
+from torchy_baselines.common.base_class import OffPolicyRLModel
+from torchy_baselines.common.callbacks import BaseCallback
+from torchy_baselines.common.type_aliases import GymEnv
+from torchy_baselines.common.noise import ActionNoise
+from torchy_baselines.td3.td3 import TD3, TD3Policy
 from torchy_baselines.cem_rl.cem import CEM
-from torchy_baselines.common.evaluation import evaluate_policy
-from torchy_baselines.td3.td3 import TD3
-from torchy_baselines.common.vec_env import sync_envs_normalization
 
 
 class CEMRL(TD3):
@@ -16,30 +18,30 @@ class CEMRL(TD3):
     Code: https://github.com/apourchot/CEM-RL
 
     :param policy: (TD3Policy or str) The policy model to use (MlpPolicy, CnnPolicy, ...)
-    :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str)
-    :param sigma_init: (float) Initial standard deviation of the population distribution
-    :param pop_size: (int) Number of individuals in the population
-    :param damping_init: (float)  Initial value of damping for preventing from early convergence.
-    :param damping_final: (float) Final value of damping
-    :param elitism: (bool) Keep the best known individual in the population
-    :param n_grad: (int) Number of individuals that will receive a gradient update.
-        Half of the population size in the paper.
-    :param buffer_size: (int) size of the replay buffer
+    :param env: (GymEnv or str) The environment to learn from (if registered in Gym, can be str)
     :param learning_rate: (float or callable) learning rate for adam optimizer,
-        the same learning rate will be used for all networks (Q-Values and Actor networks)
+        the same learning rate will be used for all networks (Q-Values, Actor and Value function)
         it can be a function of the current progress (from 1 to 0)
-    :param policy_delay: (int) Policy and target networks will only be updated once every policy_delay steps
-        per training steps. The Q values will be updated policy_delay more often (update every training step).
+    :param buffer_size: (int) size of the replay buffer
     :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts
-    :param gamma: (float) the discount factor
     :param batch_size: (int) Minibatch size for each gradient update
-    :param tau: (float) the soft update coefficient ("Polyak update" of the target networks, between 0 and 1)
-    :param action_noise: (ActionNoise) the action noise type. Cf common.noise for the different action noise type.
+    :param tau: (float) the soft update coefficient ("polyak update", between 0 and 1)
+    :param gamma: (float) the discount factor
+    :param n_episodes_rollout: (int) Update the model every ``n_episodes_rollout`` episodes.
+    :param action_noise: (ActionNoise) the action noise type (None by default), this can help
+        for hard exploration problem. Cf common.noise for the different action noise type.
+    :param policy_delay: (int) Policy and target networks will only be updated once every policy_delay steps
+        per training steps. The Q values will be updated policy_delay more often (update every training step).
     :param target_policy_noise: (float) Standard deviation of Gaussian noise added to target policy
         (smoothing noise)
     :param target_noise_clip: (float) Limit for absolute value of target policy smoothing noise.
-    :param n_episodes_rollout: (int) Update the model every `n_episodes_rollout` episodes.
-        Note that this cannot be used at the same time as `train_freq`
+    :param sigma_init: (float) Initial standard deviation of the population distribution
+    :param pop_size: (int) Number of individuals in the population
+    :param damping_init: (float)  Initial value of damping for preventing from early convergence.
+    :param damping_final: (float) Final value of damping
+    :param elitism: (bool) Keep the best known individual in the population
+    :param n_grad: (int) Number of individuals that will receive a gradient update.
+        Half of the population size in the paper.
     :param update_style: (str) Update style for the individual that will use the gradient:
         - original: original implementation (actor_steps // n_grad steps for the critic
         and actor_steps gradient steps per individual)
@@ -55,15 +57,33 @@ class CEMRL(TD3):
         Setting it to auto, the code will be run on the GPU if possible.
     :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance
     """
-    def __init__(self, policy, env, sigma_init=1e-3, pop_size=10,
-                 damping_init=1e-3, damping_final=1e-5, elitism=False, n_grad=5,
-                 buffer_size=int(1e6), learning_rate=1e-3, policy_delay=2,
-                 learning_starts=100, gamma=0.99, batch_size=100, tau=0.005,
-                 action_noise=None, target_policy_noise=0.2, target_noise_clip=0.5,
-                 n_episodes_rollout=1, update_style='original',
-                 tensorboard_log=None, create_eval_env=False,
-                 policy_kwargs=None, verbose=0, seed=None, device='auto',
-                 _init_setup_model=True):
+    def __init__(self, policy: Union[str, Type[TD3Policy]],
+                 env: Union[GymEnv, str],
+                 learning_rate: Union[float, Callable] = 1e-3,
+                 buffer_size: int = int(1e6),
+                 learning_starts: int = 100,
+                 batch_size: int = 100,
+                 tau: float = 0.005,
+                 gamma: float = 0.99,
+                 n_episodes_rollout: int = 1,
+                 action_noise: Optional[ActionNoise] = None,
+                 policy_delay: int = 2,
+                 target_policy_noise: float = 0.2,
+                 target_noise_clip: float = 0.5,
+                 sigma_init: float = 1e-3,
+                 pop_size: int = 10,
+                 damping_init: float = 1e-3,
+                 damping_final: float = 1e-5,
+                 elitism: bool = False,
+                 n_grad: int = 5,
+                 update_style: str = 'original',
+                 tensorboard_log: Optional[str] = None,
+                 create_eval_env: bool = False,
+                 policy_kwargs: Dict[str, Any] = None,
+                 verbose: int = 0,
+                 seed: Optional[int] = None,
+                 device: Union[th.device, str] = 'auto',
+                 _init_setup_model: bool = True):
 
         super(CEMRL, self).__init__(policy, env,
                                     buffer_size=buffer_size, learning_rate=learning_rate, seed=seed, device=device,
@@ -77,7 +97,7 @@ def __init__(self, policy, env, sigma_init=1e-3, pop_size=10,
 
         # Evolution strategy method that follows cma-es interface (ask-tell)
         # for now, only CEM is implemented
-        self.es = None
+        self.es = None  # type: Optional[CEM]
         self.sigma_init = sigma_init
         self.pop_size = pop_size
         self.damping_init = damping_init
@@ -91,7 +111,7 @@ def __init__(self, policy, env, sigma_init=1e-3, pop_size=10,
         if _init_setup_model:
             self._setup_model()
 
-    def _setup_model(self, seed=None):
+    def _setup_model(self) -> None:
         super(CEMRL, self)._setup_model()
         params_vector = self.actor.parameters_to_vector()
         self.es = CEM(len(params_vector), mu_init=params_vector,
@@ -99,9 +119,16 @@ def _setup_model(self, seed=None):
                       pop_size=self.pop_size, antithetic=not self.pop_size % 2, parents=self.pop_size // 2,
                       elitism=self.elitism)
 
-    def learn(self, total_timesteps, callback=None, log_interval=4,
-              eval_env=None, eval_freq=-1, n_eval_episodes=5,
-              tb_log_name="CEMRL", eval_log_path=None, reset_num_timesteps=True):
+    def learn(self,
+              total_timesteps: int,
+              callback: Optional[BaseCallback] = None,
+              log_interval: int = 4,
+              eval_env: Optional[GymEnv] = None,
+              eval_freq: int = -1,
+              n_eval_episodes: int = 5,
+              tb_log_name: str = "CEMRL",
+              eval_log_path: Optional[str] = None,
+              reset_num_timesteps: bool = True) -> OffPolicyRLModel:
 
         episode_num, obs, callback = self._setup_learn(eval_env, callback, eval_freq,
                                                        n_eval_episodes, eval_log_path, reset_num_timesteps)
diff --git a/torchy_baselines/common/base_class.py b/torchy_baselines/common/base_class.py
index d6bd69de3..b77237dbe 100644
--- a/torchy_baselines/common/base_class.py
+++ b/torchy_baselines/common/base_class.py
@@ -288,7 +288,7 @@ def learn(self, total_timesteps: int,
               eval_freq: int = -1,
               n_eval_episodes: int = 5,
               eval_log_path: Optional[str] = None,
-              reset_num_timesteps: bool = True):
+              reset_num_timesteps: bool = True) -> 'BaseRLModel':
         """
         Return a trained model.
 
diff --git a/torchy_baselines/sac/sac.py b/torchy_baselines/sac/sac.py
index 2ac822fc6..d0309e8b0 100644
--- a/torchy_baselines/sac/sac.py
+++ b/torchy_baselines/sac/sac.py
@@ -33,21 +33,21 @@ class SAC(OffPolicyRLModel):
         the same learning rate will be used for all networks (Q-Values, Actor and Value function)
         it can be a function of the current progress (from 1 to 0)
     :param buffer_size: (int) size of the replay buffer
+    :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts
     :param batch_size: (int) Minibatch size for each gradient update
     :param tau: (float) the soft update coefficient ("polyak update", between 0 and 1)
-    :param ent_coef: (str or float) Entropy regularization coefficient. (Equivalent to
-        inverse of reward scale in the original SAC paper.)  Controlling exploration/exploitation trade-off.
-        Set it to 'auto' to learn it automatically (and 'auto_0.1' for using 0.1 as initial value)
-    :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts
-    :param target_update_interval: (int) update the target network every `target_network_update_freq` steps.
-    :param train_freq: (int) Update the model every `train_freq` steps.
+    :param gamma: (float) the discount factor
+    :param train_freq: (int) Update the model every ``train_freq`` steps.
     :param gradient_steps: (int) How many gradient update after each step
-    :param n_episodes_rollout: (int) Update the model every `n_episodes_rollout` episodes.
-        Note that this cannot be used at the same time as `train_freq`
-    :param target_entropy: (str or float) target entropy when learning `ent_coef` (`ent_coef = 'auto'`)
+    :param n_episodes_rollout: (int) Update the model every ``n_episodes_rollout`` episodes.
+        Note that this cannot be used at the same time as ``train_freq``
     :param action_noise: (ActionNoise) the action noise type (None by default), this can help
         for hard exploration problem. Cf common.noise for the different action noise type.
-    :param gamma: (float) the discount factor
+    :param ent_coef: (str or float) Entropy regularization coefficient. (Equivalent to
+        inverse of reward scale in the original SAC paper.)  Controlling exploration/exploitation trade-off.
+        Set it to 'auto' to learn it automatically (and 'auto_0.1' for using 0.1 as initial value)
+    :param target_update_interval: (int) update the target network every ``target_network_update_freq`` steps.
+    :param target_entropy: (str or float) target entropy when learning ``ent_coef`` (``ent_coef = 'auto'``)
     :param use_sde: (bool) Whether to use State Dependent Exploration (SDE)
         instead of action noise exploration (default: False)
     :param sde_sample_freq: (int) Sample a new noise matrix every n steps when using SDE
@@ -71,14 +71,14 @@ def __init__(self, policy: Union[str, Type[SACPolicy]],
                  learning_starts: int = 100,
                  batch_size: int = 256,
                  tau: float = 0.005,
-                 ent_coef: Union[str, float] = 'auto',
-                 target_update_interval: int = 1,
+                 gamma: float = 0.99,
                  train_freq: int = 1,
                  gradient_steps: int = 1,
                  n_episodes_rollout: int = -1,
-                 target_entropy: Union[str, float] = 'auto',
                  action_noise: Optional[ActionNoise] = None,
-                 gamma: float = 0.99,
+                 ent_coef: Union[str, float] = 'auto',
+                 target_update_interval: int = 1,
+                 target_entropy: Union[str, float] = 'auto',
                  use_sde: bool = False,
                  sde_sample_freq: int = -1,
                  use_sde_at_warmup: bool = False,
diff --git a/torchy_baselines/td3/td3.py b/torchy_baselines/td3/td3.py
index 7dbaf1636..ccb835e50 100644
--- a/torchy_baselines/td3/td3.py
+++ b/torchy_baselines/td3/td3.py
@@ -1,4 +1,4 @@
-from typing import List, Tuple, Optional
+from typing import List, Tuple, Type, Union, Callable, Optional, Dict, Any
 
 import torch as th
 import torch.nn.functional as F
@@ -6,7 +6,9 @@
 
 from torchy_baselines.common.base_class import OffPolicyRLModel
 from torchy_baselines.common.buffers import ReplayBuffer
-from torchy_baselines.common.type_aliases import ReplayBufferSamples
+from torchy_baselines.common.type_aliases import ReplayBufferSamples, GymEnv
+from torchy_baselines.common.noise import ActionNoise
+from torchy_baselines.common.callbacks import BaseCallback
 from torchy_baselines.td3.policies import TD3Policy
 
 
@@ -20,22 +22,23 @@ class TD3(OffPolicyRLModel):
     Introduction to TD3: https://spinningup.openai.com/en/latest/algorithms/td3.html
 
     :param policy: (TD3Policy or str) The policy model to use (MlpPolicy, CnnPolicy, ...)
-    :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str)
-    :param buffer_size: (int) size of the replay buffer
+    :param env: (GymEnv or str) The environment to learn from (if registered in Gym, can be str)
     :param learning_rate: (float or callable) learning rate for adam optimizer,
-        the same learning rate will be used for all networks (Q-Values and Actor networks)
+        the same learning rate will be used for all networks (Q-Values, Actor and Value function)
         it can be a function of the current progress (from 1 to 0)
-    :param policy_delay: (int) Policy and target networks will only be updated once every policy_delay steps
-        per training steps. The Q values will be updated policy_delay more often (update every training step).
+    :param buffer_size: (int) size of the replay buffer
     :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts
-    :param gamma: (float) the discount factor
     :param batch_size: (int) Minibatch size for each gradient update
-    :param train_freq: (int) Update the model every `train_freq` steps.
+    :param tau: (float) the soft update coefficient ("polyak update", between 0 and 1)
+    :param gamma: (float) the discount factor
+    :param train_freq: (int) Update the model every ``train_freq`` steps.
     :param gradient_steps: (int) How many gradient update after each step
-    :param n_episodes_rollout: (int) Update the model every `n_episodes_rollout` episodes.
-        Note that this cannot be used at the same time as `train_freq`
-    :param tau: (float) the soft update coefficient ("Polyak update" of the target networks, between 0 and 1)
-    :param action_noise: (ActionNoise) the action noise type. Cf common.noise for the different action noise type.
+    :param n_episodes_rollout: (int) Update the model every ``n_episodes_rollout`` episodes.
+        Note that this cannot be used at the same time as ``train_freq``
+    :param action_noise: (ActionNoise) the action noise type (None by default), this can help
+        for hard exploration problem. Cf common.noise for the different action noise type.
+    :param policy_delay: (int) Policy and target networks will only be updated once every policy_delay steps
+        per training steps. The Q values will be updated policy_delay more often (update every training step).
     :param target_policy_noise: (float) Standard deviation of Gaussian noise added to target policy
         (smoothing noise)
     :param target_noise_clip: (float) Limit for absolute value of target policy smoothing noise.
@@ -58,14 +61,34 @@ class TD3(OffPolicyRLModel):
     :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance
     """
 
-    def __init__(self, policy, env, buffer_size=int(1e6), learning_rate=1e-3,
-                 policy_delay=2, learning_starts=100, gamma=0.99, batch_size=100,
-                 train_freq=-1, gradient_steps=-1, n_episodes_rollout=1,
-                 tau=0.005, action_noise=None, target_policy_noise=0.2, target_noise_clip=0.5,
-                 use_sde=False, sde_sample_freq=-1, sde_max_grad_norm=1,
-                 sde_ent_coef=0.0, sde_log_std_scheduler=None, use_sde_at_warmup=False,
-                 tensorboard_log=None, create_eval_env=False, policy_kwargs=None, verbose=0,
-                 seed=None, device='auto', _init_setup_model=True):
+    def __init__(self, policy: Union[str, Type[TD3Policy]],
+                 env: Union[GymEnv, str],
+                 learning_rate: Union[float, Callable] = 1e-3,
+                 buffer_size: int = int(1e6),
+                 learning_starts: int = 100,
+                 batch_size: int = 100,
+                 tau: float = 0.005,
+                 gamma: float = 0.99,
+                 train_freq: int = -1,
+                 gradient_steps: int = -1,
+                 n_episodes_rollout: int = 1,
+                 action_noise: Optional[ActionNoise] = None,
+                 policy_delay: int = 2,
+                 target_policy_noise: float = 0.2,
+                 target_noise_clip: float = 0.5,
+                 use_sde: bool = False,
+                 sde_sample_freq: int = -1,
+                 sde_max_grad_norm: float = 1,
+                 sde_ent_coef: float = 0.0,
+                 sde_log_std_scheduler: Optional[Callable] = None,
+                 use_sde_at_warmup: bool = False,
+                 tensorboard_log: Optional[str] = None,
+                 create_eval_env: bool = False,
+                 policy_kwargs: Dict[str, Any] = None,
+                 verbose: int = 0,
+                 seed: Optional[int] = None,
+                 device: Union[th.device, str] = 'auto',
+                 _init_setup_model: bool = True):
 
         super(TD3, self).__init__(policy, env, TD3Policy, policy_kwargs, verbose, device,
                                   create_eval_env=create_eval_env, seed=seed,
@@ -96,7 +119,7 @@ def __init__(self, policy, env, buffer_size=int(1e6), learning_rate=1e-3,
         if _init_setup_model:
             self._setup_model()
 
-    def _setup_model(self):
+    def _setup_model(self) -> None:
         self._setup_learning_rate()
         obs_dim, action_dim = self.observation_space.shape[0], self.action_space.shape[0]
         self.set_random_seed(self.seed)
@@ -107,7 +130,7 @@ def _setup_model(self):
         self.policy = self.policy.to(self.device)
         self._create_aliases()
 
-    def _create_aliases(self):
+    def _create_aliases(self) -> None:
         self.actor = self.policy.actor
         self.actor_target = self.policy.actor_target
         self.critic = self.policy.critic
@@ -117,7 +140,7 @@ def _create_aliases(self):
     def train_critic(self, gradient_steps: int = 1,
                     batch_size: int = 100,
                     replay_data: Optional[ReplayBufferSamples] = None,
-                    tau: float = 0.0):
+                    tau: float = 0.0) -> None:
         # Update optimizer learning rate
         self._update_learning_rate(self.critic.optimizer)
 
@@ -158,7 +181,7 @@ def train_actor(self, gradient_steps: int = 1,
                     batch_size: int = 100,
                     tau_actor: float = 0.005,
                     tau_critic: float = 0.005,
-                    replay_data: Optional[ReplayBufferSamples] = None):
+                    replay_data: Optional[ReplayBufferSamples] = None) -> None:
         # Update optimizer learning rate
         self._update_learning_rate(self.actor.optimizer)
 
@@ -183,7 +206,7 @@ def train_actor(self, gradient_steps: int = 1,
             for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
                 target_param.data.copy_(tau_actor * param.data + (1 - tau_actor) * target_param.data)
 
-    def train(self, gradient_steps: int, batch_size: int = 100, policy_delay: int = 2):
+    def train(self, gradient_steps: int, batch_size: int = 100, policy_delay: int = 2) -> None:
 
         for gradient_step in range(gradient_steps):
 
@@ -195,7 +218,7 @@ def train(self, gradient_steps: int, batch_size: int = 100, policy_delay: int =
             if gradient_step % policy_delay == 0:
                 self.train_actor(replay_data=replay_data, tau_actor=self.tau, tau_critic=self.tau)
 
-    def train_sde(self):
+    def train_sde(self) -> None:
         # Update optimizer learning rate
         # self._update_learning_rate(self.policy.optimizer)
 
@@ -241,9 +264,16 @@ def train_sde(self):
 
         del self.rollout_data
 
-    def learn(self, total_timesteps, callback=None, log_interval=4,
-              eval_env=None, eval_freq=-1, n_eval_episodes=5,
-              tb_log_name="TD3", eval_log_path=None, reset_num_timesteps=True):
+    def learn(self,
+              total_timesteps: int,
+              callback: Optional[BaseCallback] = None,
+              log_interval: int = 4,
+              eval_env: Optional[GymEnv] = None,
+              eval_freq: int = -1,
+              n_eval_episodes: int = 5,
+              tb_log_name: str = "TD3",
+              eval_log_path: Optional[str] = None,
+              reset_num_timesteps: bool = True) -> OffPolicyRLModel:
 
         episode_num, obs, callback = self._setup_learn(eval_env, callback, eval_freq,
                                                        n_eval_episodes, eval_log_path, reset_num_timesteps)

From c5e58128949b932ed32f95ec12131c17b1338a59 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Wed, 11 Mar 2020 13:01:42 +0100
Subject: [PATCH 12/17] Finish typing A2C and PPO

---
 torchy_baselines/a2c/a2c.py    | 19 +++++++++++++------
 torchy_baselines/cem_rl/cem.py | 21 +++++++++++++++------
 torchy_baselines/ppo/ppo.py    | 15 +++++++++++----
 3 files changed, 39 insertions(+), 16 deletions(-)

diff --git a/torchy_baselines/a2c/a2c.py b/torchy_baselines/a2c/a2c.py
index 4c24c7faf..6e0a1f2e9 100644
--- a/torchy_baselines/a2c/a2c.py
+++ b/torchy_baselines/a2c/a2c.py
@@ -7,11 +7,11 @@
 from torchy_baselines.common.utils import explained_variance
 from torchy_baselines.common import logger
 from torchy_baselines.common.type_aliases import GymEnv
+from torchy_baselines.common.callbacks import BaseCallback
 from torchy_baselines.ppo.ppo import PPO
 from torchy_baselines.ppo.policies import PPOPolicy
 
 
-
 class A2C(PPO):
     """
     Advantage Actor Critic (A2C)
@@ -89,14 +89,14 @@ def __init__(self, policy: Union[str, Type[PPOPolicy]],
         if _init_setup_model:
             self._setup_model()
 
-    def _setup_model(self):
+    def _setup_model(self) -> None:
         super(A2C, self)._setup_model()
         if self.use_rms_prop:
             self.policy.optimizer = th.optim.RMSprop(self.policy.parameters(),
                                                      lr=self.learning_rate(1), alpha=0.99,
                                                      eps=self.rms_prop_eps, weight_decay=0)
 
-    def train(self, gradient_steps: int, batch_size=None):
+    def train(self, gradient_steps: int, batch_size: Optional[int] = None) -> None:
         # Update optimizer learning rate
         self._update_learning_rate(self.policy.optimizer)
         # A2C with gradient_steps > 1 does not make sense
@@ -153,9 +153,16 @@ def train(self, gradient_steps: int, batch_size=None):
         if hasattr(self.policy, 'log_std'):
             logger.logkv("std", th.exp(self.policy.log_std).mean().item())
 
-    def learn(self, total_timesteps, callback=None, log_interval=100,
-              eval_env=None, eval_freq=-1, n_eval_episodes=5,
-              tb_log_name="A2C", eval_log_path=None, reset_num_timesteps=True):
+    def learn(self,
+              total_timesteps: int,
+              callback: Optional[BaseCallback] = None,
+              log_interval: int = 100,
+              eval_env: Optional[GymEnv] = None,
+              eval_freq: int = -1,
+              n_eval_episodes: int = 5,
+              tb_log_name: str = "A2C",
+              eval_log_path: Optional[str] = None,
+              reset_num_timesteps: bool = True) -> 'A2C':
 
         return super(A2C, self).learn(total_timesteps=total_timesteps, callback=callback, log_interval=log_interval,
                                       eval_env=eval_env, eval_freq=eval_freq, n_eval_episodes=n_eval_episodes,
diff --git a/torchy_baselines/cem_rl/cem.py b/torchy_baselines/cem_rl/cem.py
index ee4f484bc..d1d221c86 100644
--- a/torchy_baselines/cem_rl/cem.py
+++ b/torchy_baselines/cem_rl/cem.py
@@ -1,3 +1,5 @@
+from typing import Type, Tuple, Optional, List
+
 import numpy as np
 
 
@@ -21,9 +23,16 @@ class CEM(object):
     :param antithetic: (bool) Use a finite difference like method for sampling
         (mu + epsilon, mu - epsilon)
     """
-    def __init__(self, num_params, mu_init=None, sigma_init=1e-3,
-                 pop_size=256, damping_init=1e-3, damping_final=1e-5,
-                 parents=None, elitism=False, antithetic=False):
+    def __init__(self,
+                 num_params: int,
+                 mu_init: Optional[np.ndarray] = None,
+                 sigma_init: float = 1e-3,
+                 pop_size: int = 256,
+                 damping_init: float = 1e-3,
+                 damping_final: float = 1e-5,
+                 parents: Optional[int] = None,
+                 elitism: bool = False,
+                 antithetic: bool = False):
         super(CEM, self).__init__()
 
         self.num_params = num_params
@@ -66,7 +75,7 @@ def __init__(self, num_params, mu_init=None, sigma_init=1e-3,
                                  for i in range(1, self.parents + 1)])
         self.weights /= self.weights.sum()
 
-    def ask(self, pop_size):
+    def ask(self, pop_size: int) -> List[np.ndarray]:
         """
         Returns a list of candidates parameters
 
@@ -87,7 +96,7 @@ def ask(self, pop_size):
 
         return individuals
 
-    def tell(self, solutions, scores):
+    def tell(self, solutions: List[np.ndarray], scores: List[float]) -> None:
         """
         Updates the distribution
 
@@ -114,7 +123,7 @@ def tell(self, solutions, scores):
         self.elite = solutions[idx_sorted[0]]
         self.elite_score = scores[idx_sorted[0]]
 
-    def get_distrib_params(self):
+    def get_distrib_params(self) -> Tuple[np.ndarray, np.ndarray]:
         """
         Returns the parameters of the distribution:
         the mean and standard deviation.
diff --git a/torchy_baselines/ppo/ppo.py b/torchy_baselines/ppo/ppo.py
index 6f9a4839d..a7dcf0bdb 100644
--- a/torchy_baselines/ppo/ppo.py
+++ b/torchy_baselines/ppo/ppo.py
@@ -121,7 +121,7 @@ def __init__(self, policy: Union[str, Type[PPOPolicy]],
         if _init_setup_model:
             self._setup_model()
 
-    def _setup_model(self):
+    def _setup_model(self) -> None:
         self._setup_learning_rate()
         # TODO: preprocessing: one hot vector for obs discrete
         state_dim = self.observation_space.shape[0]
@@ -284,9 +284,16 @@ def train(self, gradient_steps: int, batch_size: int = 64) -> None:
         if hasattr(self.policy, 'log_std'):
             logger.logkv("std", th.exp(self.policy.log_std).mean().item())
 
-    def learn(self, total_timesteps, callback=None, log_interval=1,
-              eval_env=None, eval_freq=-1, n_eval_episodes=5, tb_log_name="PPO",
-              eval_log_path=None, reset_num_timesteps=True):
+    def learn(self,
+              total_timesteps: int,
+              callback: Optional[BaseCallback] = None,
+              log_interval: int = 1,
+              eval_env: Optional[GymEnv] = None,
+              eval_freq: int = -1,
+              n_eval_episodes: int = 5,
+              tb_log_name: str = "PPO",
+              eval_log_path: Optional[str] = None,
+              reset_num_timesteps: bool = True) -> 'PPO':
 
         episode_num, obs, callback = self._setup_learn(eval_env, callback, eval_freq,
                                                        n_eval_episodes, eval_log_path, reset_num_timesteps)

From 037986a91d272a79cb6a2d434fedf5fb27f23904 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Wed, 11 Mar 2020 16:35:13 +0100
Subject: [PATCH 13/17] Add test for `expln`

---
 docs/misc/changelog.rst |  1 +
 tests/test_sde.py       | 11 ++++++-----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
index 45d28de6c..3bd119bf5 100644
--- a/docs/misc/changelog.rst
+++ b/docs/misc/changelog.rst
@@ -27,6 +27,7 @@ Others:
 - Added ``clip_mean`` parameter to SAC policy
 - Buffers now return ``NamedTuple``
 - More typing
+- Add test for ``expln``
 
 Documentation:
 ^^^^^^^^^^^^^^
diff --git a/tests/test_sde.py b/tests/test_sde.py
index 497f39887..96f6c8f38 100644
--- a/tests/test_sde.py
+++ b/tests/test_sde.py
@@ -2,7 +2,7 @@
 import torch as th
 from torch.distributions import Normal
 
-from torchy_baselines import A2C, TD3, SAC
+from torchy_baselines import A2C, TD3, SAC, PPO
 
 
 def test_state_dependent_exploration_grad():
@@ -55,12 +55,13 @@ def test_state_dependent_exploration_grad():
     assert sigma_hat.grad.allclose(grad)
 
 
-@pytest.mark.parametrize("model_class", [TD3, SAC, A2C])
+@pytest.mark.parametrize("model_class", [TD3, SAC, A2C, PPO])
 @pytest.mark.parametrize("sde_net_arch", [None, [32, 16], []])
-def test_state_dependent_offpolicy_noise(model_class, sde_net_arch):
+@pytest.mark.parametrize("use_expln", [False, True])
+def test_state_dependent_offpolicy_noise(model_class, sde_net_arch, use_expln):
     model = model_class('MlpPolicy', 'Pendulum-v0', use_sde=True, seed=None, create_eval_env=True,
-                        verbose=1, policy_kwargs=dict(log_std_init=-2, sde_net_arch=sde_net_arch))
-    model.learn(total_timesteps=int(1000), eval_freq=500)
+                        verbose=1, policy_kwargs=dict(log_std_init=-2, sde_net_arch=sde_net_arch, use_expln=use_expln))
+    model.learn(total_timesteps=int(500), eval_freq=250)
 
 
 def test_scheduler():

From 18f38f8cf5b969403d5416e62eb9a4461538c0db Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Thu, 12 Mar 2020 11:12:10 +0100
Subject: [PATCH 14/17] Reformat

---
 tests/test_callbacks.py                        |  4 ++--
 tests/test_distributions.py                    |  1 +
 tests/test_logger.py                           |  3 ++-
 tests/test_predict.py                          |  1 +
 tests/test_run.py                              |  4 +---
 tests/test_save_load.py                        |  1 +
 tests/test_vec_normalize.py                    |  7 +++++--
 torchy_baselines/a2c/a2c.py                    | 11 +++++------
 torchy_baselines/cem_rl/cem.py                 |  3 +--
 torchy_baselines/common/base_class.py          |  4 ++--
 torchy_baselines/common/buffers.py             | 10 ++++++----
 torchy_baselines/common/distributions.py       |  4 ++--
 torchy_baselines/common/monitor.py             |  5 +++--
 torchy_baselines/common/noise.py               |  3 +++
 torchy_baselines/common/save_util.py           |  2 --
 torchy_baselines/common/type_aliases.py        |  5 ++---
 .../common/vec_env/base_vec_env.py             |  2 +-
 .../common/vec_env/subproc_vec_env.py          |  4 ++--
 .../common/vec_env/vec_normalize.py            |  6 +++---
 torchy_baselines/ppo/policies.py               | 10 ++++++----
 torchy_baselines/ppo/ppo.py                    | 18 +++++++++---------
 torchy_baselines/sac/policies.py               |  1 +
 torchy_baselines/td3/policies.py               |  5 ++---
 torchy_baselines/td3/td3.py                    | 14 ++++++--------
 24 files changed, 67 insertions(+), 61 deletions(-)

diff --git a/tests/test_callbacks.py b/tests/test_callbacks.py
index 96db45a77..a4ffede11 100644
--- a/tests/test_callbacks.py
+++ b/tests/test_callbacks.py
@@ -6,7 +6,7 @@
 
 from torchy_baselines import A2C, CEMRL, PPO, SAC, TD3
 from torchy_baselines.common.callbacks import (CallbackList, CheckpointCallback, EvalCallback,
-    EveryNTimesteps, StopTrainingOnRewardThreshold)
+                                               EveryNTimesteps, StopTrainingOnRewardThreshold)
 
 
 @pytest.mark.parametrize("model_class", [A2C, CEMRL, PPO, SAC, TD3])
@@ -44,6 +44,6 @@ def test_callbacks(model_class):
     # Transform callback into a callback list automatically
     model.learn(500, callback=[checkpoint_callback, eval_callback])
     # Automatic wrapping, old way of doing callbacks
-    model.learn(500, callback=lambda _locals, _globals : True)
+    model.learn(500, callback=lambda _locals, _globals: True)
     if os.path.exists(log_folder):
         shutil.rmtree(log_folder)
diff --git a/tests/test_distributions.py b/tests/test_distributions.py
index 4b5d7927f..7d28ad7dd 100644
--- a/tests/test_distributions.py
+++ b/tests/test_distributions.py
@@ -22,6 +22,7 @@ def test_bijector():
     # Check the inverse method
     assert th.isclose(TanhBijector.inverse(squashed_actions), actions).all()
 
+
 @pytest.mark.parametrize("model_class", [A2C, PPO])
 def test_squashed_gaussian(model_class):
     """
diff --git a/tests/test_logger.py b/tests/test_logger.py
index b55a61633..5ca0437b5 100644
--- a/tests/test_logger.py
+++ b/tests/test_logger.py
@@ -5,7 +5,8 @@
 import numpy as np
 
 from torchy_baselines.common.logger import (make_output_format, read_csv, read_json, DEBUG, ScopedConfigure,
-    info, debug, set_level, configure, logkv, logkvs, dumpkvs, logkv_mean, warn, error, reset)
+                                            info, debug, set_level, configure, logkv, logkvs, dumpkvs, logkv_mean, warn,
+                                            error, reset)
 
 KEY_VALUES = {
     "test": 1,
diff --git a/tests/test_predict.py b/tests/test_predict.py
index 6f2245ce8..e68954f60 100644
--- a/tests/test_predict.py
+++ b/tests/test_predict.py
@@ -12,6 +12,7 @@
     SAC,
 ]
 
+
 @pytest.mark.parametrize("model_class", MODEL_LIST)
 def test_auto_wrap(model_class):
     # test auto wrapping of env into a VecEnv
diff --git a/tests/test_run.py b/tests/test_run.py
index fdfcff6da..1a3f99110 100644
--- a/tests/test_run.py
+++ b/tests/test_run.py
@@ -1,7 +1,5 @@
-import os
-
-import pytest
 import numpy as np
+import pytest
 
 from torchy_baselines import A2C, CEMRL, PPO, SAC, TD3
 from torchy_baselines.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
diff --git a/tests/test_save_load.py b/tests/test_save_load.py
index 9d73ddf74..bdec3eda0 100644
--- a/tests/test_save_load.py
+++ b/tests/test_save_load.py
@@ -16,6 +16,7 @@
     SAC,
 ]
 
+
 @pytest.mark.parametrize("model_class", MODEL_LIST)
 def test_save_load(model_class):
     """
diff --git a/tests/test_vec_normalize.py b/tests/test_vec_normalize.py
index 3c21f69aa..75fd3a857 100644
--- a/tests/test_vec_normalize.py
+++ b/tests/test_vec_normalize.py
@@ -8,9 +8,11 @@
 
 ENV_ID = 'Pendulum-v0'
 
+
 def make_env():
     return gym.make(ENV_ID)
 
+
 def check_rms_equal(rmsa, rmsb):
     assert np.all(rmsa.mean == rmsb.mean)
     assert np.all(rmsa.var == rmsb.var)
@@ -34,6 +36,7 @@ def check_vec_norm_equal(norma, normb):
     assert norma.epsilon == normb.epsilon
     assert norma.training == normb.training
 
+
 def _make_warmstart_cartpole():
     """Warm-start VecNormalize by stepping through CartPole"""
     venv = DummyVecEnv([lambda: gym.make("CartPole-v1")])
@@ -50,8 +53,8 @@ def _make_warmstart_cartpole():
 def test_runningmeanstd():
     """Test RunningMeanStd object"""
     for (x_1, x_2, x_3) in [
-         (np.random.randn(3), np.random.randn(4), np.random.randn(5)),
-         (np.random.randn(3, 2), np.random.randn(4, 2), np.random.randn(5, 2))]:
+        (np.random.randn(3), np.random.randn(4), np.random.randn(5)),
+        (np.random.randn(3, 2), np.random.randn(4, 2), np.random.randn(5, 2))]:
         rms = RunningMeanStd(epsilon=0.0, shape=x_1.shape[1:])
 
         x_cat = np.concatenate([x_1, x_2, x_3], axis=0)
diff --git a/torchy_baselines/a2c/a2c.py b/torchy_baselines/a2c/a2c.py
index 6e0a1f2e9..5f9730df2 100644
--- a/torchy_baselines/a2c/a2c.py
+++ b/torchy_baselines/a2c/a2c.py
@@ -1,15 +1,14 @@
-from typing import List, Tuple, Type, Union, Callable, Optional, Dict, Any
-
-from gym import spaces
 import torch as th
 import torch.nn.functional as F
+from gym import spaces
+from typing import Type, Union, Callable, Optional, Dict, Any
 
-from torchy_baselines.common.utils import explained_variance
 from torchy_baselines.common import logger
-from torchy_baselines.common.type_aliases import GymEnv
 from torchy_baselines.common.callbacks import BaseCallback
-from torchy_baselines.ppo.ppo import PPO
+from torchy_baselines.common.type_aliases import GymEnv
+from torchy_baselines.common.utils import explained_variance
 from torchy_baselines.ppo.policies import PPOPolicy
+from torchy_baselines.ppo.ppo import PPO
 
 
 class A2C(PPO):
diff --git a/torchy_baselines/cem_rl/cem.py b/torchy_baselines/cem_rl/cem.py
index d1d221c86..7527b996c 100644
--- a/torchy_baselines/cem_rl/cem.py
+++ b/torchy_baselines/cem_rl/cem.py
@@ -1,6 +1,5 @@
-from typing import Type, Tuple, Optional, List
-
 import numpy as np
+from typing import Tuple, Optional, List
 
 
 # TODO: add more from https://github.com/hardmaru/estool/blob/master/es.py
diff --git a/torchy_baselines/common/base_class.py b/torchy_baselines/common/base_class.py
index b77237dbe..f1d205a0e 100644
--- a/torchy_baselines/common/base_class.py
+++ b/torchy_baselines/common/base_class.py
@@ -16,7 +16,7 @@
 from torchy_baselines.common.utils import set_random_seed, get_schedule_fn, update_learning_rate
 from torchy_baselines.common.vec_env import DummyVecEnv, VecEnv, unwrap_vec_normalize, VecNormalize
 from torchy_baselines.common.save_util import data_to_json, json_to_data, recursive_getattr, recursive_setattr
-from torchy_baselines.common.type_aliases import GymEnv, TensorDict, OptimizerStateDict, RolloutReturn
+from torchy_baselines.common.type_aliases import GymEnv, TensorDict, RolloutReturn
 from torchy_baselines.common.callbacks import BaseCallback, CallbackList, ConvertCallback, EvalCallback
 from torchy_baselines.common.monitor import Monitor
 from torchy_baselines.common.noise import ActionNoise
@@ -494,7 +494,7 @@ def _load_from_file(load_path: str, load_data: bool = True) -> (Tuple[Optional[D
                 if "data" in namelist and load_data:
                     # Load class parameters and convert to string
                     json_data = archive.read("data").decode()
-                    data = json_to_data(json_data, device)
+                    data = json_to_data(json_data)
 
                 if "tensors.pth" in namelist and load_data:
                     # Load extra tensors
diff --git a/torchy_baselines/common/buffers.py b/torchy_baselines/common/buffers.py
index 6ffe479a2..5ddc29c39 100644
--- a/torchy_baselines/common/buffers.py
+++ b/torchy_baselines/common/buffers.py
@@ -18,6 +18,7 @@ class BaseBuffer(object):
         to which the values will be converted
     :param n_envs: (int) Number of parallel environments
     """
+
     def __init__(self,
                  buffer_size: int,
                  obs_dim: int,
@@ -118,13 +119,13 @@ def to_torch(self, array: np.ndarray, copy: bool = True) -> th.Tensor:
 
     @staticmethod
     def _normalize_obs(obs: np.ndarray,
-                      env: Optional[VecNormalize] = None) -> np.ndarray:
+                       env: Optional[VecNormalize] = None) -> np.ndarray:
         if env is not None:
             return env.normalize_obs(obs).astype(np.float32)
         return obs
 
-    def _normalize_reward(self,
-                          reward: np.ndarray,
+    @staticmethod
+    def _normalize_reward(reward: np.ndarray,
                           env: Optional[VecNormalize] = None) -> np.ndarray:
         if env is not None:
             return env.normalize_reward(reward).astype(np.float32)
@@ -141,13 +142,13 @@ class ReplayBuffer(BaseBuffer):
     :param device: (th.device)
     :param n_envs: (int) Number of parallel environments
     """
+
     def __init__(self,
                  buffer_size: int,
                  obs_dim: int,
                  action_dim: int,
                  device: Union[th.device, str] = 'cpu',
                  n_envs: int = 1):
-
         super(ReplayBuffer, self).__init__(buffer_size, obs_dim, action_dim, device, n_envs=n_envs)
 
         assert n_envs == 1, "Replay buffer only support single environment for now"
@@ -201,6 +202,7 @@ class RolloutBuffer(BaseBuffer):
     :param gamma: (float) Discount factor
     :param n_envs: (int) Number of parallel environments
     """
+
     def __init__(self,
                  buffer_size: int,
                  obs_dim: int,
diff --git a/torchy_baselines/common/distributions.py b/torchy_baselines/common/distributions.py
index a7f219ac8..6a0ab2f11 100644
--- a/torchy_baselines/common/distributions.py
+++ b/torchy_baselines/common/distributions.py
@@ -317,7 +317,7 @@ def sample_weights(self, log_std: th.Tensor, batch_size: int = 1) -> None:
         self.exploration_matrices = self.weights_dist.rsample((batch_size,))
 
     def proba_distribution_net(self, latent_dim: int, log_std_init: float = -2.0,
-                               latent_sde_dim: Optional[th.Tensor] = None) -> Tuple[nn.Module, nn.Parameter]:
+                               latent_sde_dim: Optional[int] = None) -> Tuple[nn.Module, nn.Parameter]:
         """
         Create the layers and parameter that represent the distribution:
         one output will be the deterministic action, the other parameter will be the
@@ -325,7 +325,7 @@ def proba_distribution_net(self, latent_dim: int, log_std_init: float = -2.0,
 
         :param latent_dim: (int) Dimension of the last layer of the policy (before the action layer)
         :param log_std_init: (float) Initial value for the log standard deviation
-        :param latent_sde_dim: (int) Dimension of the last layer of the feature extractor
+        :param latent_sde_dim: (Optional[int]) Dimension of the last layer of the feature extractor
             for SDE. By default, it is shared with the policy network.
         :return: (nn.Linear, nn.Parameter)
         """
diff --git a/torchy_baselines/common/monitor.py b/torchy_baselines/common/monitor.py
index 3d84b9ba9..5a8716902 100644
--- a/torchy_baselines/common/monitor.py
+++ b/torchy_baselines/common/monitor.py
@@ -27,8 +27,9 @@ def __init__(self,
         :param env: (gym.Env) The environment
         :param filename: (Optional[str]) the location to save a log file, can be None for no log
         :param allow_early_resets: (bool) allows the reset of the environment before it is done
-        :param reset_keywords: (Tuple[str, ...]) extra keywords for the reset call, if extra parameters are needed at reset
-        :param info_keywords: (Tuple[str, ...]) extra information to log, from the information return of environment.step
+        :param reset_keywords: (Tuple[str, ...]) extra keywords for the reset call,
+            if extra parameters are needed at reset
+        :param info_keywords: (Tuple[str, ...]) extra information to log, from the information return of env.step()
         """
         super(Monitor, self).__init__(env=env)
         self.t_start = time.time()
diff --git a/torchy_baselines/common/noise.py b/torchy_baselines/common/noise.py
index fa25f42ad..8511010e9 100644
--- a/torchy_baselines/common/noise.py
+++ b/torchy_baselines/common/noise.py
@@ -9,6 +9,7 @@ class ActionNoise(ABC):
     """
     The action noise base class
     """
+
     def __init__(self):
         super(ActionNoise, self).__init__()
 
@@ -22,6 +23,7 @@ def reset(self):
     def __call__(self):
         pass
 
+
 class NormalActionNoise(ActionNoise):
     """
     A Gaussian action noise
@@ -29,6 +31,7 @@ class NormalActionNoise(ActionNoise):
     :param mean: (float) the mean value of the noise
     :param sigma: (float) the scale of the noise (std here)
     """
+
     def __init__(self, mean, sigma):
         self._mu = mean
         self._sigma = sigma
diff --git a/torchy_baselines/common/save_util.py b/torchy_baselines/common/save_util.py
index a9dedbc94..85fb6fd12 100644
--- a/torchy_baselines/common/save_util.py
+++ b/torchy_baselines/common/save_util.py
@@ -122,14 +122,12 @@ def data_to_json(data: Dict[str, Any]) -> str:
 
 
 def json_to_data(json_string: str,
-                 device: Union[th.device, str] = 'cpu',
                  custom_objects: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
     """
     Turn JSON serialization of class-parameters back into dictionary.
 
     :param json_string: (str) JSON serialization of the class-parameters
         that should be loaded.
-    :param device: torch.device device to which the data should be mapped if errors occur
     :param custom_objects: (dict) Dictionary of objects to replace
         upon loading. If a variable is present in this dictionary as a
         key, it will not be deserialized and the corresponding item
diff --git a/torchy_baselines/common/type_aliases.py b/torchy_baselines/common/type_aliases.py
index 12c220fe4..60042be0e 100644
--- a/torchy_baselines/common/type_aliases.py
+++ b/torchy_baselines/common/type_aliases.py
@@ -1,8 +1,7 @@
 """
-Common aliases for type hing
+Common aliases for type hint
 """
-from typing import Union, Type, Optional, Dict, Any, List, NamedTuple
-from collections import namedtuple
+from typing import Union, Dict, Any, NamedTuple, Optional
 
 import numpy as np
 import torch as th
diff --git a/torchy_baselines/common/vec_env/base_vec_env.py b/torchy_baselines/common/vec_env/base_vec_env.py
index 18e8a5956..4a81e6ea6 100644
--- a/torchy_baselines/common/vec_env/base_vec_env.py
+++ b/torchy_baselines/common/vec_env/base_vec_env.py
@@ -244,7 +244,7 @@ def __getattr__(self, name):
         if blocked_class is not None:
             own_class = f"{type(self).__module__}.{type(self).__name__}"
             error_str = (f"Error: Recursive attribute lookup for {name} from {own_class} is "
-                          "ambiguous and hides attribute from {blocked_class}")
+                         "ambiguous and hides attribute from {blocked_class}")
             raise AttributeError(error_str)
 
         return self.getattr_recursive(name)
diff --git a/torchy_baselines/common/vec_env/subproc_vec_env.py b/torchy_baselines/common/vec_env/subproc_vec_env.py
index 920568377..5e6ee858c 100644
--- a/torchy_baselines/common/vec_env/subproc_vec_env.py
+++ b/torchy_baselines/common/vec_env/subproc_vec_env.py
@@ -61,11 +61,11 @@ def tile_images(img_nhwc):
     new_width = int(np.ceil(float(n_images) / new_height))
     img_nhwc = np.array(list(img_nhwc) + [img_nhwc[0] * 0 for _ in range(n_images, new_height * new_width)])
     # img_HWhwc
-    out_image = img_nhwc.reshape(new_height, new_width, height, width, n_channels)
+    out_image = img_nhwc.reshape((new_height, new_width, height, width, n_channels))
     # img_HhWwc
     out_image = out_image.transpose(0, 2, 1, 3, 4)
     # img_Hh_Ww_c
-    out_image = out_image.reshape(new_height * height, new_width * width, n_channels)
+    out_image = out_image.reshape((new_height * height, new_width * width, n_channels))
     return out_image
 
 
diff --git a/torchy_baselines/common/vec_env/vec_normalize.py b/torchy_baselines/common/vec_env/vec_normalize.py
index ea94bc009..87fb70ae6 100644
--- a/torchy_baselines/common/vec_env/vec_normalize.py
+++ b/torchy_baselines/common/vec_env/vec_normalize.py
@@ -86,7 +86,7 @@ def step_wait(self):
         """
         obs, rews, news, infos = self.venv.step_wait()
         self.old_obs = obs
-        self.old_rews = rews
+        self.old_reward = rews
 
         if self.training:
             self.obs_rms.update(obs)
@@ -122,7 +122,7 @@ def normalize_reward(self, reward):
         """
         if self.norm_reward:
             reward = np.clip(reward / np.sqrt(self.ret_rms.var + self.epsilon),
-                           -self.clip_reward, self.clip_reward)
+                             -self.clip_reward, self.clip_reward)
         return reward
 
     def unnormalize_obs(self, obs):
@@ -146,7 +146,7 @@ def get_original_reward(self):
         """
         Returns an unnormalized version of the rewards from the most recent step.
         """
-        return self.old_rews.copy()
+        return self.old_reward.copy()
 
     def reset(self):
         """
diff --git a/torchy_baselines/ppo/policies.py b/torchy_baselines/ppo/policies.py
index 491737a02..b98977683 100644
--- a/torchy_baselines/ppo/policies.py
+++ b/torchy_baselines/ppo/policies.py
@@ -9,8 +9,8 @@
 from torchy_baselines.common.policies import (BasePolicy, register_policy, MlpExtractor,
                                               create_sde_feature_extractor)
 from torchy_baselines.common.distributions import (make_proba_distribution, Distribution,
-    DiagGaussianDistribution, CategoricalDistribution, StateDependentNoiseDistribution)
-
+                                                   DiagGaussianDistribution, CategoricalDistribution,
+                                                   StateDependentNoiseDistribution)
 
 
 class PPOPolicy(BasePolicy):
@@ -183,13 +183,15 @@ def predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Ten
         action, _ = self._get_action_dist_from_latent(latent_pi, latent_sde, deterministic=deterministic)
         return action
 
-    def evaluate_actions(self, obs: th.Tensor, actions: th.Tensor, deterministic: bool = False) -> Tuple[th.Tensor, th.Tensor, th.Tensor]:
+    def evaluate_actions(self, obs: th.Tensor,
+                         actions: th.Tensor,
+                         deterministic: bool = False) -> Tuple[th.Tensor, th.Tensor, th.Tensor]:
         """
         Evaluate actions according to the current policy,
         given the observations.
 
         :param obs: (th.Tensor)
-        :param action: (th.Tensor)
+        :param actions: (th.Tensor)
         :param deterministic: (bool)
         :return: (th.Tensor, th.Tensor, th.Tensor) estimated value, log likelihood of taking those actions
             and entropy of the action distribution.
diff --git a/torchy_baselines/ppo/ppo.py b/torchy_baselines/ppo/ppo.py
index a7dcf0bdb..5ac1a9644 100644
--- a/torchy_baselines/ppo/ppo.py
+++ b/torchy_baselines/ppo/ppo.py
@@ -146,11 +146,11 @@ def _setup_model(self) -> None:
             self.clip_range_vf = get_schedule_fn(self.clip_range_vf)
 
     def collect_rollouts(self,
-                        env: VecEnv,
-                        callback: BaseCallback,
-                        rollout_buffer: RolloutBuffer,
-                        n_rollout_steps: int = 256,
-                        obs: Optional[np.ndarray] = None) -> Tuple[Optional[np.ndarray], bool]:
+                         env: VecEnv,
+                         callback: BaseCallback,
+                         rollout_buffer: RolloutBuffer,
+                         n_rollout_steps: int = 256,
+                         obs: Optional[np.ndarray] = None) -> Tuple[Optional[np.ndarray], bool]:
 
         n_steps = 0
         continue_training = True
@@ -167,7 +167,6 @@ def collect_rollouts(self,
                 continue_training = False
                 return None, continue_training
 
-
             if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0:
                 # Sample a new noise matrix
                 self.policy.reset_noise(env.num_envs)
@@ -227,7 +226,8 @@ def train(self, gradient_steps: int, batch_size: int = 64) -> None:
                 values, log_prob, entropy = self.policy.evaluate_actions(rollout_data.observations, actions)
                 values = values.flatten()
                 # Normalize advantage
-                advantages = (rollout_data.advantages - rollout_data.advantages.mean()) / (rollout_data.advantages.std() + 1e-8)
+                advantages = (rollout_data.advantages - rollout_data.advantages.mean()) / (
+                            rollout_data.advantages.std() + 1e-8)
 
                 # ratio between old and new policy, should be one at the first iteration
                 ratio = th.exp(log_prob - rollout_data.old_log_prob)
@@ -242,7 +242,8 @@ def train(self, gradient_steps: int, batch_size: int = 64) -> None:
                 else:
                     # Clip the different between old and new value
                     # NOTE: this depends on the reward scaling
-                    values_pred = rollout_data.old_values + th.clamp(values - rollout_data.old_values, -clip_range_vf, clip_range_vf)
+                    values_pred = rollout_data.old_values + th.clamp(values - rollout_data.old_values, -clip_range_vf,
+                                                                     clip_range_vf)
                 # Value loss using the TD(gae_lambda) target
                 value_loss = F.mse_loss(rollout_data.returns, values_pred)
 
@@ -275,7 +276,6 @@ def train(self, gradient_steps: int, batch_size: int = 64) -> None:
         if self.clip_range_vf is not None:
             logger.logkv("clip_range_vf", clip_range_vf)
 
-
         logger.logkv("explained_variance", explained_var)
         # TODO: gather stats for the entropy and other losses?
         logger.logkv("entropy_loss", entropy_loss.item())
diff --git a/torchy_baselines/sac/policies.py b/torchy_baselines/sac/policies.py
index 15a28f2dd..1a1dc33bb 100644
--- a/torchy_baselines/sac/policies.py
+++ b/torchy_baselines/sac/policies.py
@@ -244,6 +244,7 @@ def forward(self, obs: th.Tensor) -> th.Tensor:
     def predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor:
         return self.actor.forward(observation, deterministic)
 
+
 MlpPolicy = SACPolicy
 
 register_policy("MlpPolicy", MlpPolicy)
diff --git a/torchy_baselines/td3/policies.py b/torchy_baselines/td3/policies.py
index f75cdea68..3020d268d 100644
--- a/torchy_baselines/td3/policies.py
+++ b/torchy_baselines/td3/policies.py
@@ -1,12 +1,11 @@
-from typing import List, Tuple, Callable, Optional
-
 import torch
 import torch as th
 import torch.nn as nn
+from typing import List, Tuple, Optional
 
+from torchy_baselines.common.distributions import StateDependentNoiseDistribution
 from torchy_baselines.common.policies import BasePolicy, register_policy, create_mlp, BaseNetwork, \
     create_sde_feature_extractor
-from torchy_baselines.common.distributions import StateDependentNoiseDistribution
 
 
 class Actor(BaseNetwork):
diff --git a/torchy_baselines/td3/td3.py b/torchy_baselines/td3/td3.py
index ccb835e50..9de88c106 100644
--- a/torchy_baselines/td3/td3.py
+++ b/torchy_baselines/td3/td3.py
@@ -1,14 +1,12 @@
-from typing import List, Tuple, Type, Union, Callable, Optional, Dict, Any
-
 import torch as th
 import torch.nn.functional as F
-import numpy as np
+from typing import List, Tuple, Type, Union, Callable, Optional, Dict, Any
 
 from torchy_baselines.common.base_class import OffPolicyRLModel
 from torchy_baselines.common.buffers import ReplayBuffer
-from torchy_baselines.common.type_aliases import ReplayBufferSamples, GymEnv
-from torchy_baselines.common.noise import ActionNoise
 from torchy_baselines.common.callbacks import BaseCallback
+from torchy_baselines.common.noise import ActionNoise
+from torchy_baselines.common.type_aliases import ReplayBufferSamples, GymEnv
 from torchy_baselines.td3.policies import TD3Policy
 
 
@@ -138,9 +136,9 @@ def _create_aliases(self) -> None:
         self.vf_net = self.policy.vf_net
 
     def train_critic(self, gradient_steps: int = 1,
-                    batch_size: int = 100,
-                    replay_data: Optional[ReplayBufferSamples] = None,
-                    tau: float = 0.0) -> None:
+                     batch_size: int = 100,
+                     replay_data: Optional[ReplayBufferSamples] = None,
+                     tau: float = 0.0) -> None:
         # Update optimizer learning rate
         self._update_learning_rate(self.critic.optimizer)
 

From b64873ffff1923b02a00c7b683099959a288ff6c Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Thu, 12 Mar 2020 12:34:25 +0100
Subject: [PATCH 15/17] Sync callbacks

---
 docs/misc/changelog.rst                     |  1 +
 tests/test_run.py                           |  2 +-
 torchy_baselines/a2c/a2c.py                 |  5 +-
 torchy_baselines/cem_rl/cem_rl.py           |  5 +-
 torchy_baselines/common/base_class.py       | 12 ++--
 torchy_baselines/common/callbacks.py        | 62 +++++++++++++++------
 torchy_baselines/common/type_aliases.py     |  5 +-
 torchy_baselines/common/vec_env/__init__.py | 15 +++--
 torchy_baselines/ppo/ppo.py                 | 12 ++--
 torchy_baselines/sac/sac.py                 |  5 +-
 torchy_baselines/td3/td3.py                 |  5 +-
 11 files changed, 81 insertions(+), 48 deletions(-)

diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
index 3bd119bf5..38a63d5b5 100644
--- a/docs/misc/changelog.rst
+++ b/docs/misc/changelog.rst
@@ -17,6 +17,7 @@ New Features:
 
 Bug Fixes:
 ^^^^^^^^^^
+- Synced callbacks with Stable-Baselines
 
 Deprecations:
 ^^^^^^^^^^^^^
diff --git a/tests/test_run.py b/tests/test_run.py
index 1a3f99110..db33cc75f 100644
--- a/tests/test_run.py
+++ b/tests/test_run.py
@@ -23,7 +23,7 @@ def test_cemrl():
 @pytest.mark.parametrize("model_class", [A2C, PPO])
 @pytest.mark.parametrize("env_id", ['CartPole-v1', 'Pendulum-v0'])
 def test_onpolicy(model_class, env_id):
-    model = model_class('MlpPolicy', env_id, policy_kwargs=dict(net_arch=[16]), verbose=1, create_eval_env=True)
+    model = model_class('MlpPolicy', env_id, seed=0, policy_kwargs=dict(net_arch=[16]), verbose=1, create_eval_env=True)
     model.learn(total_timesteps=1000, eval_freq=500)
 
 
diff --git a/torchy_baselines/a2c/a2c.py b/torchy_baselines/a2c/a2c.py
index 5f9730df2..252f8bf0b 100644
--- a/torchy_baselines/a2c/a2c.py
+++ b/torchy_baselines/a2c/a2c.py
@@ -4,8 +4,7 @@
 from typing import Type, Union, Callable, Optional, Dict, Any
 
 from torchy_baselines.common import logger
-from torchy_baselines.common.callbacks import BaseCallback
-from torchy_baselines.common.type_aliases import GymEnv
+from torchy_baselines.common.type_aliases import GymEnv, MaybeCallback
 from torchy_baselines.common.utils import explained_variance
 from torchy_baselines.ppo.policies import PPOPolicy
 from torchy_baselines.ppo.ppo import PPO
@@ -154,7 +153,7 @@ def train(self, gradient_steps: int, batch_size: Optional[int] = None) -> None:
 
     def learn(self,
               total_timesteps: int,
-              callback: Optional[BaseCallback] = None,
+              callback: MaybeCallback = None,
               log_interval: int = 100,
               eval_env: Optional[GymEnv] = None,
               eval_freq: int = -1,
diff --git a/torchy_baselines/cem_rl/cem_rl.py b/torchy_baselines/cem_rl/cem_rl.py
index 867f2c418..0069aef0c 100644
--- a/torchy_baselines/cem_rl/cem_rl.py
+++ b/torchy_baselines/cem_rl/cem_rl.py
@@ -3,8 +3,7 @@
 import torch as th
 
 from torchy_baselines.common.base_class import OffPolicyRLModel
-from torchy_baselines.common.callbacks import BaseCallback
-from torchy_baselines.common.type_aliases import GymEnv
+from torchy_baselines.common.type_aliases import GymEnv, MaybeCallback
 from torchy_baselines.common.noise import ActionNoise
 from torchy_baselines.td3.td3 import TD3, TD3Policy
 from torchy_baselines.cem_rl.cem import CEM
@@ -121,7 +120,7 @@ def _setup_model(self) -> None:
 
     def learn(self,
               total_timesteps: int,
-              callback: Optional[BaseCallback] = None,
+              callback: MaybeCallback = None,
               log_interval: int = 4,
               eval_env: Optional[GymEnv] = None,
               eval_freq: int = -1,
diff --git a/torchy_baselines/common/base_class.py b/torchy_baselines/common/base_class.py
index f1d205a0e..19ea791d2 100644
--- a/torchy_baselines/common/base_class.py
+++ b/torchy_baselines/common/base_class.py
@@ -16,7 +16,7 @@
 from torchy_baselines.common.utils import set_random_seed, get_schedule_fn, update_learning_rate
 from torchy_baselines.common.vec_env import DummyVecEnv, VecEnv, unwrap_vec_normalize, VecNormalize
 from torchy_baselines.common.save_util import data_to_json, json_to_data, recursive_getattr, recursive_setattr
-from torchy_baselines.common.type_aliases import GymEnv, TensorDict, RolloutReturn
+from torchy_baselines.common.type_aliases import GymEnv, TensorDict, RolloutReturn, MaybeCallback
 from torchy_baselines.common.callbacks import BaseCallback, CallbackList, ConvertCallback, EvalCallback
 from torchy_baselines.common.monitor import Monitor
 from torchy_baselines.common.noise import ActionNoise
@@ -281,7 +281,7 @@ def get_torch_variables(self) -> Tuple[List[str], List[str]]:
 
     @abstractmethod
     def learn(self, total_timesteps: int,
-              callback: Union[None, Callable, List[BaseCallback], BaseCallback] = None,
+              callback: MaybeCallback = None,
               log_interval: int = 100,
               tb_log_name: str = "run",
               eval_env: Optional[GymEnv] = None,
@@ -877,10 +877,6 @@ def collect_rollouts(self,
 
             while not done:
 
-                # Only stop training if return value is False, not when it is None.
-                if callback() is False:
-                    return RolloutReturn(0.0, total_steps, total_episodes, None, continue_training=False)
-
                 if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0:
                     # Sample a new noise matrix
                     self.actor.reset_noise()
@@ -913,6 +909,10 @@ def collect_rollouts(self,
                 # Rescale and perform action
                 new_obs, reward, done, infos = env.step(self.unscale_action(clipped_action))
 
+                # Only stop training if return value is False, not when it is None.
+                if callback.on_step() is False:
+                    return RolloutReturn(0.0, total_steps, total_episodes, None, continue_training=False)
+
                 episode_reward += reward
 
                 # Retrieve reward and episode length if using Monitor wrapper
diff --git a/torchy_baselines/common/callbacks.py b/torchy_baselines/common/callbacks.py
index 392716c96..1a8403ac3 100644
--- a/torchy_baselines/common/callbacks.py
+++ b/torchy_baselines/common/callbacks.py
@@ -1,12 +1,13 @@
 import os
 from abc import ABC, abstractmethod
+import warnings
 import typing
 from typing import Union, List, Dict, Any, Optional
 
 import gym
 import numpy as np
 
-from torchy_baselines.common.vec_env import VecEnv, sync_envs_normalization
+from torchy_baselines.common.vec_env import DummyVecEnv, VecEnv, sync_envs_normalization
 from torchy_baselines.common.evaluation import evaluate_policy
 from torchy_baselines.common.logger import Logger
 
@@ -22,9 +23,13 @@ class BaseCallback(ABC):
     """
     def __init__(self, verbose: int = 0):
         super(BaseCallback, self).__init__()
+        # The RL model
         self.model = None  # type: Optional[BaseRLModel]
+        # An alias for self.model.get_env(), the environment used for training
         self.training_env = None  # type: Union[gym.Env, VecEnv, None]
+        # Number of time the callback was called
         self.n_calls = 0  # type: int
+        # n_envs * n times env.step() was called
         self.num_timesteps = 0  # type: int
         self.verbose = verbose
         self.locals = None  # type: Optional[Dict[str, Any]]
@@ -70,9 +75,13 @@ def _on_step(self) -> bool:
         """
         return True
 
-    def __call__(self) -> bool:
+    def on_step(self) -> bool:
         """
-        This method will be called by the model. This is the equivalent to the callback function.
+        This method will be called by the model after each call to ``env.step()``.
+
+        For child callback (of an ``EventCallback``), this will be called
+        when the event is triggered.
+
         :return: (bool) If the callback returns False, training is aborted early.
         """
         self.n_calls += 1
@@ -128,6 +137,12 @@ def _on_step(self) -> bool:
 
 
 class CallbackList(BaseCallback):
+    """
+    Class for chaining callbacks.
+
+    :param callbacks: (List[BaseCallback]) A list of callbacks that will be called
+        sequentially.
+    """
     def __init__(self, callbacks: List[BaseCallback]):
         super(CallbackList, self).__init__()
         assert isinstance(callbacks, list)
@@ -141,16 +156,21 @@ def _on_training_start(self) -> None:
         for callback in self.callbacks:
             callback.on_training_start(self.locals, self.globals)
 
+    def _on_rollout_start(self) -> None:
+        for callback in self.callbacks:
+            callback.on_rollout_start()
+
     def _on_step(self) -> bool:
         continue_training = True
         for callback in self.callbacks:
-            # # Update variables
-            # callback.num_timesteps = self.num_timesteps
-            # callback.n_calls = self.n_calls
             # Return False (stop training) if at least one callback returns False
-            continue_training = callback() and continue_training
+            continue_training = callback.on_step() and continue_training
         return continue_training
 
+    def _on_rollout_end(self) -> None:
+        for callback in self.callbacks:
+            callback.on_rollout_end()
+
     def _on_training_end(self) -> None:
         for callback in self.callbacks:
             callback.on_training_end()
@@ -158,7 +178,7 @@ def _on_training_end(self) -> None:
 
 class CheckpointCallback(BaseCallback):
     """
-    Callback for saving a model every `save_freq` steps
+    Callback for saving a model every ``save_freq`` steps
 
     :param save_freq: (int)
     :param save_path: (str) Path to the folder where the model will be saved.
@@ -207,16 +227,17 @@ class EvalCallback(EventCallback):
 
     :param eval_env: (Union[gym.Env, VecEnv]) The environment used for initialization
     :param callback_on_new_best: (Optional[BaseCallback]) Callback to trigger
-        when there is a new best model according to the `mean_reward`
+        when there is a new best model according to the ``mean_reward``
     :param n_eval_episodes: (int) The number of episodes to test the agent
     :param eval_freq: (int) Evaluate the agent every eval_freq call of the callback.
-    :param log_path: (str) Path to a folder where the evaluations (`evaluations.npz`)
+    :param log_path: (str) Path to a folder where the evaluations (``evaluations.npz``)
         will be saved. It will be updated at each evaluation.
     :param best_model_save_path: (str) Path to a folder where the best model
         according to performance on the eval env will be saved.
     :param deterministic: (bool) Whether the evaluation should
         use a stochastic or deterministic actions.
     :param deterministic: (bool) Whether to render or not the environment during evaluation
+    :param render: (bool) Whether to render or not the environment during evaluation
     :param verbose: (int)
     """
     def __init__(self, eval_env: Union[gym.Env, VecEnv],
@@ -236,12 +257,16 @@ def __init__(self, eval_env: Union[gym.Env, VecEnv],
         self.deterministic = deterministic
         self.render = render
 
+        # Convert to VecEnv for consistency
+        if not isinstance(eval_env, VecEnv):
+            eval_env = DummyVecEnv([lambda: eval_env])
+
         if isinstance(eval_env, VecEnv):
             assert eval_env.num_envs == 1, "You must pass only one environment for evaluation"
 
         self.eval_env = eval_env
         self.best_model_save_path = best_model_save_path
-        # Logs will be written in `evaluations.npz`
+        # Logs will be written in ``evaluations.npz``
         if log_path is not None:
             log_path = os.path.join(log_path, 'evaluations')
         self.log_path = log_path
@@ -250,9 +275,10 @@ def __init__(self, eval_env: Union[gym.Env, VecEnv],
         self.evaluations_length = []
 
     def _init_callback(self):
-        # Does not work when eval_env is a gym.Env and training_env is a VecEnv
-        # assert type(self.training_env) is type(self.eval_env), ("training and eval env are not of the same type",
-        #                                                         "{} != {}".format(self.training_env, self.eval_env))
+        # Does not work in some corner cases, where the wrapper is not the same
+        if not type(self.training_env) is type(self.eval_env):
+            warnings.warn("Training and eval env are not of the same type"
+                          f"{self.training_env} != {self.eval_env}")
 
         # Create folders if needed
         if self.best_model_save_path is not None:
@@ -306,7 +332,7 @@ class StopTrainingOnRewardThreshold(BaseCallback):
     Stop the training once a threshold in episodic reward
     has been reached (i.e. when the model is good enough).
 
-    It must be used with the `EvalCallback`.
+    It must be used with the ``EvalCallback``.
 
     :param reward_threshold: (float)  Minimum expected reward per episode
         to stop training.
@@ -317,8 +343,8 @@ def __init__(self, reward_threshold: float, verbose: int = 0):
         self.reward_threshold = reward_threshold
 
     def _on_step(self) -> bool:
-        assert self.parent is not None, ("`StopTrainingOnMinimumReward` callback must be used "
-                                         "with an `EvalCallback`")
+        assert self.parent is not None, ("``StopTrainingOnMinimumReward`` callback must be used "
+                                         "with an ``EvalCallback``")
         # Convert np.bool to bool, otherwise callback() is False won't work
         continue_training = bool(self.parent.best_mean_reward < self.reward_threshold)
         if self.verbose > 0 and not continue_training:
@@ -329,7 +355,7 @@ def _on_step(self) -> bool:
 
 class EveryNTimesteps(EventCallback):
     """
-    Trigger a callback every `n_steps` timesteps
+    Trigger a callback every ``n_steps`` timesteps
 
     :param n_steps: (int) Number of timesteps between two trigger.
     :param callback: (BaseCallback) Callback that will be called
diff --git a/torchy_baselines/common/type_aliases.py b/torchy_baselines/common/type_aliases.py
index 60042be0e..53c152c81 100644
--- a/torchy_baselines/common/type_aliases.py
+++ b/torchy_baselines/common/type_aliases.py
@@ -1,18 +1,21 @@
 """
 Common aliases for type hint
 """
-from typing import Union, Dict, Any, NamedTuple, Optional
+import typing
+from typing import Union, Dict, Any, NamedTuple, Optional, List, Callable
 
 import numpy as np
 import torch as th
 import gym
 
 from torchy_baselines.common.vec_env import VecEnv
+from torchy_baselines.common.callbacks import BaseCallback
 
 
 GymEnv = Union[gym.Env, VecEnv]
 TensorDict = Dict[str, th.Tensor]
 OptimizerStateDict = Dict[str, Any]
+MaybeCallback = Union[None, Callable, List[BaseCallback], BaseCallback]
 
 
 class RolloutBufferSamples(NamedTuple):
diff --git a/torchy_baselines/common/vec_env/__init__.py b/torchy_baselines/common/vec_env/__init__.py
index 38099af44..2cbb34992 100644
--- a/torchy_baselines/common/vec_env/__init__.py
+++ b/torchy_baselines/common/vec_env/__init__.py
@@ -1,4 +1,6 @@
 # flake8: noqa F401
+import typing
+from typing import Optional
 from copy import deepcopy
 
 from torchy_baselines.common.vec_env.base_vec_env import AlreadySteppingError, NotSteppingError,\
@@ -8,8 +10,12 @@
 from torchy_baselines.common.vec_env.vec_frame_stack import VecFrameStack
 from torchy_baselines.common.vec_env.vec_normalize import VecNormalize
 
+# Avoid circular import
+if typing.TYPE_CHECKING:
+    from torchy_baselines.common.type_aliases import GymEnv
 
-def unwrap_vec_normalize(env):
+
+def unwrap_vec_normalize(env: 'GymEnv') -> Optional[VecNormalize]:
     """
     :param env: (gym.Env)
     :return: (VecNormalize)
@@ -23,16 +29,17 @@ def unwrap_vec_normalize(env):
 
 
 # Define here to avoid circular import
-def sync_envs_normalization(env, eval_env):
+def sync_envs_normalization(env: 'GymEnv', eval_env: 'GymEnv') -> None:
     """
     Sync eval env and train env when using VecNormalize
 
-    :param env: (gym.Env)
-    :param eval_env: (gym.Env)
+    :param env: (GymEnv)
+    :param eval_env: (GymEnv)
     """
     env_tmp, eval_env_tmp = env, eval_env
     while isinstance(env_tmp, VecEnvWrapper):
         if isinstance(env_tmp, VecNormalize):
             eval_env_tmp.obs_rms = deepcopy(env_tmp.obs_rms)
+            eval_env_tmp.ret_rms = deepcopy(env_tmp.ret_rms)
         env_tmp = env_tmp.venv
         eval_env_tmp = eval_env_tmp.venv
diff --git a/torchy_baselines/ppo/ppo.py b/torchy_baselines/ppo/ppo.py
index 5ac1a9644..7b7cd74ab 100644
--- a/torchy_baselines/ppo/ppo.py
+++ b/torchy_baselines/ppo/ppo.py
@@ -16,7 +16,7 @@
 
 from torchy_baselines.common import logger
 from torchy_baselines.common.base_class import BaseRLModel
-from torchy_baselines.common.type_aliases import GymEnv
+from torchy_baselines.common.type_aliases import GymEnv, MaybeCallback
 from torchy_baselines.common.buffers import RolloutBuffer
 from torchy_baselines.common.utils import explained_variance, get_schedule_fn
 from torchy_baselines.common.vec_env import VecEnv
@@ -163,10 +163,6 @@ def collect_rollouts(self,
 
         while n_steps < n_rollout_steps:
 
-            if callback() is False:
-                continue_training = False
-                return None, continue_training
-
             if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0:
                 # Sample a new noise matrix
                 self.policy.reset_noise(env.num_envs)
@@ -182,6 +178,10 @@ def collect_rollouts(self,
                 clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high)
             new_obs, rewards, dones, infos = env.step(clipped_actions)
 
+            if callback.on_step() is False:
+                continue_training = False
+                return None, continue_training
+
             self._update_info_buffer(infos)
             n_steps += 1
             self.num_timesteps += env.num_envs
@@ -286,7 +286,7 @@ def train(self, gradient_steps: int, batch_size: int = 64) -> None:
 
     def learn(self,
               total_timesteps: int,
-              callback: Optional[BaseCallback] = None,
+              callback: MaybeCallback = None,
               log_interval: int = 1,
               eval_env: Optional[GymEnv] = None,
               eval_freq: int = -1,
diff --git a/torchy_baselines/sac/sac.py b/torchy_baselines/sac/sac.py
index d0309e8b0..9d517b3b2 100644
--- a/torchy_baselines/sac/sac.py
+++ b/torchy_baselines/sac/sac.py
@@ -7,9 +7,8 @@
 from torchy_baselines.common import logger
 from torchy_baselines.common.base_class import OffPolicyRLModel
 from torchy_baselines.common.buffers import ReplayBuffer
-from torchy_baselines.common.type_aliases import GymEnv
+from torchy_baselines.common.type_aliases import GymEnv, MaybeCallback
 from torchy_baselines.common.noise import ActionNoise
-from torchy_baselines.common.callbacks import BaseCallback
 from torchy_baselines.sac.policies import SACPolicy
 
 
@@ -253,7 +252,7 @@ def train(self, gradient_steps: int, batch_size: int = 64) -> None:
 
     def learn(self,
               total_timesteps: int,
-              callback: Optional[BaseCallback] = None,
+              callback: MaybeCallback = None,
               log_interval: int = 4,
               eval_env: Optional[GymEnv] = None,
               eval_freq: int = -1,
diff --git a/torchy_baselines/td3/td3.py b/torchy_baselines/td3/td3.py
index 9de88c106..c2cf27836 100644
--- a/torchy_baselines/td3/td3.py
+++ b/torchy_baselines/td3/td3.py
@@ -4,9 +4,8 @@
 
 from torchy_baselines.common.base_class import OffPolicyRLModel
 from torchy_baselines.common.buffers import ReplayBuffer
-from torchy_baselines.common.callbacks import BaseCallback
 from torchy_baselines.common.noise import ActionNoise
-from torchy_baselines.common.type_aliases import ReplayBufferSamples, GymEnv
+from torchy_baselines.common.type_aliases import ReplayBufferSamples, GymEnv, MaybeCallback
 from torchy_baselines.td3.policies import TD3Policy
 
 
@@ -264,7 +263,7 @@ def train_sde(self) -> None:
 
     def learn(self,
               total_timesteps: int,
-              callback: Optional[BaseCallback] = None,
+              callback: MaybeCallback = None,
               log_interval: int = 4,
               eval_env: Optional[GymEnv] = None,
               eval_freq: int = -1,

From 765d8fc5b268a948e19e38fb0c8884d1b0b051e1 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Thu, 12 Mar 2020 13:24:11 +0100
Subject: [PATCH 16/17] Fix event callback

---
 torchy_baselines/common/callbacks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchy_baselines/common/callbacks.py b/torchy_baselines/common/callbacks.py
index 1a8403ac3..b5a015d1f 100644
--- a/torchy_baselines/common/callbacks.py
+++ b/torchy_baselines/common/callbacks.py
@@ -129,7 +129,7 @@ def _on_training_start(self) -> None:
 
     def _on_event(self) -> bool:
         if self.callback is not None:
-            return self.callback()
+            return self.callback.on_step()
         return True
 
     def _on_step(self) -> bool:

From 70e601c03cdb0135438d621a19f5f2fcf514de18 Mon Sep 17 00:00:00 2001
From: Antonin Raffin <antonin.raffin@dlr.de>
Date: Thu, 12 Mar 2020 15:34:35 +0100
Subject: [PATCH 17/17] Improve code and bump version

---
 setup.py                          |  2 +-
 tests/test_vec_normalize.py       | 12 +++++++-
 torchy_baselines/__init__.py      |  2 +-
 torchy_baselines/a2c/a2c.py       |  2 +-
 torchy_baselines/cem_rl/cem_rl.py |  2 +-
 torchy_baselines/ppo/policies.py  |  4 +--
 torchy_baselines/ppo/ppo.py       |  6 ++--
 torchy_baselines/sac/policies.py  | 46 +++++++++++++------------------
 torchy_baselines/sac/sac.py       |  2 +-
 torchy_baselines/td3/policies.py  | 12 ++++----
 torchy_baselines/td3/td3.py       |  2 +-
 11 files changed, 47 insertions(+), 45 deletions(-)

diff --git a/setup.py b/setup.py
index 1fa044cc6..044d839b1 100644
--- a/setup.py
+++ b/setup.py
@@ -47,7 +47,7 @@
       license="MIT",
       long_description="",
       long_description_content_type='text/markdown',
-      version="0.2.2",
+      version="0.2.3",
       )
 
 # python setup.py sdist
diff --git a/tests/test_vec_normalize.py b/tests/test_vec_normalize.py
index 75fd3a857..d80462c07 100644
--- a/tests/test_vec_normalize.py
+++ b/tests/test_vec_normalize.py
@@ -3,7 +3,7 @@
 import numpy as np
 
 from torchy_baselines.common.running_mean_std import RunningMeanStd
-from torchy_baselines.common.vec_env import DummyVecEnv, VecNormalize, VecFrameStack, sync_envs_normalization
+from torchy_baselines.common.vec_env import DummyVecEnv, VecNormalize, VecFrameStack, sync_envs_normalization, unwrap_vec_normalize
 from torchy_baselines import CEMRL, SAC, TD3
 
 ENV_ID = 'Pendulum-v0'
@@ -132,9 +132,17 @@ def test_offpolicy_normalization(model_class):
 
 def test_sync_vec_normalize():
     env = DummyVecEnv([make_env])
+
+    assert unwrap_vec_normalize(env) is None
+
     env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.)
+
+    assert isinstance(unwrap_vec_normalize(env), VecNormalize)
+
     env = VecFrameStack(env, 1)
 
+    assert isinstance(unwrap_vec_normalize(env), VecNormalize)
+
     eval_env = DummyVecEnv([make_env])
     eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.)
     eval_env = VecFrameStack(eval_env, 1)
@@ -146,6 +154,7 @@ def test_sync_vec_normalize():
 
     obs = env.reset()
     original_obs = env.get_original_obs()
+    dummy_rewards = np.random.rand(10)
     # Normalization must be different
     assert not np.allclose(obs, eval_env.normalize_obs(original_obs))
 
@@ -153,3 +162,4 @@ def test_sync_vec_normalize():
 
     # Now they must be synced
     assert np.allclose(obs, eval_env.normalize_obs(original_obs))
+    assert np.allclose(env.normalize_reward(dummy_rewards), eval_env.normalize_reward(dummy_rewards))
diff --git a/torchy_baselines/__init__.py b/torchy_baselines/__init__.py
index a548d3037..8c6637646 100644
--- a/torchy_baselines/__init__.py
+++ b/torchy_baselines/__init__.py
@@ -4,4 +4,4 @@
 from torchy_baselines.sac import SAC
 from torchy_baselines.td3 import TD3
 
-__version__ = "0.2.2"
+__version__ = "0.2.3"
diff --git a/torchy_baselines/a2c/a2c.py b/torchy_baselines/a2c/a2c.py
index 252f8bf0b..6b2c22339 100644
--- a/torchy_baselines/a2c/a2c.py
+++ b/torchy_baselines/a2c/a2c.py
@@ -43,7 +43,7 @@ class A2C(PPO):
     :param create_eval_env: (bool) Whether to create a second environment that will be
         used for evaluating the agent periodically. (Only available when passing string for the environment)
     :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation
-    :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
+    :param verbose: (int) the verbosity level: 0 no output, 1 info, 2 debug
     :param seed: (int) Seed for the pseudo random generators
     :param device: (str or th.device) Device (cpu, cuda, ...) on which the code should be run.
         Setting it to auto, the code will be run on the GPU if possible.
diff --git a/torchy_baselines/cem_rl/cem_rl.py b/torchy_baselines/cem_rl/cem_rl.py
index 0069aef0c..585db55ab 100644
--- a/torchy_baselines/cem_rl/cem_rl.py
+++ b/torchy_baselines/cem_rl/cem_rl.py
@@ -50,7 +50,7 @@ class CEMRL(TD3):
     :param create_eval_env: (bool) Whether to create a second environment that will be
         used for evaluating the agent periodically. (Only available when passing string for the environment)
     :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation
-    :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
+    :param verbose: (int) the verbosity level: 0 no output, 1 info, 2 debug
     :param seed: (int) Seed for the pseudo random generators
     :param device: (str or th.device) Device (cpu, cuda, ...) on which the code should be run.
         Setting it to auto, the code will be run on the GPU if possible.
diff --git a/torchy_baselines/ppo/policies.py b/torchy_baselines/ppo/policies.py
index b98977683..2d4904a61 100644
--- a/torchy_baselines/ppo/policies.py
+++ b/torchy_baselines/ppo/policies.py
@@ -32,9 +32,9 @@ class PPOPolicy(BasePolicy):
     :param sde_net_arch: ([int]) Network architecture for extracting features
         when using SDE. If None, the latent features from the policy will be used.
         Pass an empty list to use the states as features.
-    :param use_expln: (bool) Use `expln()` function instead of `exp()` to ensure
+    :param use_expln: (bool) Use ``expln()`` function instead of ``exp()`` to ensure
         a positive standard deviation (cf paper). It allows to keep variance
-        above zero and prevent it from growing too fast. In practice, `exp()` is usually enough.
+        above zero and prevent it from growing too fast. In practice, ``exp()`` is usually enough.
     :param squash_output: (bool) Whether to squash the output using a tanh function,
         this allows to ensure boundaries when using SDE.
     """
diff --git a/torchy_baselines/ppo/ppo.py b/torchy_baselines/ppo/ppo.py
index 7b7cd74ab..2ee6a7538 100644
--- a/torchy_baselines/ppo/ppo.py
+++ b/torchy_baselines/ppo/ppo.py
@@ -67,7 +67,7 @@ class PPO(BaseRLModel):
     :param create_eval_env: (bool) Whether to create a second environment that will be
         used for evaluating the agent periodically. (Only available when passing string for the environment)
     :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation
-    :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
+    :param verbose: (int) the verbosity level: 0 no output, 1 info, 2 debug
     :param seed: (int) Seed for the pseudo random generators
     :param device: (str or th.device) Device (cpu, cuda, ...) on which the code should be run.
         Setting it to auto, the code will be run on the GPU if possible.
@@ -226,8 +226,8 @@ def train(self, gradient_steps: int, batch_size: int = 64) -> None:
                 values, log_prob, entropy = self.policy.evaluate_actions(rollout_data.observations, actions)
                 values = values.flatten()
                 # Normalize advantage
-                advantages = (rollout_data.advantages - rollout_data.advantages.mean()) / (
-                            rollout_data.advantages.std() + 1e-8)
+                advantages = rollout_data.advantages
+                advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
 
                 # ratio between old and new policy, should be one at the first iteration
                 ratio = th.exp(log_prob - rollout_data.old_log_prob)
diff --git a/torchy_baselines/sac/policies.py b/torchy_baselines/sac/policies.py
index 1a1dc33bb..00d7457bd 100644
--- a/torchy_baselines/sac/policies.py
+++ b/torchy_baselines/sac/policies.py
@@ -28,9 +28,9 @@ class Actor(BaseNetwork):
     :param sde_net_arch: ([int]) Network architecture for extracting features
         when using SDE. If None, the latent features from the policy will be used.
         Pass an empty list to use the states as features.
-    :param use_expln: (bool) Use `expln()` function instead of `exp()` when using SDE to ensure
+    :param use_expln: (bool) Use ``expln()`` function instead of ``exp()`` when using SDE to ensure
         a positive standard deviation (cf paper). It allows to keep variance
-        above zero and prevent it from growing too fast. In practice, `exp()` is usually enough.
+        above zero and prevent it from growing too fast. In practice, ``exp()`` is usually enough.
     :param clip_mean: (float) Clip the mean output when using SDE to avoid numerical instability.
     """
     def __init__(self, obs_dim: int,
@@ -75,8 +75,8 @@ def get_std(self) -> th.Tensor:
         """
         Retrieve the standard deviation of the action distribution.
         Only useful when using SDE.
-        It corresponds to `th.exp(log_std)` in the normal case,
-        but is slightly different when using `expln` function
+        It corresponds to ``th.exp(log_std)`` in the normal case,
+        but is slightly different when using ``expln`` function
         (cf StateDependentNoiseDistribution doc).
 
         :return: (th.Tensor)
@@ -96,43 +96,35 @@ def reset_noise(self, batch_size: int = 1) -> None:
     def _get_latent(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
         latent_pi = self.latent_pi(obs)
 
-        if self.sde_feature_extractor is not None:
-            latent_sde = self.sde_feature_extractor(obs)
-        else:
-            latent_sde = latent_pi
+        latent_sde = self.sde_feature_extractor(obs) if self.sde_feature_extractor is not None else latent_pi
+
         return latent_pi, latent_sde
 
     def get_action_dist_params(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor, th.Tensor]:
         latent_pi, latent_sde = self._get_latent(obs)
+        mean_actions = self.mu(latent_pi)
 
         if self.use_sde:
-            mean_actions, log_std = self.mu(latent_pi), self.log_std
+            log_std = self.log_std
         else:
-            mean_actions, log_std = self.mu(latent_pi), self.log_std(latent_pi)
+            log_std = self.log_std(latent_pi)
             # Original Implementation to cap the standard deviation
             log_std = th.clamp(log_std, LOG_STD_MIN, LOG_STD_MAX)
         return mean_actions, log_std, latent_sde
 
     def forward(self, obs: th.Tensor, deterministic: bool = False) -> th.Tensor:
         mean_actions, log_std, latent_sde = self.get_action_dist_params(obs)
-        if self.use_sde:
-            # Note: the action is squashed
-            action, _ = self.action_dist.proba_distribution(mean_actions, log_std, latent_sde,
-                                                            deterministic=deterministic)
-        else:
-            # Note: the action is squashed
-            action, _ = self.action_dist.proba_distribution(mean_actions, log_std,
-                                                            deterministic=deterministic)
+        kwargs = dict(latent_sde=latent_sde) if self.use_sde else {}
+        # Note: the action is squashed
+        action, _ = self.action_dist.proba_distribution(mean_actions, log_std,
+                                                        deterministic=deterministic, **kwargs)
         return action
 
     def action_log_prob(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
         mean_actions, log_std, latent_sde = self.get_action_dist_params(obs)
-
-        if self.use_sde:
-            action, log_prob = self.action_dist.log_prob_from_params(mean_actions, self.log_std, latent_sde)
-        else:
-            action, log_prob = self.action_dist.log_prob_from_params(mean_actions, log_std)
-        return action, log_prob
+        kwargs = dict(latent_sde=latent_sde) if self.use_sde else {}
+        # return action and associated log prob
+        return self.action_dist.log_prob_from_params(mean_actions, log_std, **kwargs)
 
 
 class Critic(BaseNetwork):
@@ -178,9 +170,9 @@ class SACPolicy(BasePolicy):
     :param sde_net_arch: ([int]) Network architecture for extracting features
         when using SDE. If None, the latent features from the policy will be used.
         Pass an empty list to use the states as features.
-    :param use_expln: (bool) Use `expln()` function instead of `exp()` when using SDE to ensure
+    :param use_expln: (bool) Use ``expln()`` function instead of ``exp()`` when using SDE to ensure
         a positive standard deviation (cf paper). It allows to keep variance
-        above zero and prevent it from growing too fast. In practice, `exp()` is usually enough.
+        above zero and prevent it from growing too fast. In practice, ``exp()`` is usually enough.
     :param clip_mean: (float) Clip the mean output when using SDE to avoid numerical instability.
     """
     def __init__(self, observation_space: gym.spaces.Space,
@@ -239,7 +231,7 @@ def make_critic(self) -> Critic:
         return Critic(**self.net_args).to(self.device)
 
     def forward(self, obs: th.Tensor) -> th.Tensor:
-        return self.actor(obs)
+        return self.predict(obs, deterministic=False)
 
     def predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor:
         return self.actor.forward(observation, deterministic)
diff --git a/torchy_baselines/sac/sac.py b/torchy_baselines/sac/sac.py
index 9d517b3b2..1696bad2d 100644
--- a/torchy_baselines/sac/sac.py
+++ b/torchy_baselines/sac/sac.py
@@ -56,7 +56,7 @@ class SAC(OffPolicyRLModel):
     :param create_eval_env: (bool) Whether to create a second environment that will be
         used for evaluating the agent periodically. (Only available when passing string for the environment)
     :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation
-    :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
+    :param verbose: (int) the verbosity level: 0 no output, 1 info, 2 debug
     :param seed: (int) Seed for the pseudo random generators
     :param device: (str or th.device) Device (cpu, cuda, ...) on which the code should be run.
         Setting it to auto, the code will be run on the GPU if possible.
diff --git a/torchy_baselines/td3/policies.py b/torchy_baselines/td3/policies.py
index 3020d268d..fc86c7790 100644
--- a/torchy_baselines/td3/policies.py
+++ b/torchy_baselines/td3/policies.py
@@ -25,9 +25,9 @@ class Actor(BaseNetwork):
     :param sde_net_arch: ([int]) Network architecture for extracting features
         when using SDE. If None, the latent features from the policy will be used.
         Pass an empty list to use the states as features.
-    :param use_expln: (bool) Use `expln()` function instead of `exp()` when using SDE to ensure
+    :param use_expln: (bool) Use ``expln()`` function instead of ``exp()`` when using SDE to ensure
         a positive standard deviation (cf paper). It allows to keep variance
-        above zero and prevent it from growing too fast. In practice, `exp()` is usually enough.
+        above zero and prevent it from growing too fast. In practice, ``exp()`` is usually enough.
     """
     def __init__(self,
                  obs_dim: int,
@@ -80,8 +80,8 @@ def get_std(self) -> torch.Tensor:
         """
         Retrieve the standard deviation of the action distribution.
         Only useful when using SDE.
-        It corresponds to `th.exp(log_std)` in the normal case,
-        but is slightly different when using `expln` function
+        It corresponds to ``th.exp(log_std)`` in the normal case,
+        but is slightly different when using ``expln`` function
         (cf StateDependentNoiseDistribution doc).
 
         :return: (th.Tensor)
@@ -206,9 +206,9 @@ class TD3Policy(BasePolicy):
     :param sde_net_arch: ([int]) Network architecture for extracting features
         when using SDE. If None, the latent features from the policy will be used.
         Pass an empty list to use the states as features.
-    :param use_expln: (bool) Use `expln()` function instead of `exp()` when using SDE to ensure
+    :param use_expln: (bool) Use ``expln()`` function instead of ``exp()`` when using SDE to ensure
         a positive standard deviation (cf paper). It allows to keep variance
-        above zero and prevent it from growing too fast. In practice, `exp()` is usually enough.
+        above zero and prevent it from growing too fast. In practice, ``exp()`` is usually enough.
     """
     def __init__(self, observation_space, action_space,
                  learning_rate, net_arch=None, device='cpu',
diff --git a/torchy_baselines/td3/td3.py b/torchy_baselines/td3/td3.py
index c2cf27836..b2a7dbe23 100644
--- a/torchy_baselines/td3/td3.py
+++ b/torchy_baselines/td3/td3.py
@@ -51,7 +51,7 @@ class TD3(OffPolicyRLModel):
     :param create_eval_env: (bool) Whether to create a second environment that will be
         used for evaluating the agent periodically. (Only available when passing string for the environment)
     :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation
-    :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
+    :param verbose: (int) the verbosity level: 0 no output, 1 info, 2 debug
     :param seed: (int) Seed for the pseudo random generators
     :param device: (str or th.device) Device (cpu, cuda, ...) on which the code should be run.
         Setting it to auto, the code will be run on the GPU if possible.