Improve code and bump version

Shunian-Chen · Mar 12, 2020 · 70e601c · 70e601c
1 parent 765d8fc
commit 70e601c
Show file tree

Hide file tree

Showing 11 changed files with 47 additions and 45 deletions.
diff --git a/setup.py b/setup.py
@@ -47,7 +47,7 @@
       license="MIT",
       long_description="",
       long_description_content_type='text/markdown',
-      version="0.2.2",
+      version="0.2.3",
       )
 
 # python setup.py sdist

diff --git a/tests/test_vec_normalize.py b/tests/test_vec_normalize.py
@@ -3,7 +3,7 @@
 import numpy as np
 
 from torchy_baselines.common.running_mean_std import RunningMeanStd
-from torchy_baselines.common.vec_env import DummyVecEnv, VecNormalize, VecFrameStack, sync_envs_normalization
+from torchy_baselines.common.vec_env import DummyVecEnv, VecNormalize, VecFrameStack, sync_envs_normalization, unwrap_vec_normalize
 from torchy_baselines import CEMRL, SAC, TD3
 
 ENV_ID = 'Pendulum-v0'
@@ -132,9 +132,17 @@ def test_offpolicy_normalization(model_class):
 
 def test_sync_vec_normalize():
     env = DummyVecEnv([make_env])
+
+    assert unwrap_vec_normalize(env) is None
+
     env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.)
+
+    assert isinstance(unwrap_vec_normalize(env), VecNormalize)
+
     env = VecFrameStack(env, 1)
 
+    assert isinstance(unwrap_vec_normalize(env), VecNormalize)
+
     eval_env = DummyVecEnv([make_env])
     eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.)
     eval_env = VecFrameStack(eval_env, 1)
@@ -146,10 +154,12 @@ def test_sync_vec_normalize():
 
     obs = env.reset()
     original_obs = env.get_original_obs()
+    dummy_rewards = np.random.rand(10)
     # Normalization must be different
     assert not np.allclose(obs, eval_env.normalize_obs(original_obs))
 
     sync_envs_normalization(env, eval_env)
 
     # Now they must be synced
     assert np.allclose(obs, eval_env.normalize_obs(original_obs))
+    assert np.allclose(env.normalize_reward(dummy_rewards), eval_env.normalize_reward(dummy_rewards))
diff --git a/torchy_baselines/__init__.py b/torchy_baselines/__init__.py
@@ -4,4 +4,4 @@
 from torchy_baselines.sac import SAC
 from torchy_baselines.td3 import TD3
 
-__version__ = "0.2.2"
+__version__ = "0.2.3"
diff --git a/torchy_baselines/a2c/a2c.py b/torchy_baselines/a2c/a2c.py
@@ -43,7 +43,7 @@ class A2C(PPO):
     :param create_eval_env: (bool) Whether to create a second environment that will be
         used for evaluating the agent periodically. (Only available when passing string for the environment)
     :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation
-    :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
+    :param verbose: (int) the verbosity level: 0 no output, 1 info, 2 debug
     :param seed: (int) Seed for the pseudo random generators
     :param device: (str or th.device) Device (cpu, cuda, ...) on which the code should be run.
         Setting it to auto, the code will be run on the GPU if possible.

diff --git a/torchy_baselines/cem_rl/cem_rl.py b/torchy_baselines/cem_rl/cem_rl.py
@@ -50,7 +50,7 @@ class CEMRL(TD3):
     :param create_eval_env: (bool) Whether to create a second environment that will be
         used for evaluating the agent periodically. (Only available when passing string for the environment)
     :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation
-    :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
+    :param verbose: (int) the verbosity level: 0 no output, 1 info, 2 debug
     :param seed: (int) Seed for the pseudo random generators
     :param device: (str or th.device) Device (cpu, cuda, ...) on which the code should be run.
         Setting it to auto, the code will be run on the GPU if possible.

diff --git a/torchy_baselines/ppo/policies.py b/torchy_baselines/ppo/policies.py
@@ -32,9 +32,9 @@ class PPOPolicy(BasePolicy):
     :param sde_net_arch: ([int]) Network architecture for extracting features
         when using SDE. If None, the latent features from the policy will be used.
         Pass an empty list to use the states as features.
-    :param use_expln: (bool) Use `expln()` function instead of `exp()` to ensure
+    :param use_expln: (bool) Use ``expln()`` function instead of ``exp()`` to ensure
         a positive standard deviation (cf paper). It allows to keep variance
-        above zero and prevent it from growing too fast. In practice, `exp()` is usually enough.
+        above zero and prevent it from growing too fast. In practice, ``exp()`` is usually enough.
     :param squash_output: (bool) Whether to squash the output using a tanh function,
         this allows to ensure boundaries when using SDE.
     """

diff --git a/torchy_baselines/ppo/ppo.py b/torchy_baselines/ppo/ppo.py
@@ -67,7 +67,7 @@ class PPO(BaseRLModel):
     :param create_eval_env: (bool) Whether to create a second environment that will be
         used for evaluating the agent periodically. (Only available when passing string for the environment)
     :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation
-    :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
+    :param verbose: (int) the verbosity level: 0 no output, 1 info, 2 debug
     :param seed: (int) Seed for the pseudo random generators
     :param device: (str or th.device) Device (cpu, cuda, ...) on which the code should be run.
         Setting it to auto, the code will be run on the GPU if possible.
@@ -226,8 +226,8 @@ def train(self, gradient_steps: int, batch_size: int = 64) -> None:
                 values, log_prob, entropy = self.policy.evaluate_actions(rollout_data.observations, actions)
                 values = values.flatten()
                 # Normalize advantage
-                advantages = (rollout_data.advantages - rollout_data.advantages.mean()) / (
-                            rollout_data.advantages.std() + 1e-8)
+                advantages = rollout_data.advantages
+                advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
 
                 # ratio between old and new policy, should be one at the first iteration
                 ratio = th.exp(log_prob - rollout_data.old_log_prob)

diff --git a/torchy_baselines/sac/policies.py b/torchy_baselines/sac/policies.py
@@ -28,9 +28,9 @@ class Actor(BaseNetwork):
     :param sde_net_arch: ([int]) Network architecture for extracting features
         when using SDE. If None, the latent features from the policy will be used.
         Pass an empty list to use the states as features.
-    :param use_expln: (bool) Use `expln()` function instead of `exp()` when using SDE to ensure
+    :param use_expln: (bool) Use ``expln()`` function instead of ``exp()`` when using SDE to ensure
         a positive standard deviation (cf paper). It allows to keep variance
-        above zero and prevent it from growing too fast. In practice, `exp()` is usually enough.
+        above zero and prevent it from growing too fast. In practice, ``exp()`` is usually enough.
     :param clip_mean: (float) Clip the mean output when using SDE to avoid numerical instability.
     """
     def __init__(self, obs_dim: int,
@@ -75,8 +75,8 @@ def get_std(self) -> th.Tensor:
         """
         Retrieve the standard deviation of the action distribution.
         Only useful when using SDE.
-        It corresponds to `th.exp(log_std)` in the normal case,
-        but is slightly different when using `expln` function
+        It corresponds to ``th.exp(log_std)`` in the normal case,
+        but is slightly different when using ``expln`` function
         (cf StateDependentNoiseDistribution doc).
 
         :return: (th.Tensor)
@@ -96,43 +96,35 @@ def reset_noise(self, batch_size: int = 1) -> None:
     def _get_latent(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
         latent_pi = self.latent_pi(obs)
 
-        if self.sde_feature_extractor is not None:
-            latent_sde = self.sde_feature_extractor(obs)
-        else:
-            latent_sde = latent_pi
+        latent_sde = self.sde_feature_extractor(obs) if self.sde_feature_extractor is not None else latent_pi
+
         return latent_pi, latent_sde
 
     def get_action_dist_params(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor, th.Tensor]:
         latent_pi, latent_sde = self._get_latent(obs)
+        mean_actions = self.mu(latent_pi)
 
         if self.use_sde:
-            mean_actions, log_std = self.mu(latent_pi), self.log_std
+            log_std = self.log_std
         else:
-            mean_actions, log_std = self.mu(latent_pi), self.log_std(latent_pi)
+            log_std = self.log_std(latent_pi)
             # Original Implementation to cap the standard deviation
             log_std = th.clamp(log_std, LOG_STD_MIN, LOG_STD_MAX)
         return mean_actions, log_std, latent_sde
 
     def forward(self, obs: th.Tensor, deterministic: bool = False) -> th.Tensor:
         mean_actions, log_std, latent_sde = self.get_action_dist_params(obs)
-        if self.use_sde:
-            # Note: the action is squashed
-            action, _ = self.action_dist.proba_distribution(mean_actions, log_std, latent_sde,
-                                                            deterministic=deterministic)
-        else:
-            # Note: the action is squashed
-            action, _ = self.action_dist.proba_distribution(mean_actions, log_std,
-                                                            deterministic=deterministic)
+        kwargs = dict(latent_sde=latent_sde) if self.use_sde else {}
+        # Note: the action is squashed
+        action, _ = self.action_dist.proba_distribution(mean_actions, log_std,
+                                                        deterministic=deterministic, **kwargs)
         return action
 
     def action_log_prob(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
         mean_actions, log_std, latent_sde = self.get_action_dist_params(obs)
-
-        if self.use_sde:
-            action, log_prob = self.action_dist.log_prob_from_params(mean_actions, self.log_std, latent_sde)
-        else:
-            action, log_prob = self.action_dist.log_prob_from_params(mean_actions, log_std)
-        return action, log_prob
+        kwargs = dict(latent_sde=latent_sde) if self.use_sde else {}
+        # return action and associated log prob
+        return self.action_dist.log_prob_from_params(mean_actions, log_std, **kwargs)
 
 
 class Critic(BaseNetwork):
@@ -178,9 +170,9 @@ class SACPolicy(BasePolicy):
     :param sde_net_arch: ([int]) Network architecture for extracting features
         when using SDE. If None, the latent features from the policy will be used.
         Pass an empty list to use the states as features.
-    :param use_expln: (bool) Use `expln()` function instead of `exp()` when using SDE to ensure
+    :param use_expln: (bool) Use ``expln()`` function instead of ``exp()`` when using SDE to ensure
         a positive standard deviation (cf paper). It allows to keep variance
-        above zero and prevent it from growing too fast. In practice, `exp()` is usually enough.
+        above zero and prevent it from growing too fast. In practice, ``exp()`` is usually enough.
     :param clip_mean: (float) Clip the mean output when using SDE to avoid numerical instability.
     """
     def __init__(self, observation_space: gym.spaces.Space,
@@ -239,7 +231,7 @@ def make_critic(self) -> Critic:
         return Critic(**self.net_args).to(self.device)
 
     def forward(self, obs: th.Tensor) -> th.Tensor:
-        return self.actor(obs)
+        return self.predict(obs, deterministic=False)
 
     def predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor:
         return self.actor.forward(observation, deterministic)

diff --git a/torchy_baselines/sac/sac.py b/torchy_baselines/sac/sac.py
@@ -56,7 +56,7 @@ class SAC(OffPolicyRLModel):
     :param create_eval_env: (bool) Whether to create a second environment that will be
         used for evaluating the agent periodically. (Only available when passing string for the environment)
     :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation
-    :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
+    :param verbose: (int) the verbosity level: 0 no output, 1 info, 2 debug
     :param seed: (int) Seed for the pseudo random generators
     :param device: (str or th.device) Device (cpu, cuda, ...) on which the code should be run.
         Setting it to auto, the code will be run on the GPU if possible.

diff --git a/torchy_baselines/td3/policies.py b/torchy_baselines/td3/policies.py
@@ -25,9 +25,9 @@ class Actor(BaseNetwork):
     :param sde_net_arch: ([int]) Network architecture for extracting features
         when using SDE. If None, the latent features from the policy will be used.
         Pass an empty list to use the states as features.
-    :param use_expln: (bool) Use `expln()` function instead of `exp()` when using SDE to ensure
+    :param use_expln: (bool) Use ``expln()`` function instead of ``exp()`` when using SDE to ensure
         a positive standard deviation (cf paper). It allows to keep variance
-        above zero and prevent it from growing too fast. In practice, `exp()` is usually enough.
+        above zero and prevent it from growing too fast. In practice, ``exp()`` is usually enough.
     """
     def __init__(self,
                  obs_dim: int,
@@ -80,8 +80,8 @@ def get_std(self) -> torch.Tensor:
         """
         Retrieve the standard deviation of the action distribution.
         Only useful when using SDE.
-        It corresponds to `th.exp(log_std)` in the normal case,
-        but is slightly different when using `expln` function
+        It corresponds to ``th.exp(log_std)`` in the normal case,
+        but is slightly different when using ``expln`` function
         (cf StateDependentNoiseDistribution doc).
 
         :return: (th.Tensor)
@@ -206,9 +206,9 @@ class TD3Policy(BasePolicy):
     :param sde_net_arch: ([int]) Network architecture for extracting features
         when using SDE. If None, the latent features from the policy will be used.
         Pass an empty list to use the states as features.
-    :param use_expln: (bool) Use `expln()` function instead of `exp()` when using SDE to ensure
+    :param use_expln: (bool) Use ``expln()`` function instead of ``exp()`` when using SDE to ensure
         a positive standard deviation (cf paper). It allows to keep variance
-        above zero and prevent it from growing too fast. In practice, `exp()` is usually enough.
+        above zero and prevent it from growing too fast. In practice, ``exp()`` is usually enough.
     """
     def __init__(self, observation_space, action_space,
                  learning_rate, net_arch=None, device='cpu',

diff --git a/torchy_baselines/td3/td3.py b/torchy_baselines/td3/td3.py
@@ -51,7 +51,7 @@ class TD3(OffPolicyRLModel):
     :param create_eval_env: (bool) Whether to create a second environment that will be
         used for evaluating the agent periodically. (Only available when passing string for the environment)
     :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation
-    :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
+    :param verbose: (int) the verbosity level: 0 no output, 1 info, 2 debug
     :param seed: (int) Seed for the pseudo random generators
     :param device: (str or th.device) Device (cpu, cuda, ...) on which the code should be run.
         Setting it to auto, the code will be run on the GPU if possible.