Skip to content

Commit

Permalink
Improve code and bump version
Browse files Browse the repository at this point in the history
  • Loading branch information
araffin committed Mar 12, 2020
1 parent 765d8fc commit 70e601c
Show file tree
Hide file tree
Showing 11 changed files with 47 additions and 45 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
license="MIT",
long_description="",
long_description_content_type='text/markdown',
version="0.2.2",
version="0.2.3",
)

# python setup.py sdist
Expand Down
12 changes: 11 additions & 1 deletion tests/test_vec_normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import numpy as np

from torchy_baselines.common.running_mean_std import RunningMeanStd
from torchy_baselines.common.vec_env import DummyVecEnv, VecNormalize, VecFrameStack, sync_envs_normalization
from torchy_baselines.common.vec_env import DummyVecEnv, VecNormalize, VecFrameStack, sync_envs_normalization, unwrap_vec_normalize
from torchy_baselines import CEMRL, SAC, TD3

ENV_ID = 'Pendulum-v0'
Expand Down Expand Up @@ -132,9 +132,17 @@ def test_offpolicy_normalization(model_class):

def test_sync_vec_normalize():
env = DummyVecEnv([make_env])

assert unwrap_vec_normalize(env) is None

env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.)

assert isinstance(unwrap_vec_normalize(env), VecNormalize)

env = VecFrameStack(env, 1)

assert isinstance(unwrap_vec_normalize(env), VecNormalize)

eval_env = DummyVecEnv([make_env])
eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.)
eval_env = VecFrameStack(eval_env, 1)
Expand All @@ -146,10 +154,12 @@ def test_sync_vec_normalize():

obs = env.reset()
original_obs = env.get_original_obs()
dummy_rewards = np.random.rand(10)
# Normalization must be different
assert not np.allclose(obs, eval_env.normalize_obs(original_obs))

sync_envs_normalization(env, eval_env)

# Now they must be synced
assert np.allclose(obs, eval_env.normalize_obs(original_obs))
assert np.allclose(env.normalize_reward(dummy_rewards), eval_env.normalize_reward(dummy_rewards))
2 changes: 1 addition & 1 deletion torchy_baselines/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
from torchy_baselines.sac import SAC
from torchy_baselines.td3 import TD3

__version__ = "0.2.2"
__version__ = "0.2.3"
2 changes: 1 addition & 1 deletion torchy_baselines/a2c/a2c.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ class A2C(PPO):
:param create_eval_env: (bool) Whether to create a second environment that will be
used for evaluating the agent periodically. (Only available when passing string for the environment)
:param policy_kwargs: (dict) additional arguments to be passed to the policy on creation
:param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
:param verbose: (int) the verbosity level: 0 no output, 1 info, 2 debug
:param seed: (int) Seed for the pseudo random generators
:param device: (str or th.device) Device (cpu, cuda, ...) on which the code should be run.
Setting it to auto, the code will be run on the GPU if possible.
Expand Down
2 changes: 1 addition & 1 deletion torchy_baselines/cem_rl/cem_rl.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ class CEMRL(TD3):
:param create_eval_env: (bool) Whether to create a second environment that will be
used for evaluating the agent periodically. (Only available when passing string for the environment)
:param policy_kwargs: (dict) additional arguments to be passed to the policy on creation
:param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
:param verbose: (int) the verbosity level: 0 no output, 1 info, 2 debug
:param seed: (int) Seed for the pseudo random generators
:param device: (str or th.device) Device (cpu, cuda, ...) on which the code should be run.
Setting it to auto, the code will be run on the GPU if possible.
Expand Down
4 changes: 2 additions & 2 deletions torchy_baselines/ppo/policies.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,9 @@ class PPOPolicy(BasePolicy):
:param sde_net_arch: ([int]) Network architecture for extracting features
when using SDE. If None, the latent features from the policy will be used.
Pass an empty list to use the states as features.
:param use_expln: (bool) Use `expln()` function instead of `exp()` to ensure
:param use_expln: (bool) Use ``expln()`` function instead of ``exp()`` to ensure
a positive standard deviation (cf paper). It allows to keep variance
above zero and prevent it from growing too fast. In practice, `exp()` is usually enough.
above zero and prevent it from growing too fast. In practice, ``exp()`` is usually enough.
:param squash_output: (bool) Whether to squash the output using a tanh function,
this allows to ensure boundaries when using SDE.
"""
Expand Down
6 changes: 3 additions & 3 deletions torchy_baselines/ppo/ppo.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ class PPO(BaseRLModel):
:param create_eval_env: (bool) Whether to create a second environment that will be
used for evaluating the agent periodically. (Only available when passing string for the environment)
:param policy_kwargs: (dict) additional arguments to be passed to the policy on creation
:param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
:param verbose: (int) the verbosity level: 0 no output, 1 info, 2 debug
:param seed: (int) Seed for the pseudo random generators
:param device: (str or th.device) Device (cpu, cuda, ...) on which the code should be run.
Setting it to auto, the code will be run on the GPU if possible.
Expand Down Expand Up @@ -226,8 +226,8 @@ def train(self, gradient_steps: int, batch_size: int = 64) -> None:
values, log_prob, entropy = self.policy.evaluate_actions(rollout_data.observations, actions)
values = values.flatten()
# Normalize advantage
advantages = (rollout_data.advantages - rollout_data.advantages.mean()) / (
rollout_data.advantages.std() + 1e-8)
advantages = rollout_data.advantages
advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

# ratio between old and new policy, should be one at the first iteration
ratio = th.exp(log_prob - rollout_data.old_log_prob)
Expand Down
46 changes: 19 additions & 27 deletions torchy_baselines/sac/policies.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,9 @@ class Actor(BaseNetwork):
:param sde_net_arch: ([int]) Network architecture for extracting features
when using SDE. If None, the latent features from the policy will be used.
Pass an empty list to use the states as features.
:param use_expln: (bool) Use `expln()` function instead of `exp()` when using SDE to ensure
:param use_expln: (bool) Use ``expln()`` function instead of ``exp()`` when using SDE to ensure
a positive standard deviation (cf paper). It allows to keep variance
above zero and prevent it from growing too fast. In practice, `exp()` is usually enough.
above zero and prevent it from growing too fast. In practice, ``exp()`` is usually enough.
:param clip_mean: (float) Clip the mean output when using SDE to avoid numerical instability.
"""
def __init__(self, obs_dim: int,
Expand Down Expand Up @@ -75,8 +75,8 @@ def get_std(self) -> th.Tensor:
"""
Retrieve the standard deviation of the action distribution.
Only useful when using SDE.
It corresponds to `th.exp(log_std)` in the normal case,
but is slightly different when using `expln` function
It corresponds to ``th.exp(log_std)`` in the normal case,
but is slightly different when using ``expln`` function
(cf StateDependentNoiseDistribution doc).
:return: (th.Tensor)
Expand All @@ -96,43 +96,35 @@ def reset_noise(self, batch_size: int = 1) -> None:
def _get_latent(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
latent_pi = self.latent_pi(obs)

if self.sde_feature_extractor is not None:
latent_sde = self.sde_feature_extractor(obs)
else:
latent_sde = latent_pi
latent_sde = self.sde_feature_extractor(obs) if self.sde_feature_extractor is not None else latent_pi

return latent_pi, latent_sde

def get_action_dist_params(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor, th.Tensor]:
latent_pi, latent_sde = self._get_latent(obs)
mean_actions = self.mu(latent_pi)

if self.use_sde:
mean_actions, log_std = self.mu(latent_pi), self.log_std
log_std = self.log_std
else:
mean_actions, log_std = self.mu(latent_pi), self.log_std(latent_pi)
log_std = self.log_std(latent_pi)
# Original Implementation to cap the standard deviation
log_std = th.clamp(log_std, LOG_STD_MIN, LOG_STD_MAX)
return mean_actions, log_std, latent_sde

def forward(self, obs: th.Tensor, deterministic: bool = False) -> th.Tensor:
mean_actions, log_std, latent_sde = self.get_action_dist_params(obs)
if self.use_sde:
# Note: the action is squashed
action, _ = self.action_dist.proba_distribution(mean_actions, log_std, latent_sde,
deterministic=deterministic)
else:
# Note: the action is squashed
action, _ = self.action_dist.proba_distribution(mean_actions, log_std,
deterministic=deterministic)
kwargs = dict(latent_sde=latent_sde) if self.use_sde else {}
# Note: the action is squashed
action, _ = self.action_dist.proba_distribution(mean_actions, log_std,
deterministic=deterministic, **kwargs)
return action

def action_log_prob(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
mean_actions, log_std, latent_sde = self.get_action_dist_params(obs)

if self.use_sde:
action, log_prob = self.action_dist.log_prob_from_params(mean_actions, self.log_std, latent_sde)
else:
action, log_prob = self.action_dist.log_prob_from_params(mean_actions, log_std)
return action, log_prob
kwargs = dict(latent_sde=latent_sde) if self.use_sde else {}
# return action and associated log prob
return self.action_dist.log_prob_from_params(mean_actions, log_std, **kwargs)


class Critic(BaseNetwork):
Expand Down Expand Up @@ -178,9 +170,9 @@ class SACPolicy(BasePolicy):
:param sde_net_arch: ([int]) Network architecture for extracting features
when using SDE. If None, the latent features from the policy will be used.
Pass an empty list to use the states as features.
:param use_expln: (bool) Use `expln()` function instead of `exp()` when using SDE to ensure
:param use_expln: (bool) Use ``expln()`` function instead of ``exp()`` when using SDE to ensure
a positive standard deviation (cf paper). It allows to keep variance
above zero and prevent it from growing too fast. In practice, `exp()` is usually enough.
above zero and prevent it from growing too fast. In practice, ``exp()`` is usually enough.
:param clip_mean: (float) Clip the mean output when using SDE to avoid numerical instability.
"""
def __init__(self, observation_space: gym.spaces.Space,
Expand Down Expand Up @@ -239,7 +231,7 @@ def make_critic(self) -> Critic:
return Critic(**self.net_args).to(self.device)

def forward(self, obs: th.Tensor) -> th.Tensor:
return self.actor(obs)
return self.predict(obs, deterministic=False)

def predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor:
return self.actor.forward(observation, deterministic)
Expand Down
2 changes: 1 addition & 1 deletion torchy_baselines/sac/sac.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ class SAC(OffPolicyRLModel):
:param create_eval_env: (bool) Whether to create a second environment that will be
used for evaluating the agent periodically. (Only available when passing string for the environment)
:param policy_kwargs: (dict) additional arguments to be passed to the policy on creation
:param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
:param verbose: (int) the verbosity level: 0 no output, 1 info, 2 debug
:param seed: (int) Seed for the pseudo random generators
:param device: (str or th.device) Device (cpu, cuda, ...) on which the code should be run.
Setting it to auto, the code will be run on the GPU if possible.
Expand Down
12 changes: 6 additions & 6 deletions torchy_baselines/td3/policies.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@ class Actor(BaseNetwork):
:param sde_net_arch: ([int]) Network architecture for extracting features
when using SDE. If None, the latent features from the policy will be used.
Pass an empty list to use the states as features.
:param use_expln: (bool) Use `expln()` function instead of `exp()` when using SDE to ensure
:param use_expln: (bool) Use ``expln()`` function instead of ``exp()`` when using SDE to ensure
a positive standard deviation (cf paper). It allows to keep variance
above zero and prevent it from growing too fast. In practice, `exp()` is usually enough.
above zero and prevent it from growing too fast. In practice, ``exp()`` is usually enough.
"""
def __init__(self,
obs_dim: int,
Expand Down Expand Up @@ -80,8 +80,8 @@ def get_std(self) -> torch.Tensor:
"""
Retrieve the standard deviation of the action distribution.
Only useful when using SDE.
It corresponds to `th.exp(log_std)` in the normal case,
but is slightly different when using `expln` function
It corresponds to ``th.exp(log_std)`` in the normal case,
but is slightly different when using ``expln`` function
(cf StateDependentNoiseDistribution doc).
:return: (th.Tensor)
Expand Down Expand Up @@ -206,9 +206,9 @@ class TD3Policy(BasePolicy):
:param sde_net_arch: ([int]) Network architecture for extracting features
when using SDE. If None, the latent features from the policy will be used.
Pass an empty list to use the states as features.
:param use_expln: (bool) Use `expln()` function instead of `exp()` when using SDE to ensure
:param use_expln: (bool) Use ``expln()`` function instead of ``exp()`` when using SDE to ensure
a positive standard deviation (cf paper). It allows to keep variance
above zero and prevent it from growing too fast. In practice, `exp()` is usually enough.
above zero and prevent it from growing too fast. In practice, ``exp()`` is usually enough.
"""
def __init__(self, observation_space, action_space,
learning_rate, net_arch=None, device='cpu',
Expand Down
2 changes: 1 addition & 1 deletion torchy_baselines/td3/td3.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ class TD3(OffPolicyRLModel):
:param create_eval_env: (bool) Whether to create a second environment that will be
used for evaluating the agent periodically. (Only available when passing string for the environment)
:param policy_kwargs: (dict) additional arguments to be passed to the policy on creation
:param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
:param verbose: (int) the verbosity level: 0 no output, 1 info, 2 debug
:param seed: (int) Seed for the pseudo random generators
:param device: (str or th.device) Device (cpu, cuda, ...) on which the code should be run.
Setting it to auto, the code will be run on the GPU if possible.
Expand Down

0 comments on commit 70e601c

Please sign in to comment.