chandar-lab · dapatil211 · Mar 28, 2023 · May 3, 2022 · May 3, 2022 · May 3, 2022
diff --git a/hive/agents/__init__.py b/hive/agents/__init__.py
@@ -4,6 +4,7 @@
 from hive.agents.dqn import DQNAgent
 from hive.agents.drqn import DRQNAgent
 from hive.agents.legal_moves_rainbow import LegalMovesRainbowAgent
+from hive.agents.ppo import PPOAgent
 from hive.agents.rainbow import RainbowDQNAgent
 from hive.agents.random import RandomAgent
 from hive.agents.td3 import TD3
@@ -16,6 +17,7 @@
         "DQNAgent": DQNAgent,
         "DRQNAgent": DRQNAgent,
         "LegalMovesRainbowAgent": LegalMovesRainbowAgent,
+        "PPOAgent": PPOAgent,
         "RainbowDQNAgent": RainbowDQNAgent,
         "RandomAgent": RandomAgent,
         "TD3": TD3,

diff --git a/hive/agents/ppo.py b/hive/agents/ppo.py
diff --git a/hive/agents/qnets/normalizer.py b/hive/agents/qnets/normalizer.py
@@ -0,0 +1,151 @@
+from typing import Tuple
+
+import numpy as np
+
+from hive.utils.registry import Registrable, registry
+
+# taken from https://github.com/openai/baselines/blob/master/baselines/common/vec_env/vec_normalize.py
+class MeanStd:
+    """Tracks the mean, variance and count of values."""
+
+    # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
+    def __init__(self, epsilon=1e-4, shape=()):
+        """Tracks the mean, variance and count of values."""
+        self.mean = np.zeros(shape, "float64")
+        self.var = np.ones(shape, "float64")
+        self.count = epsilon
+
+    def update(self, x):
+        """Updates the mean, var and count from a batch of samples."""
+        batch_mean = np.mean(x, axis=0)
+        batch_var = np.var(x, axis=0)
+        batch_count = x.shape[0]
+        self.update_from_moments(batch_mean, batch_var, batch_count)
+
+    def update_from_moments(self, batch_mean, batch_var, batch_count):
+        """Updates from batch mean, variance and count moments."""
+        self.mean, self.var, self.count = self.update_mean_var_count_from_moments(
+            self.mean, self.var, self.count, batch_mean, batch_var, batch_count
+        )
+
+    def update_mean_var_count_from_moments(
+        self, mean, var, count, batch_mean, batch_var, batch_count
+    ):
+        """Updates the mean, var and count using the previous mean, var, count and batch values."""
+        delta = batch_mean - mean
+        tot_count = count + batch_count
+
+        new_mean = mean + delta * batch_count / tot_count
+        m_a = var * count
+        m_b = batch_var * batch_count
+        M2 = m_a + m_b + np.square(delta) * count * batch_count / tot_count
+        new_var = M2 / tot_count
+        new_count = tot_count
+
+        return new_mean, new_var, new_count
+
+
+class BaseNormalizationFn(object):
+    """Implements the base normalization function."""
+
+    def __init__(self, *args, **kwds):
+        pass
+
+    def __call__(self, *args, **kwds):
+        return NotImplementedError
+
+    def update(self, *args, **kwds):
+        return NotImplementedError
+
+
+class ObservationNormalizationFn(BaseNormalizationFn):
+    """Implements a normalization function. Transforms output by
+    normalising the input data by the running :obj:`mean` and
+    :obj:`std`, and clipping the normalised data on :obj:`clip`
+    """
+
+    def __init__(
+        self, shape: Tuple[int, ...], epsilon: float = 1e-4, clip: np.float32 = np.inf
+    ):
+        """
+        Args:
+            epsilon (float): minimum value of variance to avoid division by 0.
+            shape (tuple[int]): The shape of input data.
+            clip (np.float32): The clip value for the normalised data.
+        """
+        super().__init__()
+        self.obs_rms = MeanStd(epsilon, shape)
+        self._shape = shape
+        self._epsilon = epsilon
+        self._clip = clip
+
+    def __call__(self, obs):
+        obs = np.array([obs])
+        obs = ((obs - self.obs_rms.mean) / np.sqrt(self.obs_rms.var + self._epsilon))[0]
+        if self._clip is not None:
+            obs = np.clip(obs, -self._clip, self._clip)
+        return obs
+
+    def update(self, obs):
+        self.obs_rms.update(obs)
+
+
+class RewardNormalizationFn(BaseNormalizationFn):
+    """Implements a normalization function. Transforms output by
+    normalising the input data by the running :obj:`mean` and
+    :obj:`std`, and clipping the normalised data on :obj:`clip`
+    """
+
+    def __init__(self, gamma: float, epsilon: float = 1e-4, clip: np.float32 = np.inf):
+        """
+        Args:
+            gamma (float): discount factor for the agent.
+            epsilon (float): minimum value of variance to avoid division by 0.
+            clip (np.float32): The clip value for the normalised data.
+        """
+        super().__init__()
+        self.return_rms = MeanStd(epsilon, ())
+        self._epsilon = epsilon
+        self._clip = clip
+        self._gamma = gamma
+        self._returns = np.zeros(1)
+
+    def __call__(self, rew):
+        rew = np.array([rew])
+        rew = (rew / np.sqrt(self.return_rms.var + self._epsilon))[0]
+        if self._clip is not None:
+            rew = np.clip(rew, -self._clip, self._clip)
+        return rew
+
+    def update(self, rew, done):
+        self._returns = self._returns * self._gamma + rew
+        self.return_rms.update(self._returns)
+        self._returns *= 1 - done
+
+
+class NormalizationFn(Registrable):
+    """A wrapper for callables that produce normalization functions.
+
+    These wrapped callables can be partially initialized through configuration
+    files or command line arguments.
+    """
+
+    @classmethod
+    def type_name(cls):
+        """
+        Returns:
+            "norm_fn"
+        """
+        return "norm_fn"
+
+
+registry.register_all(
+    NormalizationFn,
+    {
+        "BaseNormalization": BaseNormalizationFn,
+        "RewardNormalization": RewardNormalizationFn,
+        "ObservationNormalization": ObservationNormalizationFn,
+    },
+)
+
+get_norm_fn = getattr(registry, f"get_{NormalizationFn.type_name()}")
diff --git a/hive/agents/qnets/ppo_nets.py b/hive/agents/qnets/ppo_nets.py
@@ -0,0 +1,116 @@
+from typing import Tuple, Union
+import gym
+from gym.spaces import Box, Discrete
+import numpy as np
+import torch
+
+from hive.agents.qnets.base import FunctionApproximator
+from hive.agents.qnets.utils import calculate_output_dim
+
+
+class CategoricalHead(torch.nn.Module):
+    """A module that implements a discrete actor head. It uses the ouput from the
+    :obj:`actor_net`, and adds creates a :py:class:`~torch.distributions.categorical.Categorical`
+    object to compute the action distribution."""
+
+    def __init__(
+        self, feature_dim: Tuple[int], action_space: gym.spaces.Discrete
+    ) -> None:
+        """
+        Args:
+            feature dim: Expected output shape of the actor network.
+            action_shape: Expected shape of actions.
+        """
+        super().__init__()
+        self.network = torch.nn.Linear(feature_dim, action_space.n)
+        self.distribution = torch.distributions.categorical.Categorical
+
+    def forward(self, x):
+        logits = self.network(x)
+        return self.distribution(logits=logits)
+
+
+class GaussianPolicyHead(torch.nn.Module):
+    """A module that implements a continuous actor head. It uses the output from the
+    :obj:`actor_net` and state independent learnable parameter :obj:`policy_logstd` to
+    create a :py:class:`~torch.distributions.normal.Normal`  object to compute
+    the action distribution."""
+
+    def __init__(self, feature_dim: Tuple[int], action_space: gym.spaces.Box) -> None:
+        """
+        Args:
+            feature dim: Expected output shape of the actor network.
+            action_shape: Expected shape of actions.
+        """
+        super().__init__()
+        self._action_shape = action_space.shape
+        self.policy_mean = torch.nn.Sequential(
+            torch.nn.Linear(feature_dim, np.prod(self._action_shape))
+        )
+        self.policy_logstd = torch.nn.Parameter(
+            torch.zeros(1, np.prod(action_space.shape))
+        )
+        self.distribution = torch.distributions.normal.Normal
+
+    def forward(self, x):
+        _mean = self.policy_mean(x)
+        _std = self.policy_logstd.repeat(x.shape[0], 1).exp()
+        distribution = self.distribution(
+            torch.reshape(_mean, (x.size(0), *self._action_shape)),
+            torch.reshape(_std, (x.size(0), *self._action_shape)),
+        )
+        return distribution
+
+
+class PPOActorCriticNetwork(torch.nn.Module):
+    """A module that implements the PPO actor and critic computation. It puts together the
+    :obj:`representation_network`, :obj:`actor_net` and :obj:`critic_net`, then adds two final
+    :py:class:`~torch.nn.Linear` layers to compute the action and state value."""
+
+    def __init__(
+        self,
+        representation_network: torch.nn.Module,
+        actor_net: FunctionApproximator,
+        critic_net: FunctionApproximator,
+        network_output_dim: Union[int, Tuple[int]],
+        action_space: Union[Box, Discrete],
+        continuous_action: bool,
+    ) -> None:
+        super().__init__()
+        self._network = representation_network
+        self._continuous_action = continuous_action
+        if actor_net is None:
+            actor_network = torch.nn.Identity()
+        else:
+            actor_network = actor_net(network_output_dim)
+        feature_dim = np.prod(calculate_output_dim(actor_network, network_output_dim))
+        actor_head = GaussianPolicyHead if self._continuous_action else CategoricalHead
+
+        self.actor = torch.nn.Sequential(
+            actor_network,
+            torch.nn.Flatten(),
+            actor_head(feature_dim, action_space),
+        )
+
+        if critic_net is None:
+            critic_network = torch.nn.Identity()
+        else:
+            critic_network = critic_net(network_output_dim)
+        feature_dim = np.prod(calculate_output_dim(critic_network, network_output_dim))
+        self.critic = torch.nn.Sequential(
+            critic_network,
+            torch.nn.Flatten(),
+            torch.nn.Linear(feature_dim, 1),
+        )
+
+    def forward(self, x, action=None):
+        hidden_state = self._network(x)
+        distribution = self.actor(hidden_state)
+        value = self.critic(hidden_state)
+        if action is None:
+            action = distribution.sample()
+
+        logprob, entropy = distribution.log_prob(action), distribution.entropy()
+        if self._continuous_action:
+            logprob, entropy = logprob.sum(dim=-1), entropy.sum(dim=-1)
+        return action, logprob, entropy, value
diff --git a/hive/agents/qnets/utils.py b/hive/agents/qnets/utils.py
@@ -1,5 +1,7 @@
 import math
+from typing import Tuple
 
+import numpy as np
 import torch
 
 from hive.utils.registry import registry

diff --git a/hive/configs/atari/ppo.yml b/hive/configs/atari/ppo.yml
@@ -0,0 +1,64 @@
+run_name: &run_name 'atari-ppo'
+train_steps: 10000000
+test_frequency: 250000
+test_episodes: 10
+max_steps_per_episode: 27000
+stack_size: &stack_size 4
+save_dir: 'experiment'
+saving_schedule:
+  name: 'PeriodicSchedule'
+  kwargs:
+    off_value: False
+    on_value: True
+    period: 1000000
+environment:
+  name: 'AtariEnv'
+  kwargs:
+    env_name: 'Breakout'
+
+agent:
+  name: 'PPOAgent'
+  kwargs:
+    representation_net:
+      name: 'ConvNetwork'
+      kwargs:
+        channels: [32, 64, 64]
+        kernel_sizes: [8, 4, 3]
+        strides: [4, 2, 1]
+        paddings: [2, 2, 1]
+        mlp_layers: [512]
+    optimizer_fn:
+      name: 'Adam'
+      kwargs:
+        lr: .00025
+    init_fn: 
+      name: 'orthogonal'
+    replay_buffer:
+      name: 'PPOReplayBuffer'
+      kwargs:
+        stack_size: *stack_size
+        use_gae: True
+        gae_lambda: .95
+    discount_rate: .99
+    grad_clip: .5
+    clip_coef: .1
+    ent_coef: .0
+    clip_vloss: True
+    vf_coef: .5
+    transitions_per_update: 4096
+    num_epochs_per_update: 4
+    normalize_advantages: True
+    batch_size: 256
+    device: 'cuda'
+    id: 'agent'
+# List of logger configs used.
+loggers:
+  -
+    name: ChompLogger
+  -
+    name: WandbLogger
+    kwargs:
+      project: Hive
+      name: *run_name
+      resume: "allow"
+      start_method: "fork"