medipixel · isk03276 · Aug 24, 2021 · Jun 21, 2021 · Jun 21, 2021 · Jun 23, 2021
@@ -72,6 +72,7 @@ This project follows the [all-contributors](https://github.com/all-contributors/
 10. [Recurrent Replay DQN (R2D1)](https://github.com/medipixel/rl_algorithms/tree/master/rl_algorithms/recurrent)
 11. [Distributed Pioritized Experience Replay (Ape-X)](https://github.com/medipixel/rl_algorithms/tree/master/rl_algorithms/common/apex)
 12. [Policy Distillation](https://github.com/medipixel/rl_algorithms/tree/master/rl_algorithms/distillation)
+13. [Generative Adversarial Imitation Learning (GAIL)](https://github.com/medipixel/rl_algorithms/tree/master/rl_algorithms/gail)
 
 ## Performance
 
@@ -139,6 +140,14 @@ See <a href="https://app.wandb.ai/medipixel_rl/LunarLanderContinuous-v2/reports/
 </p>
 </details>
 
+<details><summary><b>LunarLanderContinuous-v2: PPO, SAC, GAIL</b></summary>
+<p><br>
+See <a href="https://wandb.ai/chaehyeuk-lee/LunarLanderContinuous-v2?workspace=user-chaehyeuk-lee">W&B log</a> for more details. (The performance is measured on the commit <a href="https://github.com/medipixel/rl_algorithms/commit/922222b2e249f1f14bdf1a28c9f0f00752e49907">9e897ad</a>)
+
+![lunarlandercontinuous-v2_gail](https://user-images.githubusercontent.com/23740495/130401442-8b668975-8760-4a79-b757-1c1e9a9c4e47.png)
+</p>
+</details>
+
 #### Reacher-v2
 
 We reproduced the performance of **DDPG**, **TD3**, and **SAC** on Reacher-v2 (Mujoco). They reach the score around -3.5 to -4.5.
@@ -313,3 +322,4 @@ To cite this repository in publications:
 19. [Steven Kapturowski et al., "Recurrent Experience Replay in Distributed Reinforcement Learning." in International Conference on Learning Representations https://openreview.net/forum?id=r1lyTjAqYX, 2019.](https://openreview.net/forum?id=r1lyTjAqYX)
 20. [Horgan et al., "Distributed Prioritized Experience Replay." in International Conference on Learning Representations, 2018](https://arxiv.org/pdf/1803.00933.pdf)
 21. [Simonyan et al., "Deep Inside Convolutional Networks: Visualising Image Classification Models and Saliency Maps", 2013](https://arxiv.org/pdf/1312.6034.pdf)
+22. [Ho et al., "Generative adversarial imitation learning", 2016](https://arxiv.org/abs/1606.03476)
@@ -0,0 +1,59 @@
+type: "GAILPPOAgent"
+hyper_params:
+  gamma: 0.99
+  tau: 0.95
+  batch_size: 128
+  max_epsilon: 0.2
+  min_epsilon: 0.2
+  epsilon_decay_period: 1500
+  w_value: 1.0
+  w_entropy: 0.001
+  gradient_clip_ac: 0.5
+  gradient_clip_cr: 1.0
+  epoch: 10
+  rollout_len: 1024
+  n_workers: 4
+  use_clipped_value_loss: False
+  standardize_advantage: True
+  gail_reward_weight: 1.0
+  demo_path: "data/lunarlander_continuous_demo.pkl"
+
+learner_cfg:
+  type: "GAILPPOLearner"
+  backbone:
+    actor:
+    critic:
+    discriminator: 
+    shared_actor_critic:
+  head:
+    actor:
+      type: "GaussianDist"
+      configs: 
+        hidden_sizes: [256, 256]
+        output_activation: "identity"
+        fixed_logstd: True
+    critic:
+      type: "MLP"
+      configs:
+        hidden_sizes: [256, 256]
+        output_size: 1
+        output_activation: "identity"
+    discriminator:
+      type: "MLP"
+      configs:
+        hidden_sizes: [256, 256]
+        output_size: 1
+        output_activation: "identity"
+    aciton_embedder:
+      type: "MLP"
+      configs:
+        hidden_sizes: []
+        output_size: 16
+        output_activation: "identity"
+
+  optim_cfg:
+    lr_actor: 0.0003
+    lr_critic: 0.001
+    lr_discriminator: 0.0003
+    weight_decay: 0.0
+    discriminator_acc_threshold : 0.8
@@ -10,7 +10,7 @@ tqdm
 
 # for distributed learning
 redis==3.5.3  # for ray
-ray==1.2.0
+ray==1.3.0
 pyzmq==20.0.0
 pyarrow==3.0.0
 

@@ -21,6 +21,8 @@
 from .fd.dqn_learner import DQfDLearner
 from .fd.sac_agent import SACfDAgent
 from .fd.sac_learner import SACfDLearner
+from .gail.agent import GAILPPOAgent
+from .gail.learner import GAILPPOLearner
 from .ppo.agent import PPOAgent
 from .ppo.learner import PPOLearner
 from .recurrent.dqn_agent import R2D1Agent
@@ -45,6 +47,7 @@
     "PPOAgent",
     "SACAgent",
     "TD3Agent",
+    "GAILPPOAgent",
     "A2CLearner",
     "BCDDPGLearner",
     "BCSACLearner",
@@ -56,6 +59,7 @@
     "PPOLearner",
     "SACLearner",
     "TD3Learner",
+    "GAILPPOLearner",
     "R2D1Learner",
     "LunarLanderContinuousHER",
     "ReacherHER",

@@ -0,0 +1,60 @@
+# -*- coding: utf-8 -*-
+"""Demo buffer for GAIL algorithm."""
+
+import pickle
+from typing import List, Tuple
+
+import numpy as np
+import torch
+
+from rl_algorithms.common.abstract.buffer import BaseBuffer
+
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+
+
+class GAILBuffer(BaseBuffer):
+    """Buffer to store expert states and actions.
+
+    Attributes:
+        obs_buf (np.ndarray): observations
+        acts_buf (np.ndarray): actions
+    """
+
+    def __init__(self, dataset_path: str):
+        """Initialize a Buffer.
+
+        Args:
+            dataset_path (str): path of the demo dataset
+        """
+
+        self.obs_buf: np.ndarray = None
+        self.acts_buf: np.ndarray = None
+
+        self.load_demo(dataset_path)
+
+    def load_demo(self, dataset_path: str):
+        """load demo data."""
+        with open(dataset_path, "rb") as f:
+            demo = list(pickle.load(f))
+        demo = np.array(demo)
+        self.obs_buf = np.array(list(map(np.array, demo[:, 0])))
+        self.acts_buf = np.array(list(map(np.array, demo[:, 1])))
+
+    def add(self):
+        pass
+
+    def sample(self, batch_size, indices: List[int] = None) -> Tuple[np.ndarray, ...]:
+        """Randomly sample a batch of experiences from memory."""
+        assert 0 < batch_size < len(self)
+
+        if indices is None:
+            indices = np.random.choice(len(self), size=batch_size)
+
+        states = self.obs_buf[indices]
+        actions = self.acts_buf[indices]
+
+        return torch.Tensor(states).to(device), torch.Tensor(actions).to(device)
+
+    def __len__(self) -> int:
+        """Return the current size of internal memory."""
+        return len(self.obs_buf)
@@ -0,0 +1 @@
+"""Empty."""