Remove phrase 'Abstract' from abstract class names.

medipixel · Apr 11, 2019 · 66a8f4a · 66a8f4a
2 parents 17ef9a7 + ad9665f
commit 66a8f4a
Show file tree

Hide file tree

Showing 28 changed files with 505 additions and 243 deletions.
diff --git a/README.md b/README.md
@@ -30,8 +30,8 @@ We are warmly welcoming external contributors! :)
 6. [Behaviour Cloning (BC with DDPG, SAC)](https://github.com/medipixel/rl_algorithms/tree/master/algorithms/bc)
 7. [Prioritized Experience Replay (PER with DDPG)](https://github.com/medipixel/rl_algorithms/tree/master/algorithms/per)
 8. [From Demonstrations (DDPGfD, SACfD, DQfD)](https://github.com/medipixel/rl_algorithms/tree/master/algorithms/fd)
-9. [Rainbow DQN (without NoisyNet)](https://github.com/medipixel/rl_algorithms/tree/master/algorithms/dqn)
-10. [Rainbow IQN (without DuelingNet & NoisyNet)](https://github.com/medipixel/rl_algorithms/tree/master/algorithms/dqn)
+9. [Rainbow DQN](https://github.com/medipixel/rl_algorithms/tree/master/algorithms/dqn)
+10. [Rainbow IQN (without DuelingNet)](https://github.com/medipixel/rl_algorithms/tree/master/algorithms/dqn) - DuelingNet [degrades performance](https://github.com/medipixel/rl_algorithms/pull/137)
 
 ## Getting started
 We have tested each algorithm on some of the following environments.
@@ -109,6 +109,10 @@ python <run-file> -h
 - `--load-from <save-file-path>`
     - Load the saved models and optimizers at the beginning.
 
+### Class Diagram
+Class diagram drawn on [e447f3e](https://github.com/medipixel/rl_algorithms/commit/e447f3e743f6f85505f2275b646e46f0adcf8f89). This won't be frequently updated.
+![rl_algorithms_cls](https://user-images.githubusercontent.com/14961526/55703648-26022a80-5a15-11e9-8099-9bbfdffcb96d.png)
+
 ### W&B for logging
 We use [W&B](https://www.wandb.com/) for logging of network parameters and others. For more details, read [W&B tutorial](https://docs.wandb.com/docs/started.html).
 
@@ -128,5 +132,6 @@ We use [W&B](https://www.wandb.com/) for logging of network parameters and other
 12. [Z. Wang et al., "Dueling Network Architectures for Deep Reinforcement Learning." arXiv preprint arXiv:1511.06581, 2015.](https://arxiv.org/pdf/1511.06581.pdf)
 13. [T. Hester et al., "Deep Q-learning from Demonstrations." arXiv preprint arXiv:1704.03732, 2017.](https://arxiv.org/pdf/1704.03732.pdf)
 14. [M. G. Bellemare et al., "A Distributional Perspective on Reinforcement Learning." arXiv preprint arXiv:1707.06887, 2017.](https://arxiv.org/pdf/1707.06887.pdf)
-15. [M. Hessel et al., "Rainbow: Combining Improvements in Deep Reinforcement Learning." arXiv preprint arXiv:1710.02298, 2017.](https://arxiv.org/pdf/1710.02298.pdf)
-16. [W. Dabney et al., "Implicit Quantile Networks for Distributional Reinforcement Learning." arXiv preprint arXiv:1806.06923, 2018.](https://arxiv.org/pdf/1806.06923.pdf)
+15. [M. Fortunato et al., "Noisy Networks for Exploration." arXiv preprint arXiv:1706.10295, 2017.](https://arxiv.org/pdf/1706.10295.pdf)
+16. [M. Hessel et al., "Rainbow: Combining Improvements in Deep Reinforcement Learning." arXiv preprint arXiv:1710.02298, 2017.](https://arxiv.org/pdf/1710.02298.pdf)
+17. [W. Dabney et al., "Implicit Quantile Networks for Distributional Reinforcement Learning." arXiv preprint arXiv:1806.06923, 2018.](https://arxiv.org/pdf/1806.06923.pdf)
diff --git a/algorithms/bc/sac_agent.py b/algorithms/bc/sac_agent.py
@@ -115,8 +115,9 @@ def _add_transition_to_memory(self, transition: Tuple[np.ndarray, ...]):
 
     def update_model(self) -> Tuple[torch.Tensor, ...]:
         """Train the model after each episode."""
-        experiences = self.memory.sample()
-        demos = self.demo_memory.sample()
+        self.update_step += 1
+
+        experiences, demos = self.memory.sample(), self.demo_memory.sample()
 
         states, actions, rewards, next_states, dones = experiences
         demo_states, demo_actions, _, _, _ = demos
@@ -169,7 +170,7 @@ def update_model(self) -> Tuple[torch.Tensor, ...]:
         vf_loss.backward()
         self.vf_optimizer.step()
 
-        if self.total_step % self.hyper_params["DELAYED_UPDATE"] == 0:
+        if self.update_step % self.hyper_params["POLICY_UPDATE_FREQ"] == 0:
             # bc loss
             qf_mask = torch.gt(
                 self.qf_1(demo_states, demo_actions),
@@ -223,7 +224,7 @@ def update_model(self) -> Tuple[torch.Tensor, ...]:
         )
 
     def write_log(
-        self, i: int, loss: np.ndarray, score: float = 0.0, delayed_update: int = 1
+        self, i: int, loss: np.ndarray, score: float = 0.0, policy_update_freq: int = 1
     ):
         """Write log about loss and score"""
         total_loss = loss.sum()
@@ -238,7 +239,7 @@ def write_log(
                 self.total_step,
                 score,
                 total_loss,
-                loss[0] * delayed_update,  # actor loss
+                loss[0] * policy_update_freq,  # actor loss
                 loss[1],  # qf_1 loss
                 loss[2],  # qf_2 loss
                 loss[3],  # vf loss
@@ -252,7 +253,7 @@ def write_log(
                 {
                     "score": score,
                     "total loss": total_loss,
-                    "actor loss": loss[0] * delayed_update,
+                    "actor loss": loss[0] * policy_update_freq,
                     "qf_1 loss": loss[1],
                     "qf_2 loss": loss[2],
                     "vf loss": loss[3],

diff --git a/algorithms/common/abstract/reward_fn.py b/algorithms/common/abstract/reward_fn.py
@@ -13,8 +13,6 @@ class RewardFn(ABC):
     """Abstract class for computing reward.
        New compute_reward class should redefine __call__()
 
-    Attributes:
-
     """
 
     @abstractmethod

diff --git a/algorithms/common/buffer/segment_tree.py b/algorithms/common/buffer/segment_tree.py
@@ -102,7 +102,8 @@ def sum(self, start: int = 0, end: int = 0) -> float:
 
     def retrieve(self, upperbound: float) -> int:
         """Find the highest index `i` about upper bound in the tree"""
-        assert 0 <= upperbound <= self.sum() + 1e-5
+        # TODO: Check assert case and fix bug
+        assert 0 <= upperbound <= self.sum() + 1e-5, "upperbound: {}".format(upperbound)
 
         idx = 1
 

diff --git a/algorithms/common/env/atari_wrappers.py b/algorithms/common/env/atari_wrappers.py
@@ -8,7 +8,7 @@
 
 import cv2
 import gym
-from gym import spaces
+import gym.spaces as spaces
 import numpy as np
 
 os.environ.setdefault("PATH", "")

diff --git a/algorithms/common/networks/mlp.py b/algorithms/common/networks/mlp.py
@@ -31,6 +31,14 @@ def concat(
     return in_concat
 
 
+def init_layer_uniform(layer: nn.Linear, init_w: float = 3e-3) -> nn.Linear:
+    """Init uniform parameters on the single layer"""
+    layer.weight.data.uniform_(-init_w, init_w)
+    layer.bias.data.uniform_(-init_w, init_w)
+
+    return layer
+
+
 class MLP(nn.Module):
     """Baseline of Multilayer perceptron.
 
@@ -53,9 +61,10 @@ def __init__(
         hidden_sizes: list,
         hidden_activation: Callable = F.relu,
         output_activation: Callable = identity,
+        linear_layer: nn.Module = nn.Linear,
         use_output_layer: bool = True,
         n_category: int = -1,
-        init_w: float = 3e-3,
+        init_fn: Callable = init_layer_uniform,
     ):
         """Initialization.
 
@@ -65,9 +74,10 @@ def __init__(
             hidden_sizes (list): number of hidden layers
             hidden_activation (function): activation function of hidden layers
             output_activation (function): activation function of output layer
+            linear_layer (nn.Module): linear layer of mlp
             use_output_layer (bool): whether or not to use the last layer
             n_category (int): category number (-1 if the action is continuous)
-            init_w (float): weight initialization bound for the last layer
+            init_fn (Callable): weight initialization function bound for the last layer
 
         """
         super(MLP, self).__init__()
@@ -77,23 +87,23 @@ def __init__(
         self.output_size = output_size
         self.hidden_activation = hidden_activation
         self.output_activation = output_activation
+        self.linear_layer = linear_layer
         self.use_output_layer = use_output_layer
         self.n_category = n_category
 
         # set hidden layers
         self.hidden_layers: list = []
         in_size = self.input_size
         for i, next_size in enumerate(hidden_sizes):
-            fc = nn.Linear(in_size, next_size)
+            fc = self.linear_layer(in_size, next_size)
             in_size = next_size
             self.__setattr__("hidden_fc{}".format(i), fc)
             self.hidden_layers.append(fc)
 
         # set output layers
         if self.use_output_layer:
-            self.output_layer = nn.Linear(in_size, output_size)
-            self.output_layer.weight.data.uniform_(-init_w, init_w)
-            self.output_layer.bias.data.uniform_(-init_w, init_w)
+            self.output_layer = self.linear_layer(in_size, output_size)
+            self.output_layer = init_fn(self.output_layer)
         else:
             self.output_layer = identity
             self.output_activation = identity
@@ -137,7 +147,7 @@ def __init__(
         mu_activation: Callable = torch.tanh,
         log_std_min: float = -20,
         log_std_max: float = 2,
-        init_w: float = 3e-3,
+        init_fn: Callable = init_layer_uniform,
     ):
         """Initialization."""
         super(GaussianDist, self).__init__(
@@ -155,13 +165,11 @@ def __init__(
 
         # set log_std layer
         self.log_std_layer = nn.Linear(in_size, output_size)
-        self.log_std_layer.weight.data.uniform_(-init_w, init_w)
-        self.log_std_layer.bias.data.uniform_(-init_w, init_w)
+        self.log_std_layer = init_fn(self.log_std_layer)
 
         # set mean layer
         self.mu_layer = nn.Linear(in_size, output_size)
-        self.mu_layer.weight.data.uniform_(-init_w, init_w)
-        self.mu_layer.bias.data.uniform_(-init_w, init_w)
+        self.mu_layer = init_fn(self.mu_layer)
 
     def get_dist_params(self, x: torch.Tensor) -> Tuple[torch.Tensor, ...]:
         """Return gausian distribution parameters."""
@@ -229,7 +237,7 @@ def __init__(
         output_size: int,
         hidden_sizes: list,
         hidden_activation: Callable = F.relu,
-        init_w: float = 3e-3,
+        init_fn: Callable = init_layer_uniform,
     ):
         """Initialization."""
         super(CategoricalDist, self).__init__(
@@ -244,8 +252,7 @@ def __init__(
 
         # set log_std layer
         self.last_layer = nn.Linear(in_size, output_size)
-        self.last_layer.weight.data.uniform_(-init_w, init_w)
-        self.last_layer.bias.data.uniform_(-init_w, init_w)
+        self.last_layer = init_fn(self.last_layer)
 
     def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, ...]:
         """Forward method implementation."""

diff --git a/algorithms/common/noise.py b/algorithms/common/noise.py
@@ -15,21 +15,23 @@ class GaussianNoise:
 
     def __init__(
         self,
+        action_dim: int,
         min_sigma: float = 1.0,
         max_sigma: float = 1.0,
         decay_period: int = 1000000,
     ):
         """Initialization."""
+        self.action_dim = action_dim
         self.max_sigma = max_sigma
         self.min_sigma = min_sigma
         self.decay_period = decay_period
 
-    def sample(self, action_size: int, t: int = 0) -> float:
+    def sample(self, t: int = 0) -> float:
         """Get an action with gaussian noise."""
         sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min(
             1.0, t / self.decay_period
         )
-        return np.random.normal(0, sigma, size=action_size)
+        return np.random.normal(0, sigma, size=self.action_dim)
 
 
 class OUNoise:

diff --git a/algorithms/dqn/agent.py b/algorithms/dqn/agent.py
@@ -7,13 +7,15 @@
          https://arxiv.org/pdf/1509.06461.pdf (Double DQN)
          https://arxiv.org/pdf/1511.05952.pdf (PER)
          https://arxiv.org/pdf/1511.06581.pdf (Dueling)
+         https://arxiv.org/pdf/1706.10295.pdf (NoisyNet)
          https://arxiv.org/pdf/1707.06887.pdf (C51)
+         https://arxiv.org/pdf/1710.02298.pdf (Rainbow)
          https://arxiv.org/pdf/1806.06923.pdf (IQN)
 """
 
 import argparse
-import datetime
 import os
+import time
 from typing import Tuple
 
 import gym
@@ -191,7 +193,7 @@ def _get_dqn_loss(
                 gamma=gamma,
             )
 
-    def update_model(self) -> torch.Tensor:
+    def update_model(self) -> Tuple[torch.Tensor, torch.Tensor]:
         """Train the model after each episode."""
         # 1 step loss
         experiences_1 = self.memory.sample(self.beta)
@@ -239,6 +241,10 @@ def update_model(self) -> torch.Tensor:
         fraction = min(float(self.i_episode) / self.args.episode_num, 1.0)
         self.beta = self.beta + fraction * (1.0 - self.beta)
 
+        if self.hyper_params["USE_NOISY_NET"]:
+            self.dqn.reset_noise()
+            self.dqn_target.reset_noise()
+
         return loss.data, q_values.mean().data
 
     def load_params(self, path: str):
@@ -263,11 +269,11 @@ def save_params(self, n_episode: int):
 
         Agent.save_params(self, params, n_episode)
 
-    def write_log(self, i: int, loss: np.ndarray, score: float):
+    def write_log(self, i: int, loss: np.ndarray, score: float, avg_time_cost: float):
         """Write log about loss and score"""
         print(
             "[INFO] episode %d, episode step: %d, total step: %d, total score: %f\n"
-            "epsilon: %f, loss: %f, avg q-value: %f at %s\n"
+            "epsilon: %f, loss: %f, avg q-value: %f (spent %.6f sec/step)\n"
             % (
                 i,
                 self.episode_step,
@@ -276,12 +282,20 @@ def write_log(self, i: int, loss: np.ndarray, score: float):
                 self.epsilon,
                 loss[0],
                 loss[1],
-                datetime.datetime.now(),
+                avg_time_cost,
             )
         )
 
         if self.args.log:
-            wandb.log({"score": score, "dqn loss": loss[0], "epsilon": self.epsilon})
+            wandb.log(
+                {
+                    "score": score,
+                    "epsilon": self.epsilon,
+                    "dqn loss": loss[0],
+                    "avg q values": loss[1],
+                    "time per each step": avg_time_cost,
+                }
+            )
 
     # pylint: disable=no-self-use, unnecessary-pass
     def pretrain(self):
@@ -312,6 +326,8 @@ def train(self):
             done = False
             score = 0
 
+            t_begin = time.time()
+
             while not done:
                 if self.args.render and self.i_episode >= self.args.render_after:
                     self.env.render()
@@ -334,9 +350,12 @@ def train(self):
                 state = next_state
                 score += reward
 
+            t_end = time.time()
+            avg_time_cost = (t_end - t_begin) / self.episode_step
+
             if losses:
                 avg_loss = np.vstack(losses).mean(axis=0)
-                self.write_log(self.i_episode, avg_loss, score)
+                self.write_log(self.i_episode, avg_loss, score, avg_time_cost)
 
             if self.i_episode % self.args.save_period == 0:
                 self.save_params(self.i_episode)