Skip to content

Commit

Permalink
Remove phrase 'Abstract' from abstract class names.
Browse files Browse the repository at this point in the history
  • Loading branch information
darthegg committed Apr 11, 2019
2 parents 17ef9a7 + ad9665f commit 66a8f4a
Show file tree
Hide file tree
Showing 28 changed files with 505 additions and 243 deletions.
13 changes: 9 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ We are warmly welcoming external contributors! :)
6. [Behaviour Cloning (BC with DDPG, SAC)](https://github.com/medipixel/rl_algorithms/tree/master/algorithms/bc)
7. [Prioritized Experience Replay (PER with DDPG)](https://github.com/medipixel/rl_algorithms/tree/master/algorithms/per)
8. [From Demonstrations (DDPGfD, SACfD, DQfD)](https://github.com/medipixel/rl_algorithms/tree/master/algorithms/fd)
9. [Rainbow DQN (without NoisyNet)](https://github.com/medipixel/rl_algorithms/tree/master/algorithms/dqn)
10. [Rainbow IQN (without DuelingNet & NoisyNet)](https://github.com/medipixel/rl_algorithms/tree/master/algorithms/dqn)
9. [Rainbow DQN](https://github.com/medipixel/rl_algorithms/tree/master/algorithms/dqn)
10. [Rainbow IQN (without DuelingNet)](https://github.com/medipixel/rl_algorithms/tree/master/algorithms/dqn) - DuelingNet [degrades performance](https://github.com/medipixel/rl_algorithms/pull/137)

## Getting started
We have tested each algorithm on some of the following environments.
Expand Down Expand Up @@ -109,6 +109,10 @@ python <run-file> -h
- `--load-from <save-file-path>`
- Load the saved models and optimizers at the beginning.

### Class Diagram
Class diagram drawn on [e447f3e](https://github.com/medipixel/rl_algorithms/commit/e447f3e743f6f85505f2275b646e46f0adcf8f89). This won't be frequently updated.
![rl_algorithms_cls](https://user-images.githubusercontent.com/14961526/55703648-26022a80-5a15-11e9-8099-9bbfdffcb96d.png)

### W&B for logging
We use [W&B](https://www.wandb.com/) for logging of network parameters and others. For more details, read [W&B tutorial](https://docs.wandb.com/docs/started.html).

Expand All @@ -128,5 +132,6 @@ We use [W&B](https://www.wandb.com/) for logging of network parameters and other
12. [Z. Wang et al., "Dueling Network Architectures for Deep Reinforcement Learning." arXiv preprint arXiv:1511.06581, 2015.](https://arxiv.org/pdf/1511.06581.pdf)
13. [T. Hester et al., "Deep Q-learning from Demonstrations." arXiv preprint arXiv:1704.03732, 2017.](https://arxiv.org/pdf/1704.03732.pdf)
14. [M. G. Bellemare et al., "A Distributional Perspective on Reinforcement Learning." arXiv preprint arXiv:1707.06887, 2017.](https://arxiv.org/pdf/1707.06887.pdf)
15. [M. Hessel et al., "Rainbow: Combining Improvements in Deep Reinforcement Learning." arXiv preprint arXiv:1710.02298, 2017.](https://arxiv.org/pdf/1710.02298.pdf)
16. [W. Dabney et al., "Implicit Quantile Networks for Distributional Reinforcement Learning." arXiv preprint arXiv:1806.06923, 2018.](https://arxiv.org/pdf/1806.06923.pdf)
15. [M. Fortunato et al., "Noisy Networks for Exploration." arXiv preprint arXiv:1706.10295, 2017.](https://arxiv.org/pdf/1706.10295.pdf)
16. [M. Hessel et al., "Rainbow: Combining Improvements in Deep Reinforcement Learning." arXiv preprint arXiv:1710.02298, 2017.](https://arxiv.org/pdf/1710.02298.pdf)
17. [W. Dabney et al., "Implicit Quantile Networks for Distributional Reinforcement Learning." arXiv preprint arXiv:1806.06923, 2018.](https://arxiv.org/pdf/1806.06923.pdf)
13 changes: 7 additions & 6 deletions algorithms/bc/sac_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,9 @@ def _add_transition_to_memory(self, transition: Tuple[np.ndarray, ...]):

def update_model(self) -> Tuple[torch.Tensor, ...]:
"""Train the model after each episode."""
experiences = self.memory.sample()
demos = self.demo_memory.sample()
self.update_step += 1

experiences, demos = self.memory.sample(), self.demo_memory.sample()

states, actions, rewards, next_states, dones = experiences
demo_states, demo_actions, _, _, _ = demos
Expand Down Expand Up @@ -169,7 +170,7 @@ def update_model(self) -> Tuple[torch.Tensor, ...]:
vf_loss.backward()
self.vf_optimizer.step()

if self.total_step % self.hyper_params["DELAYED_UPDATE"] == 0:
if self.update_step % self.hyper_params["POLICY_UPDATE_FREQ"] == 0:
# bc loss
qf_mask = torch.gt(
self.qf_1(demo_states, demo_actions),
Expand Down Expand Up @@ -223,7 +224,7 @@ def update_model(self) -> Tuple[torch.Tensor, ...]:
)

def write_log(
self, i: int, loss: np.ndarray, score: float = 0.0, delayed_update: int = 1
self, i: int, loss: np.ndarray, score: float = 0.0, policy_update_freq: int = 1
):
"""Write log about loss and score"""
total_loss = loss.sum()
Expand All @@ -238,7 +239,7 @@ def write_log(
self.total_step,
score,
total_loss,
loss[0] * delayed_update, # actor loss
loss[0] * policy_update_freq, # actor loss
loss[1], # qf_1 loss
loss[2], # qf_2 loss
loss[3], # vf loss
Expand All @@ -252,7 +253,7 @@ def write_log(
{
"score": score,
"total loss": total_loss,
"actor loss": loss[0] * delayed_update,
"actor loss": loss[0] * policy_update_freq,
"qf_1 loss": loss[1],
"qf_2 loss": loss[2],
"vf loss": loss[3],
Expand Down
2 changes: 0 additions & 2 deletions algorithms/common/abstract/reward_fn.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@ class RewardFn(ABC):
"""Abstract class for computing reward.
New compute_reward class should redefine __call__()
Attributes:
"""

@abstractmethod
Expand Down
3 changes: 2 additions & 1 deletion algorithms/common/buffer/segment_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,8 @@ def sum(self, start: int = 0, end: int = 0) -> float:

def retrieve(self, upperbound: float) -> int:
"""Find the highest index `i` about upper bound in the tree"""
assert 0 <= upperbound <= self.sum() + 1e-5
# TODO: Check assert case and fix bug
assert 0 <= upperbound <= self.sum() + 1e-5, "upperbound: {}".format(upperbound)

idx = 1

Expand Down
2 changes: 1 addition & 1 deletion algorithms/common/env/atari_wrappers.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

import cv2
import gym
from gym import spaces
import gym.spaces as spaces
import numpy as np

os.environ.setdefault("PATH", "")
Expand Down
35 changes: 21 additions & 14 deletions algorithms/common/networks/mlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,14 @@ def concat(
return in_concat


def init_layer_uniform(layer: nn.Linear, init_w: float = 3e-3) -> nn.Linear:
"""Init uniform parameters on the single layer"""
layer.weight.data.uniform_(-init_w, init_w)
layer.bias.data.uniform_(-init_w, init_w)

return layer


class MLP(nn.Module):
"""Baseline of Multilayer perceptron.
Expand All @@ -53,9 +61,10 @@ def __init__(
hidden_sizes: list,
hidden_activation: Callable = F.relu,
output_activation: Callable = identity,
linear_layer: nn.Module = nn.Linear,
use_output_layer: bool = True,
n_category: int = -1,
init_w: float = 3e-3,
init_fn: Callable = init_layer_uniform,
):
"""Initialization.
Expand All @@ -65,9 +74,10 @@ def __init__(
hidden_sizes (list): number of hidden layers
hidden_activation (function): activation function of hidden layers
output_activation (function): activation function of output layer
linear_layer (nn.Module): linear layer of mlp
use_output_layer (bool): whether or not to use the last layer
n_category (int): category number (-1 if the action is continuous)
init_w (float): weight initialization bound for the last layer
init_fn (Callable): weight initialization function bound for the last layer
"""
super(MLP, self).__init__()
Expand All @@ -77,23 +87,23 @@ def __init__(
self.output_size = output_size
self.hidden_activation = hidden_activation
self.output_activation = output_activation
self.linear_layer = linear_layer
self.use_output_layer = use_output_layer
self.n_category = n_category

# set hidden layers
self.hidden_layers: list = []
in_size = self.input_size
for i, next_size in enumerate(hidden_sizes):
fc = nn.Linear(in_size, next_size)
fc = self.linear_layer(in_size, next_size)
in_size = next_size
self.__setattr__("hidden_fc{}".format(i), fc)
self.hidden_layers.append(fc)

# set output layers
if self.use_output_layer:
self.output_layer = nn.Linear(in_size, output_size)
self.output_layer.weight.data.uniform_(-init_w, init_w)
self.output_layer.bias.data.uniform_(-init_w, init_w)
self.output_layer = self.linear_layer(in_size, output_size)
self.output_layer = init_fn(self.output_layer)
else:
self.output_layer = identity
self.output_activation = identity
Expand Down Expand Up @@ -137,7 +147,7 @@ def __init__(
mu_activation: Callable = torch.tanh,
log_std_min: float = -20,
log_std_max: float = 2,
init_w: float = 3e-3,
init_fn: Callable = init_layer_uniform,
):
"""Initialization."""
super(GaussianDist, self).__init__(
Expand All @@ -155,13 +165,11 @@ def __init__(

# set log_std layer
self.log_std_layer = nn.Linear(in_size, output_size)
self.log_std_layer.weight.data.uniform_(-init_w, init_w)
self.log_std_layer.bias.data.uniform_(-init_w, init_w)
self.log_std_layer = init_fn(self.log_std_layer)

# set mean layer
self.mu_layer = nn.Linear(in_size, output_size)
self.mu_layer.weight.data.uniform_(-init_w, init_w)
self.mu_layer.bias.data.uniform_(-init_w, init_w)
self.mu_layer = init_fn(self.mu_layer)

def get_dist_params(self, x: torch.Tensor) -> Tuple[torch.Tensor, ...]:
"""Return gausian distribution parameters."""
Expand Down Expand Up @@ -229,7 +237,7 @@ def __init__(
output_size: int,
hidden_sizes: list,
hidden_activation: Callable = F.relu,
init_w: float = 3e-3,
init_fn: Callable = init_layer_uniform,
):
"""Initialization."""
super(CategoricalDist, self).__init__(
Expand All @@ -244,8 +252,7 @@ def __init__(

# set log_std layer
self.last_layer = nn.Linear(in_size, output_size)
self.last_layer.weight.data.uniform_(-init_w, init_w)
self.last_layer.bias.data.uniform_(-init_w, init_w)
self.last_layer = init_fn(self.last_layer)

def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, ...]:
"""Forward method implementation."""
Expand Down
6 changes: 4 additions & 2 deletions algorithms/common/noise.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,23 @@ class GaussianNoise:

def __init__(
self,
action_dim: int,
min_sigma: float = 1.0,
max_sigma: float = 1.0,
decay_period: int = 1000000,
):
"""Initialization."""
self.action_dim = action_dim
self.max_sigma = max_sigma
self.min_sigma = min_sigma
self.decay_period = decay_period

def sample(self, action_size: int, t: int = 0) -> float:
def sample(self, t: int = 0) -> float:
"""Get an action with gaussian noise."""
sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min(
1.0, t / self.decay_period
)
return np.random.normal(0, sigma, size=action_size)
return np.random.normal(0, sigma, size=self.action_dim)


class OUNoise:
Expand Down
33 changes: 26 additions & 7 deletions algorithms/dqn/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,15 @@
https://arxiv.org/pdf/1509.06461.pdf (Double DQN)
https://arxiv.org/pdf/1511.05952.pdf (PER)
https://arxiv.org/pdf/1511.06581.pdf (Dueling)
https://arxiv.org/pdf/1706.10295.pdf (NoisyNet)
https://arxiv.org/pdf/1707.06887.pdf (C51)
https://arxiv.org/pdf/1710.02298.pdf (Rainbow)
https://arxiv.org/pdf/1806.06923.pdf (IQN)
"""

import argparse
import datetime
import os
import time
from typing import Tuple

import gym
Expand Down Expand Up @@ -191,7 +193,7 @@ def _get_dqn_loss(
gamma=gamma,
)

def update_model(self) -> torch.Tensor:
def update_model(self) -> Tuple[torch.Tensor, torch.Tensor]:
"""Train the model after each episode."""
# 1 step loss
experiences_1 = self.memory.sample(self.beta)
Expand Down Expand Up @@ -239,6 +241,10 @@ def update_model(self) -> torch.Tensor:
fraction = min(float(self.i_episode) / self.args.episode_num, 1.0)
self.beta = self.beta + fraction * (1.0 - self.beta)

if self.hyper_params["USE_NOISY_NET"]:
self.dqn.reset_noise()
self.dqn_target.reset_noise()

return loss.data, q_values.mean().data

def load_params(self, path: str):
Expand All @@ -263,11 +269,11 @@ def save_params(self, n_episode: int):

Agent.save_params(self, params, n_episode)

def write_log(self, i: int, loss: np.ndarray, score: float):
def write_log(self, i: int, loss: np.ndarray, score: float, avg_time_cost: float):
"""Write log about loss and score"""
print(
"[INFO] episode %d, episode step: %d, total step: %d, total score: %f\n"
"epsilon: %f, loss: %f, avg q-value: %f at %s\n"
"epsilon: %f, loss: %f, avg q-value: %f (spent %.6f sec/step)\n"
% (
i,
self.episode_step,
Expand All @@ -276,12 +282,20 @@ def write_log(self, i: int, loss: np.ndarray, score: float):
self.epsilon,
loss[0],
loss[1],
datetime.datetime.now(),
avg_time_cost,
)
)

if self.args.log:
wandb.log({"score": score, "dqn loss": loss[0], "epsilon": self.epsilon})
wandb.log(
{
"score": score,
"epsilon": self.epsilon,
"dqn loss": loss[0],
"avg q values": loss[1],
"time per each step": avg_time_cost,
}
)

# pylint: disable=no-self-use, unnecessary-pass
def pretrain(self):
Expand Down Expand Up @@ -312,6 +326,8 @@ def train(self):
done = False
score = 0

t_begin = time.time()

while not done:
if self.args.render and self.i_episode >= self.args.render_after:
self.env.render()
Expand All @@ -334,9 +350,12 @@ def train(self):
state = next_state
score += reward

t_end = time.time()
avg_time_cost = (t_end - t_begin) / self.episode_step

if losses:
avg_loss = np.vstack(losses).mean(axis=0)
self.write_log(self.i_episode, avg_loss, score)
self.write_log(self.i_episode, avg_loss, score, avg_time_cost)

if self.i_episode % self.args.save_period == 0:
self.save_params(self.i_episode)
Expand Down
Loading

0 comments on commit 66a8f4a

Please sign in to comment.