diff --git a/README.md b/README.md index 04e387e1..b706af5f 100644 --- a/README.md +++ b/README.md @@ -30,8 +30,8 @@ We are warmly welcoming external contributors! :) 6. [Behaviour Cloning (BC with DDPG, SAC)](https://github.com/medipixel/rl_algorithms/tree/master/algorithms/bc) 7. [Prioritized Experience Replay (PER with DDPG)](https://github.com/medipixel/rl_algorithms/tree/master/algorithms/per) 8. [From Demonstrations (DDPGfD, SACfD, DQfD)](https://github.com/medipixel/rl_algorithms/tree/master/algorithms/fd) -9. [Rainbow DQN (without NoisyNet)](https://github.com/medipixel/rl_algorithms/tree/master/algorithms/dqn) -10. [Rainbow IQN (without DuelingNet & NoisyNet)](https://github.com/medipixel/rl_algorithms/tree/master/algorithms/dqn) +9. [Rainbow DQN](https://github.com/medipixel/rl_algorithms/tree/master/algorithms/dqn) +10. [Rainbow IQN (without DuelingNet)](https://github.com/medipixel/rl_algorithms/tree/master/algorithms/dqn) - DuelingNet [degrades performance](https://github.com/medipixel/rl_algorithms/pull/137) ## Getting started We have tested each algorithm on some of the following environments. @@ -109,6 +109,10 @@ python -h - `--load-from ` - Load the saved models and optimizers at the beginning. +### Class Diagram +Class diagram drawn on [e447f3e](https://github.com/medipixel/rl_algorithms/commit/e447f3e743f6f85505f2275b646e46f0adcf8f89). This won't be frequently updated. +![rl_algorithms_cls](https://user-images.githubusercontent.com/14961526/55703648-26022a80-5a15-11e9-8099-9bbfdffcb96d.png) + ### W&B for logging We use [W&B](https://www.wandb.com/) for logging of network parameters and others. For more details, read [W&B tutorial](https://docs.wandb.com/docs/started.html). @@ -128,5 +132,6 @@ We use [W&B](https://www.wandb.com/) for logging of network parameters and other 12. [Z. Wang et al., "Dueling Network Architectures for Deep Reinforcement Learning." arXiv preprint arXiv:1511.06581, 2015.](https://arxiv.org/pdf/1511.06581.pdf) 13. [T. Hester et al., "Deep Q-learning from Demonstrations." arXiv preprint arXiv:1704.03732, 2017.](https://arxiv.org/pdf/1704.03732.pdf) 14. [M. G. Bellemare et al., "A Distributional Perspective on Reinforcement Learning." arXiv preprint arXiv:1707.06887, 2017.](https://arxiv.org/pdf/1707.06887.pdf) -15. [M. Hessel et al., "Rainbow: Combining Improvements in Deep Reinforcement Learning." arXiv preprint arXiv:1710.02298, 2017.](https://arxiv.org/pdf/1710.02298.pdf) -16. [W. Dabney et al., "Implicit Quantile Networks for Distributional Reinforcement Learning." arXiv preprint arXiv:1806.06923, 2018.](https://arxiv.org/pdf/1806.06923.pdf) +15. [M. Fortunato et al., "Noisy Networks for Exploration." arXiv preprint arXiv:1706.10295, 2017.](https://arxiv.org/pdf/1706.10295.pdf) +16. [M. Hessel et al., "Rainbow: Combining Improvements in Deep Reinforcement Learning." arXiv preprint arXiv:1710.02298, 2017.](https://arxiv.org/pdf/1710.02298.pdf) +17. [W. Dabney et al., "Implicit Quantile Networks for Distributional Reinforcement Learning." arXiv preprint arXiv:1806.06923, 2018.](https://arxiv.org/pdf/1806.06923.pdf) diff --git a/algorithms/bc/sac_agent.py b/algorithms/bc/sac_agent.py index d77b6e04..8b532016 100644 --- a/algorithms/bc/sac_agent.py +++ b/algorithms/bc/sac_agent.py @@ -115,8 +115,9 @@ def _add_transition_to_memory(self, transition: Tuple[np.ndarray, ...]): def update_model(self) -> Tuple[torch.Tensor, ...]: """Train the model after each episode.""" - experiences = self.memory.sample() - demos = self.demo_memory.sample() + self.update_step += 1 + + experiences, demos = self.memory.sample(), self.demo_memory.sample() states, actions, rewards, next_states, dones = experiences demo_states, demo_actions, _, _, _ = demos @@ -169,7 +170,7 @@ def update_model(self) -> Tuple[torch.Tensor, ...]: vf_loss.backward() self.vf_optimizer.step() - if self.total_step % self.hyper_params["DELAYED_UPDATE"] == 0: + if self.update_step % self.hyper_params["POLICY_UPDATE_FREQ"] == 0: # bc loss qf_mask = torch.gt( self.qf_1(demo_states, demo_actions), @@ -223,7 +224,7 @@ def update_model(self) -> Tuple[torch.Tensor, ...]: ) def write_log( - self, i: int, loss: np.ndarray, score: float = 0.0, delayed_update: int = 1 + self, i: int, loss: np.ndarray, score: float = 0.0, policy_update_freq: int = 1 ): """Write log about loss and score""" total_loss = loss.sum() @@ -238,7 +239,7 @@ def write_log( self.total_step, score, total_loss, - loss[0] * delayed_update, # actor loss + loss[0] * policy_update_freq, # actor loss loss[1], # qf_1 loss loss[2], # qf_2 loss loss[3], # vf loss @@ -252,7 +253,7 @@ def write_log( { "score": score, "total loss": total_loss, - "actor loss": loss[0] * delayed_update, + "actor loss": loss[0] * policy_update_freq, "qf_1 loss": loss[1], "qf_2 loss": loss[2], "vf loss": loss[3], diff --git a/algorithms/common/abstract/reward_fn.py b/algorithms/common/abstract/reward_fn.py index a64134c7..532865df 100644 --- a/algorithms/common/abstract/reward_fn.py +++ b/algorithms/common/abstract/reward_fn.py @@ -13,8 +13,6 @@ class RewardFn(ABC): """Abstract class for computing reward. New compute_reward class should redefine __call__() - Attributes: - """ @abstractmethod diff --git a/algorithms/common/buffer/segment_tree.py b/algorithms/common/buffer/segment_tree.py index 0e2d44cf..30c60f2f 100644 --- a/algorithms/common/buffer/segment_tree.py +++ b/algorithms/common/buffer/segment_tree.py @@ -102,7 +102,8 @@ def sum(self, start: int = 0, end: int = 0) -> float: def retrieve(self, upperbound: float) -> int: """Find the highest index `i` about upper bound in the tree""" - assert 0 <= upperbound <= self.sum() + 1e-5 + # TODO: Check assert case and fix bug + assert 0 <= upperbound <= self.sum() + 1e-5, "upperbound: {}".format(upperbound) idx = 1 diff --git a/algorithms/common/env/atari_wrappers.py b/algorithms/common/env/atari_wrappers.py index 8a4ffee5..d3e65f8e 100644 --- a/algorithms/common/env/atari_wrappers.py +++ b/algorithms/common/env/atari_wrappers.py @@ -8,7 +8,7 @@ import cv2 import gym -from gym import spaces +import gym.spaces as spaces import numpy as np os.environ.setdefault("PATH", "") diff --git a/algorithms/common/networks/mlp.py b/algorithms/common/networks/mlp.py index 66ac125f..4734c393 100644 --- a/algorithms/common/networks/mlp.py +++ b/algorithms/common/networks/mlp.py @@ -31,6 +31,14 @@ def concat( return in_concat +def init_layer_uniform(layer: nn.Linear, init_w: float = 3e-3) -> nn.Linear: + """Init uniform parameters on the single layer""" + layer.weight.data.uniform_(-init_w, init_w) + layer.bias.data.uniform_(-init_w, init_w) + + return layer + + class MLP(nn.Module): """Baseline of Multilayer perceptron. @@ -53,9 +61,10 @@ def __init__( hidden_sizes: list, hidden_activation: Callable = F.relu, output_activation: Callable = identity, + linear_layer: nn.Module = nn.Linear, use_output_layer: bool = True, n_category: int = -1, - init_w: float = 3e-3, + init_fn: Callable = init_layer_uniform, ): """Initialization. @@ -65,9 +74,10 @@ def __init__( hidden_sizes (list): number of hidden layers hidden_activation (function): activation function of hidden layers output_activation (function): activation function of output layer + linear_layer (nn.Module): linear layer of mlp use_output_layer (bool): whether or not to use the last layer n_category (int): category number (-1 if the action is continuous) - init_w (float): weight initialization bound for the last layer + init_fn (Callable): weight initialization function bound for the last layer """ super(MLP, self).__init__() @@ -77,6 +87,7 @@ def __init__( self.output_size = output_size self.hidden_activation = hidden_activation self.output_activation = output_activation + self.linear_layer = linear_layer self.use_output_layer = use_output_layer self.n_category = n_category @@ -84,16 +95,15 @@ def __init__( self.hidden_layers: list = [] in_size = self.input_size for i, next_size in enumerate(hidden_sizes): - fc = nn.Linear(in_size, next_size) + fc = self.linear_layer(in_size, next_size) in_size = next_size self.__setattr__("hidden_fc{}".format(i), fc) self.hidden_layers.append(fc) # set output layers if self.use_output_layer: - self.output_layer = nn.Linear(in_size, output_size) - self.output_layer.weight.data.uniform_(-init_w, init_w) - self.output_layer.bias.data.uniform_(-init_w, init_w) + self.output_layer = self.linear_layer(in_size, output_size) + self.output_layer = init_fn(self.output_layer) else: self.output_layer = identity self.output_activation = identity @@ -137,7 +147,7 @@ def __init__( mu_activation: Callable = torch.tanh, log_std_min: float = -20, log_std_max: float = 2, - init_w: float = 3e-3, + init_fn: Callable = init_layer_uniform, ): """Initialization.""" super(GaussianDist, self).__init__( @@ -155,13 +165,11 @@ def __init__( # set log_std layer self.log_std_layer = nn.Linear(in_size, output_size) - self.log_std_layer.weight.data.uniform_(-init_w, init_w) - self.log_std_layer.bias.data.uniform_(-init_w, init_w) + self.log_std_layer = init_fn(self.log_std_layer) # set mean layer self.mu_layer = nn.Linear(in_size, output_size) - self.mu_layer.weight.data.uniform_(-init_w, init_w) - self.mu_layer.bias.data.uniform_(-init_w, init_w) + self.mu_layer = init_fn(self.mu_layer) def get_dist_params(self, x: torch.Tensor) -> Tuple[torch.Tensor, ...]: """Return gausian distribution parameters.""" @@ -229,7 +237,7 @@ def __init__( output_size: int, hidden_sizes: list, hidden_activation: Callable = F.relu, - init_w: float = 3e-3, + init_fn: Callable = init_layer_uniform, ): """Initialization.""" super(CategoricalDist, self).__init__( @@ -244,8 +252,7 @@ def __init__( # set log_std layer self.last_layer = nn.Linear(in_size, output_size) - self.last_layer.weight.data.uniform_(-init_w, init_w) - self.last_layer.bias.data.uniform_(-init_w, init_w) + self.last_layer = init_fn(self.last_layer) def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, ...]: """Forward method implementation.""" diff --git a/algorithms/common/noise.py b/algorithms/common/noise.py index b6470524..25f9190d 100644 --- a/algorithms/common/noise.py +++ b/algorithms/common/noise.py @@ -15,21 +15,23 @@ class GaussianNoise: def __init__( self, + action_dim: int, min_sigma: float = 1.0, max_sigma: float = 1.0, decay_period: int = 1000000, ): """Initialization.""" + self.action_dim = action_dim self.max_sigma = max_sigma self.min_sigma = min_sigma self.decay_period = decay_period - def sample(self, action_size: int, t: int = 0) -> float: + def sample(self, t: int = 0) -> float: """Get an action with gaussian noise.""" sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min( 1.0, t / self.decay_period ) - return np.random.normal(0, sigma, size=action_size) + return np.random.normal(0, sigma, size=self.action_dim) class OUNoise: diff --git a/algorithms/dqn/agent.py b/algorithms/dqn/agent.py index dd81c76b..17884606 100644 --- a/algorithms/dqn/agent.py +++ b/algorithms/dqn/agent.py @@ -7,13 +7,15 @@ https://arxiv.org/pdf/1509.06461.pdf (Double DQN) https://arxiv.org/pdf/1511.05952.pdf (PER) https://arxiv.org/pdf/1511.06581.pdf (Dueling) + https://arxiv.org/pdf/1706.10295.pdf (NoisyNet) https://arxiv.org/pdf/1707.06887.pdf (C51) + https://arxiv.org/pdf/1710.02298.pdf (Rainbow) https://arxiv.org/pdf/1806.06923.pdf (IQN) """ import argparse -import datetime import os +import time from typing import Tuple import gym @@ -191,7 +193,7 @@ def _get_dqn_loss( gamma=gamma, ) - def update_model(self) -> torch.Tensor: + def update_model(self) -> Tuple[torch.Tensor, torch.Tensor]: """Train the model after each episode.""" # 1 step loss experiences_1 = self.memory.sample(self.beta) @@ -239,6 +241,10 @@ def update_model(self) -> torch.Tensor: fraction = min(float(self.i_episode) / self.args.episode_num, 1.0) self.beta = self.beta + fraction * (1.0 - self.beta) + if self.hyper_params["USE_NOISY_NET"]: + self.dqn.reset_noise() + self.dqn_target.reset_noise() + return loss.data, q_values.mean().data def load_params(self, path: str): @@ -263,11 +269,11 @@ def save_params(self, n_episode: int): Agent.save_params(self, params, n_episode) - def write_log(self, i: int, loss: np.ndarray, score: float): + def write_log(self, i: int, loss: np.ndarray, score: float, avg_time_cost: float): """Write log about loss and score""" print( "[INFO] episode %d, episode step: %d, total step: %d, total score: %f\n" - "epsilon: %f, loss: %f, avg q-value: %f at %s\n" + "epsilon: %f, loss: %f, avg q-value: %f (spent %.6f sec/step)\n" % ( i, self.episode_step, @@ -276,12 +282,20 @@ def write_log(self, i: int, loss: np.ndarray, score: float): self.epsilon, loss[0], loss[1], - datetime.datetime.now(), + avg_time_cost, ) ) if self.args.log: - wandb.log({"score": score, "dqn loss": loss[0], "epsilon": self.epsilon}) + wandb.log( + { + "score": score, + "epsilon": self.epsilon, + "dqn loss": loss[0], + "avg q values": loss[1], + "time per each step": avg_time_cost, + } + ) # pylint: disable=no-self-use, unnecessary-pass def pretrain(self): @@ -312,6 +326,8 @@ def train(self): done = False score = 0 + t_begin = time.time() + while not done: if self.args.render and self.i_episode >= self.args.render_after: self.env.render() @@ -334,9 +350,12 @@ def train(self): state = next_state score += reward + t_end = time.time() + avg_time_cost = (t_end - t_begin) / self.episode_step + if losses: avg_loss = np.vstack(losses).mean(axis=0) - self.write_log(self.i_episode, avg_loss, score) + self.write_log(self.i_episode, avg_loss, score, avg_time_cost) if self.i_episode % self.args.save_period == 0: self.save_params(self.i_episode) diff --git a/algorithms/dqn/linear.py b/algorithms/dqn/linear.py new file mode 100644 index 00000000..b2907013 --- /dev/null +++ b/algorithms/dqn/linear.py @@ -0,0 +1,114 @@ +# -*- coding: utf-8 -*- +"""Linear module for dqn algorithms + +- Author: Kh Kim +- Contact: kh.kim@medipixel.io +""" + +import math + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + + +class NoisyLinear(nn.Module): + """Noisy linear module for NoisyNet. + + References: + https://github.com/higgsfield/RL-Adventure/blob/master/5.noisy%20dqn.ipynb + https://github.com/Kaixhin/Rainbow/blob/master/model.py + + Attributes: + in_features (int): input size of linear module + out_features (int): output size of linear module + std_init (float): initial std value + weight_mu (nn.Parameter): mean value weight parameter + weight_sigma (nn.Parameter): std value weight parameter + bias_mu (nn.Parameter): mean value bias parameter + bias_sigma (nn.Parameter): std value bias parameter + + """ + + def __init__(self, in_features: int, out_features: int, std_init: float = 0.5): + """Initialization.""" + super(NoisyLinear, self).__init__() + self.in_features = in_features + self.out_features = out_features + self.std_init = std_init + + self.weight_mu = nn.Parameter(torch.Tensor(out_features, in_features)) + self.weight_sigma = nn.Parameter(torch.Tensor(out_features, in_features)) + self.register_buffer("weight_epsilon", torch.Tensor(out_features, in_features)) + + self.bias_mu = nn.Parameter(torch.Tensor(out_features)) + self.bias_sigma = nn.Parameter(torch.Tensor(out_features)) + self.register_buffer("bias_epsilon", torch.Tensor(out_features)) + + self.reset_parameters() + self.reset_noise() + + def reset_parameters(self): + """Reset trainable network parameters (factorized gaussian noise).""" + mu_range = 1 / math.sqrt(self.in_features) + self.weight_mu.data.uniform_(-mu_range, mu_range) + self.weight_sigma.data.fill_(self.std_init / math.sqrt(self.in_features)) + self.bias_mu.data.uniform_(-mu_range, mu_range) + self.bias_sigma.data.fill_(self.std_init / math.sqrt(self.out_features)) + + @staticmethod + def scale_noise(size: int) -> torch.Tensor: + """Set scale to make noise (factorized gaussian noise).""" + x = torch.FloatTensor(np.random.normal(loc=0.0, scale=1.0, size=size)) + + return x.sign().mul(x.abs().sqrt()) + + def reset_noise(self): + """Make new noise.""" + epsilon_in = self.scale_noise(self.in_features) + epsilon_out = self.scale_noise(self.out_features) + + # outer product + self.weight_epsilon.copy_(epsilon_out.ger(epsilon_in)) + self.bias_epsilon.copy_(epsilon_out) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Forward method implementation. + + We don't use separate statements on train / eval mode. + It doesn't show remarkable difference of performance. + """ + return F.linear( + x, + self.weight_mu + self.weight_sigma * self.weight_epsilon, + self.bias_mu + self.bias_sigma * self.bias_epsilon, + ) + + +class NoisyLinearConstructor: + """Constructor class for changing hyper parameters of NoisyLinear. + + Attributes: + std_init (float): initial std value + + """ + + def __init__(self, std_init: float = 0.5): + """Initialization.""" + self.std_init = std_init + + def __call__(self, in_features: int, out_features: int) -> NoisyLinear: + """Return NoisyLinear instance set hyper parameters""" + return NoisyLinear(in_features, out_features, self.std_init) + + +class NoisyMLPHandler: + """Includes methods to handle noisy linear.""" + + def reset_noise(self): + """Re-sample noise""" + for _, module in self.named_children(): + module.reset_noise() diff --git a/algorithms/dqn/networks.py b/algorithms/dqn/networks.py index a6848750..4ab3c5a4 100644 --- a/algorithms/dqn/networks.py +++ b/algorithms/dqn/networks.py @@ -14,12 +14,13 @@ import torch.nn.functional as F from algorithms.common.networks.cnn import CNN -from algorithms.common.networks.mlp import MLP +from algorithms.common.networks.mlp import MLP, init_layer_uniform +from algorithms.dqn.linear import NoisyMLPHandler device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") -class DuelingMLP(MLP): +class DuelingMLP(MLP, NoisyMLPHandler): """Multilayer perceptron with dueling construction.""" def __init__( @@ -28,7 +29,8 @@ def __init__( output_size: int, hidden_sizes: list, hidden_activation: Callable = F.relu, - init_w: float = 3e-3, + linear_layer: nn.Module = nn.Linear, + init_fn: Callable = init_layer_uniform, ): """Initialization.""" super(DuelingMLP, self).__init__( @@ -36,21 +38,20 @@ def __init__( output_size=output_size, hidden_sizes=hidden_sizes, hidden_activation=hidden_activation, + linear_layer=linear_layer, use_output_layer=False, ) in_size = hidden_sizes[-1] # set advantage layer - self.advantage_hidden_layer = nn.Linear(in_size, in_size) - self.advantage_layer = nn.Linear(in_size, output_size) - self.advantage_layer.weight.data.uniform_(-init_w, init_w) - self.advantage_layer.bias.data.uniform_(-init_w, init_w) + self.advantage_hidden_layer = self.linear_layer(in_size, in_size) + self.advantage_layer = self.linear_layer(in_size, output_size) + self.advantage_layer = init_fn(self.advantage_layer) # set value layer - self.value_hidden_layer = nn.Linear(in_size, in_size) - self.value_layer = nn.Linear(in_size, 1) - self.value_layer.weight.data.uniform_(-init_w, init_w) - self.value_layer.bias.data.uniform_(-init_w, init_w) + self.value_hidden_layer = self.linear_layer(in_size, in_size) + self.value_layer = self.linear_layer(in_size, 1) + self.value_layer = init_fn(self.value_layer) def _forward_dueling(self, x: torch.Tensor) -> torch.Tensor: adv_x = self.hidden_activation(self.advantage_hidden_layer(x)) @@ -81,8 +82,12 @@ def forward_(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: out = self.fc_layers.forward_(x) return out + def reset_noise(self): + """Re-sample noise for fc layers.""" + self.fc_layers.reset_noise() -class C51DuelingMLP(MLP): + +class C51DuelingMLP(MLP, NoisyMLPHandler): """Multilayered perceptron for C51 with dueling construction.""" def __init__( @@ -94,7 +99,8 @@ def __init__( v_min: int = -10, v_max: int = 10, hidden_activation: Callable = F.relu, - init_w: float = 3e-3, + linear_layer: nn.Module = nn.Linear, + init_fn: Callable = init_layer_uniform, ): """Initialization.""" super(C51DuelingMLP, self).__init__( @@ -102,6 +108,7 @@ def __init__( output_size=action_size, hidden_sizes=hidden_sizes, hidden_activation=hidden_activation, + linear_layer=linear_layer, use_output_layer=False, ) in_size = hidden_sizes[-1] @@ -111,16 +118,14 @@ def __init__( self.v_min, self.v_max = v_min, v_max # set advantage layer - self.advantage_hidden_layer = nn.Linear(in_size, in_size) - self.advantage_layer = nn.Linear(in_size, self.output_size) - self.advantage_layer.weight.data.uniform_(-init_w, init_w) - self.advantage_layer.bias.data.uniform_(-init_w, init_w) + self.advantage_hidden_layer = self.linear_layer(in_size, in_size) + self.advantage_layer = self.linear_layer(in_size, self.output_size) + self.advantage_layer = init_fn(self.advantage_layer) # set value layer - self.value_hidden_layer = nn.Linear(in_size, in_size) - self.value_layer = nn.Linear(in_size, self.atom_size) - self.value_layer.weight.data.uniform_(-init_w, init_w) - self.value_layer.bias.data.uniform_(-init_w, init_w) + self.value_hidden_layer = self.linear_layer(in_size, in_size) + self.value_layer = self.linear_layer(in_size, self.atom_size) + self.value_layer = init_fn(self.value_layer) def forward_(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """Get distribution for atoms.""" @@ -160,8 +165,12 @@ def forward_( out = self.fc_layers.forward_(x, n_tau_samples) return out + def reset_noise(self): + """Re-sample noise for fc layers.""" + self.fc_layers.reset_noise() + -class IQNMLP(MLP): +class IQNMLP(MLP, NoisyMLPHandler): """Multilayered perceptron for IQN with dueling construction. Reference: https://github.com/google/dopamine @@ -175,7 +184,8 @@ def __init__( n_quantiles: int, quantile_embedding_dim: int, hidden_activation: Callable = F.relu, - init_w: float = 3e-3, + linear_layer: nn.Module = nn.Linear, + init_fn: Callable = init_layer_uniform, ): """Initialization.""" super(IQNMLP, self).__init__( @@ -183,6 +193,8 @@ def __init__( output_size=output_size, hidden_sizes=hidden_sizes, hidden_activation=hidden_activation, + linear_layer=linear_layer, + init_fn=init_fn, ) IQNMLP.n_quantiles = n_quantiles @@ -191,9 +203,10 @@ def __init__( self.output_size = output_size # set quantile_net layer - self.quantile_fc_layer = nn.Linear(self.quantile_embedding_dim, self.input_size) - self.quantile_fc_layer.weight.data.uniform_(-init_w, init_w) - self.quantile_fc_layer.bias.data.uniform_(-init_w, init_w) + self.quantile_fc_layer = self.linear_layer( + self.quantile_embedding_dim, self.input_size + ) + self.quantile_fc_layer = init_fn(self.quantile_fc_layer) def forward_( self, state: torch.Tensor, n_tau_samples: int = None diff --git a/algorithms/dqn/utils.py b/algorithms/dqn/utils.py index 53d46e15..ac7d3bc5 100644 --- a/algorithms/dqn/utils.py +++ b/algorithms/dqn/utils.py @@ -45,7 +45,7 @@ def calculate_iqn_loss( # Get the indices of the maximium Q-value across the action dimension. # Shape of replay_next_qt_argmax: (n_tau_prime_samples x batch_size) x 1. - next_actions = target_model(next_states).argmax(dim=1) + next_actions = model(next_states).argmax(dim=1) # double Q next_actions = next_actions[:, None] next_actions = next_actions.repeat(n_tau_prime_samples, 1) @@ -145,7 +145,11 @@ def calculate_c51_loss( delta_z = float(v_max - v_min) / (atom_size - 1) with torch.no_grad(): + # According to noisynet paper, + # it resamples noisynet parameters on online network when using double q + # but we don't because there is no remarkable difference in performance. next_actions = model.forward_(next_states)[1].argmax(1) + next_dist = target_model.forward_(next_states)[0] next_dist = next_dist[range(batch_size), next_actions] @@ -155,11 +159,6 @@ def calculate_c51_loss( l = b.floor().long() # noqa: E741 u = b.ceil().long() - # Fix disappearing probability mass when l = b = u (b is int) - # taken from https://github.com/Kaixhin/Rainbow - l[(u > 0) * (l == u)] -= 1 # noqa: E741 - u[(l < (atom_size - 1)) * (l == u)] += 1 # noqa: E741 - offset = ( torch.linspace(0, (batch_size - 1) * atom_size, batch_size) .long() @@ -194,7 +193,11 @@ def calculate_dqn_loss( states, actions, rewards, next_states, dones = experiences[:5] q_values = model(states) + # According to noisynet paper, + # it resamples noisynet parameters on online network when using double q + # but we don't because there is no remarkable difference in performance. next_q_values = model(next_states) + next_target_q_values = target_model(next_states) curr_q_value = q_values.gather(1, actions.long().unsqueeze(1)) diff --git a/algorithms/fd/ddpg_agent.py b/algorithms/fd/ddpg_agent.py index cd690dba..8ee0f124 100644 --- a/algorithms/fd/ddpg_agent.py +++ b/algorithms/fd/ddpg_agent.py @@ -160,3 +160,4 @@ def pretrain(self): avg_loss = np.vstack(pretrain_loss).mean(axis=0) pretrain_loss.clear() self.write_log(0, avg_loss, 0) + print("[INFO] Pre-Train Complete!\n") diff --git a/algorithms/fd/dqn_agent.py b/algorithms/fd/dqn_agent.py index af305621..ac5441da 100644 --- a/algorithms/fd/dqn_agent.py +++ b/algorithms/fd/dqn_agent.py @@ -3,15 +3,11 @@ - Author: Kh Kim, Curt Park - Contact: kh.kim@medipixel.io, curt.park@medipixel.io -- Paper: https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf (DQN) - https://arxiv.org/pdf/1509.06461.pdf (Double DQN) - https://arxiv.org/pdf/1511.05952.pdf (PER) - https://arxiv.org/pdf/1511.06581.pdf (Dueling) - https://arxiv.org/pdf/1704.03732.pdf (DQfD) +- Paper: https://arxiv.org/pdf/1704.03732.pdf (DQfD) """ -import datetime import pickle +import time from typing import Tuple import numpy as np @@ -27,7 +23,7 @@ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") -class DQNfDAgent(DQNAgent): +class DQfDAgent(DQNAgent): """DQN interacting with environment. Attribute: @@ -142,6 +138,10 @@ def update_model(self) -> Tuple[torch.Tensor, ...]: fraction = min(float(self.i_episode) / self.args.episode_num, 1.0) self.beta = self.beta + fraction * (1.0 - self.beta) + if self.hyper_params["USE_NOISY_NET"]: + self.dqn.reset_noise() + self.dqn_target.reset_noise() + return ( loss.data, dq_loss.data, @@ -150,12 +150,14 @@ def update_model(self) -> Tuple[torch.Tensor, ...]: n_demo, ) - def write_log(self, i: int, avg_loss: np.ndarray, score: float = 0.0): + def write_log( + self, i: int, avg_loss: np.ndarray, score: float, avg_time_cost: float + ): """Write log about loss and score""" print( "[INFO] episode %d, episode step: %d, total step: %d, total score: %f\n" "epsilon: %f, total loss: %f, dq loss: %f, supervised loss: %f\n" - "avg q values: %f, demo num in minibatch: %d at %s\n" + "avg q values: %f, demo num in minibatch: %d (spent %.6f sec/step)\n" % ( i, self.episode_step, @@ -167,7 +169,7 @@ def write_log(self, i: int, avg_loss: np.ndarray, score: float = 0.0): avg_loss[2], avg_loss[3], avg_loss[4], - datetime.datetime.now(), + avg_time_cost, ) ) @@ -179,6 +181,9 @@ def write_log(self, i: int, avg_loss: np.ndarray, score: float = 0.0): "total loss": avg_loss[0], "dq loss": avg_loss[1], "supervised loss": avg_loss[2], + "avg q values": avg_loss[3], + "demo num in minibatch": avg_loss[4], + "time per each step": avg_time_cost, } ) @@ -187,11 +192,14 @@ def pretrain(self): pretrain_loss = list() print("[INFO] Pre-Train %d step." % self.hyper_params["PRETRAIN_STEP"]) for i_step in range(1, self.hyper_params["PRETRAIN_STEP"] + 1): + t_begin = time.time() loss = self.update_model() + t_end = time.time() pretrain_loss.append(loss) # for logging # logging if i_step == 1 or i_step % 100 == 0: avg_loss = np.vstack(pretrain_loss).mean(axis=0) pretrain_loss.clear() - self.write_log(0, avg_loss) + self.write_log(0, avg_loss, 0.0, t_end - t_begin) + print("[INFO] Pre-Train Complete!\n") diff --git a/algorithms/fd/sac_agent.py b/algorithms/fd/sac_agent.py index 30178c76..08429bbe 100644 --- a/algorithms/fd/sac_agent.py +++ b/algorithms/fd/sac_agent.py @@ -79,6 +79,8 @@ def _add_transition_to_memory(self, transition: Tuple[np.ndarray, ...]): # pylint: disable=too-many-statements def update_model(self) -> Tuple[torch.Tensor, ...]: """Train the model after each episode.""" + self.update_step += 1 + experiences = self.memory.sample(self.beta) states, actions, rewards, next_states, dones, weights, indices, eps_d = ( experiences @@ -149,7 +151,7 @@ def update_model(self) -> Tuple[torch.Tensor, ...]: vf_loss.backward() self.vf_optimizer.step() - if self.total_step % self.hyper_params["DELAYED_UPDATE"] == 0: + if self.update_step % self.hyper_params["POLICY_UPDATE_FREQ"] == 0: # actor loss advantage = q_pred - v_pred.detach() actor_loss_element_wise = alpha * log_prob - advantage @@ -212,5 +214,9 @@ def pretrain(self): avg_loss = np.vstack(pretrain_loss).mean(axis=0) pretrain_loss.clear() self.write_log( - 0, avg_loss, 0, delayed_update=self.hyper_params["DELAYED_UPDATE"] + 0, + avg_loss, + 0, + policy_update_freq=self.hyper_params["POLICY_UPDATE_FREQ"], ) + print("[INFO] Pre-Train Complete!\n") diff --git a/algorithms/per/ddpg_agent.py b/algorithms/per/ddpg_agent.py index 3821fd6d..5bfc6349 100644 --- a/algorithms/per/ddpg_agent.py +++ b/algorithms/per/ddpg_agent.py @@ -18,7 +18,7 @@ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") -class DDPGPERAgent(DDPGAgent): +class PERDDPGAgent(DDPGAgent): """ActorCritic interacting with environment. Attributes: diff --git a/algorithms/sac/agent.py b/algorithms/sac/agent.py index 0791ca03..fa1c5212 100644 --- a/algorithms/sac/agent.py +++ b/algorithms/sac/agent.py @@ -47,6 +47,7 @@ class SACAgent(Agent): hyper_params (dict): hyper-parameters total_step (int): total step numbers episode_step (int): step number of the current episode + update_step (int): step number of updates i_episode (int): current episode number """ @@ -80,6 +81,7 @@ def __init__( self.curr_state = np.zeros((1,)) self.total_step = 0 self.episode_step = 0 + self.update_step = 0 self.i_episode = 0 # automatic entropy tuning @@ -153,6 +155,8 @@ def _add_transition_to_memory(self, transition: Tuple[np.ndarray, ...]): def update_model(self) -> Tuple[torch.Tensor, ...]: """Train the model after each episode.""" + self.update_step += 1 + experiences = self.memory.sample() states, actions, rewards, next_states, dones = experiences new_actions, log_prob, pre_tanh_value, mu, std = self.actor(states) @@ -203,7 +207,7 @@ def update_model(self) -> Tuple[torch.Tensor, ...]: vf_loss.backward() self.vf_optimizer.step() - if self.total_step % self.hyper_params["DELAYED_UPDATE"] == 0: + if self.update_step % self.hyper_params["POLICY_UPDATE_FREQ"] == 0: # actor loss advantage = q_pred - v_pred.detach() actor_loss = (alpha * log_prob - advantage).mean() @@ -280,7 +284,7 @@ def save_params(self, n_episode: int): Agent.save_params(self, params, n_episode) def write_log( - self, i: int, loss: np.ndarray, score: float = 0.0, delayed_update: int = 1 + self, i: int, loss: np.ndarray, score: float = 0.0, policy_update_freq: int = 1 ): """Write log about loss and score""" total_loss = loss.sum() @@ -295,7 +299,7 @@ def write_log( self.total_step, score, total_loss, - loss[0] * delayed_update, # actor loss + loss[0] * policy_update_freq, # actor loss loss[1], # qf_1 loss loss[2], # qf_2 loss loss[3], # vf loss @@ -308,7 +312,7 @@ def write_log( { "score": score, "total loss": total_loss, - "actor loss": loss[0] * delayed_update, + "actor loss": loss[0] * policy_update_freq, "qf_1 loss": loss[1], "qf_2 loss": loss[2], "vf loss": loss[3], @@ -359,7 +363,10 @@ def train(self): if loss_episode: avg_loss = np.vstack(loss_episode).mean(axis=0) self.write_log( - self.i_episode, avg_loss, score, self.hyper_params["DELAYED_UPDATE"] + self.i_episode, + avg_loss, + score, + self.hyper_params["POLICY_UPDATE_FREQ"], ) if self.i_episode % self.args.save_period == 0: diff --git a/algorithms/td3/agent.py b/algorithms/td3/agent.py index 63ecefc8..c7e982df 100644 --- a/algorithms/td3/agent.py +++ b/algorithms/td3/agent.py @@ -29,20 +29,20 @@ class TD3Agent(Agent): Attributes: memory (ReplayBuffer): replay memory - noise (GaussianNoise): random noise for exploration + exploration_noise (GaussianNoise): random noise for exploration + target_policy_noise (GaussianNoise): random noise for target values actor (nn.Module): actor model to select actions - critic_1 (nn.Module): critic model to predict state values - critic_2 (nn.Module): critic model to predict state values + critic1 (nn.Module): critic model to predict state values + critic2 (nn.Module): critic model to predict state values critic_target1 (nn.Module): target critic model to predict state values critic_target2 (nn.Module): target critic model to predict state values actor_target (nn.Module): target actor model to select actions - critic_optimizer (Optimizer): optimizer for training critic - actor_optimizer (Optimizer): optimizer for training actor + critic_optim (Optimizer): optimizer for training critic + actor_optim (Optimizer): optimizer for training actor hyper_params (dict): hyper-parameters curr_state (np.ndarray): temporary storage of the current state - total_step (int): total step numbers - update_step (int): train step numbers - episode_step (int): step number of the current episode + total_steps (int): total step numbers + episode_steps (int): step number of the current episode """ @@ -53,7 +53,8 @@ def __init__( hyper_params: dict, models: tuple, optims: tuple, - noise: GaussianNoise, + exploration_noise: GaussianNoise, + target_policy_noise: GaussianNoise, ): """Initialization. @@ -63,22 +64,24 @@ def __init__( hyper_params (dict): hyper-parameters models (tuple): models including actor and critic optims (tuple): optimizers for actor and critic - noise (GaussianNoise): random noise for exploration + exploration_noise (GaussianNoise): random noise for exploration + target_policy_noise (GaussianNoise): random noise for target values """ Agent.__init__(self, env, args) self.actor, self.actor_target = models[0:2] - self.critic_1, self.critic_2 = models[2:4] + self.critic1, self.critic2 = models[2:4] self.critic_target1, self.critic_target2 = models[4:6] - self.actor_optimizer = optims[0] - self.critic_optimizer = optims[1] + self.actor_optim = optims[0] + self.critic_optim = optims[1] self.hyper_params = hyper_params self.curr_state = np.zeros((1,)) - self.noise = noise - self.total_step = 0 - self.update_step = 0 - self.episode_step = 0 + self.exploration_noise = exploration_noise + self.target_policy_noise = target_policy_noise + self.total_steps = 0 + self.episode_steps = 0 + self.update_steps = 0 self.i_episode = 0 # load the optimizer and model parameters @@ -98,102 +101,93 @@ def select_action(self, state: np.ndarray) -> np.ndarray: self.curr_state = state - if self.total_step < random_action_count and not self.args.test: + if self.total_steps < random_action_count and not self.args.test: return self.env.action_space.sample() state = torch.FloatTensor(state).to(device) - selected_action = self.actor(state) + selected_action = self.actor(state).detach().cpu().numpy() if not self.args.test: - action_size = selected_action.size() - selected_action += torch.FloatTensor( - self.noise.sample(action_size, self.total_step) - ).to(device) - selected_action = torch.clamp(selected_action, -1.0, 1.0) + noise = self.exploration_noise.sample() + selected_action = np.clip(selected_action + noise, -1.0, 1.0) - return selected_action.detach().cpu().numpy() + return selected_action def step(self, action: np.ndarray) -> Tuple[np.ndarray, np.float64, bool]: """Take an action and return the response of the env.""" - self.total_step += 1 - self.episode_step += 1 + self.total_steps += 1 + self.episode_steps += 1 next_state, reward, done, _ = self.env.step(action) - # if last state is not terminal state in episode, done is false - done_bool = ( - 0.0 if self.episode_step == self.args.max_episode_steps else float(done) - ) if not self.args.test: + # if last state is not terminal state in episode, done is false + done_bool = ( + False if self.episode_steps == self.args.max_episode_steps else done + ) self.memory.add(self.curr_state, action, reward, next_state, done_bool) return next_state, reward, done def update_model( - self, - experiences: Tuple[ - torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor - ], + self, experiences: Tuple[torch.Tensor, ...] ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """Train the model after each episode.""" - self.update_step += 1 + self.update_steps += 1 states, actions, rewards, next_states, dones = experiences masks = 1 - dones # get actions with noise - noise_std, noise_clip = ( - self.hyper_params["TARGET_SMOOTHING_NOISE_STD"], - self.hyper_params["TARGET_SMOOTHING_NOISE_CLIP"], + noise = torch.FloatTensor(self.target_policy_noise.sample()).to(device) + clipped_noise = torch.clamp( + noise, + -self.hyper_params["TARGET_POLICY_NOISE_CLIP"], + self.hyper_params["TARGET_POLICY_NOISE_CLIP"], ) - next_actions = self.actor_target(next_states) - noise = next_actions.data.normal_(0, noise_std).to(device) - noise = noise.clamp(-noise_clip, noise_clip) - next_actions += noise - next_actions = next_actions.clamp(-1.0, 1.0) + next_actions = (self.actor_target(next_states) + clipped_noise).clamp(-1.0, 1.0) # min (Q_1', Q_2') - next_states_actions = torch.cat((next_states, next_actions), dim=-1) - next_values1 = self.critic_target1(next_states_actions) - next_values2 = self.critic_target2(next_states_actions) + next_values1 = self.critic_target1(next_states, next_actions) + next_values2 = self.critic_target2(next_states, next_actions) next_values = torch.min(next_values1, next_values2) # G_t = r + gamma * v(s_{t+1}) if state != Terminal # = r otherwise curr_returns = rewards + self.hyper_params["GAMMA"] * next_values * masks - curr_returns = curr_returns.to(device).detach() + curr_returns = curr_returns.detach() # critic loss - states_actions = torch.cat((states, actions), dim=-1) - values1 = self.critic_1(states_actions) - values2 = self.critic_2(states_actions) - critic_loss1 = F.mse_loss(values1, curr_returns) - critic_loss2 = F.mse_loss(values2, curr_returns) - critic_loss = critic_loss1 + critic_loss2 + values1 = self.critic1(states, actions) + values2 = self.critic2(states, actions) + critic1_loss = F.mse_loss(values1, curr_returns) + critic2_loss = F.mse_loss(values2, curr_returns) # train critic - self.critic_optimizer.zero_grad() + critic_loss = critic1_loss + critic2_loss + self.critic_optim.zero_grad() critic_loss.backward() - self.critic_optimizer.step() + self.critic_optim.step() - if self.update_step % self.hyper_params["DELAYED_UPDATE"] == 0: - # train actor + if self.update_steps % self.hyper_params["POLICY_UPDATE_FREQ"] == 0: + # policy loss actions = self.actor(states) - states_actions = torch.cat((states, actions), dim=-1) - actor_loss = -self.critic_1(states_actions).mean() - self.actor_optimizer.zero_grad() + actor_loss = -self.critic1(states, actions).mean() + + # train actor + self.actor_optim.zero_grad() actor_loss.backward() - self.actor_optimizer.step() + self.actor_optim.step() # update target networks tau = self.hyper_params["TAU"] - common_utils.soft_update(self.critic_1, self.critic_target1, tau) - common_utils.soft_update(self.critic_2, self.critic_target2, tau) + common_utils.soft_update(self.critic1, self.critic_target1, tau) + common_utils.soft_update(self.critic2, self.critic_target2, tau) common_utils.soft_update(self.actor, self.actor_target, tau) else: actor_loss = torch.zeros(1) - return actor_loss.data, critic_loss1.data, critic_loss2.data + return actor_loss.data, critic1_loss.data, critic2_loss.data def load_params(self, path: str): """Load model and optimizer parameters.""" @@ -202,14 +196,14 @@ def load_params(self, path: str): return params = torch.load(path) - self.critic_1.load_state_dict(params["critic_1"]) - self.critic_2.load_state_dict(params["critic_2"]) + self.critic1.load_state_dict(params["critic1"]) + self.critic2.load_state_dict(params["critic2"]) self.critic_target1.load_state_dict(params["critic_target1"]) self.critic_target2.load_state_dict(params["critic_target2"]) - self.critic_optimizer.load_state_dict(params["critic_optim"]) + self.critic_optim.load_state_dict(params["critic_optim"]) self.actor.load_state_dict(params["actor"]) self.actor_target.load_state_dict(params["actor_target"]) - self.actor_optimizer.load_state_dict(params["actor_optim"]) + self.actor_optim.load_state_dict(params["actor_optim"]) print("[INFO] loaded the model and optimizer from", path) def save_params(self, n_episode: int): @@ -217,30 +211,31 @@ def save_params(self, n_episode: int): params = { "actor": self.actor.state_dict(), "actor_target": self.actor_target.state_dict(), - "actor_optim": self.actor_optimizer.state_dict(), - "critic_1": self.critic_1.state_dict(), - "critic_2": self.critic_2.state_dict(), + "actor_optim": self.actor_optim.state_dict(), + "critic1": self.critic1.state_dict(), + "critic2": self.critic2.state_dict(), "critic_target1": self.critic_target1.state_dict(), "critic_target2": self.critic_target2.state_dict(), - "critic_optim": self.critic_optimizer.state_dict(), + "critic_optim": self.critic_optim.state_dict(), } Agent.save_params(self, params, n_episode) def write_log( - self, i: int, loss: np.ndarray, score: float = 0.0, delayed_update: int = 1 + self, i: int, loss: np.ndarray, score: float = 0.0, policy_update_freq: int = 1 ): """Write log about loss and score""" total_loss = loss.sum() print( - "[INFO] episode %d total score: %d, total_step: %d, total loss: %f\n" - "actor_loss: %.3f critic_1_loss: %.3f critic_2_loss: %.3f\n" + "[INFO] episode %d total score: %d, episode_step: %d, total_step: %d\n" + "total loss: %f actor_loss: %.3f critic1_loss: %.3f critic2_loss: %.3f\n" % ( i, score, - self.total_step, + self.episode_steps, + self.total_steps, total_loss, - loss[0] * delayed_update, # actor loss + loss[0] * policy_update_freq, # actor loss loss[1], # critic1 loss loss[2], # critic2 loss ) @@ -251,9 +246,9 @@ def write_log( { "score": score, "total loss": total_loss, - "actor loss": loss[0] * delayed_update, - "critic_1 loss": loss[1], - "critic_2 loss": loss[2], + "actor loss": loss[0] * policy_update_freq, + "critic1 loss": loss[1], + "critic2 loss": loss[2], } ) @@ -263,14 +258,14 @@ def train(self): if self.args.log: wandb.init() wandb.config.update(self.hyper_params) - # wandb.watch([self.actor, self.critic_1, self.critic_2], log="parameters") + # wandb.watch([self.actor, self.critic1, self.critic2], log="parameters") for self.i_episode in range(1, self.args.episode_num + 1): state = self.env.reset() done = False score = 0 loss_episode = list() - self.episode_step = 0 + self.episode_steps = 0 while not done: if self.args.render and self.i_episode >= self.args.render_after: @@ -282,9 +277,7 @@ def train(self): state = next_state score += reward - # training - if len(self.memory) >= self.hyper_params["BATCH_SIZE"]: - for _ in range(self.hyper_params["EPOCH"]): + if len(self.memory) >= self.hyper_params["BATCH_SIZE"]: experiences = self.memory.sample() loss = self.update_model(experiences) loss_episode.append(loss) # for logging @@ -293,7 +286,10 @@ def train(self): if loss_episode: avg_loss = np.vstack(loss_episode).mean(axis=0) self.write_log( - self.i_episode, avg_loss, score, self.hyper_params["DELAYED_UPDATE"] + self.i_episode, + avg_loss, + score, + self.hyper_params["POLICY_UPDATE_FREQ"], ) if self.i_episode % self.args.save_period == 0: self.save_params(self.i_episode) diff --git a/examples/lunarlander_continuous_v2/bc-sac.py b/examples/lunarlander_continuous_v2/bc-sac.py index e3258b58..6f9d0748 100644 --- a/examples/lunarlander_continuous_v2/bc-sac.py +++ b/examples/lunarlander_continuous_v2/bc-sac.py @@ -31,7 +31,7 @@ "LR_QF1": 3e-4, "LR_QF2": 3e-4, "LR_ENTROPY": 3e-4, - "DELAYED_UPDATE": 2, + "POLICY_UPDATE_FREQ": 2, "BUFFER_SIZE": int(1e6), "BATCH_SIZE": 512, "DEMO_BATCH_SIZE": 64, diff --git a/examples/lunarlander_continuous_v2/per-ddpg.py b/examples/lunarlander_continuous_v2/per-ddpg.py index 33769669..4a7c0564 100644 --- a/examples/lunarlander_continuous_v2/per-ddpg.py +++ b/examples/lunarlander_continuous_v2/per-ddpg.py @@ -13,7 +13,7 @@ from algorithms.common.networks.mlp import MLP from algorithms.common.noise import OUNoise -from algorithms.per.ddpg_agent import DDPGPERAgent +from algorithms.per.ddpg_agent import PERDDPGAgent device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") @@ -104,7 +104,7 @@ def run(env: gym.Env, args: argparse.Namespace, state_dim: int, action_dim: int) optims = (actor_optim, critic_optim) # create an agent - agent = DDPGPERAgent(env, args, hyper_params, models, optims, noise) + agent = PERDDPGAgent(env, args, hyper_params, models, optims, noise) # run if args.test: diff --git a/examples/lunarlander_continuous_v2/sac.py b/examples/lunarlander_continuous_v2/sac.py index 344d6c9a..6654cc10 100644 --- a/examples/lunarlander_continuous_v2/sac.py +++ b/examples/lunarlander_continuous_v2/sac.py @@ -22,20 +22,20 @@ "GAMMA": 0.99, "TAU": 5e-3, "W_ENTROPY": 1e-3, - "W_MEAN_REG": 1e-3, - "W_STD_REG": 1e-3, + "W_MEAN_REG": 0.0, + "W_STD_REG": 0.0, "W_PRE_ACTIVATION_REG": 0.0, "LR_ACTOR": 3e-4, "LR_VF": 3e-4, "LR_QF1": 3e-4, "LR_QF2": 3e-4, "LR_ENTROPY": 3e-4, - "DELAYED_UPDATE": 2, + "POLICY_UPDATE_FREQ": 2, "BUFFER_SIZE": int(1e6), - "BATCH_SIZE": 512, + "BATCH_SIZE": 128, "AUTO_ENTROPY_TUNING": True, "WEIGHT_DECAY": 0.0, - "INITIAL_RANDOM_ACTION": 5000, + "INITIAL_RANDOM_ACTION": int(1e4), "MULTIPLE_LEARN": 1, } diff --git a/examples/lunarlander_continuous_v2/sacfd.py b/examples/lunarlander_continuous_v2/sacfd.py index 665c9407..b2209d86 100644 --- a/examples/lunarlander_continuous_v2/sacfd.py +++ b/examples/lunarlander_continuous_v2/sacfd.py @@ -34,7 +34,7 @@ "W_MEAN_REG": 1e-3, "W_STD_REG": 1e-3, "W_PRE_ACTIVATION_REG": 0.0, - "DELAYED_UPDATE": 2, + "POLICY_UPDATE_FREQ": 2, "PRETRAIN_STEP": 100, "MULTIPLE_LEARN": 2, # multiple learning updates "LAMBDA1": 1.0, # N-step return weight diff --git a/examples/lunarlander_continuous_v2/td3.py b/examples/lunarlander_continuous_v2/td3.py index b73a05c5..a0ccf39e 100644 --- a/examples/lunarlander_continuous_v2/td3.py +++ b/examples/lunarlander_continuous_v2/td3.py @@ -11,7 +11,7 @@ import torch import torch.optim as optim -from algorithms.common.networks.mlp import MLP +from algorithms.common.networks.mlp import MLP, FlattenMLP from algorithms.common.noise import GaussianNoise from algorithms.td3.agent import TD3Agent @@ -21,19 +21,15 @@ hyper_params = { "GAMMA": 0.99, "TAU": 5e-3, - "TARGET_SMOOTHING_NOISE_STD": 0.2, - "TARGET_SMOOTHING_NOISE_CLIP": 0.5, - "DELAYED_UPDATE": 2, - "BUFFER_SIZE": int(1e5), - "BATCH_SIZE": 128, + "BUFFER_SIZE": int(1e6), + "BATCH_SIZE": 100, "LR_ACTOR": 1e-3, - "LR_CRITIC_1": 1e-3, - "LR_CRITIC_2": 1e-3, - "GAUSSIAN_NOISE_MIN_SIGMA": 0.1, - "GAUSSIAN_NOISE_MAX_SIGMA": 0.1, - "GAUSSIAN_NOISE_DECAY_PERIOD": 1000000, - "WEIGHT_DECAY": 1e-6, - "EPOCH": 256, + "LR_CRITIC": 1e-3, + "WEIGHT_DECAY": 0.0, + "POLICY_UPDATE_FREQ": 2, + "EXPLORATION_NOISE": 0.1, + "TARGET_POLICY_NOISE": 0.2, + "TARGET_POLICY_NOISE_CLIP": 0.5, "INITIAL_RANDOM_ACTION": int(1e4), } @@ -58,6 +54,7 @@ def run(env: gym.Env, args: argparse.Namespace, state_dim: int, action_dim: int) hidden_sizes=hidden_sizes_actor, output_activation=torch.tanh, ).to(device) + actor_target = MLP( input_size=state_dim, output_size=action_dim, @@ -67,28 +64,35 @@ def run(env: gym.Env, args: argparse.Namespace, state_dim: int, action_dim: int) actor_target.load_state_dict(actor.state_dict()) # create critic - critic_1 = MLP( + critic1 = FlattenMLP( input_size=state_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes_critic, ).to(device) - critic_2 = MLP( + + critic2 = FlattenMLP( input_size=state_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes_critic, ).to(device) - critic_target1 = MLP( + + critic_target1 = FlattenMLP( input_size=state_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes_critic, ).to(device) - critic_target2 = MLP( + + critic_target2 = FlattenMLP( input_size=state_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes_critic, ).to(device) - critic_target1.load_state_dict(critic_1.state_dict()) - critic_target2.load_state_dict(critic_2.state_dict()) + + critic_target1.load_state_dict(critic1.state_dict()) + critic_target2.load_state_dict(critic2.state_dict()) + + # concat critic parameters to use one optim + critic_parameters = list(critic1.parameters()) + list(critic2.parameters()) # create optimizers actor_optim = optim.Adam( @@ -96,26 +100,32 @@ def run(env: gym.Env, args: argparse.Namespace, state_dim: int, action_dim: int) lr=hyper_params["LR_ACTOR"], weight_decay=hyper_params["WEIGHT_DECAY"], ) - critic_parameter = list(critic_1.parameters()) + list(critic_2.parameters()) + critic_optim = optim.Adam( - critic_parameter, - lr=hyper_params["LR_CRITIC_1"], + critic_parameters, + lr=hyper_params["LR_CRITIC"], weight_decay=hyper_params["WEIGHT_DECAY"], ) # noise instance to make randomness of action - noise = GaussianNoise( - hyper_params["GAUSSIAN_NOISE_MIN_SIGMA"], - hyper_params["GAUSSIAN_NOISE_MAX_SIGMA"], - hyper_params["GAUSSIAN_NOISE_DECAY_PERIOD"], + exploration_noise = GaussianNoise( + action_dim, hyper_params["EXPLORATION_NOISE"], hyper_params["EXPLORATION_NOISE"] + ) + + target_policy_noise = GaussianNoise( + action_dim, + hyper_params["TARGET_POLICY_NOISE"], + hyper_params["TARGET_POLICY_NOISE"], ) # make tuples to create an agent - models = (actor, actor_target, critic_1, critic_2, critic_target1, critic_target2) + models = (actor, actor_target, critic1, critic2, critic_target1, critic_target2) optims = (actor_optim, critic_optim) # create an agent - agent = TD3Agent(env, args, hyper_params, models, optims, noise) + agent = TD3Agent( + env, args, hyper_params, models, optims, exploration_noise, target_policy_noise + ) # run if args.test: diff --git a/examples/lunarlander_v2/dqfd.py b/examples/lunarlander_v2/dqfd.py index 3b373c96..91500585 100644 --- a/examples/lunarlander_v2/dqfd.py +++ b/examples/lunarlander_v2/dqfd.py @@ -9,10 +9,14 @@ import gym import torch +import torch.nn as nn import torch.optim as optim +from algorithms.common.helper_functions import identity +from algorithms.common.networks.mlp import init_layer_uniform +from algorithms.dqn.linear import NoisyLinearConstructor from algorithms.dqn.networks import C51DuelingMLP -from algorithms.fd.dqn_agent import DQNfDAgent +from algorithms.fd.dqn_agent import DQfDAgent device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") @@ -47,6 +51,9 @@ "V_MIN": -300, "V_MAX": 300, "ATOMS": 1530, + # NoisyNet + "USE_NOISY_NET": False, + "STD_INIT": 0.5, } @@ -63,6 +70,17 @@ def run(env: gym.Env, args: argparse.Namespace, state_dim: int, action_dim: int) # create model def get_fc_model(): hidden_sizes = [128, 64] + + # use noisy net + if hyper_params["USE_NOISY_NET"]: + linear_layer = NoisyLinearConstructor(hyper_params["STD_INIT"]) + init_fn = identity + hyper_params["MAX_EPSILON"] = 0.0 + hyper_params["MIN_EPSILON"] = 0.0 + else: + linear_layer = nn.Linear + init_fn = init_layer_uniform + model = C51DuelingMLP( input_size=state_dim, action_size=action_dim, @@ -70,7 +88,10 @@ def get_fc_model(): v_min=hyper_params["V_MIN"], v_max=hyper_params["V_MAX"], atom_size=hyper_params["ATOMS"], + linear_layer=linear_layer, + init_fn=init_fn, ).to(device) + return model dqn = get_fc_model() @@ -89,7 +110,7 @@ def get_fc_model(): models = (dqn, dqn_target) # create an agent - agent = DQNfDAgent(env, args, hyper_params, models, dqn_optim) + agent = DQfDAgent(env, args, hyper_params, models, dqn_optim) # run if args.test: diff --git a/examples/lunarlander_v2/dqn.py b/examples/lunarlander_v2/dqn.py index c824dfaf..435dd505 100644 --- a/examples/lunarlander_v2/dqn.py +++ b/examples/lunarlander_v2/dqn.py @@ -9,9 +9,13 @@ import gym import torch +import torch.nn as nn import torch.optim as optim +from algorithms.common.helper_functions import identity +from algorithms.common.networks.mlp import init_layer_uniform from algorithms.dqn.agent import DQNAgent +from algorithms.dqn.linear import NoisyLinearConstructor from algorithms.dqn.networks import C51DuelingMLP device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") @@ -43,6 +47,9 @@ "V_MIN": -300, "V_MAX": 300, "ATOMS": 1530, + # NoisyNet + "USE_NOISY_NET": True, + "STD_INIT": 0.5, } @@ -59,6 +66,17 @@ def run(env: gym.Env, args: argparse.Namespace, state_dim: int, action_dim: int) # create model def get_fc_model(): hidden_sizes = [128, 64] + + if hyper_params["USE_NOISY_NET"]: + # use noisy net + linear_layer = NoisyLinearConstructor(hyper_params["STD_INIT"]) + init_fn = identity + hyper_params["MAX_EPSILON"] = 0.0 + hyper_params["MIN_EPSILON"] = 0.0 + else: + linear_layer = nn.Linear + init_fn = init_layer_uniform + model = C51DuelingMLP( input_size=state_dim, action_size=action_dim, @@ -66,6 +84,8 @@ def get_fc_model(): v_min=hyper_params["V_MIN"], v_max=hyper_params["V_MAX"], atom_size=hyper_params["ATOMS"], + linear_layer=linear_layer, + init_fn=init_fn, ).to(device) return model diff --git a/examples/pong_no_frameskip_v4/dqn.py b/examples/pong_no_frameskip_v4/dqn.py index 074b987e..dfe8bd6a 100644 --- a/examples/pong_no_frameskip_v4/dqn.py +++ b/examples/pong_no_frameskip_v4/dqn.py @@ -9,10 +9,14 @@ import gym import torch +import torch.nn as nn import torch.optim as optim +from algorithms.common.helper_functions import identity from algorithms.common.networks.cnn import CNNLayer +from algorithms.common.networks.mlp import init_layer_uniform from algorithms.dqn.agent import DQNAgent +from algorithms.dqn.linear import NoisyLinearConstructor from algorithms.dqn.networks import IQNCNN, IQNMLP device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") @@ -46,6 +50,9 @@ "N_QUANTILE_SAMPLES": 32, "QUANTILE_EMBEDDING_DIM": 64, "KAPPA": 1.0, + # NoisyNet + "USE_NOISY_NET": True, + "STD_INIT": 0.5, } @@ -64,12 +71,24 @@ def get_cnn_model(): hidden_sizes = [512] action_dim = env.action_space.n + # use noisy net + if hyper_params["USE_NOISY_NET"]: + linear_layer = NoisyLinearConstructor(hyper_params["STD_INIT"]) + init_fn = identity + hyper_params["MAX_EPSILON"] = 0.0 + hyper_params["MIN_EPSILON"] = 0.0 + else: + linear_layer = nn.Linear + init_fn = init_layer_uniform + fc_model = IQNMLP( input_size=fc_input_size, output_size=action_dim, hidden_sizes=hidden_sizes, n_quantiles=hyper_params["N_QUANTILE_SAMPLES"], quantile_embedding_dim=hyper_params["QUANTILE_EMBEDDING_DIM"], + linear_layer=linear_layer, + init_fn=init_fn, ).to(device) # create a model diff --git a/examples/reacher_v2/bc-sac.py b/examples/reacher_v2/bc-sac.py index 5773eb95..94f9b01e 100644 --- a/examples/reacher_v2/bc-sac.py +++ b/examples/reacher_v2/bc-sac.py @@ -31,7 +31,7 @@ "LR_QF1": 3e-4, "LR_QF2": 3e-4, "LR_ENTROPY": 3e-4, - "DELAYED_UPDATE": 2, + "POLICY_UPDATE_FREQ": 2, "BUFFER_SIZE": int(1e6), "BATCH_SIZE": 512, "DEMO_BATCH_SIZE": 64, diff --git a/examples/reacher_v2/sac.py b/examples/reacher_v2/sac.py index 3f2607d2..3cab21b1 100644 --- a/examples/reacher_v2/sac.py +++ b/examples/reacher_v2/sac.py @@ -30,7 +30,7 @@ "LR_QF1": 3e-4, "LR_QF2": 3e-4, "LR_ENTROPY": 3e-4, - "DELAYED_UPDATE": 2, + "POLICY_UPDATE_FREQ": 2, "BUFFER_SIZE": int(1e6), "BATCH_SIZE": 512, "AUTO_ENTROPY_TUNING": True, diff --git a/examples/reacher_v2/td3.py b/examples/reacher_v2/td3.py index c488afad..6854f293 100644 --- a/examples/reacher_v2/td3.py +++ b/examples/reacher_v2/td3.py @@ -11,7 +11,7 @@ import torch import torch.optim as optim -from algorithms.common.networks.mlp import MLP +from algorithms.common.networks.mlp import MLP, FlattenMLP from algorithms.common.noise import GaussianNoise from algorithms.td3.agent import TD3Agent @@ -19,21 +19,17 @@ # hyper parameters hyper_params = { - "GAMMA": 0.99, + "GAMMA": 0.95, "TAU": 5e-3, - "TARGET_SMOOTHING_NOISE_STD": 0.2, - "TARGET_SMOOTHING_NOISE_CLIP": 0.5, - "DELAYED_UPDATE": 2, "BUFFER_SIZE": int(1e6), "BATCH_SIZE": 100, "LR_ACTOR": 1e-3, - "LR_CRITIC_1": 1e-3, - "LR_CRITIC_2": 1e-3, - "GAUSSIAN_NOISE_MIN_SIGMA": 0.1, - "GAUSSIAN_NOISE_MAX_SIGMA": 0.1, - "GAUSSIAN_NOISE_DECAY_PERIOD": 1000000, - "WEIGHT_DECAY": 1e-6, - "EPOCH": 50, + "LR_CRITIC": 1e-3, + "WEIGHT_DECAY": 0.0, + "POLICY_UPDATE_FREQ": 2, + "EXPLORATION_NOISE": 0.1, + "TARGET_POLICY_NOISE": 0.2, + "TARGET_POLICY_NOISE_CLIP": 0.5, "INITIAL_RANDOM_ACTION": int(1e4), } @@ -58,37 +54,46 @@ def run(env: gym.Env, args: argparse.Namespace, state_dim: int, action_dim: int) hidden_sizes=hidden_sizes_actor, output_activation=torch.tanh, ).to(device) + actor_target = MLP( input_size=state_dim, output_size=action_dim, hidden_sizes=hidden_sizes_actor, output_activation=torch.tanh, ).to(device) + actor_target.load_state_dict(actor.state_dict()) # create critic - critic_1 = MLP( + critic1 = FlattenMLP( input_size=state_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes_critic, ).to(device) - critic_2 = MLP( + + critic2 = FlattenMLP( input_size=state_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes_critic, ).to(device) - critic_target1 = MLP( + + critic_target1 = FlattenMLP( input_size=state_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes_critic, ).to(device) - critic_target2 = MLP( + + critic_target2 = FlattenMLP( input_size=state_dim + action_dim, output_size=1, hidden_sizes=hidden_sizes_critic, ).to(device) - critic_target1.load_state_dict(critic_1.state_dict()) - critic_target2.load_state_dict(critic_2.state_dict()) + + critic_target1.load_state_dict(critic1.state_dict()) + critic_target2.load_state_dict(critic2.state_dict()) + + # concat critic parameters to use one optim + critic_parameters = list(critic1.parameters()) + list(critic2.parameters()) # create optimizers actor_optim = optim.Adam( @@ -96,26 +101,32 @@ def run(env: gym.Env, args: argparse.Namespace, state_dim: int, action_dim: int) lr=hyper_params["LR_ACTOR"], weight_decay=hyper_params["WEIGHT_DECAY"], ) - critic_parameter = list(critic_1.parameters()) + list(critic_2.parameters()) + critic_optim = optim.Adam( - critic_parameter, - lr=hyper_params["LR_CRITIC_1"], + critic_parameters, + lr=hyper_params["LR_CRITIC"], weight_decay=hyper_params["WEIGHT_DECAY"], ) # noise instance to make randomness of action - noise = GaussianNoise( - hyper_params["GAUSSIAN_NOISE_MIN_SIGMA"], - hyper_params["GAUSSIAN_NOISE_MAX_SIGMA"], - hyper_params["GAUSSIAN_NOISE_DECAY_PERIOD"], + exploration_noise = GaussianNoise( + action_dim, hyper_params["EXPLORATION_NOISE"], hyper_params["EXPLORATION_NOISE"] + ) + + target_policy_noise = GaussianNoise( + action_dim, + hyper_params["TARGET_POLICY_NOISE"], + hyper_params["TARGET_POLICY_NOISE"], ) # make tuples to create an agent - models = (actor, actor_target, critic_1, critic_2, critic_target1, critic_target2) + models = (actor, actor_target, critic1, critic2, critic_target1, critic_target2) optims = (actor_optim, critic_optim) # create an agent - agent = TD3Agent(env, args, hyper_params, models, optims, noise) + agent = TD3Agent( + env, args, hyper_params, models, optims, exploration_noise, target_policy_noise + ) # run if args.test: