Skip to content

Commit

Permalink
Merge pull request #331 from kengz/reward
Browse files Browse the repository at this point in the history
Reward preprocessing in wrapper; retire Atari-specific memories
  • Loading branch information
kengz authored May 8, 2019
2 parents 779cc1e + 6b564bc commit 83dcba7
Show file tree
Hide file tree
Showing 56 changed files with 175 additions and 144 deletions.
19 changes: 0 additions & 19 deletions slm_lab/agent/memory/onpolicy.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,22 +144,3 @@ def sample(self):
'dones' : dones}
'''
return super().sample()


class OnPolicyAtariReplay(OnPolicyReplay):
'''
Preprocesses an state to be the concatenation of the last four states, after converting the 210 x 160 x 3 image to 84 x 84 x 1 grayscale image, and clips all rewards to [-10, 10] as per "Playing Atari with Deep Reinforcement Learning", Mnih et al, 2013
Note: Playing Atari with Deep RL clips the rewards to + / - 1
Otherwise the same as OnPolicyReplay memory
'''

def add_experience(self, state, action, reward, next_state, done):
# clip reward, done here to minimize change to only training data data
super().add_experience(state, action, np.sign(reward), next_state, done)


class OnPolicyAtariBatchReplay(OnPolicyBatchReplay, OnPolicyAtariReplay):
'''
OnPolicyBatchReplay with Atari concat
'''
pass
7 changes: 1 addition & 6 deletions slm_lab/agent/memory/prioritized.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from slm_lab.agent.memory.replay import Replay, AtariReplay
from slm_lab.agent.memory.replay import Replay
from slm_lab.lib import util
from slm_lab.lib.decorator import lab_api
import numpy as np
Expand Down Expand Up @@ -175,8 +175,3 @@ def update_priorities(self, errors):
self.priorities[idx] = p
for p, i in zip(priorities, self.tree_idxs):
self.tree.update(i, p)


class AtariPrioritizedReplay(PrioritizedReplay, AtariReplay):
'''Make a Atari PrioritizedReplay via nice multi-inheritance (python magic)'''
pass
11 changes: 0 additions & 11 deletions slm_lab/agent/memory/replay.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,14 +151,3 @@ def sample_idxs(self, batch_size):
if self.use_cer: # add the latest sample
batch_idxs[-1] = self.head
return batch_idxs


class AtariReplay(Replay):
'''
Preprocesses an state to be the concatenation of the last four states, after converting the 210 x 160 x 3 image to 84 x 84 x 1 grayscale image, and clips all rewards to [-10, 10] as per "Playing Atari with Deep Reinforcement Learning", Mnih et al, 2013
Note: Playing Atari with Deep RL clips the rewards to + / - 1
'''

def add_experience(self, state, action, reward, next_state, done):
# clip reward, done here to minimize change to only training data data
super().add_experience(state, action, np.sign(reward), next_state, done)
8 changes: 2 additions & 6 deletions slm_lab/env/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@ def __init__(self, spec, e=None, env_space=None):
try_register_env(spec) # register if it's a custom gym env
seed = ps.get(spec, 'meta.random_seed')
if self.is_venv: # make vector environment
self.u_env = make_gym_venv(self.name, seed, self.frame_op, self.frame_op_len, self.num_envs)
self.u_env = make_gym_venv(self.name, seed, self.frame_op, self.frame_op_len, self.reward_scale, self.num_envs)
else:
self.u_env = make_gym_env(self.name, seed, self.frame_op, self.frame_op_len)
self.u_env = make_gym_env(self.name, seed, self.frame_op, self.frame_op_len, self.reward_scale)
self._set_attr_from_u_env(self.u_env)
self.max_t = self.max_t or self.u_env.spec.max_episode_steps
assert self.max_t is not None
Expand All @@ -58,8 +58,6 @@ def step(self, action):
if not self.is_discrete and self.action_dim == 1: # guard for continuous with action_dim 1, make array
action = np.expand_dims(action, axis=-1)
state, reward, done, info = self.u_env.step(action)
if self.reward_scale is not None:
reward *= self.reward_scale
if self.to_render:
self.u_env.render()
if not self.is_venv and self.clock.t > self.max_t:
Expand Down Expand Up @@ -100,8 +98,6 @@ def space_step(self, action_e):
state, reward, done, info = self.u_env.step(action)
if done:
state = self.u_env.reset()
if self.reward_scale is not None:
reward *= self.reward_scale
if self.to_render:
self.u_env.render()
if not self.is_venv and self.clock.t > self.max_t:
Expand Down
11 changes: 5 additions & 6 deletions slm_lab/env/unity.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from gym import spaces
from slm_lab.env.base import BaseEnv, ENV_DATA_NAMES, set_gym_space_attr
from slm_lab.env.registration import get_env_path
from slm_lab.env.wrapper import try_scale_reward
from slm_lab.lib import logger, util
from slm_lab.lib.decorator import lab_api
from unityagents import brain, UnityEnvironment
Expand Down Expand Up @@ -141,8 +142,7 @@ def step(self, action):
env_info_a = self._get_env_info(env_info_dict, a)
state = env_info_a.states[b]
reward = env_info_a.rewards[b]
if self.reward_scale is not None:
reward *= self.reward_scale
reward = try_scale_reward(self, reward)
done = env_info_a.local_done[b]
if not self.is_venv and self.clock.t > self.max_t:
done = True
Expand Down Expand Up @@ -187,10 +187,9 @@ def space_step(self, action_e):
for (a, b), body in util.ndenumerate_nonan(self.body_e):
env_info_a = self._get_env_info(env_info_dict, a)
state_e[(a, b)] = env_info_a.states[b]
reward = env_info_a.rewards[b]
if self.reward_scale is not None:
reward *= self.reward_scale
reward_e[(a, b)] = reward
rewards = env_info_a.rewards[b]
rewards = try_scale_reward(self, rewards)
reward_e[(a, b)] = rewards
done_e[(a, b)] = env_info_a.local_done[b]
info_e = env_info_dict
self.done = (util.nonan_all(done_e) or self.clock.t > self.max_t)
Expand Down
15 changes: 9 additions & 6 deletions slm_lab/env/vec_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from collections import OrderedDict
from functools import partial
from gym import spaces
from slm_lab.env.wrapper import make_gym_env
from slm_lab.env.wrapper import make_gym_env, try_scale_reward
from slm_lab.lib import logger
import contextlib
import ctypes
Expand Down Expand Up @@ -450,11 +450,13 @@ def _decode_obses(self, obs):
class VecFrameStack(VecEnvWrapper):
'''Frame stack wrapper for vector environment'''

def __init__(self, venv, frame_op, frame_op_len):
def __init__(self, venv, frame_op, frame_op_len, reward_scale=None):
self.venv = venv
assert frame_op == 'concat', 'VecFrameStack only supports concat frame_op for now'
self.frame_op = frame_op
self.frame_op_len = frame_op_len
self.reward_scale = reward_scale
self.sign_reward = self.reward_scale == 'sign'
self.spec = venv.spec
wos = venv.observation_space # wrapped ob space
self.shape_dim0 = wos.shape[0]
Expand All @@ -471,6 +473,7 @@ def step_wait(self):
if new:
self.stackedobs[i] = 0
self.stackedobs[:, -self.shape_dim0:] = obs
rews = try_scale_reward(self, rews)
return self.stackedobs.copy(), rews, news, infos

def reset(self):
Expand All @@ -480,17 +483,17 @@ def reset(self):
return self.stackedobs.copy()


def make_gym_venv(name, seed=0, frame_op=None, frame_op_len=None, num_envs=4):
def make_gym_venv(name, seed=0, frame_op=None, frame_op_len=None, reward_scale=None, num_envs=4):
'''General method to create any parallel vectorized Gym env; auto wraps Atari'''
venv = [
# don't stack on individual env, but stack as vector
partial(make_gym_env, name, seed + i, frame_op=None, frame_op_len=None)
# don't concat frame or clip reward on individual env; do that at vector level
partial(make_gym_env, name, seed + i, frame_op=None, frame_op_len=None, reward_scale=None)
for i in range(num_envs)
]
if len(venv) > 1:
venv = ShmemVecEnv(venv, context='fork')
else:
venv = DummyVecEnv(venv)
if frame_op is not None:
venv = VecFrameStack(venv, frame_op, frame_op_len)
venv = VecFrameStack(venv, frame_op, frame_op_len, reward_scale)
return venv
37 changes: 28 additions & 9 deletions slm_lab/env/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,17 @@
import numpy as np


def try_scale_reward(cls, reward):
'''Env class to scale reward and set raw_reward'''
if cls.reward_scale is not None:
cls.raw_reward = reward
if cls.sign_reward:
reward = np.sign(reward)
else:
reward *= cls.reward_scale
return reward


class NoopResetEnv(gym.Wrapper):
def __init__(self, env, noop_max=30):
'''
Expand Down Expand Up @@ -130,10 +141,19 @@ def reset(self, **kwargs):
return self.env.reset(**kwargs)


class ClipRewardEnv(gym.RewardWrapper):
class ScaleRewardEnv(gym.RewardWrapper):
def __init__(self, env, reward_scale):
'''
Rescale reward
@param (str,float):reward_scale If 'sign', use np.sign, else multiply with the specified float scale
'''
gym.Wrapper.__init__(self, env)
self.reward_scale = reward_scale
self.sign_reward = self.reward_scale == 'sign'

def reward(self, reward):
'''Atari reward, to -1, 0 or +1. Not usually used as SLM Lab memory class does the clipping'''
return np.sign(reward)
'''Set self.raw_reward for retrieving the original reward'''
return try_scale_reward(self, reward)


class PreprocessImage(gym.ObservationWrapper):
Expand Down Expand Up @@ -241,14 +261,12 @@ def wrap_atari(env):
return env


def wrap_deepmind(env, episode_life=True, clip_rewards=True, stack_len=None):
def wrap_deepmind(env, episode_life=True, stack_len=None):
'''Wrap Atari environment DeepMind-style'''
if episode_life:
env = EpisodicLifeEnv(env)
if 'FIRE' in env.unwrapped.get_action_meanings():
env = FireResetEnv(env)
if clip_rewards:
env = ClipRewardEnv(env)
env = PreprocessImage(env)
if stack_len is not None: # use concat for image (1, 84, 84)
env = FrameStack(env, 'concat', stack_len)
Expand All @@ -263,20 +281,21 @@ def wrap_image_env(env, stack_len=None):
return env


def make_gym_env(name, seed=None, frame_op=None, frame_op_len=None):
def make_gym_env(name, seed=None, frame_op=None, frame_op_len=None, reward_scale=None):
'''General method to create any Gym env; auto wraps Atari'''
env = gym.make(name)
if seed is not None:
env.seed(seed)
if 'NoFrameskip' in env.spec.id: # Atari
env = wrap_atari(env)
# no reward clipping to allow monitoring; Atari memory clips it
clip_rewards = False
episode_life = util.get_lab_mode() != 'eval'
env = wrap_deepmind(env, clip_rewards, episode_life, frame_op_len)
env = wrap_deepmind(env, episode_life, frame_op_len)
elif len(env.observation_space.shape) == 3: # image-state env
env = wrap_image_env(env, frame_op_len)
else: # vector-state env
if frame_op is not None:
env = FrameStack(env, frame_op, frame_op_len)
if reward_scale is not None:
env = ScaleRewardEnv(env, reward_scale)
return env
2 changes: 2 additions & 0 deletions slm_lab/experiment/monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,8 @@ def __init__(self, env, agent_spec, aeb=(0, 0, 0), aeb_space=None):

def update(self, state, action, reward, next_state, done):
'''Interface update method for body at agent.update()'''
if self.env.reward_scale is not None:
reward = self.env.u_env.raw_reward
if self.ckpt_total_reward is np.nan: # init
self.ckpt_total_reward = reward
else: # reset on epi_start, else keep adding. generalized for vec env
Expand Down
3 changes: 2 additions & 1 deletion slm_lab/spec/experimental/a2c.json
Original file line number Diff line number Diff line change
Expand Up @@ -798,7 +798,7 @@
"normalize_state": false
},
"memory": {
"name": "OnPolicyAtariReplay",
"name": "OnPolicyReplay",
},
"net": {
"type": "ConvNet",
Expand Down Expand Up @@ -833,6 +833,7 @@
"name": "PongNoFrameskip-v4",
"frame_op": "concat",
"frame_op_len": 4,
"reward_scale": "sign",
"max_t": null,
"max_tick": 10000000,
}],
Expand Down
3 changes: 2 additions & 1 deletion slm_lab/spec/experimental/a2c_pong.json
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
"normalize_state": false
},
"memory": {
"name": "OnPolicyAtariBatchReplay",
"name": "OnPolicyBatchReplay",
},
"net": {
"type": "ConvNet",
Expand Down Expand Up @@ -63,6 +63,7 @@
"name": "PongNoFrameskip-v4",
"frame_op": "concat",
"frame_op_len": 4,
"reward_scale": "sign",
"num_envs": 16,
"max_t": null,
"max_tick": 1e7
Expand Down
6 changes: 4 additions & 2 deletions slm_lab/spec/experimental/ddqn.json
Original file line number Diff line number Diff line change
Expand Up @@ -379,7 +379,7 @@
"normalize_state": false
},
"memory": {
"name": "AtariReplay",
"name": "Replay",
"batch_size": 32,
"max_size": 250000,
"use_cer": true
Expand Down Expand Up @@ -421,6 +421,7 @@
"name": "BreakoutDeterministic-v4",
"frame_op": "concat",
"frame_op_len": 4,
"reward_scale": "sign",
"max_t": null,
"max_tick": 50000,
}],
Expand Down Expand Up @@ -459,7 +460,7 @@
"normalize_state": false
},
"memory": {
"name": "AtariReplay",
"name": "Replay",
"batch_size": 32,
"max_size": 250000,
"use_cer": true
Expand Down Expand Up @@ -501,6 +502,7 @@
"name": "BreakoutDeterministic-v4",
"frame_op": "concat",
"frame_op_len": 4,
"reward_scale": "sign",
"max_t": null,
"max_tick": 50000,
}],
Expand Down
3 changes: 2 additions & 1 deletion slm_lab/spec/experimental/ddqn_beamrider.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
"normalize_state": false
},
"memory": {
"name": "AtariReplay",
"name": "Replay",
"batch_size": 32,
"max_size": 200000,
"use_cer": false,
Expand Down Expand Up @@ -55,6 +55,7 @@
"name": "BeamRiderNoFrameskip-v4",
"frame_op": "concat",
"frame_op_len": 4,
"reward_scale": "sign",
"max_t": null,
"max_tick": 10000000
}],
Expand Down
3 changes: 2 additions & 1 deletion slm_lab/spec/experimental/ddqn_breakout.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
"normalize_state": false
},
"memory": {
"name": "AtariReplay",
"name": "Replay",
"batch_size": 32,
"max_size": 200000,
"use_cer": false
Expand Down Expand Up @@ -55,6 +55,7 @@
"name": "BreakoutNoFrameskip-v4",
"frame_op": "concat",
"frame_op_len": 4,
"reward_scale": "sign",
"max_t": null,
"max_tick": 10000000
}],
Expand Down
3 changes: 2 additions & 1 deletion slm_lab/spec/experimental/ddqn_enduro.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
"normalize_state": false
},
"memory": {
"name": "AtariReplay",
"name": "Replay",
"batch_size": 32,
"max_size": 200000,
"use_cer": false
Expand Down Expand Up @@ -55,6 +55,7 @@
"name": "EnduroNoFrameskip-v4",
"frame_op": "concat",
"frame_op_len": 4,
"reward_scale": "sign",
"max_t": null,
"max_tick": 10000000
}],
Expand Down
3 changes: 2 additions & 1 deletion slm_lab/spec/experimental/ddqn_mspacman.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
"normalize_state": false
},
"memory": {
"name": "AtariReplay",
"name": "Replay",
"batch_size": 32,
"max_size": 200000,
"use_cer": false
Expand Down Expand Up @@ -55,6 +55,7 @@
"name": "MsPacmanNoFrameskip-v4",
"frame_op": "concat",
"frame_op_len": 4,
"reward_scale": "sign",
"max_t": null,
"max_tick": 10000000
}],
Expand Down
Loading

0 comments on commit 83dcba7

Please sign in to comment.