Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reward preprocessing in wrapper; retire Atari-specific memories #331

Merged
merged 6 commits into from
May 8, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 0 additions & 19 deletions slm_lab/agent/memory/onpolicy.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,22 +144,3 @@ def sample(self):
'dones' : dones}
'''
return super().sample()


class OnPolicyAtariReplay(OnPolicyReplay):
'''
Preprocesses an state to be the concatenation of the last four states, after converting the 210 x 160 x 3 image to 84 x 84 x 1 grayscale image, and clips all rewards to [-10, 10] as per "Playing Atari with Deep Reinforcement Learning", Mnih et al, 2013
Note: Playing Atari with Deep RL clips the rewards to + / - 1
Otherwise the same as OnPolicyReplay memory
'''

def add_experience(self, state, action, reward, next_state, done):
# clip reward, done here to minimize change to only training data data
super().add_experience(state, action, np.sign(reward), next_state, done)


class OnPolicyAtariBatchReplay(OnPolicyBatchReplay, OnPolicyAtariReplay):
'''
OnPolicyBatchReplay with Atari concat
'''
pass
7 changes: 1 addition & 6 deletions slm_lab/agent/memory/prioritized.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from slm_lab.agent.memory.replay import Replay, AtariReplay
from slm_lab.agent.memory.replay import Replay
from slm_lab.lib import util
from slm_lab.lib.decorator import lab_api
import numpy as np
Expand Down Expand Up @@ -175,8 +175,3 @@ def update_priorities(self, errors):
self.priorities[idx] = p
for p, i in zip(priorities, self.tree_idxs):
self.tree.update(i, p)


class AtariPrioritizedReplay(PrioritizedReplay, AtariReplay):
'''Make a Atari PrioritizedReplay via nice multi-inheritance (python magic)'''
pass
11 changes: 0 additions & 11 deletions slm_lab/agent/memory/replay.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,14 +151,3 @@ def sample_idxs(self, batch_size):
if self.use_cer: # add the latest sample
batch_idxs[-1] = self.head
return batch_idxs


class AtariReplay(Replay):
'''
Preprocesses an state to be the concatenation of the last four states, after converting the 210 x 160 x 3 image to 84 x 84 x 1 grayscale image, and clips all rewards to [-10, 10] as per "Playing Atari with Deep Reinforcement Learning", Mnih et al, 2013
Note: Playing Atari with Deep RL clips the rewards to + / - 1
'''

def add_experience(self, state, action, reward, next_state, done):
# clip reward, done here to minimize change to only training data data
super().add_experience(state, action, np.sign(reward), next_state, done)
8 changes: 2 additions & 6 deletions slm_lab/env/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@ def __init__(self, spec, e=None, env_space=None):
try_register_env(spec) # register if it's a custom gym env
seed = ps.get(spec, 'meta.random_seed')
if self.is_venv: # make vector environment
self.u_env = make_gym_venv(self.name, seed, self.frame_op, self.frame_op_len, self.num_envs)
self.u_env = make_gym_venv(self.name, seed, self.frame_op, self.frame_op_len, self.reward_scale, self.num_envs)
else:
self.u_env = make_gym_env(self.name, seed, self.frame_op, self.frame_op_len)
self.u_env = make_gym_env(self.name, seed, self.frame_op, self.frame_op_len, self.reward_scale)
self._set_attr_from_u_env(self.u_env)
self.max_t = self.max_t or self.u_env.spec.max_episode_steps
assert self.max_t is not None
Expand All @@ -58,8 +58,6 @@ def step(self, action):
if not self.is_discrete and self.action_dim == 1: # guard for continuous with action_dim 1, make array
action = np.expand_dims(action, axis=-1)
state, reward, done, info = self.u_env.step(action)
if self.reward_scale is not None:
reward *= self.reward_scale
if self.to_render:
self.u_env.render()
if not self.is_venv and self.clock.t > self.max_t:
Expand Down Expand Up @@ -100,8 +98,6 @@ def space_step(self, action_e):
state, reward, done, info = self.u_env.step(action)
if done:
state = self.u_env.reset()
if self.reward_scale is not None:
reward *= self.reward_scale
if self.to_render:
self.u_env.render()
if not self.is_venv and self.clock.t > self.max_t:
Expand Down
11 changes: 5 additions & 6 deletions slm_lab/env/unity.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from gym import spaces
from slm_lab.env.base import BaseEnv, ENV_DATA_NAMES, set_gym_space_attr
from slm_lab.env.registration import get_env_path
from slm_lab.env.wrapper import try_scale_reward
from slm_lab.lib import logger, util
from slm_lab.lib.decorator import lab_api
from unityagents import brain, UnityEnvironment
Expand Down Expand Up @@ -141,8 +142,7 @@ def step(self, action):
env_info_a = self._get_env_info(env_info_dict, a)
state = env_info_a.states[b]
reward = env_info_a.rewards[b]
if self.reward_scale is not None:
reward *= self.reward_scale
reward = try_scale_reward(self, reward)
done = env_info_a.local_done[b]
if not self.is_venv and self.clock.t > self.max_t:
done = True
Expand Down Expand Up @@ -187,10 +187,9 @@ def space_step(self, action_e):
for (a, b), body in util.ndenumerate_nonan(self.body_e):
env_info_a = self._get_env_info(env_info_dict, a)
state_e[(a, b)] = env_info_a.states[b]
reward = env_info_a.rewards[b]
if self.reward_scale is not None:
reward *= self.reward_scale
reward_e[(a, b)] = reward
rewards = env_info_a.rewards[b]
rewards = try_scale_reward(self, rewards)
reward_e[(a, b)] = rewards
done_e[(a, b)] = env_info_a.local_done[b]
info_e = env_info_dict
self.done = (util.nonan_all(done_e) or self.clock.t > self.max_t)
Expand Down
15 changes: 9 additions & 6 deletions slm_lab/env/vec_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from collections import OrderedDict
from functools import partial
from gym import spaces
from slm_lab.env.wrapper import make_gym_env
from slm_lab.env.wrapper import make_gym_env, try_scale_reward
from slm_lab.lib import logger
import contextlib
import ctypes
Expand Down Expand Up @@ -450,11 +450,13 @@ def _decode_obses(self, obs):
class VecFrameStack(VecEnvWrapper):
'''Frame stack wrapper for vector environment'''

def __init__(self, venv, frame_op, frame_op_len):
def __init__(self, venv, frame_op, frame_op_len, reward_scale=None):
self.venv = venv
assert frame_op == 'concat', 'VecFrameStack only supports concat frame_op for now'
self.frame_op = frame_op
self.frame_op_len = frame_op_len
self.reward_scale = reward_scale
self.sign_reward = self.reward_scale == 'sign'
self.spec = venv.spec
wos = venv.observation_space # wrapped ob space
self.shape_dim0 = wos.shape[0]
Expand All @@ -471,6 +473,7 @@ def step_wait(self):
if new:
self.stackedobs[i] = 0
self.stackedobs[:, -self.shape_dim0:] = obs
rews = try_scale_reward(self, rews)
return self.stackedobs.copy(), rews, news, infos

def reset(self):
Expand All @@ -480,17 +483,17 @@ def reset(self):
return self.stackedobs.copy()


def make_gym_venv(name, seed=0, frame_op=None, frame_op_len=None, num_envs=4):
def make_gym_venv(name, seed=0, frame_op=None, frame_op_len=None, reward_scale=None, num_envs=4):
'''General method to create any parallel vectorized Gym env; auto wraps Atari'''
venv = [
# don't stack on individual env, but stack as vector
partial(make_gym_env, name, seed + i, frame_op=None, frame_op_len=None)
# don't concat frame or clip reward on individual env; do that at vector level
partial(make_gym_env, name, seed + i, frame_op=None, frame_op_len=None, reward_scale=None)
for i in range(num_envs)
]
if len(venv) > 1:
venv = ShmemVecEnv(venv, context='fork')
else:
venv = DummyVecEnv(venv)
if frame_op is not None:
venv = VecFrameStack(venv, frame_op, frame_op_len)
venv = VecFrameStack(venv, frame_op, frame_op_len, reward_scale)
return venv
37 changes: 28 additions & 9 deletions slm_lab/env/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,17 @@
import numpy as np


def try_scale_reward(cls, reward):
'''Env class to scale reward and set raw_reward'''
if cls.reward_scale is not None:
cls.raw_reward = reward
if cls.sign_reward:
reward = np.sign(reward)
else:
reward *= cls.reward_scale
return reward


class NoopResetEnv(gym.Wrapper):
def __init__(self, env, noop_max=30):
'''
Expand Down Expand Up @@ -130,10 +141,19 @@ def reset(self, **kwargs):
return self.env.reset(**kwargs)


class ClipRewardEnv(gym.RewardWrapper):
class ScaleRewardEnv(gym.RewardWrapper):
def __init__(self, env, reward_scale):
'''
Rescale reward
@param (str,float):reward_scale If 'sign', use np.sign, else multiply with the specified float scale
'''
gym.Wrapper.__init__(self, env)
self.reward_scale = reward_scale
self.sign_reward = self.reward_scale == 'sign'

def reward(self, reward):
'''Atari reward, to -1, 0 or +1. Not usually used as SLM Lab memory class does the clipping'''
return np.sign(reward)
'''Set self.raw_reward for retrieving the original reward'''
return try_scale_reward(self, reward)


class PreprocessImage(gym.ObservationWrapper):
Expand Down Expand Up @@ -241,14 +261,12 @@ def wrap_atari(env):
return env


def wrap_deepmind(env, episode_life=True, clip_rewards=True, stack_len=None):
def wrap_deepmind(env, episode_life=True, stack_len=None):
'''Wrap Atari environment DeepMind-style'''
if episode_life:
env = EpisodicLifeEnv(env)
if 'FIRE' in env.unwrapped.get_action_meanings():
env = FireResetEnv(env)
if clip_rewards:
env = ClipRewardEnv(env)
env = PreprocessImage(env)
if stack_len is not None: # use concat for image (1, 84, 84)
env = FrameStack(env, 'concat', stack_len)
Expand All @@ -263,20 +281,21 @@ def wrap_image_env(env, stack_len=None):
return env


def make_gym_env(name, seed=None, frame_op=None, frame_op_len=None):
def make_gym_env(name, seed=None, frame_op=None, frame_op_len=None, reward_scale=None):
'''General method to create any Gym env; auto wraps Atari'''
env = gym.make(name)
if seed is not None:
env.seed(seed)
if 'NoFrameskip' in env.spec.id: # Atari
env = wrap_atari(env)
# no reward clipping to allow monitoring; Atari memory clips it
clip_rewards = False
episode_life = util.get_lab_mode() != 'eval'
env = wrap_deepmind(env, clip_rewards, episode_life, frame_op_len)
env = wrap_deepmind(env, episode_life, frame_op_len)
elif len(env.observation_space.shape) == 3: # image-state env
env = wrap_image_env(env, frame_op_len)
else: # vector-state env
if frame_op is not None:
env = FrameStack(env, frame_op, frame_op_len)
if reward_scale is not None:
env = ScaleRewardEnv(env, reward_scale)
return env
2 changes: 2 additions & 0 deletions slm_lab/experiment/monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,8 @@ def __init__(self, env, agent_spec, aeb=(0, 0, 0), aeb_space=None):

def update(self, state, action, reward, next_state, done):
'''Interface update method for body at agent.update()'''
if self.env.reward_scale is not None:
reward = self.env.u_env.raw_reward
if self.ckpt_total_reward is np.nan: # init
self.ckpt_total_reward = reward
else: # reset on epi_start, else keep adding. generalized for vec env
Expand Down
3 changes: 2 additions & 1 deletion slm_lab/spec/experimental/a2c.json
Original file line number Diff line number Diff line change
Expand Up @@ -798,7 +798,7 @@
"normalize_state": false
},
"memory": {
"name": "OnPolicyAtariReplay",
"name": "OnPolicyReplay",
},
"net": {
"type": "ConvNet",
Expand Down Expand Up @@ -833,6 +833,7 @@
"name": "PongNoFrameskip-v4",
"frame_op": "concat",
"frame_op_len": 4,
"reward_scale": "sign",
"max_t": null,
"max_tick": 10000000,
}],
Expand Down
3 changes: 2 additions & 1 deletion slm_lab/spec/experimental/a2c_pong.json
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
"normalize_state": false
},
"memory": {
"name": "OnPolicyAtariBatchReplay",
"name": "OnPolicyBatchReplay",
},
"net": {
"type": "ConvNet",
Expand Down Expand Up @@ -63,6 +63,7 @@
"name": "PongNoFrameskip-v4",
"frame_op": "concat",
"frame_op_len": 4,
"reward_scale": "sign",
"num_envs": 16,
"max_t": null,
"max_tick": 1e7
Expand Down
6 changes: 4 additions & 2 deletions slm_lab/spec/experimental/ddqn.json
Original file line number Diff line number Diff line change
Expand Up @@ -379,7 +379,7 @@
"normalize_state": false
},
"memory": {
"name": "AtariReplay",
"name": "Replay",
"batch_size": 32,
"max_size": 250000,
"use_cer": true
Expand Down Expand Up @@ -421,6 +421,7 @@
"name": "BreakoutDeterministic-v4",
"frame_op": "concat",
"frame_op_len": 4,
"reward_scale": "sign",
"max_t": null,
"max_tick": 50000,
}],
Expand Down Expand Up @@ -459,7 +460,7 @@
"normalize_state": false
},
"memory": {
"name": "AtariReplay",
"name": "Replay",
"batch_size": 32,
"max_size": 250000,
"use_cer": true
Expand Down Expand Up @@ -501,6 +502,7 @@
"name": "BreakoutDeterministic-v4",
"frame_op": "concat",
"frame_op_len": 4,
"reward_scale": "sign",
"max_t": null,
"max_tick": 50000,
}],
Expand Down
3 changes: 2 additions & 1 deletion slm_lab/spec/experimental/ddqn_beamrider.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
"normalize_state": false
},
"memory": {
"name": "AtariReplay",
"name": "Replay",
"batch_size": 32,
"max_size": 200000,
"use_cer": false,
Expand Down Expand Up @@ -55,6 +55,7 @@
"name": "BeamRiderNoFrameskip-v4",
"frame_op": "concat",
"frame_op_len": 4,
"reward_scale": "sign",
"max_t": null,
"max_tick": 10000000
}],
Expand Down
3 changes: 2 additions & 1 deletion slm_lab/spec/experimental/ddqn_breakout.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
"normalize_state": false
},
"memory": {
"name": "AtariReplay",
"name": "Replay",
"batch_size": 32,
"max_size": 200000,
"use_cer": false
Expand Down Expand Up @@ -55,6 +55,7 @@
"name": "BreakoutNoFrameskip-v4",
"frame_op": "concat",
"frame_op_len": 4,
"reward_scale": "sign",
"max_t": null,
"max_tick": 10000000
}],
Expand Down
3 changes: 2 additions & 1 deletion slm_lab/spec/experimental/ddqn_enduro.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
"normalize_state": false
},
"memory": {
"name": "AtariReplay",
"name": "Replay",
"batch_size": 32,
"max_size": 200000,
"use_cer": false
Expand Down Expand Up @@ -55,6 +55,7 @@
"name": "EnduroNoFrameskip-v4",
"frame_op": "concat",
"frame_op_len": 4,
"reward_scale": "sign",
"max_t": null,
"max_tick": 10000000
}],
Expand Down
3 changes: 2 additions & 1 deletion slm_lab/spec/experimental/ddqn_mspacman.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
"normalize_state": false
},
"memory": {
"name": "AtariReplay",
"name": "Replay",
"batch_size": 32,
"max_size": 200000,
"use_cer": false
Expand Down Expand Up @@ -55,6 +55,7 @@
"name": "MsPacmanNoFrameskip-v4",
"frame_op": "concat",
"frame_op_len": 4,
"reward_scale": "sign",
"max_t": null,
"max_tick": 10000000
}],
Expand Down
Loading