Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fast eval option #391

Merged
merged 28 commits into from
Jul 28, 2019
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
67e06ee
remove extra try_scale_rewards from vec_env
kengz Jul 25, 2019
8548f49
refactor env base attr setting
kengz Jul 25, 2019
1d91fb1
fix spec ref in env base frame attr infer
kengz Jul 26, 2019
29df219
move total_reward calc to env from body. update from env.step
kengz Jul 26, 2019
6e2823c
move update total_reward from util to env.base
kengz Jul 26, 2019
ad71c8b
add info arg to _track_total_reward
kengz Jul 26, 2019
67c91fb
update obs wrapper API method
kengz Jul 27, 2019
799f94e
reorder wrappers
kengz Jul 27, 2019
1171e7c
add a TrackReward env wrapper for all openai envs
kengz Jul 27, 2019
7b77f25
update lab env total_reward update for openai and unity
kengz Jul 27, 2019
3b54df1
return random breakout spec to breakout
kengz Jul 27, 2019
8f54917
use nan as default value for total_reward for rigor
kengz Jul 27, 2019
809834c
simplify eval ckpt and gen return
kengz Jul 27, 2019
9d1f84d
update body.total_reward log to source from env
kengz Jul 27, 2019
f34ba41
refactor body ckpt
kengz Jul 27, 2019
221c839
reorder env base methods
kengz Jul 27, 2019
463a4c0
default eval_frequency and log_frequency to 10k, simplify code
kengz Jul 27, 2019
e353fa6
add backward compatible meta.rigorous_eval spec var
kengz Jul 27, 2019
e4ead38
missed ppo lam 0.75 for unused pong search spec
kengz Jul 27, 2019
15df4da
add ppo qbert spec
kengz Jul 27, 2019
a932d79
expand row size for calc
kengz Jul 27, 2019
bd2ab5f
cleaner handle calc_df_row warning
kengz Jul 27, 2019
31b1011
mute nanmean warning
kengz Jul 27, 2019
41059d6
add qbert specs
kengz Jul 27, 2019
795897d
use fast eval for atari benchmarks
kengz Jul 27, 2019
bc401b3
remove env.raw_reward attr
kengz Jul 28, 2019
ad564fa
restore reward_scale from vec to low level
kengz Jul 28, 2019
0e93254
remove unused import
kengz Jul 28, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 18 additions & 28 deletions slm_lab/agent/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,13 +91,9 @@ def __init__(self, env, agent_spec, aeb=(0, 0, 0)):
self.mean_entropy = np.nan
self.mean_grad_norm = np.nan

self.epi_start = True
self.ckpt_total_reward = np.nan
self.total_reward = 0 # init to 0, but dont ckpt before end of an epi
# total_reward_ma from eval for model checkpoint saves
self.best_total_reward_ma = -np.inf
self.total_reward_ma = np.nan
# store current and best reward_ma for model checkpointing and early termination if all the environments are solved
self.best_reward_ma = -np.inf
self.eval_reward_ma = np.nan

# dataframes to track data for analysis.analyze_session
# track training data per episode
Expand All @@ -123,9 +119,7 @@ def __init__(self, env, agent_spec, aeb=(0, 0, 0)):

def update(self, state, action, reward, next_state, done):
'''Interface update method for body at agent.update()'''
if hasattr(self.env.u_env, 'raw_reward'): # use raw_reward if reward is preprocessed
reward = self.env.u_env.raw_reward
self.ckpt_total_reward, self.total_reward, self.epi_start = util.update_total_reward(self.ckpt_total_reward, self.total_reward, self.epi_start, reward, done)
pass

def __str__(self):
return f'body: {util.to_json(util.get_class_attr(self))}'
Expand All @@ -135,6 +129,8 @@ def calc_df_row(self, env):
frame = self.env.clock.get('frame')
wall_t = env.clock.get_elapsed_wall_t()
fps = 0 if wall_t == 0 else frame / wall_t
with np.errstate(all='ignore'):
total_reward = np.nanmean(env.total_reward) # guard for vec env

# update debugging variables
if net_util.to_check_train_step():
Expand All @@ -150,7 +146,7 @@ def calc_df_row(self, env):
'opt_step': self.env.clock.get('opt_step'),
'frame': frame,
'fps': fps,
'total_reward': np.nanmean(self.total_reward), # guard for vec env
'total_reward': total_reward,
'total_reward_ma': np.nan, # update outside
'loss': self.loss,
'lr': self.get_mean_lr(),
Expand All @@ -162,24 +158,18 @@ def calc_df_row(self, env):
assert all(col in self.train_df.columns for col in row.index), f'Mismatched row keys: {row.index} vs df columns {self.train_df.columns}'
return row

def train_ckpt(self):
'''Checkpoint to update body.train_df data'''
row = self.calc_df_row(self.env)
# append efficiently to df
self.train_df.loc[len(self.train_df)] = row
# update current reward_ma
self.total_reward_ma = self.train_df[-viz.PLOT_MA_WINDOW:]['total_reward'].mean()
self.train_df.iloc[-1]['total_reward_ma'] = self.total_reward_ma

def eval_ckpt(self, eval_env, total_reward):
'''Checkpoint to update body.eval_df data'''
row = self.calc_df_row(eval_env)
row['total_reward'] = total_reward
# append efficiently to df
self.eval_df.loc[len(self.eval_df)] = row
# update current reward_ma
self.eval_reward_ma = self.eval_df[-viz.PLOT_MA_WINDOW:]['total_reward'].mean()
self.eval_df.iloc[-1]['total_reward_ma'] = self.eval_reward_ma
def ckpt(self, env, df_mode):
'''
Checkpoint to update body.train_df or eval_df data
@param OpenAIEnv|UnityEnv:env self.env or self.eval_env
@param str:df_mode 'train' or 'eval'
'''
row = self.calc_df_row(env)
df = getattr(self, f'{df_mode}_df')
df.loc[len(df)] = row # append efficiently to df
df.iloc[-1]['total_reward_ma'] = total_reward_ma = df[-viz.PLOT_MA_WINDOW:]['total_reward'].mean()
if df_mode == 'eval':
self.total_reward_ma = total_reward_ma

def get_mean_lr(self):
'''Gets the average current learning rate of the algorithm's nets.'''
Expand Down
2 changes: 1 addition & 1 deletion slm_lab/agent/algorithm/actor_critic.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ def train(self):
loss = policy_loss + val_loss
# reset
self.to_train = 0
logger.debug(f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.total_reward}, loss: {loss:g}')
logger.debug(f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.env.total_reward}, loss: {loss:g}')
kengz marked this conversation as resolved.
Show resolved Hide resolved
return loss.item()
else:
return np.nan
Expand Down
2 changes: 1 addition & 1 deletion slm_lab/agent/algorithm/dqn.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ def train(self):
loss = total_loss / (self.training_iter * self.training_batch_iter)
# reset
self.to_train = 0
logger.debug(f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.total_reward}, loss: {loss:g}')
logger.debug(f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.env.total_reward}, loss: {loss:g}')
kengz marked this conversation as resolved.
Show resolved Hide resolved
return loss.item()
else:
return np.nan
Expand Down
2 changes: 1 addition & 1 deletion slm_lab/agent/algorithm/ppo.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ def train(self):
loss = total_loss / self.training_epoch / len(minibatches)
# reset
self.to_train = 0
logger.debug(f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.total_reward}, loss: {loss:g}')
logger.debug(f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.env.total_reward}, loss: {loss:g}')
kengz marked this conversation as resolved.
Show resolved Hide resolved
return loss.item()
else:
return np.nan
Expand Down
2 changes: 1 addition & 1 deletion slm_lab/agent/algorithm/reinforce.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ def train(self):
self.net.train_step(loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net)
# reset
self.to_train = 0
logger.debug(f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.total_reward}, loss: {loss:g}')
logger.debug(f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.env.total_reward}, loss: {loss:g}')
kengz marked this conversation as resolved.
Show resolved Hide resolved
return loss.item()
else:
return np.nan
Expand Down
2 changes: 1 addition & 1 deletion slm_lab/agent/algorithm/sarsa.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ def train(self):
self.net.train_step(loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net)
# reset
self.to_train = 0
logger.debug(f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.total_reward}, loss: {loss:g}')
logger.debug(f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.env.total_reward}, loss: {loss:g}')
kengz marked this conversation as resolved.
Show resolved Hide resolved
return loss.item()
else:
return np.nan
Expand Down
2 changes: 1 addition & 1 deletion slm_lab/agent/algorithm/sil.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ def train(self):
total_sil_loss += sil_loss
sil_loss = total_sil_loss / self.training_iter
loss = super_loss + sil_loss
logger.debug(f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.total_reward}, loss: {loss:g}')
logger.debug(f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.env.total_reward}, loss: {loss:g}')
kengz marked this conversation as resolved.
Show resolved Hide resolved
return loss.item()
else:
return np.nan
Expand Down
67 changes: 42 additions & 25 deletions slm_lab/env/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import pydash as ps
import time

NUM_EVAL = 8
logger = logger.get_logger(__name__)


Expand Down Expand Up @@ -87,20 +86,20 @@ class BaseEnv(ABC):
'''

def __init__(self, spec):
self.done = False
self.env_spec = spec['env'][0] # idx 0 for single-env
# set default
util.set_attr(self, dict(
log_frequency=None, # default to log at epi done
eval_frequency=10000,
log_frequency=10000,
frame_op=None,
frame_op_len=None,
normalize_state=False,
reward_scale=None,
num_envs=1,
))
util.set_attr(self, spec['meta'], [
'log_frequency',
'eval_frequency',
'log_frequency',
])
util.set_attr(self, self.env_spec, [
'name',
Expand All @@ -112,28 +111,15 @@ def __init__(self, spec):
'max_t',
'max_frame',
])
seq_len = ps.get(spec, 'agent.0.net.seq_len')
if seq_len is not None: # infer if using RNN
self.frame_op = 'stack'
self.frame_op_len = seq_len
if util.in_eval_lab_modes(): # use singleton for eval
self.num_envs = NUM_EVAL
self.log_frequency = 10000 # dummy
if spec['meta']['distributed'] != False: # divide max_frame for distributed
self.max_frame = int(self.max_frame / spec['meta']['max_session'])
self.is_venv = (self.num_envs is not None and self.num_envs > 1)
if self.is_venv:
assert self.log_frequency is not None, f'Specify log_frequency when using venv'
self.clock_speed = 1 * (self.num_envs or 1) # tick with a multiple of num_envs to properly count frames
self.clock = Clock(self.max_frame, self.clock_speed)
# override if env is for eval
if util.in_eval_lab_modes():
self.num_envs = ps.get(spec, 'meta.rigorous_eval')
self.to_render = util.to_render()

def _set_attr_from_u_env(self, u_env):
'''Set the observation, action dimensions and action type from u_env'''
self.observation_space, self.action_space = self._get_spaces(u_env)
self.observable_dim = self._get_observable_dim(self.observation_space)
self.action_dim = self._get_action_dim(self.action_space)
self.is_discrete = self._is_discrete(self.action_space)
self._infer_frame_attr(spec)
self._infer_venv_attr()
self._set_clock()
self.done = False
self.total_reward = np.nan

def _get_spaces(self, u_env):
'''Helper to set the extra attributes to, and get, observation and action spaces'''
Expand Down Expand Up @@ -163,10 +149,41 @@ def _get_action_dim(self, action_space):
raise ValueError('action_space not recognized')
return action_dim

def _infer_frame_attr(self, spec):
'''Infer frame attributes'''
seq_len = ps.get(spec, 'agent.0.net.seq_len')
if seq_len is not None: # infer if using RNN
self.frame_op = 'stack'
self.frame_op_len = seq_len
if spec['meta']['distributed'] != False: # divide max_frame for distributed
self.max_frame = int(self.max_frame / spec['meta']['max_session'])

def _infer_venv_attr(self):
'''Infer vectorized env attributes'''
self.is_venv = (self.num_envs is not None and self.num_envs > 1)

def _is_discrete(self, action_space):
'''Check if an action space is discrete'''
return util.get_class_name(action_space) != 'Box'

def _set_clock(self):
self.clock_speed = 1 * (self.num_envs or 1) # tick with a multiple of num_envs to properly count frames
self.clock = Clock(self.max_frame, self.clock_speed)

def _set_attr_from_u_env(self, u_env):
'''Set the observation, action dimensions and action type from u_env'''
self.observation_space, self.action_space = self._get_spaces(u_env)
self.observable_dim = self._get_observable_dim(self.observation_space)
self.action_dim = self._get_action_dim(self.action_space)
self.is_discrete = self._is_discrete(self.action_space)

def _update_total_reward(self, info):
'''Extract total_reward from info (set in wrapper) into self.total_reward for single and vec env'''
if isinstance(info, dict):
self.total_reward = info['total_reward']
else: # vec env tuple of infos
self.total_reward = np.array([i['total_reward'] for i in info])

@abstractmethod
@lab_api
def reset(self):
Expand Down
1 change: 1 addition & 0 deletions slm_lab/env/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def step(self, action):
if not self.is_discrete and self.action_dim == 1: # guard for continuous with action_dim 1, make array
action = np.expand_dims(action, axis=-1)
state, reward, done, info = self.u_env.step(action)
self._update_total_reward(info)
if self.to_render:
self.u_env.render()
if not self.is_venv and self.clock.t > self.max_t:
Expand Down
11 changes: 10 additions & 1 deletion slm_lab/env/unity.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ def __init__(self, spec):
self.patch_gym_spaces(self.u_env)
self._set_attr_from_u_env(self.u_env)
assert self.max_t is not None
self.tracked_reward = 0
self.total_reward = 0
logger.info(util.self_desc(self))

def patch_gym_spaces(self, u_env):
Expand Down Expand Up @@ -145,7 +147,14 @@ def step(self, action):
if not self.is_venv and self.clock.t > self.max_t:
done = True
self.done = done
return state, reward, done, env_info_a
info = {'env_info': env_info_a}
# track total_reward
self.tracked_reward += reward
if done:
self.total_reward = self.tracked_reward
self.tracked_reward = 0 # reset
info.update({'total_reward': self.total_reward})
return state, reward, done, info

@lab_api
def close(self):
Expand Down
1 change: 0 additions & 1 deletion slm_lab/env/vec_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,7 +480,6 @@ def step_wait(self):
if self.is_stack:
obs = np.expand_dims(obs, axis=1)
self.stackedobs[:, -self.shape_dim0:] = obs
rews = try_scale_reward(self, rews)
return self.stackedobs.copy(), rews, news, infos

def reset(self):
Expand Down
Loading