forked from ryanxhr/IVR
-
Notifications
You must be signed in to change notification settings - Fork 0
/
train_offline.py
121 lines (97 loc) · 3.92 KB
/
train_offline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import os
from typing import Tuple
from pathlib import Path
import gym
import numpy as np
import tqdm
from absl import app, flags
from ml_collections import config_flags
from dataset_utils import Log
import wandb
import wrappers
from dataset_utils import D4RLDataset, split_into_trajectories
from evaluation import evaluate
from learner import Learner
FLAGS = flags.FLAGS
flags.DEFINE_string('env_name', 'halfcheetah-expert-v2', 'Environment name.')
flags.DEFINE_string('save_dir', './results/', 'Tensorboard logging dir.')
flags.DEFINE_integer('seed', 42, 'Random seed.')
flags.DEFINE_integer('eval_episodes', 10,
'Number of episodes used for evaluation.')
flags.DEFINE_integer('log_interval', 1000, 'Logging interval.')
flags.DEFINE_integer('eval_interval', 10000, 'Eval interval.')
flags.DEFINE_integer('batch_size', 256, 'Mini batch size.')
flags.DEFINE_integer('max_steps', int(1e6), 'Number of training steps.')
flags.DEFINE_string('mix_dataset', 'None', 'mix the dataset')
flags.DEFINE_boolean('tqdm', True, 'Use tqdm progress bar.')
flags.DEFINE_string('alg', 'SQL', 'the training algorithm')
flags.DEFINE_float('alpha', 1.0 , 'temperature')
config_flags.DEFINE_config_file(
'config',
'default.py',
'File path to the training hyperparameter configuration.',
lock_config=False)
def normalize(dataset):
trajs = split_into_trajectories(dataset.observations, dataset.actions,
dataset.rewards, dataset.masks,
dataset.dones_float,
dataset.next_observations)
def compute_returns(traj):
episode_return = 0
for _, _, rew, _, _, _ in traj:
episode_return += rew
return episode_return
trajs.sort(key=compute_returns)
dataset.rewards /= compute_returns(trajs[-1]) - compute_returns(trajs[0])
dataset.rewards *= 1000.0
def make_env_and_dataset(env_name: str,
seed: int) -> Tuple[gym.Env, D4RLDataset]:
env = gym.make(env_name)
env = wrappers.EpisodeMonitor(env)
env = wrappers.SinglePrecision(env)
env.seed(seed)
env.action_space.seed(seed)
env.observation_space.seed(seed)
dataset = D4RLDataset(env)
if 'antmaze' in FLAGS.env_name:
dataset.rewards -= 1.0
# See https://github.com/aviralkumar2907/CQL/blob/master/d4rl/examples/cql_antmaze_new.py#L22
# but I found no difference between (x - 0.5) * 4 and x - 1.0
elif ('halfcheetah' in FLAGS.env_name or 'walker2d' in FLAGS.env_name
or 'hopper' in FLAGS.env_name):
# pass
normalize(dataset)
return env, dataset
def main(_):
env, dataset = make_env_and_dataset(FLAGS.env_name, FLAGS.seed)
kwargs = dict(FLAGS.config)
kwargs['alpha'] = FLAGS.alpha
kwargs['alg'] = FLAGS.alg
agent = Learner(FLAGS.seed,
env.observation_space.sample()[np.newaxis],
env.action_space.sample()[np.newaxis],
max_steps=FLAGS.max_steps,
**kwargs)
kwargs['seed'] = FLAGS.seed
kwargs['env_name'] = FLAGS.env_name
wandb.init(
project='project_name',
entity='your_wandb_id',
name=f"{FLAGS.env_name}",
config=kwargs
)
log = Log(Path('benchmark')/FLAGS.env_name, kwargs)
log(f'Log dir: {log.dir}')
for i in tqdm.tqdm(range(1, FLAGS.max_steps + 1),
smoothing=0.1,
disable=not FLAGS.tqdm):
batch = dataset.sample(FLAGS.batch_size)
update_info = agent.update(batch)
if i % FLAGS.log_interval == 0:
wandb.log(update_info, i)
if i % FLAGS.eval_interval == 0:
normalized_return = evaluate(FLAGS.env_name, agent, env, FLAGS.eval_episodes)
log.row({'normalized_return': normalized_return})
wandb.log({'normalized_return': normalized_return}, i)
if __name__ == '__main__':
app.run(main)