-
Notifications
You must be signed in to change notification settings - Fork 0
/
obj.py
122 lines (95 loc) · 5.11 KB
/
obj.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
from os import path
import gym
import mlflow
import numpy as np
import torch
from mpi4py import MPI
import src.core.es as es
from src.core.noisetable import NoiseTable
from src.core.policy import Policy
from src.gym import gym_runner
from src.gym.training_result import TrainingResult, RewardResult
from src.nn.nn import FeedForward, BaseNet
from src.nn.optimizers import Adam
from src.utils import utils
from src.utils.rankers import CenteredRanker, EliteRanker
from src.utils.reporters import LoggerReporter, StdoutReporter, MLFlowReporter, DefaultMpiReporterSet
def main(cfg):
comm: MPI.Comm = MPI.COMM_WORLD
full_name = f'{cfg.env.name}-{cfg.general.name}'
mlflow_reporter = MLFlowReporter(comm, cfg) if cfg.general.mlflow else None
reporter = DefaultMpiReporterSet(comm, full_name,
LoggerReporter(comm, full_name),
StdoutReporter(comm),
mlflow_reporter)
env: gym.Env = gym.make(cfg.env.name)
# seeding
rs, my_seed, global_seed = utils.seed(comm, cfg.general.seed, env)
all_seeds = comm.alltoall([my_seed] * comm.size) # simply for saving the seeds used on each proc
reporter.print(f'seeds:{all_seeds}')
# initializing policy, optimizer, noise and env
if 'load' in cfg.policy:
policy: Policy = Policy.load(cfg.policy.load)
nn: BaseNet = policy._module
else:
nn: BaseNet = FeedForward(cfg.policy.layer_sizes, torch.nn.Tanh(), env, cfg.policy.ac_std, cfg.policy.ob_clip)
policy: Policy = Policy(nn, cfg.noise.std, Adam(len(Policy.get_flat(nn)), cfg.policy.lr))
nt: NoiseTable = NoiseTable.create_shared(comm, cfg.noise.tbl_size, len(policy), reporter, global_seed)
ranker = CenteredRanker()
if 0 < cfg.experimental.elite < 1:
ranker = EliteRanker(CenteredRanker(), cfg.experimental.elite)
best_max_rew = -np.inf # highest achieved in any gen
def r_fn(model: torch.nn.Module, use_ac_noise=True) -> TrainingResult:
save_obs = rs.random() < cfg.policy.save_obs_chance
rews = np.zeros(cfg.env.max_steps)
for _ in range(max(1, cfg.general.eps_per_policy)):
rew, behv, obs, steps = gym_runner.run_model(model, env, cfg.env.max_steps, rs if use_ac_noise else None)
rews[:len(rew)] += np.array(rew)
rews /= max(1, cfg.general.eps_per_policy)
return RewardResult(rews.tolist(), behv, obs if save_obs else np.array([np.zeros(env.observation_space.shape)]),
steps)
time_since_best = 0
noise_std_inc = 0.08
for gen in range(cfg.general.gens):
if cfg.general.mlflow: mlflow_reporter.set_active_run(0)
reporter.start_gen()
if cfg.noise.std_decay != 1:
reporter.log({'noise std': policy.std})
if cfg.policy.lr_decay != 1:
reporter.log({'lr': policy.optim.lr})
if cfg.policy.ac_std_decay != 1:
reporter.log({'ac std': nn._action_std})
tr, gen_obstat = es.step(cfg, comm, policy, nt, env, r_fn, rs, ranker, reporter)
policy.update_obstat(gen_obstat)
cfg.policy.ac_std = nn._action_std = nn._action_std * cfg.policy.ac_std_decay
cfg.noise.std = policy.std = max(cfg.noise.std * cfg.noise.std_decay, cfg.noise.std_limit)
cfg.policy.lr = policy.optim.lr = max(cfg.policy.lr * cfg.policy.lr_decay, cfg.policy.lr_limit)
reporter.log({'obs recorded': policy.obstat.count})
max_rew_ind = np.argmax(ranker.fits[:, 0])
max_rew = ranker.fits[:, 0][max_rew_ind]
time_since_best = 0 if max_rew > best_max_rew else time_since_best + 1
reporter.log({'time since best': time_since_best})
# increasing noise std if policy is stuck
if time_since_best > cfg.experimental.max_time_since_best and cfg.experimental.explore_with_large_noise:
cfg.noise.std = policy.std = policy.std + noise_std_inc
if 0 < cfg.experimental.elite < 1: # using elite extension
if time_since_best > cfg.experimental.max_time_since_best and cfg.experimental.elite < 1:
ranker.elite_percent = cfg.experimental.elite
if time_since_best == 0:
ranker.elite_percent = 1
reporter.print(f'elite percent: {ranker.elite_percent}')
# Saving max rew if it obtained best ever rew
if max_rew > best_max_rew and comm.rank == 0:
best_max_rew = max_rew
coeff = 1 if max_rew_ind < ranker.n_fits_ranked // 2 else -1 # checking if pos or neg noise ind used
# TODO save this as a policy
torch.save(policy.pheno(coeff * ranker.noise_inds[max_rew_ind % (ranker.n_fits_ranked // 2)]),
path.join('saved', full_name, 'weights', f'gen{gen}-rew{best_max_rew:0.0f}.pt'))
reporter.print(f'saving max policy with rew:{best_max_rew:0.2f}')
reporter.end_gen()
mlflow.end_run() # in the case where mlflow is the reporter, just ending its run
if __name__ == '__main__':
gym.logger.set_level(40)
config_file = utils.parse_args()
config = utils.load_config(config_file)
main(config)