-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlogger.py
151 lines (136 loc) · 6.04 KB
/
logger.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
from datetime import datetime as dt
import json
import os
import os.path as osp
import pickle
import time
from typing import Any, Dict, List
import numpy as np
import ray
import torch
from torch.utils.tensorboard import SummaryWriter
from games.game import GameHistory
from replay_buffer import ReplayBuffer
from self_play import SelfPlay
from shared_storage import SharedStorage
class Logger:
def __init__(self, exp_name: str):
self.logdir = osp.join(os.getcwd(), 'data', exp_name)\
if exp_name else f'/tmp/experiments/{str(dt.now())}'
if not osp.exists(self.logdir):
os.makedirs(self.logdir)
def save_config(self, config: Dict) -> None:
if 'visit_softmax_temperature_func' in config:
del config['visit_softmax_temperature_func']
output = json.dumps(config, separators=(',', ':\t'), indent=4)
print('Experiment config:\n', output)
with open(osp.join(self.logdir, 'config.json'), 'w') as f:
f.write(output)
def save_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
torch.save(checkpoint, osp.join(self.logdir, 'model.checkpoint'))
def save_replay_buffer(self,
replay_buffer: ReplayBuffer,
checkpoint: Dict[str, Any]) -> None:
replay_buffer_path = osp.join(self.logdir, 'replay_buffer.pkl')
print(f'\n\nSaving replay buffer at {replay_buffer_path}')
pickle.dump({
'buffer': replay_buffer,
'played_games': checkpoint['played_games'],
'played_steps': checkpoint['played_steps'],
'reanalysed_games': checkpoint['reanalysed_games']
}, open(replay_buffer_path, 'wb')
)
def log_continuously(self,
config,
test_worker: SelfPlay,
shared_storage_worker: SharedStorage,
replay_buffer_worker: ReplayBuffer) -> None:
test_worker.play_continuously.remote(shared_storage_worker, None, test=True)
writer = SummaryWriter(self.logdir)
keys = [
'episode_length',
'episode_return',
'mean_value',
'lr',
'loss',
'value_loss',
'reward_loss',
'policy_loss',
'training_step',
'played_games',
'played_steps',
'reanalysed_games'
]
info = ray.get(shared_storage_worker.get_info.remote(keys))
last_step, counter = 0, 0
try:
while info['training_step'] < config.training_steps:
info = ray.get(shared_storage_worker.get_info.remote(keys))
writer.add_scalar(
'1.Total_reward/1.Episode_return', info['episode_return'], counter
)
writer.add_scalar(
'1.Total_reward/2.Mean_value', info['mean_value'], counter
)
writer.add_scalar(
'1.Total_reward/3.Episode_length', info['episode_length'], counter
)
writer.add_scalar(
'2.Workers/1.Self_played_games', info['played_games'], counter
)
writer.add_scalar(
'2.Workers/2.Training_steps', info['training_step'], counter
)
writer.add_scalar(
'2.Workers/3.Self_played_steps', info['played_steps'], counter
)
writer.add_scalar(
'2.Workers/4.Reanalysed_games', info['reanalysed_games'], counter
)
writer.add_scalar(
'2.Workers/5.Training_steps_per_self_played_step_ratio',
info['training_step'] / max(1, info['played_steps']),
counter
)
writer.add_scalar('2.Workers/6.Learning_rate', info['lr'], counter)
writer.add_scalar('3.Loss/1.Total_weighted_loss', info['loss'], counter)
writer.add_scalar('3.Loss/2.Value_loss', info['value_loss'], counter)
writer.add_scalar('3.Loss/3.Reward_loss', info['reward_loss'], counter)
writer.add_scalar('3.Loss/4.Policy_loss', info['policy_loss'], counter)
print(f'\rEpisode return: {info["episode_return"]:.2f}. '
+ f'Training step: {info["training_step"]}/{config.training_steps}. '
+ f'Played games: {info["played_games"]}. '
+ f'Loss: {info["loss"]:.2f}', end="")
if info['training_step'] > last_step and\
info['training_step'] % config.checkpoint_interval == 0:
self.save_checkpoint(
ray.get(shared_storage_worker.get_checkpoint.remote())
)
last_step += 1
counter += 1
time.sleep(0.5)
except KeyboardInterrupt:
pass
self.save_replay_buffer(
ray.get(replay_buffer_worker.get_buffer.remote()),
ray.get(shared_storage_worker.get_checkpoint.remote())
)
def log_result(self, config, histories: List[GameHistory]) -> None:
with open(osp.join(self.logdir, 'rewards.txt'), 'w') as f:
for history in histories:
f.write(','.join(map(str, history.rewards)) + '\n')
if config.players == 1:
result = np.mean([sum(history.rewards) for history in histories])
print('Result:', result)
else:
p1_wr = np.mean([
sum(reward for i, reward in enumerate(history.rewards)
if history.to_plays[i] == 0) for history in histories
])
p2_wr = np.mean([
sum(reward for i, reward in enumerate(history.rewards)
if history.to_plays[i] == 1) for history in histories
])
time.sleep(1)
print(f'P1 win rate: {p1_wr * 100:.2f}%\nP2 win rate: {p2_wr * 100:.2f}%\
\nDraw: {(1 - p1_wr - p2_wr) * 100:.2f}%')