-
Notifications
You must be signed in to change notification settings - Fork 1
/
run_agents.py
480 lines (406 loc) · 23.1 KB
/
run_agents.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
import os
import time
from datetime import datetime
import gym
import numpy as np
import pandas as pd
from dateutil.relativedelta import relativedelta
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.monitor import Monitor
import global_var
import preprocessor as pp
import util
from agents import Agent, HoldAgent, A2CAgent, PPOAgent
from agents import agent_factory
from stock_eval_env_v2 import StockEvalEnvV2
from stock_train_env_v2 import StockTrainEnvV2
def make_train_env(data: pd.DataFrame, agent_name: str, log_path: str = None):
"""
根据Agent类型以及是否输出日志生成合适的训练环境
:param data: 每日股票数据(请先调用to_daily_data处理成字典形式)
:param agent_name: 模型名称,目前可选:'Dumb', 'Hold', 'A2C', 'PPO'
:param log_path: 日志保存路径,如为None(默认)则不保存
"""
# A2C and PPO support multi-environment training
if agent_name == 'A2C' or agent_name == 'PPO':
if log_path is not None:
env = make_vec_env(StockTrainEnvV2, n_envs=4,
env_kwargs={'data': data, 'verbose': 0},
monitor_dir=log_path)
else:
env = make_vec_env(StockTrainEnvV2, n_envs=4,
env_kwargs={'data': data, 'verbose': 0})
else:
if log_path is not None:
env = Monitor(StockTrainEnvV2(data, verbose=0), log_path)
else:
env = StockTrainEnvV2(data, verbose=0)
return env
def eval_agent_simple(agent: Agent, env_eval: gym.Env) -> float:
"""
测试模型在模拟环境中完成一轮交易的表现,仅返回其收益值
:param agent: Agent对象
:param env_eval: 环境
:return: 模型在环境中完成一轮交易的收益金额
"""
if isinstance(agent, HoldAgent):
agent.is_first_day = True
total_rewards = 0
state = env_eval.reset()
while True:
action = agent.act(state)
next_state, reward, done, _ = env_eval.step(action)
state = next_state
total_rewards += reward
if done:
ret = total_rewards / global_var.REWARD_SCALING
break
return ret
def eval_agent(agent: Agent, env_eval: StockEvalEnvV2, output_path: str) -> float:
"""
测试模型在模拟环境中完成一轮交易的表现,将模型的历史输出记录到csv文件,绘制与基线的每日收益和资产曲线对比图,同时返回收益值
:param agent: Agent对象
:param env_eval: 环境
:param output_path: 图表输出路径
:return: 模型在环境中完成一轮交易的收益金额
"""
os.makedirs(output_path, exist_ok=True)
os.makedirs(output_path + 'env_memory', exist_ok=True)
# Eval input agent
total_rewards = 0
state = env_eval.reset()
while True:
action = agent.act(state)
next_state, reward, done, _ = env_eval.step(action)
state = next_state
total_rewards += reward
if done:
ret_agent = total_rewards / global_var.REWARD_SCALING
reward_memory_agent = env_eval.reward_memory
asset_memory_agent = [a[-1] for a in env_eval.asset_memory]
env_eval.plot_memory(output_path)
env_eval.dump_memory(output_path + 'env_memory/')
break
yearly_return_rate = 100 * ret_agent / global_var.INITIAL_BALANCE \
/ (util.get_year_diff(global_var.EVAL_START_DATE, global_var.EVAL_END_DATE) + 1)
print('RunAgent:',
'eval {} agent, average return {:.2f}, yearly return rate {:.2f}%'.format(agent.name,
ret_agent,
yearly_return_rate))
# Eval baseline agent
baseline_agent = agent_factory('Hold', env_eval)
total_rewards = 0
state = env_eval.reset()
while True:
action = baseline_agent.act(state)
next_state, reward, done, _ = env_eval.step(action)
state = next_state
total_rewards += reward
if done:
ret_baseline = total_rewards / global_var.REWARD_SCALING
reward_memory_baseline = env_eval.reward_memory
asset_memory_baseline = [a[-1] for a in env_eval.asset_memory]
break
yearly_return_rate = 100 * ret_baseline / global_var.INITIAL_BALANCE \
/ (util.get_year_diff(global_var.EVAL_START_DATE, global_var.EVAL_END_DATE) + 1)
print('RunAgent:',
'baseline {} agent, average return {:.2f}, yearly return rate {:.2f}%'.format(baseline_agent.name,
ret_baseline,
yearly_return_rate))
# Plot compare figure
trade_dates = [datetime.strptime(str(d), '%Y%m%d').date() for d in env_eval.full_dates][:-1]
util.plot_daily_compare(x=trade_dates, y1=asset_memory_baseline, y2=asset_memory_agent,
y1_label=f'{baseline_agent.name}(baseline)', y2_label=agent.name,
diff_y_scale=False, save_path=output_path + 'total_assets_compare.png')
util.plot_daily_compare(x=trade_dates, y1=reward_memory_baseline, y2=reward_memory_agent,
y1_label=f'{baseline_agent.name}(baseline)', y2_label=agent.name,
diff_y_scale=False, save_path=output_path + 'daily_reward_compare.png')
return ret_agent
def train_agent(data: pd.DataFrame, agent_name: str, train_timesteps: int = 10000,
eval_episode: int = 1, model_save_path: str = None, log_path: str = None):
"""
训练一个模型,并测试其表现
:param data: 预处理后的完整股票数据
:param agent_name: Agent名称,目前可选:'Dumb', 'Hold', 'A2C', 'PPO'
:param train_timesteps: 模型训练交互步数
:param eval_episode: 模型重复测试次数
:param model_save_path: 模型保存路径,如为None(默认)则不保存
:param log_path: 模型训练日志保存路径,如为None(默认)则不保存
"""
data_train = pp.subdata_by_range(data, global_var.TRAIN_START_DATE, global_var.TRAIN_END_DATE)
data_eval = pp.subdata_by_range(data, global_var.EVAL_START_DATE, global_var.EVAL_END_DATE)
# Create and train the agent
if global_var.VERBOSE:
print('RunAgent:', f'training {agent_name} agent, timesteps {train_timesteps}')
env_train = make_train_env(data_train, agent_name, log_path)
agent = agent_factory(agent_name, env_train)
train_start_time = time.time() # record training time
agent.learn(timesteps=train_timesteps)
train_end_time = time.time()
# Save the agent's model and plot learning curve(optional)
if model_save_path is not None:
os.makedirs(model_save_path, exist_ok=True)
agent.save(model_save_path)
if log_path is not None:
util.plot_learning_curve(log_path, log_path + 'learning_curve.png')
# Evaluate the agent
agent.eval_mode()
env_eval = StockEvalEnvV2(data_eval, verbose=0)
if global_var.VERBOSE:
print('RunAgent:', f'evaluating trained {agent_name} agent for {eval_episode} episodes:')
returns = []
for i in range(eval_episode):
ret = eval_agent_simple(agent, env_eval)
print('RunAgent:', 'episode {:0>2d}/{:0>2d}, return {:.2f}'.format(i + 1, eval_episode, ret))
returns.append(ret)
return_mean, return_std = np.mean(returns), np.std(returns)
# 因为实验中一般将测试区间定为某年1月1日至某年12月31日,故对这两个日期求year_diff再+1才是总年数
yearly_return_rate = 100 * return_mean / global_var.INITIAL_BALANCE\
/ (util.get_year_diff(global_var.EVAL_START_DATE, global_var.EVAL_END_DATE) + 1)
print('RunAgent:',
'total {} episodes, average return {:.2f}, std {:.2f}, yearly return rate {:.2f}%'.format(eval_episode,
return_mean,
return_std,
yearly_return_rate))
print('RunAgent:',
'{} agent average training time: {:.2f} minutes'.format(agent_name, (train_end_time - train_start_time) / 60))
def train_agent_ntimes(data: pd.DataFrame, agent_name: str, train_timesteps: int = 10000,
n_train: int = 10, eval_episode: int = 1,
model_save_path: str = None, log_path: str = None):
"""
训练一种模型n_train次,测试其平均表现
:param data: 预处理后的完整股票数据
:param agent_name: Agent名称,目前可选:'Dumb', 'Hold', 'A2C', 'PPO'
:param train_timesteps: 模型每次训练的交互步数
:param n_train: 重复训练次数
:param eval_episode: 模型每次训练的重复测试次数
:param model_save_path: 模型保存路径,如为None(默认)则不保存,否则n_train个模型分别保存至该目录下的1.zip-n.zip文件
:param log_path: 模型训练日志保存路径,如为None(默认)则不保存,否则n_train次训练的日志分别保存至该目录下的1-n目录中
"""
data_train = pp.subdata_by_range(data, global_var.TRAIN_START_DATE, global_var.TRAIN_END_DATE)
data_eval = pp.subdata_by_range(data, global_var.EVAL_START_DATE, global_var.EVAL_END_DATE)
if model_save_path is not None:
os.makedirs(model_save_path, exist_ok=True)
print('RunAgent:', f'models will be saved to {model_save_path}')
if global_var.VERBOSE:
print('RunAgent:', f'training and evaluating {agent_name} agent for {n_train} times, timesteps {train_timesteps}')
returns = []
train_elapsed_times = []
for i in range(n_train):
train_i_log_path = None
if log_path is not None:
train_i_log_path = log_path + f'{i+1}/'
os.makedirs(train_i_log_path, exist_ok=True)
# Train
env_train = make_train_env(data_train, agent_name, train_i_log_path)
agent = agent_factory(agent_name, env_train)
train_start_time = time.time()
agent.learn(timesteps=train_timesteps)
train_end_time = time.time()
train_elapsed_times.append((train_end_time - train_start_time) / 60)
if model_save_path is not None:
agent.save(model_save_path + f'{i+1}')
if log_path is not None:
util.plot_learning_curve(train_i_log_path, train_i_log_path + 'learning_curve.png')
# Eval
agent.eval_mode()
env_eval = StockEvalEnvV2(data_eval, verbose=0)
ret = 0
for _ in range(eval_episode):
ret += eval_agent_simple(agent, env_eval)
ret /= eval_episode
print('RunAgent:', 'episode {:0>2d}/{}, average return {:.2f}'.format(i+1, n_train, ret))
returns.append(ret)
return_mean, return_std = np.mean(returns), np.std(returns)
yearly_return_rate = 100 * return_mean / global_var.INITIAL_BALANCE \
/ (util.get_year_diff(global_var.EVAL_START_DATE, global_var.EVAL_END_DATE) + 1)
print('RunAgent:',
'total {} training, average return {:.2f}, std {:.2f}, yearly return rate {:.2f}%'.format(n_train,
return_mean,
return_std,
yearly_return_rate))
print('RunAgent:',
'{} agent average training time: {:.2f} minutes'.format(agent_name, np.mean(train_elapsed_times)))
def track_train_agent(data: pd.DataFrame, agent_name: str, track_train_timesteps: int,
base_model_path: str, output_path: str) -> float:
"""
在一个基础预训练模型的基础上,在测试集上每完成一个季度的交易后使用该段时间上的数据继续训练该模型,使模型能追踪近期趋势。
训练+测试一次,输出历史状态以及追踪训练的Agent和不进行追踪训练以及基线的资产曲线对比图
:param data: 预处理后的完整股票数据
:param agent_name: Agent名称,目前可选:'Dumb', 'Hold', 'A2C', 'PPO'
:param track_train_timesteps: 模型在每个时间窗口上继续训练的交互步数
:param base_model_path: 基础模型文件路径
:param output_path: 图表输出路径
:return: 模型在环境中完成一轮交易的收益金额
"""
if global_var.VERBOSE:
print('RunAgent:', f'track train {agent_name} agent on eval period, timesteps {track_train_timesteps}. '
f'Load base model from {base_model_path}, outputing eval result at {output_path}')
if not os.path.isfile(base_model_path):
raise ValueError('Specified base model file does not exist')
data_train = pp.subdata_by_range(data, global_var.EVAL_START_DATE, global_var.EVAL_END_DATE)
data_eval = pp.subdata_by_range(data, global_var.EVAL_START_DATE, global_var.EVAL_END_DATE)
env_train = StockTrainEnvV2(data_train, verbose=0)
env_eval = StockEvalEnvV2(data_eval, verbose=0)
os.makedirs(output_path, exist_ok=True)
###################### Track Trained Model Perf ######################
retrain_dates = util.get_quarter_dates(global_var.EVAL_START_DATE,
int(datetime.strftime(
datetime.strptime(str(global_var.EVAL_END_DATE), '%Y%m%d') + relativedelta(days=1),
'%Y%m%d')))
# load a pre-trained model
agent_track = agent_factory(agent_name, env_train)
# agent.load() will clear out the env so that the agent can't learn anymore
# therefore use set_parameters() to load the model
if isinstance(agent_track, A2CAgent) or isinstance(agent_track, PPOAgent):
agent_track.model.set_parameters(base_model_path)
state = env_eval.reset()
ret_agent_track = 0
for i in range(len(retrain_dates) - 1):
# perform trading on the next quarter
if not env_train.check_interval_valid(retrain_dates[i], retrain_dates[i + 1]):
# print(f'Quarter {retrain_dates[i]} to {retrain_dates[i + 1]} has no data, therefore skipped.')
continue
agent_track.eval_mode()
env_eval.reset_date(retrain_dates[i], retrain_dates[i + 1], is_last_section=(i == len(retrain_dates) - 2))
total_rewards = 0
while True:
action = agent_track.act(state)
next_state, reward, done, _ = env_eval.step(action)
state = next_state
total_rewards += reward
if done:
ret_agent_track += total_rewards / global_var.REWARD_SCALING
break
# continue training on the quarter's data after trading
if i != len(retrain_dates) - 2:
agent_track.train_mode()
env_train.reset_date(retrain_dates[i], retrain_dates[i + 1])
agent_track.learn(track_train_timesteps)
reward_memory_agent_track = env_eval.reward_memory
asset_memory_agent_track = [a[-1] for a in env_eval.asset_memory]
os.makedirs(output_path + 'env_memory', exist_ok=True)
env_eval.dump_memory(output_path + 'env_memory/')
yearly_return_rate = 100 * ret_agent_track / global_var.INITIAL_BALANCE \
/ (util.get_year_diff(global_var.EVAL_START_DATE, global_var.EVAL_END_DATE) + 1)
print('RunAgent:',
'Track trained agent average return {:.2f}, yearly return rate {:.2f}%'.format(ret_agent_track,
yearly_return_rate))
###################### Original Model Perf (No Track Train) ######################
env_eval.reset()
agent_original = agent_factory(agent_name, env_eval)
agent_original.load(base_model_path)
# agent_original.eval_mode()
total_rewards = 0
state = env_eval.reset()
while True:
action = agent_original.act(state)
next_state, reward, done, _ = env_eval.step(action)
state = next_state
total_rewards += reward
if done:
ret_agent_original = total_rewards / global_var.REWARD_SCALING
reward_memory_agent_original = env_eval.reward_memory
asset_memory_agent_original = [a[-1] for a in env_eval.asset_memory]
break
yearly_return_rate = 100 * ret_agent_original / global_var.INITIAL_BALANCE\
/ (util.get_year_diff(global_var.EVAL_START_DATE, global_var.EVAL_END_DATE) + 1)
print('RunAgent:', 'Original agent average return {:.2f}, yearly return rate {:.2f}%'.format(ret_agent_original,
yearly_return_rate))
###################### Baseline Perf (Hold Agent) ######################
env_eval.reset()
baseline_agent = agent_factory('Hold', env_eval)
total_rewards = 0
state = env_eval.reset()
while True:
action = baseline_agent.act(state)
next_state, reward, done, _ = env_eval.step(action)
state = next_state
total_rewards += reward
if done:
ret_baseline = total_rewards / global_var.REWARD_SCALING
reward_memory_baseline = env_eval.reward_memory
asset_memory_baseline = [a[-1] for a in env_eval.asset_memory]
break
yearly_return_rate = 100 * ret_baseline / global_var.INITIAL_BALANCE\
/ (util.get_year_diff(global_var.EVAL_START_DATE, global_var.EVAL_END_DATE) + 1)
print('RunAgent:', 'Baseline average return {:.2f}, yearly return rate {:.2f}%'.format(ret_baseline,
yearly_return_rate))
dates = [datetime.strptime(str(d), '%Y%m%d').date() for d in env_eval.full_dates][:-1]
util.plot_daily_multi_y(x=dates, ys=[asset_memory_agent_track, asset_memory_agent_original, asset_memory_baseline],
ys_label=[agent_name + f'(Track Step={track_train_timesteps})', agent_name + '(No Track Train)',
'Hold(Baseline)'],
save_path=output_path + 'total_assets_compare.png')
util.plot_daily_multi_y(x=dates,
ys=[reward_memory_agent_track, reward_memory_agent_original, reward_memory_baseline],
ys_label=[agent_name + f'(Track Step={track_train_timesteps})', agent_name + '(No Track Train)',
'Hold(Baseline)'],
save_path=output_path + 'daily_reward_compare.png')
return ret_agent_track
def track_train_agent_ntimes(data: pd.DataFrame, agent_name: str, track_train_timesteps: int,
base_model_path: str, n_train: int = 10):
"""
在一个基础预训练模型的基础上,在测试集上每完成一个季度的交易后使用该段时间上的数据继续训练该模型,使模型能追踪近期趋势。
共训练n_train次,测试其平均表现(仅输出收益率数字)。
:param data: 预处理后的完整股票数据
:param agent_name: Agent名称,目前可选:'Dumb', 'Hold', 'A2C', 'PPO'
:param track_train_timesteps: 模型在每个时间窗口上继续训练的交互步数
:param base_model_path: 基础模型文件路径
:param n_train: 重复训练次数
"""
if global_var.VERBOSE:
print('RunAgent:', f'track train {agent_name} agent on eval period for {n_train} times, timesteps {track_train_timesteps}. '
f'Load base model from {base_model_path}')
if not os.path.isfile(base_model_path):
raise ValueError('Specified base model file does not exist')
# 由于是在测试期上继续训练,因此data_train和data_eval的范围相同
data_train = pp.subdata_by_range(data, global_var.EVAL_START_DATE, global_var.EVAL_END_DATE)
data_eval = pp.subdata_by_range(data, global_var.EVAL_START_DATE, global_var.EVAL_END_DATE)
retrain_dates = util.get_quarter_dates(global_var.EVAL_START_DATE,
int(datetime.strftime(
datetime.strptime(str(global_var.EVAL_END_DATE), '%Y%m%d') + relativedelta(days=1),
'%Y%m%d')))
returns = []
for e in range(n_train):
# 追踪训练分多个时间段,不能用多环境,也无法记录训练日志
env_train = StockTrainEnvV2(data_train, verbose=0)
# 加载预训练模型
agent = agent_factory(agent_name, env_train)
if isinstance(agent, A2CAgent) or isinstance(agent, PPOAgent):
agent.model.set_parameters(base_model_path)
# 在每个季度上测试和继续训练
env_eval = StockEvalEnvV2(data_eval, verbose=0)
state = env_eval.reset()
ret = 0
for i in range(len(retrain_dates) - 1):
# 如果该季度区间内没有任何数据,则跳过
if not env_train.check_interval_valid(retrain_dates[i], retrain_dates[i + 1]):
# print(f'Quarter {retrain_dates[i]} to {retrain_dates[i+1]} has no data, therefore skipped.')
continue
# perform trading on the next quarter
agent.eval_mode()
env_eval.reset_date(retrain_dates[i], retrain_dates[i+1], is_last_section=(i == len(retrain_dates)-2))
# 多个分段间状态连续,不能用eval_agent_simple否则会在分段处重置回初始状态
total_rewards = 0
while True:
action = agent.act(state)
next_state, reward, done, _ = env_eval.step(action)
state = next_state
total_rewards += reward
if done:
ret += total_rewards / global_var.REWARD_SCALING
break
# print('RunAgent:', f'quarter {retrain_dates[i]} to {retrain_dates[i+1]} ended, current return {ret:.2f}. Training on its data.')
# training on the quarter's data after trading
if i != len(retrain_dates) - 2: # 最后一个季度上不用训练
agent.train_mode()
env_train.reset_date(retrain_dates[i], retrain_dates[i + 1])
agent.learn(track_train_timesteps)
print('RunAgent:', 'episode {:0>2d}/{}, avg return {:.2f}'.format(e + 1, n_train, ret))
returns.append(ret)
return_mean, return_std = np.mean(returns), np.std(returns)
yearly_return_rate = 100 * return_mean / global_var.INITIAL_BALANCE \
/ (util.get_year_diff(global_var.EVAL_START_DATE, global_var.EVAL_END_DATE) + 1)
print('RunAgent:',
'total {} training, average return {:.2f}, std {:.2f},'
' yearly return rate {:.2f}%'.format(n_train, return_mean, return_std, yearly_return_rate))