-
Notifications
You must be signed in to change notification settings - Fork 0
/
runner.py
182 lines (150 loc) · 6.11 KB
/
runner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
import argparse
import os
import numpy as np
from cantus_firmi import cantus_firmi
from counterpoint.species.species_one import SpeciesOneCounterpoint
from counterpoint.test_tasks import OnePitchIsGood
from utilities.factories import make_environment_factory, make_agent_factory, Approach
from utilities.save_composition import save_composition
from utilities.trial_log import ExperimentLog
significance_level = 0.05
def main():
parser = argparse.ArgumentParser()
parser.add_argument("task", type=int)
parser.add_argument("agent", type=int)
parser.add_argument("outdir", type=str)
parser.add_argument('-trials', type=int, default=1)
parser.add_argument('-evaluations', type=int, default=10)
parser.add_argument('-lamb', type=float, default=0.5)
parser.add_argument('-alpha', type=float, default=0.4)
parser.add_argument('-epsilon', type=float, default=0.3)
parser.add_argument('-period', type=int, default=100)
parser.add_argument('-unique-id', type=int)
parser.add_argument('--log-evaluations', type=int)
parser.add_argument("-history", type=int, default=3)
parser.add_argument("--time-invariant-state", action='store_true', default=False)
args = parser.parse_args()
task = args.task
agent = args.agent
num_trials = args.trials
num_evaluations = args.evaluations
evaluation_period = args.period
output_dir = args.outdir
history_length = args.history
time_invariant_state = args.time_invariant_state
alpha = args.alpha
lamb = args.lamb
epsilon = args.epsilon
meter = cantus_firmi[0][1]
key = cantus_firmi[0][2]
if task is 0:
environment_factory = make_environment_factory(meter, key, OnePitchIsGood, history_length,
time_invariant_state)
elif task == 1:
environment_factory = make_environment_factory(meter, key, SpeciesOneCounterpoint, history_length,
time_invariant_state)
if agent == 0:
approach = Approach.QLearning
elif agent == 1:
approach = Approach.Sarsa
elif agent == 2:
approach = Approach.TrueOnlineSarsaLambda
elif agent == 3:
approach = Approach.QNetwork
elif agent == 4:
approach = Approach.DDDQN
elif agent == 5:
approach = Approach.SarsaCMAC
elif agent == 6:
approach = Approach.SarsaLinear
agent_factory = make_agent_factory(approach, epsilon=epsilon, alpha=alpha, lmbda=lamb,
time_invariant=time_invariant_state)
agent_name = str(approach)
results = run_experiment(num_trials, num_evaluations, evaluation_period, agent_factory,
environment_factory, output_dir)
save(agent_name, results, output_dir, args.unique_id)
def run_experiment(num_trials, num_evaluations, evaluation_period,
agent_factory, environment_factory, out_dir
) -> ExperimentLog:
series = [i * evaluation_period for i in range(0, num_evaluations)]
log = ExperimentLog(series, 0.05)
evaluation_num = 0
for i in range(0, num_trials):
print("trial " + str(i))
# Train and periodically yield the value function
for (num_episodes, table) in train_agent(evaluation_period,
num_evaluations,
agent_factory,
environment_factory):
evaluation = evaluate(table, agent_factory, environment_factory, "Evaluation %d" % evaluation_num, out_dir)
evaluation_num += 1
print(" R: " + str(evaluation))
log.observe(evaluation)
log.observe_trial_end()
return log
def evaluate(table, agent_factory, environment_factory, unique_name: str, out_dir: str) -> float:
domain, task = environment_factory()
agent = agent_factory(domain, task, table)
agent.value_function = table
agent.epsilon = 0.0
agent.alpha = 0.0
cumulative_reward = 0.0
terminated = False
current_step = 0
while not terminated:
current_step += 1
agent.act()
if task.stateisfinal(domain.get_current_state()):
terminated = True
cumulative_reward = agent.get_cumulative_reward()
agent.episode_ended()
save_composition(unique_name, agent.name, domain, out_dir)
return cumulative_reward
def save(name: str, log: ExperimentLog, out_dir: str, unique_num: int = 0):
out_prefix = out_dir
if not os.path.exists(out_prefix):
os.makedirs(out_prefix)
filename = str(log.n) + "_" + name + str(unique_num) + ".csv"
full_out_path = os.path.join(out_prefix, filename)
if log.n > 1:
log.finalize_confidences()
data = np.c_[(log.series, log.means, log.variances, log.confidences)]
formats = ["%d", "%f", "%f", "%f"]
else:
data = np.c_[(log.series, log.means)]
formats = ["%d", "%f"]
np.savetxt(full_out_path, data,
fmt=formats,
delimiter=",")
def train_agent(evaluation_period, num_stops, agent_factory, environment_factory):
"""
Trains an agent, periodically yielding the agent's q-table
:param evaluation_period:
:param num_stops:
:return:
"""
domain, task = environment_factory()
agent = agent_factory(domain, task, None)
reward = 0
stops = 0
for i in range(0, evaluation_period * num_stops):
if i % evaluation_period is 0:
stops += 1
yield i, agent.value_function
if num_stops == stops:
return
terminated = False
current_step = 0
while not terminated:
current_step += 1
agent.act()
if task.stateisfinal(domain.get_current_state()):
final_state = domain.get_current_state()
reward = agent.get_cumulative_reward()
agent.episode_ended()
domain.reset()
terminated = True
if i % 10 == 0:
print("%d: %f" % (i, reward))
if __name__ == '__main__':
main()