-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_ppo.py
123 lines (107 loc) · 5.73 KB
/
run_ppo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import argparse
import datetime
import os
import random
import time
import numpy as np
import tensorflow as tf
from baselines.ppo2 import ppo2
from env_wrapper_rllab_to_openai import WrappedPointMazeEnv
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--eval_runs', default=50, type=int, help='How many runs for evaluation during training.')
parser.add_argument('--max_env_timestep', default=500, type=int, help='Maximum number of timesteps taken in an environment.')
parser.add_argument('--do_rendering', default=False, action='store_true', help='True for render simulations.')
parser.add_argument('--sampling_method', default='uniform', type=str,
help='Defines the sampling method. Can be "uniform", "all_previous", or "good_starts".')
parser.add_argument('--steps_per_curriculum', default=50000, type=int,
help='The number of steps taken in the environment between curriculum adaptions (for sampling methods "all_previous" and "good_starts"). Also defines the interval of the evaluation runs.')
parser.add_argument('--nsteps', default=50000, type=int,
help='Number of environment steps for the inner loop of the PPO algorithm.')
parser.add_argument('--outer_iter', default=400, type=int,
help='Number of environment steps for the outer loop of the PPO algorithm.')
parser.add_argument('--save_interval', default=0, type=int, help='')
parser.add_argument('--verbose', default=False, action='store_true', help='print more information')
parser.add_argument('--seed', default=42, type=int, help='random seed')
parser.add_argument('--sample_on_goal_area', default=False, action='store_true',
help='When True, the start points for sampling when using sampling methods "all_previous" and "good_starts" are taken from the whole goal area. When False, only the center of the goal area is used.')
args = parser.parse_args()
# get all arguments
eval_runs = args.eval_runs
max_env_timestep = args.max_env_timestep
do_rendering = args.do_rendering
sampling_method = args.sampling_method
steps_per_curriculum = args.steps_per_curriculum
nsteps = args.nsteps
total_timesteps = args.outer_iter * steps_per_curriculum
save_interval = args.save_interval
verbose = args.verbose
seed = args.seed
sample_on_goal_area = args.sample_on_goal_area
param_info = ("\nRunning 'run_ppo.py' with following parameters: \n"
"\teval_runs: {}\n"
"\tmax_env_timestep: {}\n"
"\tdo_rendering: {}\n"
"\tsampling_method: {}\n"
"\tsteps_per_curriculum: {}\n"
"\tnsteps: {}\n"
"\touter_iter: {}\n"
"\ttotal_timesteps: {} (= outer_iter * steps_per_curriculum)\n"
"\tsave_interval: {}\n"
"\tverbose: {}\n"
"\tseed: {}\n"
"\tsample_on_goal_area: {}\n"
"".format(eval_runs, max_env_timestep, do_rendering, sampling_method,
steps_per_curriculum, nsteps, args.outer_iter,
total_timesteps, save_interval,
verbose, seed, sample_on_goal_area)
)
print(param_info)
# set random seed
random.seed(seed)
np.random.seed(seed)
# create results directory
if not os.path.exists("results"):
os.mkdir("results")
# create experiment directory
# get datetime
experiment_date = datetime.datetime.today().strftime(
"%Y-%m-%d_%H-%M-%S")
experiment_dir = os.path.join('results',"ppo_maze_{}_{}_{}".format(experiment_date, sampling_method, seed))
os.mkdir(experiment_dir)
# create directory for the model
os.mkdir(os.path.join(experiment_dir, "model"))
model_file_path = os.path.join(experiment_dir, 'model' ,'model_{}_{}_{}'.format(experiment_date, sampling_method, seed))
# to use evaluation see instructions in env_wrapper_rllab_to_openai.evaluate
# save evaluation results
eval_starts_file_name = os.path.join(experiment_dir, 'evaluation_starts_{}_{}_{}.json'.format(experiment_date, sampling_method, seed))
eval_results_file_name = os.path.join(experiment_dir, 'evaluation_results_{}_{}_{}.json'.format(experiment_date, sampling_method, seed))
# save config
config_filename = os.path.join(experiment_dir, 'config_{}_{}_{}.txt'.format(experiment_date, sampling_method, seed))
with open(config_filename, "w") as config_file:
config_file.write(param_info)
# initialize environment
env = WrappedPointMazeEnv()
env.post_init(eval_runs=eval_runs, max_env_timestep=max_env_timestep, do_rendering=do_rendering,
sampling_method=sampling_method,
steps_per_curriculum=steps_per_curriculum,
verbose=verbose,
sample_on_goal_area=sample_on_goal_area,
model_file_path=model_file_path,
eval_starts_file_name=eval_starts_file_name,
eval_results_file_name=eval_results_file_name)
# train model
model = ppo2.learn(network='mlp',
env=env,
save_interval=save_interval,
total_timesteps=total_timesteps,
nsteps=nsteps,
nminibatches=1,
num_layers=2,
num_hidden=64,
activation=tf.nn.relu,
gamma=0.998,
lr=0.01,
seed=seed)
# Last evaluation run, including saving the evaluation results and the model.
env.evaluate()