-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
144 lines (121 loc) · 4.9 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import os
import sys
import gym
import minerl
from pylab import *
import numpy as np
import tensorflow as tf
from reinforce import Reinforce
from reinforce_with_baseline import ReinforceWithBaseline
from state_space import new_treechop_state
from action_space import new_action_treechop
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession
config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)
gpu_available = tf.test.is_gpu_available()
print("GPU Available: ", gpu_available)
def visualize_data(total_rewards):
"""
Takes in array of rewards from each episode, visualizes reward over episodes.
:param rewards: List of rewards from all episodes
"""
x_values = arange(0, len(total_rewards), 1)
y_values = total_rewards
plot(x_values, y_values)
xlabel("episodes")
ylabel("cumulative rewards")
title("Reward by Episode")
grid(True)
show()
def discount(rewards, discount_factor=.99):
"""
Takes in a list of rewards for each timestep in an episode,
and returns a list of the sum of discounted rewards for
each timestep. Refer to the slides to see how this is done.
:param rewards: List of rewards from an episode [r_{t1},r_{t2},...]
:param discount_factor: Gamma discounting factor to use, defaults to .99
:return: discounted_rewards: list containing the sum of discounted rewards for each timestep in the original
rewards list
"""
discounted_rewards = [0.] * len(rewards)
discounted_rewards[-1] = rewards[-1]
for i in reversed(range(len(rewards) - 1)):
discounted_rewards[i] = discounted_rewards[i + 1] * discount_factor + rewards[i]
return discounted_rewards
def generate_trajectory(env, model):
"""
Generates lists of states, actions, and rewards for one complete episode.
:param env: The openai gym environment
:param model: The model used to generate the actions
:return: A tuple of lists (states, actions, rewards), where each list has length equal to the number of timesteps
in the episode
"""
states = []
actions = []
rewards = []
state = env.reset()
done = False
while not done:
state = new_treechop_state(state)
states.append(state)
model_input = np.array([state])
prbs = model(model_input)
distribution = prbs[0].numpy()
action = np.random.choice(range(len(distribution)), p=distribution)
actions.append(action)
action_step = new_action_treechop(env.action_space.noop(), action)
state, rwd, done, _ = env.step(action_step)
rewards.append(rwd)
return states, actions, rewards
def train(env, model):
"""
This function should train your model for one episode.
Each call to this function should generate a complete trajectory for one episode (lists of states, action_probs,
and rewards seen/taken in the episode), and then train on that data to minimize your model loss.
Make sure to return the total reward for the episode.
:param env: The openai gym environment
:param model: The model
:return: The total reward for the episode
"""
with tf.GradientTape() as tape:
states, actions, rewards = generate_trajectory(env, model)
states = np.array(states)
discounted_rewards = discount(rewards)
loss = model.loss(states, actions, discounted_rewards)
gradients = tape.gradient(loss, model.trainable_variables)
model.optimizer.apply_gradients(zip(gradients, model.trainable_variables))
return tf.reduce_sum(rewards)
def main():
if len(sys.argv) != 2 or sys.argv[1] not in {"REINFORCE", "REINFORCE_BASELINE"}:
print("USAGE: python assignment.py <Model Type>")
print("<Model Type>: [REINFORCE/REINFORCE_BASELINE]")
exit()
env = gym.make("MineRLTreechop-v0") # environment
print("env", env)
print("env.observation_space", env.observation_space)
print("env.action_space", env.action_space)
print("env.action_space.spaces", env.action_space.spaces)
# state_size = env.observation_space.shape[0]
num_actions = len(env.action_space.spaces) - 1
# Initialize model
if sys.argv[1] == "REINFORCE":
model = Reinforce(state_size=None, num_actions=num_actions)
elif sys.argv[1] == "REINFORCE_BASELINE":
model = ReinforceWithBaseline(state_size=None, num_actions=num_actions)
try:
with tf.device('/device:GPU:1'):
episodes = 650
rewards = []
for ep in range(episodes):
print(ep, "/", episodes)
rwd = train(env, model)
print("[+] Episode", ep, "reward = ", rwd, ".")
rewards.append(rwd)
print("[+] Avg of last 50 rewards = ", np.mean(rewards[len(rewards)-50:]), ".")
visualize_data(rewards)
except RuntimeError as e:
print(e)
if __name__ == "__main__":
main()