-
Notifications
You must be signed in to change notification settings - Fork 1
/
environment.py
85 lines (68 loc) · 2.28 KB
/
environment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import numpy as np
import gym
class Env():
"""
Test environment wrapper for CarRacing
"""
def __init__(self):
self.env = gym.make('CarRacing-v0')
self.env.seed(0)
self.reward_threshold = self.env.spec.reward_threshold
self.action_repeat = 4
self.img_stack = 4
def reset(self):
self.counter = 0
self.av_r = self.reward_memory()
self.die = False
img_rgb = self.env.reset()
img_gray = self.rgb2gray(img_rgb)
self.stack = [img_gray] * self.img_stack
stack_array = np.array(self.stack)
stack_array = np.transpose(stack_array, axes=[1, 2, 0])
stack_array = np.expand_dims(stack_array, axis=0)
return stack_array
# return np.array((self.stack))
#return self.stack
def step(self, action):
total_reward = 0
for i in range(self.action_repeat):
img_rgb, reward, die, _ = self.env.step(action)
# don't penalize "die state"
if die:
reward += 100
# green penalty
if np.mean(img_rgb[:, :, 1]) > 185.0:
reward -= 0.05
total_reward += reward
# if no reward recently, end the episode
done = True if self.av_r(reward) <= -0.1 else False
if done or die:
break
img_gray = self.rgb2gray(img_rgb)
self.stack.pop(0)
self.stack.append(img_gray)
assert len(self.stack) == self.img_stack
stack_array = np.array(self.stack)
stack_array = np.transpose(stack_array, axes=[1, 2, 0])
stack_array = np.expand_dims(stack_array, axis=0)
return stack_array, total_reward, done, die
def render(self, *arg):
self.env.render(*arg)
@staticmethod
def rgb2gray(rgb, norm=True):
gray = np.dot(rgb[..., :], [0.299, 0.587, 0.114])
if norm:
# normalize
gray = gray / 128. - 1.
return gray
@staticmethod
def reward_memory():
count = 0
length = 100
history = np.zeros(length)
def memory(reward):
nonlocal count
history[count] = reward
count = (count + 1) % length
return np.mean(history)
return memory