forked from b4be1/gh_gym
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbaseline.py
84 lines (69 loc) · 2.27 KB
/
baseline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import socket
import struct
import pickle
import numpy as np
import gym
from stable_baselines3 import SAC
class Connection:
def __init__(self, s):
self._socket = s
self._buffer = bytearray()
def receive_object(self):
while len(self._buffer) < 4 or len(self._buffer) < struct.unpack("<L", self._buffer[:4])[0] + 4:
new_bytes = self._socket.recv(16)
if len(new_bytes) == 0:
return None
self._buffer += new_bytes
length = struct.unpack("<L", self._buffer[:4])[0]
header, body = self._buffer[:4], self._buffer[4:length + 4]
obj = pickle.loads(body)
self._buffer = self._buffer[length + 4:]
return obj
def send_object(self, d):
body = pickle.dumps(d, protocol=2)
header = struct.pack("<L", len(body))
msg = header + body
self._socket.send(msg)
class Env(gym.Env):
def __init__(self, addr):
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.bind(addr)
s.listen(1)
clientsocket, address = s.accept()
self._socket = clientsocket
self._conn = Connection(clientsocket)
self.action_space = None
self.observation_space = None
def reset(self):
self._conn.send_object("reset")
msg = self._conn.receive_object()
self.action_space = eval(msg["info"]["action_space"])
self.observation_space = eval(msg["info"]["observation_space"])
return msg["observation"]
def step(self, action):
self._conn.send_object(action.tolist())
msg = self._conn.receive_object()
obs = msg["observation"]
rwd = msg["reward"]
done = msg["done"]
info = msg["info"]
return obs, rwd, done, info
def close(self):
self._conn.send_object("close")
self._socket.close()
addr = ("127.0.0.1", 50710)
env = Env(addr)
env.reset()
model = SAC('MlpPolicy', env, verbose=1)
model.learn(total_timesteps=3000, log_interval=4)
cum_rwd = 0
obs = env.reset()
for i in range(300):
action, _states = model.predict(obs, deterministic=True)
obs, reward, done, info = env.step(action)
cum_rwd += reward
if done:
obs = env.reset()
print("Return = ", cum_rwd)
cum_rwd = 0
env.close()