-
Notifications
You must be signed in to change notification settings - Fork 51
/
reinforce_discrete.py
58 lines (48 loc) · 1.84 KB
/
reinforce_discrete.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import sys
import math
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.nn.utils as utils
import torchvision.transforms as T
from torch.autograd import Variable
import pdb
class Policy(nn.Module):
def __init__(self, hidden_size, num_inputs, action_space):
super(Policy, self).__init__()
self.action_space = action_space
num_outputs = action_space.n
self.linear1 = nn.Linear(num_inputs, hidden_size)
self.linear2 = nn.Linear(hidden_size, num_outputs)
def forward(self, inputs):
x = inputs
x = F.relu(self.linear1(x))
action_scores = self.linear2(x)
return F.softmax(action_scores)
class REINFORCE:
def __init__(self, hidden_size, num_inputs, action_space):
self.action_space = action_space
self.model = Policy(hidden_size, num_inputs, action_space)
self.model = self.model.cuda()
self.optimizer = optim.Adam(self.model.parameters(), lr=1e-3)
self.model.train()
def select_action(self, state):
probs = self.model(Variable(state).cuda())
action = probs.multinomial().data
prob = probs[:, action[0,0]].view(1, -1)
log_prob = prob.log()
entropy = - (probs*probs.log()).sum()
return action[0], log_prob, entropy
def update_parameters(self, rewards, log_probs, entropies, gamma):
R = torch.zeros(1, 1)
loss = 0
for i in reversed(range(len(rewards))):
R = gamma * R + rewards[i]
loss = loss - (log_probs[i]*(Variable(R).expand_as(log_probs[i])).cuda()).sum() - (0.0001*entropies[i].cuda()).sum()
loss = loss / len(rewards)
self.optimizer.zero_grad()
loss.backward()
utils.clip_grad_norm(self.model.parameters(), 40)
self.optimizer.step()