-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbc.py
34 lines (30 loc) · 1.2 KB
/
bc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
"""
TODO: MODIFY TO FILL IN YOUR BC IMPLEMENTATION
"""
import torch
import torch.optim as optim
import numpy as np
from utils import rollout
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
def simulate_policy_bc(env, policy, expert_data, num_epochs=500, episode_length=50,
batch_size=32):
# Fill in your BC implementation in this function.
# Hint: Just flatten your expert dataset and use standard pytorch supervised learning code to train the policy.
optimizer = optim.Adam(list(policy.parameters()), lr=1e-4)
idxs = np.array(range(len(expert_data)))
num_batches = len(idxs)*episode_length // batch_size
losses = []
for epoch in range(num_epochs):
np.random.shuffle(idxs)
running_loss = 0.0
for i in range(num_batches):
optimizer.zero_grad()
# TODO start: Fill in your behavior cloning implementation here, just maximize log likelihood!
# TODO end
loss.backward()
optimizer.step()
running_loss += loss.item()
# if epoch % 10 == 0:
print('[%d] loss: %.8f' %
(epoch, running_loss / 10.))
losses.append(loss.item())