Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rl v3 hanging issue fix #455

Merged
merged 18 commits into from
Jan 14, 2022
27 changes: 0 additions & 27 deletions docker-compose.yml

This file was deleted.

4 changes: 2 additions & 2 deletions docker_files/dev.df
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM ubuntu:18.04
FROM python:3.7.12-buster
WORKDIR /maro

# Install Apt packages
Expand All @@ -9,7 +9,7 @@ RUN apt-get install -y gcc
RUN apt-get install -y libcurl4 libcurl4-openssl-dev libssl-dev curl
RUN apt-get install -y libzmq3-dev
RUN apt-get install -y python3-pip
RUN apt-get install -y python3-dev libpython3.6-dev python-numpy
RUN apt-get install -y python3-dev libpython3.7-dev python-numpy
RUN rm -rf /var/lib/apt/lists/*

# Install Python packages
Expand Down
4 changes: 2 additions & 2 deletions examples/rl/cim/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@

from .callbacks import post_collect, post_evaluate
from .env_sampler import agent2policy, get_env_sampler
from .policies_v2 import policy_func_dict
from .policy_trainer import policy_creator, trainer_creator

__all__ = ["agent2policy", "post_collect", "post_evaluate", "get_env_sampler", "policy_func_dict"]
__all__ = ["agent2policy", "post_collect", "post_evaluate", "get_env_sampler", "policy_creator", "trainer_creator"]
Original file line number Diff line number Diff line change
@@ -1,35 +1,24 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from typing import Dict

import torch
from torch.optim import Adam, RMSprop

from maro.rl_v3.model import DiscretePolicyNet, DiscreteQNet, FullyConnected, VNet
from maro.rl_v3.model import DiscretePolicyNet, FullyConnected, VNet
from maro.rl_v3.policy import DiscretePolicyGradient
from maro.rl_v3.training.algorithms import DiscreteActorCritic, DiscreteActorCriticParams

from .config import action_shaping_conf, state_dim

q_net_conf = {
"input_dim": state_dim,
"hidden_dims": [256, 128, 64, 32],
"output_dim": len(action_shaping_conf["action_space"]),
"activation": torch.nn.LeakyReLU,
"softmax": False,
"batch_norm": True,
"skip_connection": False,
"head": True,
"dropout_p": 0.0
}
q_net_optim_conf = (RMSprop, {"lr": 0.05})
actor_net_conf = {
"input_dim": state_dim,
"hidden_dims": [256, 128, 64],
"output_dim": len(action_shaping_conf["action_space"]),
"activation": torch.nn.Tanh,
"softmax": True,
"batch_norm": False,
"head": True
}
critic_net_conf = {
"input_dim": state_dim,
"hidden_dims": [256, 128, 64],
"output_dim": 1,
"activation": torch.nn.LeakyReLU,
Expand All @@ -41,45 +30,10 @@
critic_optim_conf = (RMSprop, {"lr": 0.001})


# #####################################################################################################################
class MyQNet(DiscreteQNet):
def __init__(self) -> None:
super(MyQNet, self).__init__(state_dim=q_net_conf["input_dim"], action_num=q_net_conf["output_dim"])
self._fc = FullyConnected(**q_net_conf)
self._optim = q_net_optim_conf[0](self._fc.parameters(), **q_net_optim_conf[1])

def _get_q_values_for_all_actions(self, states: torch.Tensor) -> torch.Tensor:
return self._fc(states)

def get_gradients(self, loss: torch.Tensor) -> Dict[str, torch.Tensor]:
self._optim.zero_grad()
loss.backward()
return {name: param.grad for name, param in self.named_parameters()}

def apply_gradients(self, grad: dict) -> None:
for name, param in self.named_parameters():
param.grad = grad[name]
self._optim.step()

def get_net_state(self) -> object:
return {"network": self.state_dict(), "optim": self._optim.state_dict()}

def set_net_state(self, net_state: object) -> None:
assert isinstance(net_state, dict)
self.load_state_dict(net_state["network"])
self._optim.load_state_dict(net_state["optim"])

def freeze(self) -> None:
self.freeze_all_parameters()

def unfreeze(self) -> None:
self.unfreeze_all_parameters()


class MyActorNet(DiscretePolicyNet):
def __init__(self) -> None:
super(MyActorNet, self).__init__(state_dim=actor_net_conf["input_dim"], action_num=actor_net_conf["output_dim"])
self._actor = FullyConnected(**actor_net_conf)
def __init__(self, state_dim: int, action_num: int) -> None:
super(MyActorNet, self).__init__(state_dim=state_dim, action_num=action_num)
self._actor = FullyConnected(input_dim=state_dim, output_dim=action_num, **actor_net_conf)
self._actor_optim = actor_optim_conf[0](self._actor.parameters(), **actor_optim_conf[1])

def _get_action_probs_impl(self, states: torch.Tensor) -> torch.Tensor:
Expand Down Expand Up @@ -113,9 +67,9 @@ def set_net_state(self, net_state: dict) -> None:


class MyCriticNet(VNet):
def __init__(self) -> None:
super(MyCriticNet, self).__init__(state_dim=critic_net_conf["input_dim"])
self._critic = FullyConnected(**critic_net_conf)
def __init__(self, state_dim: int) -> None:
super(MyCriticNet, self).__init__(state_dim=state_dim)
self._critic = FullyConnected(input_dim=state_dim, **critic_net_conf)
self._critic_optim = critic_optim_conf[0](self._critic.parameters(), **critic_optim_conf[1])

def _get_v_values(self, states: torch.Tensor) -> torch.Tensor:
Expand Down Expand Up @@ -146,3 +100,24 @@ def freeze(self) -> None:

def unfreeze(self) -> None:
self.unfreeze_all_parameters()


def get_discrete_policy_gradient(name: str, *, state_dim: int, action_num: int) -> DiscretePolicyGradient:
return DiscretePolicyGradient(name=name, policy_net=MyActorNet(state_dim, action_num))


def get_ac(name: str, *, state_dim: int) -> DiscreteActorCritic:
return DiscreteActorCritic(
name=name,
params=DiscreteActorCriticParams(
device="cpu",
get_v_critic_net_func=lambda: MyCriticNet(state_dim),
reward_discount=.0,
grad_iters=10,
critic_loss_cls=torch.nn.SmoothL1Loss,
min_logp=None,
lam=.0,
)
)


92 changes: 92 additions & 0 deletions examples/rl/cim/algorithms/dqn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from typing import Dict

import torch
from torch.optim import RMSprop

from maro.rl_v3.exploration import MultiLinearExplorationScheduler, epsilon_greedy
from maro.rl_v3.model import DiscreteQNet, FullyConnected
from maro.rl_v3.policy import ValueBasedPolicy
from maro.rl_v3.training.algorithms import DQN, DQNParams


q_net_conf = {
"hidden_dims": [256, 128, 64, 32],
"activation": torch.nn.LeakyReLU,
"softmax": False,
"batch_norm": True,
"skip_connection": False,
"head": True,
"dropout_p": 0.0
}
q_net_optim_conf = (RMSprop, {"lr": 0.05})


class MyQNet(DiscreteQNet):
def __init__(self, state_dim: int, action_num: int) -> None:
super(MyQNet, self).__init__(state_dim=state_dim, action_num=action_num)
self._fc = FullyConnected(input_dim=state_dim, output_dim=action_num, **q_net_conf)
self._optim = q_net_optim_conf[0](self._fc.parameters(), **q_net_optim_conf[1])

def _get_q_values_for_all_actions(self, states: torch.Tensor) -> torch.Tensor:
return self._fc(states)

def get_gradients(self, loss: torch.Tensor) -> Dict[str, torch.Tensor]:
self._optim.zero_grad()
loss.backward()
return {name: param.grad for name, param in self.named_parameters()}

def apply_gradients(self, grad: dict) -> None:
for name, param in self.named_parameters():
param.grad = grad[name]
self._optim.step()

def get_net_state(self) -> object:
return {"network": self.state_dict(), "optim": self._optim.state_dict()}

def set_net_state(self, net_state: object) -> None:
assert isinstance(net_state, dict)
self.load_state_dict(net_state["network"])
self._optim.load_state_dict(net_state["optim"])

def freeze(self) -> None:
self.freeze_all_parameters()

def unfreeze(self) -> None:
self.unfreeze_all_parameters()


def get_value_based_policy(name: str, *, state_dim: int, action_num: int) -> ValueBasedPolicy:
return ValueBasedPolicy(
name=name,
q_net=MyQNet(state_dim, action_num),
exploration_strategy=(epsilon_greedy, {"epsilon": 0.4}),
exploration_scheduling_options=[(
"epsilon", MultiLinearExplorationScheduler, {
"splits": [(2, 0.32)],
"initial_value": 0.4,
"last_ep": 5,
"final_value": 0.0,
}
)],
warmup=100
)


def get_dqn(name: str) -> DQN:
return DQN(
name=name,
params=DQNParams(
device="cpu",
reward_discount=.0,
update_target_every=5,
num_epochs=10,
soft_update_coef=0.1,
double=False,
replay_memory_capacity=10000,
random_overwrite=False,
batch_size=32,
)
)
Original file line number Diff line number Diff line change
@@ -1,27 +1,25 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from functools import partial
from typing import Dict, List

import torch
from torch.optim import Adam, RMSprop

from maro.rl_v3.model import DiscretePolicyNet, FullyConnected, MultiQNet
from maro.rl_v3.policy import DiscretePolicyGradient
from maro.rl_v3.training.algorithms import DiscreteMADDPG, DiscreteMADDPGParams

from .config import action_shaping_conf, state_dim

actor_net_conf = {
"input_dim": state_dim,
"hidden_dims": [256, 128, 64],
"output_dim": len(action_shaping_conf["action_space"]),
"activation": torch.nn.Tanh,
"softmax": True,
"batch_norm": False,
"head": True
}
critic_conf = {
"state_dim": state_dim,
"action_dims": [1]
}
critic_net_conf = {
"input_dim": critic_conf["state_dim"] + sum(critic_conf["action_dims"]),
"hidden_dims": [256, 128, 64],
"output_dim": 1,
"activation": torch.nn.LeakyReLU,
Expand All @@ -35,9 +33,9 @@

# #####################################################################################################################
class MyActorNet(DiscretePolicyNet):
def __init__(self) -> None:
super(MyActorNet, self).__init__(state_dim=actor_net_conf["input_dim"], action_num=actor_net_conf["output_dim"])
self._actor = FullyConnected(**actor_net_conf)
def __init__(self, state_dim: int, action_num: int) -> None:
super(MyActorNet, self).__init__(state_dim=state_dim, action_num=action_num)
self._actor = FullyConnected(input_dim=state_dim, output_dim=action_num, **actor_net_conf)
self._actor_optim = actor_optim_conf[0](self._actor.parameters(), **actor_optim_conf[1])

def _get_action_probs_impl(self, states: torch.Tensor) -> torch.Tensor:
Expand Down Expand Up @@ -71,12 +69,9 @@ def set_net_state(self, net_state: dict) -> None:


class MyMultiCriticNet(MultiQNet):
def __init__(self) -> None:
super(MyMultiCriticNet, self).__init__(
state_dim=critic_conf["state_dim"],
action_dims=critic_conf["action_dims"]
)
self._critic = FullyConnected(**critic_net_conf)
def __init__(self, state_dim: int, action_dims: List[int]) -> None:
super(MyMultiCriticNet, self).__init__(state_dim=state_dim, action_dims=action_dims)
self._critic = FullyConnected(input_dim=state_dim + sum(action_dims), **critic_net_conf)
self._critic_optim = critic_optim_conf[0](self._critic.parameters(), **critic_optim_conf[1])

def _get_q_values(self, states: torch.Tensor, actions: List[torch.Tensor]) -> torch.Tensor:
Expand Down Expand Up @@ -107,3 +102,24 @@ def freeze(self) -> None:

def unfreeze(self) -> None:
self.unfreeze_all_parameters()


def get_multi_critic_net(state_dim: int, action_dims: List[int]) -> MyMultiCriticNet:
return MyMultiCriticNet(state_dim, action_dims)


def get_discrete_policy_gradient(name: str, *, state_dim: int, action_num: int) -> DiscretePolicyGradient:
return DiscretePolicyGradient(name=name, policy_net=MyActorNet(state_dim, action_num))


def get_maddpg(name: str, *, state_dim: int, action_dims: List[int]) -> DiscreteMADDPG:
return DiscreteMADDPG(
name=name,
params=DiscreteMADDPGParams(
device="cpu",
reward_discount=.0,
num_epoch=10,
get_q_critic_net_func=partial(get_multi_critic_net, state_dim, action_dims),
# shared_critic=True,
)
)
11 changes: 2 additions & 9 deletions examples/rl/cim/callbacks.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,8 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import time
from os import makedirs
from os.path import dirname, join, realpath

log_dir = join(dirname(realpath(__file__)), "log", str(time.time()))
makedirs(log_dir, exist_ok=True)


def post_collect(trackers, ep, segment):
def post_collect(trackers: list, ep: int, segment: int) -> None:
# print the env metric from each rollout worker
for tracker in trackers:
print(f"env summary (episode {ep}, segment {segment}): {tracker['env_metric']}")
Expand All @@ -21,7 +14,7 @@ def post_collect(trackers, ep, segment):
print(f"average env summary (episode {ep}, segment {segment}): {avg_metric}")


def post_evaluate(trackers, ep):
def post_evaluate(trackers: list, ep: int) -> None:
# print the env metric from each rollout worker
for tracker in trackers:
print(f"env summary (episode {ep}): {tracker['env_metric']}")
Expand Down
Loading