microsoft · Jinyu-W · Mar 30, 2023 · Jun 1, 2021 · Jun 2, 2021 · Jun 2, 2021
diff --git a/examples/cim/rl/algorithms/ac.py b/examples/cim/rl/algorithms/ac.py
@@ -11,6 +11,7 @@
 actor_net_conf = {
     "hidden_dims": [256, 128, 64],
     "activation": torch.nn.Tanh,
+    "output_activation": torch.nn.Tanh,
     "softmax": True,
     "batch_norm": False,
     "head": True,
@@ -19,6 +20,7 @@
     "hidden_dims": [256, 128, 64],
     "output_dim": 1,
     "activation": torch.nn.LeakyReLU,
+    "output_activation": torch.nn.LeakyReLU,
     "softmax": False,
     "batch_norm": True,
     "head": True,

diff --git a/examples/cim/rl/algorithms/dqn.py b/examples/cim/rl/algorithms/dqn.py
@@ -12,6 +12,7 @@
 q_net_conf = {
     "hidden_dims": [256, 128, 64, 32],
     "activation": torch.nn.LeakyReLU,
+    "output_activation": torch.nn.LeakyReLU,
     "softmax": False,
     "batch_norm": True,
     "skip_connection": False,

diff --git a/examples/cim/rl/algorithms/maddpg.py b/examples/cim/rl/algorithms/maddpg.py
@@ -14,6 +14,7 @@
 actor_net_conf = {
     "hidden_dims": [256, 128, 64],
     "activation": torch.nn.Tanh,
+    "output_activation": torch.nn.Tanh,
     "softmax": True,
     "batch_norm": False,
     "head": True,
@@ -22,6 +23,7 @@
     "hidden_dims": [256, 128, 64],
     "output_dim": 1,
     "activation": torch.nn.LeakyReLU,
+    "output_activation": torch.nn.LeakyReLU,
     "softmax": False,
     "batch_norm": True,
     "head": True,

diff --git a/examples/cim/rl/env_sampler.py b/examples/cim/rl/env_sampler.py
@@ -90,11 +90,25 @@ def post_collect(self, info_list: list, ep: int) -> None:
         for info in info_list:
             print(f"env summary (episode {ep}): {info['env_metric']}")
 
-        # print the average env metric
-        if len(info_list) > 1:
-            metric_keys, num_envs = info_list[0]["env_metric"].keys(), len(info_list)
-            avg_metric = {key: sum(info["env_metric"][key] for info in info_list) / num_envs for key in metric_keys}
-            print(f"average env summary (episode {ep}): {avg_metric}")
+        # average env metric
+        metric_keys, num_envs = info_list[0]["env_metric"].keys(), len(info_list)
+        avg_metric = {key: sum(info["env_metric"][key] for info in info_list) / num_envs for key in metric_keys}
+        print(f"average env summary (episode {ep}): {avg_metric}")
+
+        self.metrics.update(avg_metric)
+        self.metrics = {k: v for k, v in self.metrics.items() if not k.startswith("val/")}
 
     def post_evaluate(self, info_list: list, ep: int) -> None:
-        self.post_collect(info_list, ep)
+        # print the env metric from each rollout worker
+        for info in info_list:
+            print(f"env summary (episode {ep}): {info['env_metric']}")
+
+        # average env metric
+        metric_keys, num_envs = info_list[0]["env_metric"].keys(), len(info_list)
+        avg_metric = {key: sum(info["env_metric"][key] for info in info_list) / num_envs for key in metric_keys}
+        print(f"average env summary (episode {ep}): {avg_metric}")
+
+        self.metrics.update({"val/" + k: v for k, v in avg_metric.items()})
+
+    def monitor_metrics(self) -> float:
+        return -self.metrics["val/container_shortage"]
diff --git a/examples/cim/rl/rl_component_bundle.py b/examples/cim/rl/rl_component_bundle.py
@@ -13,7 +13,7 @@
 
 # Environments
 learn_env = Env(**env_conf)
-test_env = learn_env
+test_env = Env(**env_conf)
 
 # Agent, policy, and trainers
 num_agents = len(learn_env.agent_idx_list)

diff --git a/examples/rl/README.md b/examples/rl/README.md
@@ -7,7 +7,7 @@ This folder contains scenarios that employ reinforcement learning. MARO's RL too
 The entrance of a RL workflow is a YAML config file. For readers' convenience, we call this config file `config.yml` in the rest part of this doc. `config.yml` specifies the path of all necessary resources, definitions, and configurations to run the job. MARO provides a comprehensive template of the config file with detailed explanations (`maro/maro/rl/workflows/config/template.yml`). Meanwhile, MARO also provides several simple examples of `config.yml` under the current folder.
 
 There are two ways to start the RL job:
-- If you only need to have a quick look and try to start an out-of-box workflow, just run `python .\examples\rl\run_rl_example.py PATH_TO_CONFIG_YAML`. For example, `python .\examples\rl\run_rl_example.py .\examples\rl\cim.yml` will run the complete example RL training workflow of CIM scenario. If you only want to run the evaluation workflow, you could start the job with `--evaluate_only`.
+- If you only need to have a quick look and try to start an out-of-box workflow, just run `python .\examples\rl\run.py PATH_TO_CONFIG_YAML`. For example, `python .\examples\rl\run.py .\examples\rl\cim.yml` will run the complete example RL training workflow of CIM scenario. If you only want to run the evaluation workflow, you could start the job with `--evaluate_only`.
 - (**Require install MARO from source**) You could also start the job through MARO CLI. Use the command `maro local run [-c] path/to/your/config` to run in containerized (with `-c`) or non-containerized (without `-c`) environments. Similar, you could add `--evaluate_only` if you only need to run the evaluation workflow.
 
 ## Create Your Own Scenarios

diff --git a/examples/rl/cim.yml b/examples/rl/cim.yml
@@ -5,16 +5,17 @@
 # Please refer to `maro/rl/workflows/config/template.yml` for the complete template and detailed explanations.
 
 # Run this workflow by executing one of the following commands:
-# - python .\examples\rl\run_rl_example.py .\examples\rl\cim.yml
-# - (Requires installing MARO from source) maro local run .\examples\rl\cim.yml
+# - python ./examples/rl/run.py ./examples/rl/cim.yml
+# - (Requires installing MARO from source) maro local run ./examples/rl/cim.yml
 
 job: cim_rl_workflow
 scenario_path: "examples/cim/rl"
-log_path: "log/rl_job/cim.txt"
+log_path: "log/cim_rl/"
 main:
   num_episodes: 30  # Number of episodes to run. Each episode is one cycle of roll-out and training.
   num_steps: null
   eval_schedule: 5
+  early_stop_patience: 5
   logging:
     stdout: INFO
     file: DEBUG
@@ -27,7 +28,7 @@ training:
   load_path: null
   load_episode: null
   checkpointing:
-    path: "checkpoint/rl_job/cim"
+    path: "log/cim_rl/checkpoints"
     interval: 5
   logging:
     stdout: INFO

diff --git a/examples/rl/cim_distributed.yml b/examples/rl/cim_distributed.yml
@@ -1,16 +1,16 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-# Example RL config file for CIM scenario.
+# Example RL config file for CIM scenario (distributed version).
 # Please refer to `maro/rl/workflows/config/template.yml` for the complete template and detailed explanations.
 
 # Run this workflow by executing one of the following commands:
-# - python .\examples\rl\run_rl_example.py .\examples\rl\cim.yml
-# - (Requires installing MARO from source) maro local run .\examples\rl\cim.yml
+# - python ./examples/rl/run.py ./examples/rl/cim_distributed.yml
+# - (Requires installing MARO from source) maro local run ./examples/rl/cim_distributed.yml
 
 job: cim_rl_workflow
 scenario_path: "examples/cim/rl"
-log_path: "log/rl_job/cim.txt"
+log_path: "log/cim_rl/"
 main:
   num_episodes: 30  # Number of episodes to run. Each episode is one cycle of roll-out and training.
   num_steps: null
@@ -35,7 +35,7 @@ training:
   load_path: null
   load_episode: null
   checkpointing:
-    path: "checkpoint/rl_job/cim"
+    path: "log/cim_rl/checkpoints"
     interval: 5
   proxy:
     host: "127.0.0.1"

diff --git a/examples/rl/run_rl_example.py → examples/rl/run.py b/examples/rl/run_rl_example.py → examples/rl/run.py
diff --git a/examples/rl/vm_scheduling.yml b/examples/rl/vm_scheduling.yml
@@ -5,12 +5,12 @@
 # Please refer to `maro/rl/workflows/config/template.yml` for the complete template and detailed explanations.
 
 # Run this workflow by executing one of the following commands:
-# - python .\examples\rl\run_rl_example.py .\examples\rl\vm_scheduling.yml
-# - (Requires installing MARO from source) maro local run .\examples\rl\vm_scheduling.yml
+# - python ./examples/rl/run.py ./examples/rl/vm_scheduling.yml
+# - (Requires installing MARO from source) maro local run ./examples/rl/vm_scheduling.yml
 
 job: vm_scheduling_rl_workflow
 scenario_path: "examples/vm_scheduling/rl"
-log_path: "log/rl_job/vm_scheduling.txt"
+log_path: "log/vm_rl/"
 main:
   num_episodes: 30  # Number of episodes to run. Each episode is one cycle of roll-out and training.
   num_steps: null
@@ -27,7 +27,7 @@ training:
   load_path: null
   load_episode: null
   checkpointing:
-    path: "checkpoint/rl_job/vm_scheduling"
+    path: "log/vm_rl/checkpoints"
     interval: 5
   logging:
     stdout: INFO

diff --git a/examples/vm_scheduling/rl/algorithms/ac.py b/examples/vm_scheduling/rl/algorithms/ac.py
@@ -11,6 +11,7 @@
 actor_net_conf = {
     "hidden_dims": [64, 32, 32],
     "activation": torch.nn.LeakyReLU,
+    "output_activation": torch.nn.LeakyReLU,
     "softmax": True,
     "batch_norm": False,
     "head": True,
@@ -19,6 +20,7 @@
 critic_net_conf = {
     "hidden_dims": [256, 128, 64],
     "activation": torch.nn.LeakyReLU,
+    "output_activation": torch.nn.LeakyReLU,
     "softmax": False,
     "batch_norm": False,
     "head": True,

diff --git a/examples/vm_scheduling/rl/algorithms/dqn.py b/examples/vm_scheduling/rl/algorithms/dqn.py
@@ -14,6 +14,7 @@
 q_net_conf = {
     "hidden_dims": [64, 128, 256],
     "activation": torch.nn.LeakyReLU,
+    "output_activation": torch.nn.LeakyReLU,
     "softmax": False,
     "batch_norm": False,
     "skip_connection": False,

diff --git a/maro/__misc__.py b/maro/__misc__.py
@@ -2,6 +2,6 @@
 # Licensed under the MIT license.
 
 
-__version__ = "0.3.1a2"
+__version__ = "0.3.2a1"
 
 __data_version__ = "0.2"
diff --git a/maro/cli/data_pipeline/citi_bike.py b/maro/cli/data_pipeline/citi_bike.py
@@ -8,7 +8,6 @@
 from enum import Enum
 
 import geopy.distance
-import numpy as np
 import pandas as pd
 from yaml import safe_load
 
@@ -320,7 +319,7 @@ def _process_distance(self, station_info: pd.DataFrame):
             0,
             index=station_info["station_index"],
             columns=station_info["station_index"],
-            dtype=np.float,
+            dtype=float,
         )
         look_up_df = station_info[["latitude", "longitude"]]
         return distance_adj.apply(
@@ -617,7 +616,7 @@ def _gen_distance(self, station_init: pd.DataFrame):
             0,
             index=station_init["station_index"],
             columns=station_init["station_index"],
-            dtype=np.float,
+            dtype=float,
         )
         look_up_df = station_init[["latitude", "longitude"]]
         distance_df = distance_adj.apply(

diff --git a/maro/cli/local/commands.py b/maro/cli/local/commands.py
@@ -61,7 +61,7 @@ def get_redis_conn(port=None):
 
 
 # Functions executed on CLI commands
-def run(conf_path: str, containerize: bool = False, evaluate_only: bool = False, **kwargs):
+def run(conf_path: str, containerize: bool = False, seed: int = None, evaluate_only: bool = False, **kwargs):
     # Load job configuration file
     parser = ConfigParser(conf_path)
     if containerize:
@@ -71,13 +71,14 @@ def run(conf_path: str, containerize: bool = False, evaluate_only: bool = False,
                 LOCAL_MARO_ROOT,
                 DOCKERFILE_PATH,
                 DOCKER_IMAGE_NAME,
+                seed=seed,
                 evaluate_only=evaluate_only,
             )
         except KeyboardInterrupt:
             stop_rl_job_with_docker_compose(parser.config["job"], LOCAL_MARO_ROOT)
     else:
         try:
-            start_rl_job(parser, LOCAL_MARO_ROOT, evaluate_only=evaluate_only)
+            start_rl_job(parser, LOCAL_MARO_ROOT, seed=seed, evaluate_only=evaluate_only)
         except KeyboardInterrupt:
             sys.exit(1)
 

diff --git a/maro/cli/local/utils.py b/maro/cli/local/utils.py
@@ -4,7 +4,7 @@
 import os
 import subprocess
 from copy import deepcopy
-from typing import List
+from typing import List, Optional
 
 import docker
 import yaml
@@ -110,12 +110,15 @@ def exec(cmd: str, env: dict, debug: bool = False) -> subprocess.Popen:
 def start_rl_job(
     parser: ConfigParser,
     maro_root: str,
+    seed: Optional[int],
     evaluate_only: bool,
     background: bool = False,
 ) -> List[subprocess.Popen]:
     procs = [
         exec(
-            f"python {script}" + ("" if not evaluate_only else " --evaluate_only"),
+            f"python {script}"
+            + ("" if not evaluate_only else " --evaluate_only")
+            + ("" if seed is None else f" --seed {seed}"),
             format_env_vars({**env, "PYTHONPATH": maro_root}, mode="proc"),
             debug=not background,
         )
@@ -169,6 +172,7 @@ def start_rl_job_with_docker_compose(
     context: str,
     dockerfile_path: str,
     image_name: str,
+    seed: Optional[int],
     evaluate_only: bool,
 ) -> None:
     common_spec = {
@@ -185,7 +189,9 @@ def start_rl_job_with_docker_compose(
                 **deepcopy(common_spec),
                 **{
                     "container_name": component,
-                    "command": f"python3 {script}" + ("" if not evaluate_only else " --evaluate_only"),
+                    "command": f"python3 {script}"
+                    + ("" if not evaluate_only else " --evaluate_only")
+                    + ("" if seed is None else f" --seed {seed}"),
                     "environment": format_env_vars(env, mode="docker-compose"),
                 },
             }

diff --git a/maro/rl/model/abs_net.py b/maro/rl/model/abs_net.py
@@ -4,7 +4,7 @@
 from __future__ import annotations
 
 from abc import ABCMeta
-from typing import Any, Dict
+from typing import Any, Dict, Optional
 
 import torch.nn
 from torch.optim import Optimizer
@@ -18,6 +18,8 @@ class AbsNet(torch.nn.Module, metaclass=ABCMeta):
     def __init__(self) -> None:
         super(AbsNet, self).__init__()
 
+        self._device: Optional[torch.device] = None
+
     @property
     def optim(self) -> Optimizer:
         optim = getattr(self, "_optim", None)
@@ -119,3 +121,7 @@ def unfreeze_all_parameters(self) -> None:
         """Unfreeze all parameters."""
         for p in self.parameters():
             p.requires_grad = True
+
+    def to_device(self, device: torch.device) -> None:
+        self._device = device
+        self.to(device)
diff --git a/maro/rl/model/algorithm_nets/ac_based.py b/maro/rl/model/algorithm_nets/ac_based.py
@@ -43,14 +43,23 @@ class ContinuousACBasedNet(ContinuousPolicyNet, metaclass=ABCMeta):
     - set_state(self, net_state: dict) -> None:
     """
 
-    def _get_actions_impl(self, states: torch.Tensor, exploring: bool) -> torch.Tensor:
-        actions, _ = self._get_actions_with_logps_impl(states, exploring)
+    def _get_actions_impl(self, states: torch.Tensor, exploring: bool, **kwargs) -> torch.Tensor:
+        actions, _ = self._get_actions_with_logps_impl(states, exploring, **kwargs)
         return actions
 
-    def _get_actions_with_probs_impl(self, states: torch.Tensor, exploring: bool) -> Tuple[torch.Tensor, torch.Tensor]:
+    def _get_actions_with_probs_impl(
+        self,
+        states: torch.Tensor,
+        exploring: bool,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         # Not used in Actor-Critic or PPO
         pass
 
-    def _get_states_actions_probs_impl(self, states: torch.Tensor, actions: torch.Tensor) -> torch.Tensor:
+    def _get_states_actions_probs_impl(self, states: torch.Tensor, actions: torch.Tensor, **kwargs) -> torch.Tensor:
+        # Not used in Actor-Critic or PPO
+        pass
+
+    def _get_random_actions_impl(self, states: torch.Tensor, **kwargs) -> torch.Tensor:
         # Not used in Actor-Critic or PPO
         pass
diff --git a/maro/rl/model/algorithm_nets/ddpg.py b/maro/rl/model/algorithm_nets/ddpg.py
@@ -25,18 +25,32 @@ class ContinuousDDPGNet(ContinuousPolicyNet, metaclass=ABCMeta):
     - set_state(self, net_state: dict) -> None:
     """
 
-    def _get_actions_with_probs_impl(self, states: torch.Tensor, exploring: bool) -> Tuple[torch.Tensor, torch.Tensor]:
+    def _get_actions_with_probs_impl(
+        self,
+        states: torch.Tensor,
+        exploring: bool,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         # Not used in DDPG
         pass
 
-    def _get_actions_with_logps_impl(self, states: torch.Tensor, exploring: bool) -> Tuple[torch.Tensor, torch.Tensor]:
+    def _get_actions_with_logps_impl(
+        self,
+        states: torch.Tensor,
+        exploring: bool,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         # Not used in DDPG
         pass
 
-    def _get_states_actions_probs_impl(self, states: torch.Tensor, actions: torch.Tensor) -> torch.Tensor:
+    def _get_states_actions_probs_impl(self, states: torch.Tensor, actions: torch.Tensor, **kwargs) -> torch.Tensor:
         # Not used in DDPG
         pass
 
-    def _get_states_actions_logps_impl(self, states: torch.Tensor, actions: torch.Tensor) -> torch.Tensor:
+    def _get_states_actions_logps_impl(self, states: torch.Tensor, actions: torch.Tensor, **kwargs) -> torch.Tensor:
+        # Not used in DDPG
+        pass
+
+    def _get_random_actions_impl(self, states: torch.Tensor, **kwargs) -> torch.Tensor:
         # Not used in DDPG
         pass