microsoft · Jinyu-W · Feb 17, 2023 · Apr 12, 2022 · Apr 12, 2022 · Apr 22, 2022
diff --git a/examples/cim/rl/env_sampler.py b/examples/cim/rl/env_sampler.py
@@ -90,11 +90,22 @@ def post_collect(self, info_list: list, ep: int) -> None:
         for info in info_list:
             print(f"env summary (episode {ep}): {info['env_metric']}")
 
-        # print the average env metric
-        if len(info_list) > 1:
-            metric_keys, num_envs = info_list[0]["env_metric"].keys(), len(info_list)
-            avg_metric = {key: sum(info["env_metric"][key] for info in info_list) / num_envs for key in metric_keys}
-            print(f"average env summary (episode {ep}): {avg_metric}")
+        # average env metric
+        metric_keys, num_envs = info_list[0]["env_metric"].keys(), len(info_list)
+        avg_metric = {key: sum(info["env_metric"][key] for info in info_list) / num_envs for key in metric_keys}
+        print(f"average env summary (episode {ep}): {avg_metric}")
+
+        self.metrics.update(avg_metric)
+        self.metrics = {k: v for k, v in self.metrics.items() if not k.startswith("val/")}
 
     def post_evaluate(self, info_list: list, ep: int) -> None:
-        self.post_collect(info_list, ep)
+        # print the env metric from each rollout worker
+        for info in info_list:
+            print(f"env summary (episode {ep}): {info['env_metric']}")
+
+        # average env metric
+        metric_keys, num_envs = info_list[0]["env_metric"].keys(), len(info_list)
+        avg_metric = {key: sum(info["env_metric"][key] for info in info_list) / num_envs for key in metric_keys}
+        print(f"average env summary (episode {ep}): {avg_metric}")
+
+        self.metrics.update({"val/" + k: v for k, v in avg_metric.items()})
diff --git a/examples/cim/rl/rl_component_bundle.py b/examples/cim/rl/rl_component_bundle.py
@@ -13,7 +13,7 @@
 
 # Environments
 learn_env = Env(**env_conf)
-test_env = learn_env
+test_env = Env(**env_conf)
 
 # Agent, policy, and trainers
 num_agents = len(learn_env.agent_idx_list)

diff --git a/examples/rl/README.md b/examples/rl/README.md
@@ -7,7 +7,7 @@ This folder contains scenarios that employ reinforcement learning. MARO's RL too
 The entrance of a RL workflow is a YAML config file. For readers' convenience, we call this config file `config.yml` in the rest part of this doc. `config.yml` specifies the path of all necessary resources, definitions, and configurations to run the job. MARO provides a comprehensive template of the config file with detailed explanations (`maro/maro/rl/workflows/config/template.yml`). Meanwhile, MARO also provides several simple examples of `config.yml` under the current folder.
 
 There are two ways to start the RL job:
-- If you only need to have a quick look and try to start an out-of-box workflow, just run `python .\examples\rl\run_rl_example.py PATH_TO_CONFIG_YAML`. For example, `python .\examples\rl\run_rl_example.py .\examples\rl\cim.yml` will run the complete example RL training workflow of CIM scenario. If you only want to run the evaluation workflow, you could start the job with `--evaluate_only`.
+- If you only need to have a quick look and try to start an out-of-box workflow, just run `python .\examples\rl\run.py PATH_TO_CONFIG_YAML`. For example, `python .\examples\rl\run.py .\examples\rl\cim.yml` will run the complete example RL training workflow of CIM scenario. If you only want to run the evaluation workflow, you could start the job with `--evaluate_only`.
 - (**Require install MARO from source**) You could also start the job through MARO CLI. Use the command `maro local run [-c] path/to/your/config` to run in containerized (with `-c`) or non-containerized (without `-c`) environments. Similar, you could add `--evaluate_only` if you only need to run the evaluation workflow.
 
 ## Create Your Own Scenarios

diff --git a/examples/rl/cim.yml b/examples/rl/cim.yml
@@ -10,7 +10,7 @@
 
 job: cim_rl_workflow
 scenario_path: "examples/cim/rl"
-log_path: "log/rl_job/cim.txt"
+log_path: "log/cim_rl/"
 main:
   num_episodes: 30  # Number of episodes to run. Each episode is one cycle of roll-out and training.
   num_steps: null
@@ -27,7 +27,7 @@ training:
   load_path: null
   load_episode: null
   checkpointing:
-    path: "checkpoint/rl_job/cim"
+    path: "log/cim_rl/checkpoints"
     interval: 5
   logging:
     stdout: INFO

diff --git a/examples/rl/cim_distributed.yml b/examples/rl/cim_distributed.yml
@@ -10,7 +10,7 @@
 
 job: cim_rl_workflow
 scenario_path: "examples/cim/rl"
-log_path: "log/rl_job/cim.txt"
+log_path: "log/cim_rl/"
 main:
   num_episodes: 30  # Number of episodes to run. Each episode is one cycle of roll-out and training.
   num_steps: null
@@ -35,7 +35,7 @@ training:
   load_path: null
   load_episode: null
   checkpointing:
-    path: "checkpoint/rl_job/cim"
+    path: "log/cim_rl/checkpoints"
     interval: 5
   proxy:
     host: "127.0.0.1"

diff --git a/examples/rl/vm_scheduling.yml b/examples/rl/vm_scheduling.yml
@@ -10,7 +10,7 @@
 
 job: vm_scheduling_rl_workflow
 scenario_path: "examples/vm_scheduling/rl"
-log_path: "log/rl_job/vm_scheduling.txt"
+log_path: "log/vm_rl/"
 main:
   num_episodes: 30  # Number of episodes to run. Each episode is one cycle of roll-out and training.
   num_steps: null
@@ -27,7 +27,7 @@ training:
   load_path: null
   load_episode: null
   checkpointing:
-    path: "checkpoint/rl_job/vm_scheduling"
+    path: "log/vm_rl/checkpoints"
     interval: 5
   logging:
     stdout: INFO

diff --git a/maro/rl/rl_component/rl_component_bundle.py b/maro/rl/rl_component/rl_component_bundle.py
@@ -20,7 +20,7 @@ class RLComponentBundle:
         If None, there will be no explicit device assignment.
     policy_trainer_mapping (Dict[str, str], default=None): Policy-trainer mapping which identifying which trainer to
         train each policy. If None, then a policy's trainer's name is the first segment of the policy's name,
-        seperated by dot. For example, "ppo_1.policy" is trained by "ppo_1". Only policies that provided in
+        separated by dot. For example, "ppo_1.policy" is trained by "ppo_1". Only policies that provided in
         policy-trainer mapping are considered as trainable polices. Policies that not provided in policy-trainer
         mapping will not be trained.
     """

diff --git a/maro/rl/rollout/batch_env_sampler.py b/maro/rl/rollout/batch_env_sampler.py
@@ -189,8 +189,13 @@ def sample(
             "info": [res["info"][0] for res in results],
         }
 
-    def eval(self, policy_state: Dict[str, Dict[str, Any]] = None) -> dict:
-        req = {"type": "eval", "policy_state": policy_state, "index": self._ep}  # -1 signals test
+    def eval(self, policy_state: Dict[str, Dict[str, Any]] = None, num_episodes: int = 1) -> dict:
+        req = {
+            "type": "eval",
+            "policy_state": policy_state,
+            "index": self._ep,
+            "num_eval_episodes": num_episodes,
+        }  # -1 signals test
         results = self._controller.collect(req, self._eval_parallelism)
         return {
             "info": [res["info"][0] for res in results],

diff --git a/maro/rl/rollout/env_sampler.py b/maro/rl/rollout/env_sampler.py
@@ -252,6 +252,8 @@ def __init__(
         agent_wrapper_cls: Type[AbsAgentWrapper] = SimpleAgentWrapper,
         reward_eval_delay: int = None,
     ) -> None:
+        assert learn_env is not test_env, "Please use different envs for training and testing."
+
         self._learn_env = learn_env
         self._test_env = test_env
 
@@ -267,6 +269,7 @@ def __init__(
         self._reward_eval_delay = reward_eval_delay
 
         self._info: dict = {}
+        self.metrics: dict = {}
 
         assert self._reward_eval_delay is None or self._reward_eval_delay >= 0
 
@@ -430,65 +433,71 @@ def sample(
         Returns:
             A dict that contains the collected experiences and additional information.
         """
-        # Init the env
-        self._switch_env(self._learn_env)
-        if self._end_of_episode:
-            self._reset()
-
-        # Update policy state if necessary
-        if policy_state is not None:
+        steps_to_go = num_steps
+        if policy_state is not None:  # Update policy state if necessary
             self.set_policy_state(policy_state)
+        self._switch_env(self._learn_env)  # Init the env
+        self._agent_wrapper.explore()  # Collect experience
 
-        # Collect experience
-        self._agent_wrapper.explore()
-        steps_to_go = float("inf") if num_steps is None else num_steps
-        while not self._end_of_episode and steps_to_go > 0:
-            # Get agent actions and translate them to env actions
-            action_dict = self._agent_wrapper.choose_actions(self._agent_state_dict)
-            env_action_dict = self._translate_to_env_action(action_dict, self._event)
-
-            # Store experiences in the cache
-            cache_element = CacheElement(
-                tick=self.env.tick,
-                event=self._event,
-                state=self._state,
-                agent_state_dict=self._select_trainable_agents(self._agent_state_dict),
-                action_dict=self._select_trainable_agents(action_dict),
-                env_action_dict=self._select_trainable_agents(env_action_dict),
-                # The following will be generated later
-                reward_dict={},
-                terminal_dict={},
-                next_state=None,
-                next_agent_state_dict={},
-            )
+        if self._end_of_episode:
+            self._reset()
 
-            # Update env and get new states (global & agent)
-            self._step(list(env_action_dict.values()))
-
-            if self._reward_eval_delay is None:
-                self._calc_reward(cache_element)
-                self._post_step(cache_element)
-            self._append_cache_element(cache_element)
-            steps_to_go -= 1
-        self._append_cache_element(None)
-
-        tick_bound = self.env.tick - (0 if self._reward_eval_delay is None else self._reward_eval_delay)
-        experiences: List[ExpElement] = []
-        while len(self._trans_cache) > 0 and self._trans_cache[0].tick <= tick_bound:
-            cache_element = self._trans_cache.pop(0)
-            # !: Here the reward calculation method requires the given tick is enough and must be used then.
-            if self._reward_eval_delay is not None:
-                self._calc_reward(cache_element)
-                self._post_step(cache_element)
-            experiences.append(cache_element.make_exp_element())
-
-        self._agent_last_index = {
-            k: v - len(experiences) for k, v in self._agent_last_index.items() if v >= len(experiences)
-        }
+        total_experiences = []
+        # If steps_to_go is None, run until the end of episode
+        # If steps_to_go is not None, run until we collect required number of steps
+        while (steps_to_go is None and not self._end_of_episode) or (steps_to_go is not None and steps_to_go > 0):
+            if self._end_of_episode:
+                self._reset()
+
+            while not self._end_of_episode and (steps_to_go is None or steps_to_go > 0):
+                # Get agent actions and translate them to env actions
+                action_dict = self._agent_wrapper.choose_actions(self._agent_state_dict)
+                env_action_dict = self._translate_to_env_action(action_dict, self._event)
+
+                # Store experiences in the cache
+                cache_element = CacheElement(
+                    tick=self.env.tick,
+                    event=self._event,
+                    state=self._state,
+                    agent_state_dict=self._select_trainable_agents(self._agent_state_dict),
+                    action_dict=self._select_trainable_agents(action_dict),
+                    env_action_dict=self._select_trainable_agents(env_action_dict),
+                    # The following will be generated later
+                    reward_dict={},
+                    terminal_dict={},
+                    next_state=None,
+                    next_agent_state_dict={},
+                )
+
+                # Update env and get new states (global & agent)
+                self._step(list(env_action_dict.values()))
+
+                if self._reward_eval_delay is None:
+                    self._calc_reward(cache_element)
+                    self._post_step(cache_element)
+                self._append_cache_element(cache_element)
+                if steps_to_go is not None:
+                    steps_to_go -= 1
+            self._append_cache_element(None)
+
+            tick_bound = self.env.tick - (0 if self._reward_eval_delay is None else self._reward_eval_delay)
+            experiences: List[ExpElement] = []
+            while len(self._trans_cache) > 0 and self._trans_cache[0].tick <= tick_bound:
+                cache_element = self._trans_cache.pop(0)
+                # !: Here the reward calculation method requires the given tick is enough and must be used then.
+                if self._reward_eval_delay is not None:
+                    self._calc_reward(cache_element)
+                    self._post_step(cache_element)
+                experiences.append(cache_element.make_exp_element())
+
+            self._agent_last_index = {
+                k: v - len(experiences) for k, v in self._agent_last_index.items() if v >= len(experiences)
+            }
+
+            total_experiences += experiences
 
         return {
-            "end_of_episode": self._end_of_episode,
-            "experiences": [experiences],
+            "experiences": [total_experiences],
             "info": [deepcopy(self._info)],  # TODO: may have overhead issues. Leave to future work.
         }
 
@@ -514,50 +523,55 @@ def load_policy_state(self, path: str) -> List[str]:
 
         return loaded
 
-    def eval(self, policy_state: Dict[str, Dict[str, Any]] = None) -> dict:
+    def eval(self, policy_state: Dict[str, Dict[str, Any]] = None, num_episodes: int = 1) -> dict:
         self._switch_env(self._test_env)
-        self._reset()
-        if policy_state is not None:
-            self.set_policy_state(policy_state)
-
-        self._agent_wrapper.exploit()
-        while not self._end_of_episode:
-            action_dict = self._agent_wrapper.choose_actions(self._agent_state_dict)
-            env_action_dict = self._translate_to_env_action(action_dict, self._event)
-
-            # Store experiences in the cache
-            cache_element = CacheElement(
-                tick=self.env.tick,
-                event=self._event,
-                state=self._state,
-                agent_state_dict=self._select_trainable_agents(self._agent_state_dict),
-                action_dict=self._select_trainable_agents(action_dict),
-                env_action_dict=self._select_trainable_agents(env_action_dict),
-                # The following will be generated later
-                reward_dict={},
-                terminal_dict={},
-                next_state=None,
-                next_agent_state_dict={},
-            )
+        info_list = []
 
-            # Update env and get new states (global & agent)
-            self._step(list(env_action_dict.values()))
-
-            if self._reward_eval_delay is None:  # TODO: necessary to calculate reward in eval()?
-                self._calc_reward(cache_element)
-                self._post_eval_step(cache_element)
-
-            self._append_cache_element(cache_element)
-        self._append_cache_element(None)
-
-        tick_bound = self.env.tick - (0 if self._reward_eval_delay is None else self._reward_eval_delay)
-        while len(self._trans_cache) > 0 and self._trans_cache[0].tick <= tick_bound:
-            cache_element = self._trans_cache.pop(0)
-            if self._reward_eval_delay is not None:
-                self._calc_reward(cache_element)
-                self._post_eval_step(cache_element)
-
-        return {"info": [self._info]}
+        for _ in range(num_episodes):
+            self._reset()
+            if policy_state is not None:
+                self.set_policy_state(policy_state)
+
+            self._agent_wrapper.exploit()
+            while not self._end_of_episode:
+                action_dict = self._agent_wrapper.choose_actions(self._agent_state_dict)
+                env_action_dict = self._translate_to_env_action(action_dict, self._event)
+
+                # Store experiences in the cache
+                cache_element = CacheElement(
+                    tick=self.env.tick,
+                    event=self._event,
+                    state=self._state,
+                    agent_state_dict=self._select_trainable_agents(self._agent_state_dict),
+                    action_dict=self._select_trainable_agents(action_dict),
+                    env_action_dict=self._select_trainable_agents(env_action_dict),
+                    # The following will be generated later
+                    reward_dict={},
+                    terminal_dict={},
+                    next_state=None,
+                    next_agent_state_dict={},
+                )
+
+                # Update env and get new states (global & agent)
+                self._step(list(env_action_dict.values()))
+
+                if self._reward_eval_delay is None:  # TODO: necessary to calculate reward in eval()?
+                    self._calc_reward(cache_element)
+                    self._post_eval_step(cache_element)
+
+                self._append_cache_element(cache_element)
+            self._append_cache_element(None)
+
+            tick_bound = self.env.tick - (0 if self._reward_eval_delay is None else self._reward_eval_delay)
+            while len(self._trans_cache) > 0 and self._trans_cache[0].tick <= tick_bound:
+                cache_element = self._trans_cache.pop(0)
+                if self._reward_eval_delay is not None:
+                    self._calc_reward(cache_element)
+                    self._post_eval_step(cache_element)
+
+            info_list.append(self._info)
+
+        return {"info": info_list}
 
     @abstractmethod
     def _post_step(self, cache_element: CacheElement) -> None:

diff --git a/maro/rl/rollout/worker.py b/maro/rl/rollout/worker.py
@@ -59,7 +59,7 @@ def _compute(self, msg: list) -> None:
                 result = (
                     self._env_sampler.sample(policy_state=req["policy_state"], num_steps=req["num_steps"])
                     if req["type"] == "sample"
-                    else self._env_sampler.eval(policy_state=req["policy_state"])
+                    else self._env_sampler.eval(policy_state=req["policy_state"], num_episodes=req["num_eval_episodes"])
                 )
                 self._stream.send(pyobj_to_bytes({"result": result, "index": req["index"]}))
             else: