Unity-Technologies · andrewcoh · Nov 16, 2020 · Oct 20, 2020 · Oct 20, 2020 · Oct 20, 2020
diff --git a/Project/Assets/ML-Agents/Examples/PushBlock/Demos/ExpertPush.demo.meta b/Project/Assets/ML-Agents/Examples/PushBlock/Demos/ExpertPush.demo.meta
diff --git a/ml-agents-envs/mlagents_envs/base_env.py b/ml-agents-envs/mlagents_envs/base_env.py
@@ -244,6 +244,32 @@ def empty(spec: "BehaviorSpec") -> "TerminalSteps":
         )
 
 
+class ActionTuple:
+    """
+    An object whose fields correspond to actions of different types.
+    Continuous and discrete actions are numpy arrays of type float32 and
+    int32, respectively and are type checked on construction.
+    Dimensions are of (n_agents, continuous_size) and (n_agents, discrete_size),
+    respectively.
+    """
+
+    def __init__(self, continuous: np.ndarray, discrete: np.ndarray):
+        if continuous.dtype != np.float32:
+            continuous = continuous.astype(np.float32, copy=False)
+        self._continuous = continuous
+        if discrete.dtype != np.int32:
+            discrete = discrete.astype(np.int32, copy=False)
+        self._discrete = discrete
+
+    @property
+    def continuous(self) -> np.ndarray:
+        return self._continuous
+
+    @property
+    def discrete(self) -> np.ndarray:
+        return self._discrete
+
+
 class ActionSpec(NamedTuple):
     """
     A NamedTuple containing utility functions and information about the action spaces
@@ -287,62 +313,61 @@ def discrete_size(self) -> int:
         """
         return len(self.discrete_branches)
 
-    def empty_action(self, n_agents: int) -> np.ndarray:
+    def empty_action(self, n_agents: int) -> ActionTuple:
         """
-        Generates a numpy array corresponding to an empty action (all zeros)
+        Generates ActionTuple corresponding to an empty action (all zeros)
         for a number of agents.
         :param n_agents: The number of agents that will have actions generated
         """
-        if self.is_continuous():
-            return np.zeros((n_agents, self.continuous_size), dtype=np.float32)
-        return np.zeros((n_agents, self.discrete_size), dtype=np.int32)
+        continuous = np.zeros((n_agents, self.continuous_size), dtype=np.float32)
+        discrete = np.zeros((n_agents, self.discrete_size), dtype=np.int32)
+        return ActionTuple(continuous, discrete)
 
-    def random_action(self, n_agents: int) -> np.ndarray:
+    def random_action(self, n_agents: int) -> ActionTuple:
         """
-        Generates a numpy array corresponding to a random action (either discrete
+        Generates ActionTuple corresponding to a random action (either discrete
         or continuous) for a number of agents.
         :param n_agents: The number of agents that will have actions generated
         """
-        if self.is_continuous():
-            action = np.random.uniform(
-                low=-1.0, high=1.0, size=(n_agents, self.continuous_size)
-            ).astype(np.float32)
-        else:
-            branch_size = self.discrete_branches
-            action = np.column_stack(
+        continuous = np.random.uniform(
+            low=-1.0, high=1.0, size=(n_agents, self.continuous_size)
+        )
+        discrete = np.array([])
+        if self.discrete_size > 0:
+            discrete = np.column_stack(
                 [
                     np.random.randint(
                         0,
-                        branch_size[i],  # type: ignore
+                        self.discrete_branches[i],  # type: ignore
                         size=(n_agents),
                         dtype=np.int32,
                     )
                     for i in range(self.discrete_size)
                 ]
             )
-        return action
+        return ActionTuple(continuous, discrete)
 
     def _validate_action(
-        self, actions: np.ndarray, n_agents: int, name: str
-    ) -> np.ndarray:
+        self, actions: ActionTuple, n_agents: int, name: str
+    ) -> ActionTuple:
         """
         Validates that action has the correct action dim
         for the correct number of agents and ensures the type.
         """
-        if self.continuous_size > 0:
-            _size = self.continuous_size
-        else:
-            _size = self.discrete_size
-        _expected_shape = (n_agents, _size)
-        if actions.shape != _expected_shape:
+        _expected_shape = (n_agents, self.continuous_size)
+        if actions.continuous.shape != _expected_shape:
+            raise UnityActionException(
+                f"The behavior {name} needs a continuous input of dimension "
+                f"{_expected_shape} for (<number of agents>, <action size>) but "
+                f"received input of dimension {actions.continuous.shape}"
+            )
+        _expected_shape = (n_agents, self.discrete_size)
+        if actions.discrete.shape != _expected_shape:
             raise UnityActionException(
-                f"The behavior {name} needs an input of dimension "
+                f"The behavior {name} needs a discrete input of dimension "
                 f"{_expected_shape} for (<number of agents>, <action size>) but "
-                f"received input of dimension {actions.shape}"
+                f"received input of dimension {actions.discrete.shape}"
             )
-        _expected_type = np.float32 if self.is_continuous() else np.int32
-        if actions.dtype != _expected_type:
-            actions = actions.astype(_expected_type)
         return actions
 
     @staticmethod
@@ -420,27 +445,30 @@ def behavior_specs(self) -> MappingType[str, BehaviorSpec]:
         """
 
     @abstractmethod
-    def set_actions(self, behavior_name: BehaviorName, action: np.ndarray) -> None:
+    def set_actions(self, behavior_name: BehaviorName, action: ActionTuple) -> None:
         """
         Sets the action for all of the agents in the simulation for the next
         step. The Actions must be in the same order as the order received in
         the DecisionSteps.
         :param behavior_name: The name of the behavior the agents are part of
-        :param action: A two dimensional np.ndarray corresponding to the action
-        (either int or float)
+        :param action: ActionTuple tuple of continuous and/or discrete action.
+        Actions are np.arrays with dimensions  (n_agents, continuous_size) and
+        (n_agents, discrete_size), respectively.
         """
 
     @abstractmethod
     def set_action_for_agent(
-        self, behavior_name: BehaviorName, agent_id: AgentId, action: np.ndarray
+        self, behavior_name: BehaviorName, agent_id: AgentId, action: ActionTuple
     ) -> None:
         """
         Sets the action for one of the agents in the simulation for the next
         step.
         :param behavior_name: The name of the behavior the agent is part of
         :param agent_id: The id of the agent the action is set for
-        :param action: A one dimensional np.ndarray corresponding to the action
-        (either int or float)
+        :param action: ActionTuple tuple of continuous and/or discrete action
+        Actions are np.arrays with dimensions  (1, continuous_size) and
+        (1, discrete_size), respectively. Note, this initial dimensions of 1 is because
+        this action is meant for a single agent.
         """
 
     @abstractmethod

diff --git a/ml-agents-envs/mlagents_envs/environment.py b/ml-agents-envs/mlagents_envs/environment.py
@@ -18,6 +18,7 @@
     DecisionSteps,
     TerminalSteps,
     BehaviorSpec,
+    ActionTuple,
     BehaviorName,
     AgentId,
     BehaviorMapping,
@@ -236,7 +237,7 @@ def __init__(
 
         self._env_state: Dict[str, Tuple[DecisionSteps, TerminalSteps]] = {}
         self._env_specs: Dict[str, BehaviorSpec] = {}
-        self._env_actions: Dict[str, np.ndarray] = {}
+        self._env_actions: Dict[str, ActionTuple] = {}
         self._is_first_message = True
         self._update_behavior_specs(aca_output)
 
@@ -336,7 +337,7 @@ def _assert_behavior_exists(self, behavior_name: str) -> None:
                 f"agent group in the environment"
             )
 
-    def set_actions(self, behavior_name: BehaviorName, action: np.ndarray) -> None:
+    def set_actions(self, behavior_name: BehaviorName, action: ActionTuple) -> None:
         self._assert_behavior_exists(behavior_name)
         if behavior_name not in self._env_state:
             return
@@ -346,7 +347,7 @@ def set_actions(self, behavior_name: BehaviorName, action: np.ndarray) -> None:
         self._env_actions[behavior_name] = action
 
     def set_action_for_agent(
-        self, behavior_name: BehaviorName, agent_id: AgentId, action: np.ndarray
+        self, behavior_name: BehaviorName, agent_id: AgentId, action: ActionTuple
     ) -> None:
         self._assert_behavior_exists(behavior_name)
         if behavior_name not in self._env_state:
@@ -366,7 +367,10 @@ def set_action_for_agent(
                     agent_id
                 )
             ) from ie
-        self._env_actions[behavior_name][index] = action
+        if action_spec.continuous_size > 0:
+            self._env_actions[behavior_name].continuous[index] = action.continuous[0, :]
+        if action_spec.discrete_size > 0:
+            self._env_actions[behavior_name].discrete[index] = action.discrete[0, :]
 
     def get_steps(
         self, behavior_name: BehaviorName
@@ -410,15 +414,20 @@ def _close(self, timeout: Optional[int] = None) -> None:
 
     @timed
     def _generate_step_input(
-        self, vector_action: Dict[str, np.ndarray]
+        self, vector_action: Dict[str, ActionTuple]
     ) -> UnityInputProto:
         rl_in = UnityRLInputProto()
         for b in vector_action:
             n_agents = len(self._env_state[b][0])
             if n_agents == 0:
                 continue
             for i in range(n_agents):
-                action = AgentActionProto(vector_actions=vector_action[b][i])
+                # TODO: extend to AgentBuffers
+                if vector_action[b].continuous is not None:
+                    _act = vector_action[b].continuous[i]
+                else:
+                    _act = vector_action[b].discrete[i]
+                action = AgentActionProto(vector_actions=_act)
                 rl_in.agent_actions[b].value.extend([action])
                 rl_in.command = STEP
         rl_in.side_channel = bytes(

diff --git a/ml-agents-envs/mlagents_envs/tests/test_envs.py b/ml-agents-envs/mlagents_envs/tests/test_envs.py
@@ -97,11 +97,6 @@ def test_step(mock_communicator, mock_launcher):
     env.step()
     with pytest.raises(UnityActionException):
         env.set_actions("RealFakeBrain", spec.action_spec.empty_action(n_agents - 1))
-    decision_steps, terminal_steps = env.get_steps("RealFakeBrain")
-    n_agents = len(decision_steps)
-    env.set_actions("RealFakeBrain", spec.action_spec.empty_action(n_agents) - 1)
-    env.step()
-
     env.close()
     assert isinstance(decision_steps, DecisionSteps)
     assert isinstance(terminal_steps, TerminalSteps)

diff --git a/ml-agents-envs/mlagents_envs/tests/test_steps.py b/ml-agents-envs/mlagents_envs/tests/test_steps.py
@@ -81,24 +81,35 @@ def test_specs():
     assert specs.discrete_branches == ()
     assert specs.discrete_size == 0
     assert specs.continuous_size == 3
-    assert specs.empty_action(5).shape == (5, 3)
-    assert specs.empty_action(5).dtype == np.float32
+    assert specs.empty_action(5).continuous.shape == (5, 3)
+    assert specs.empty_action(5).continuous.dtype == np.float32
 
     specs = ActionSpec.create_discrete((3,))
     assert specs.discrete_branches == (3,)
     assert specs.discrete_size == 1
     assert specs.continuous_size == 0
-    assert specs.empty_action(5).shape == (5, 1)
-    assert specs.empty_action(5).dtype == np.int32
+    assert specs.empty_action(5).discrete.shape == (5, 1)
+    assert specs.empty_action(5).discrete.dtype == np.int32
+
+    specs = ActionSpec(3, (3,))
+    assert specs.continuous_size == 3
+    assert specs.discrete_branches == (3,)
+    assert specs.discrete_size == 1
+    assert specs.empty_action(5).continuous.shape == (5, 3)
+    assert specs.empty_action(5).continuous.dtype == np.float32
+    assert specs.empty_action(5).discrete.shape == (5, 1)
+    assert specs.empty_action(5).discrete.dtype == np.int32
 
 
 def test_action_generator():
     # Continuous
     action_len = 30
     specs = ActionSpec.create_continuous(action_len)
-    zero_action = specs.empty_action(4)
+    zero_action = specs.empty_action(4).continuous
     assert np.array_equal(zero_action, np.zeros((4, action_len), dtype=np.float32))
-    random_action = specs.random_action(4)
+    print(specs.random_action(4))
+    random_action = specs.random_action(4).continuous
+    print(random_action)
     assert random_action.dtype == np.float32
     assert random_action.shape == (4, action_len)
     assert np.min(random_action) >= -1
@@ -107,10 +118,10 @@ def test_action_generator():
     # Discrete
     action_shape = (10, 20, 30)
     specs = ActionSpec.create_discrete(action_shape)
-    zero_action = specs.empty_action(4)
+    zero_action = specs.empty_action(4).discrete
     assert np.array_equal(zero_action, np.zeros((4, len(action_shape)), dtype=np.int32))
 
-    random_action = specs.random_action(4)
+    random_action = specs.random_action(4).discrete
     assert random_action.dtype == np.int32
     assert random_action.shape == (4, len(action_shape))
     assert np.min(random_action) >= 0

diff --git a/ml-agents/mlagents/trainers/agent_processor.py b/ml-agents/mlagents/trainers/agent_processor.py
@@ -2,6 +2,7 @@
 from typing import List, Dict, TypeVar, Generic, Tuple, Any, Union
 from collections import defaultdict, Counter
 import queue
+import numpy as np
 
 from mlagents_envs.base_env import (
     DecisionSteps,
@@ -129,14 +130,24 @@ def _process_step(
             done = terminated  # Since this is an ongoing step
             interrupted = step.interrupted if terminated else False
             # Add the outputs of the last eval
-            action = stored_take_action_outputs["action"][idx]
+            action_dict = stored_take_action_outputs["action"]
+            action: Dict[str, np.ndarray] = {}
+            for act_type, act_array in action_dict.items():
+                action[act_type] = act_array[idx]
             if self.policy.use_continuous_act:
                 action_pre = stored_take_action_outputs["pre_action"][idx]
             else:
                 action_pre = None
-            action_probs = stored_take_action_outputs["log_probs"][idx]
+            action_probs_dict = stored_take_action_outputs["log_probs"]
+            action_probs: Dict[str, np.ndarray] = {}
+            for prob_type, prob_array in action_probs_dict.items():
+                action_probs[prob_type] = prob_array[idx]
+
             action_mask = stored_decision_step.action_mask
-            prev_action = self.policy.retrieve_previous_action([global_id])[0, :]
+            prev_action = self.policy.retrieve_previous_action([global_id])
+            prev_action_dict: Dict[str, np.ndarray] = {}
+            for _prev_act_type, _prev_act in prev_action.items():
+                prev_action_dict[_prev_act_type] = _prev_act[0, :]
             experience = AgentExperience(
                 obs=obs,
                 reward=step.reward,
@@ -145,7 +156,7 @@ def _process_step(
                 action_probs=action_probs,
                 action_pre=action_pre,
                 action_mask=action_mask,
-                prev_action=prev_action,
+                prev_action=prev_action_dict,
                 interrupted=interrupted,
                 memory=memory,
             )

diff --git a/ml-agents/mlagents/trainers/buffer.py b/ml-agents/mlagents/trainers/buffer.py
@@ -22,7 +22,7 @@ class AgentBuffer(dict):
 
     class AgentBufferField(list):
         """
-        AgentBufferField is a list of numpy arrays. When an agent collects a field, you can add it to his
+        AgentBufferField is a list of numpy arrays. When an agent collects a field, you can add it to its
         AgentBufferField with the append method.
         """
 

diff --git a/ml-agents/mlagents/trainers/demo_loader.py b/ml-agents/mlagents/trainers/demo_loader.py
@@ -66,7 +66,14 @@ def make_demo_buffer(
         for i, obs in enumerate(split_obs.visual_observations):
             demo_raw_buffer["visual_obs%d" % i].append(obs)
         demo_raw_buffer["vector_obs"].append(split_obs.vector_observations)
-        demo_raw_buffer["actions"].append(current_pair_info.action_info.vector_actions)
+        if behavior_spec.action_spec.is_continuous():
+            demo_raw_buffer["continuous_action"].append(
+                current_pair_info.action_info.vector_actions
+            )
+        else:
+            demo_raw_buffer["discrete_action"].append(
+                current_pair_info.action_info.vector_actions
+            )
         demo_raw_buffer["prev_action"].append(previous_action)
         if next_done:
             demo_raw_buffer.resequence_and_append(