HumanCompatibleAI · AdamGleave · Sep 8, 2023 · Aug 30, 2023 · Sep 5, 2023 · Sep 5, 2023
diff --git a/examples/train_dagger_atari_interactive_policy.py b/examples/train_dagger_atari_interactive_policy.py
@@ -0,0 +1,41 @@
+"""Training DAgger with an interactive policy that queries the user for actions.
+
+Note that this is a toy example that does not lead to training a reasonable policy.
+"""
+
+import tempfile
+
+import gym
+import numpy as np
+from stable_baselines3.common import vec_env
+
+from imitation.algorithms import bc, dagger
+from imitation.policies import interactive
+
+if __name__ == "__main__":
+    rng = np.random.default_rng(0)
+
+    env = vec_env.DummyVecEnv([lambda: gym.wrappers.TimeLimit(gym.make("Pong-v4"), 10)])
+    env.seed(0)
+
+    expert = interactive.AtariInteractivePolicy(env)
+
+    bc_trainer = bc.BC(
+        observation_space=env.observation_space,
+        action_space=env.action_space,
+        rng=rng,
+    )
+
+    with tempfile.TemporaryDirectory(prefix="dagger_example_") as tmpdir:
+        dagger_trainer = dagger.SimpleDAggerTrainer(
+            venv=env,
+            scratch_dir=tmpdir,
+            expert_policy=expert,
+            bc_trainer=bc_trainer,
+            rng=rng,
+        )
+        dagger_trainer.train(
+            total_timesteps=20,
+            rollout_round_min_episodes=1,
+            rollout_round_min_timesteps=10,
+        )
diff --git a/setup.py b/setup.py
@@ -203,11 +203,14 @@ def get_local_version(version: "ScmVersion", time_format="%Y%m%d") -> str:
         "torch>=1.4.0",
         "tqdm",
         "scikit-learn>=0.21.2",
-        "seals>=0.1.5",
+        "seals~=0.1.5",
         STABLE_BASELINES3,
         "sacred>=0.8.4",
         "tensorboard>=1.14",
-        "huggingface_sb3>=2.2.1",
+        # TODO: remove once https://github.com/huggingface/huggingface_sb3/issues/37 is
+        #  fixed
+        "huggingface_sb3==2.2.5",
+        "optuna>=3.0.1",
         "datasets>=2.8.0",
     ],
     tests_require=TESTS_REQUIRE,

diff --git a/src/imitation/policies/base.py b/src/imitation/policies/base.py
@@ -13,11 +13,11 @@
 from imitation.util import networks
 
 
-class HardCodedPolicy(policies.BasePolicy, abc.ABC):
-    """Abstract class for hard-coded (non-trainable) policies."""
+class NonTrainablePolicy(policies.BasePolicy, abc.ABC):
+    """Abstract class for non-trainable (e.g. hard-coded or interactive) policies."""
 
     def __init__(self, observation_space: gym.Space, action_space: gym.Space):
-        """Builds HardcodedPolicy with specified observation and action space."""
+        """Builds NonTrainablePolicy with specified observation and action space."""
         super().__init__(
             observation_space=observation_space,
             action_space=action_space,
@@ -43,14 +43,14 @@ def forward(self, *args):
         raise NotImplementedError  # pragma: no cover
 
 
-class RandomPolicy(HardCodedPolicy):
+class RandomPolicy(NonTrainablePolicy):
     """Returns random actions."""
 
     def _choose_action(self, obs: np.ndarray) -> np.ndarray:
         return self.action_space.sample()
 
 
-class ZeroPolicy(HardCodedPolicy):
+class ZeroPolicy(NonTrainablePolicy):
     """Returns constant zero action."""
 
     def _choose_action(self, obs: np.ndarray) -> np.ndarray:

diff --git a/src/imitation/policies/interactive.py b/src/imitation/policies/interactive.py
@@ -0,0 +1,152 @@
+"""Interactive policies that query the user for actions."""
+
+import abc
+import collections
+import typing
+
+import gym
+import matplotlib.pyplot as plt
+import numpy as np
+from stable_baselines3.common import vec_env
+
+import imitation.policies.base as base_policies
+from imitation.util import util
+
+
+class DiscreteInteractivePolicy(base_policies.NonTrainablePolicy, abc.ABC):
+    """Abstract class for interactive policies with discrete actions.
+
+    For each query, the observation is rendered and then the action is provided
+    as a keyboard input.
+    """
+
+    def __init__(
+        self,
+        observation_space: gym.Space,
+        action_space: gym.Space,
+        action_keys_names: collections.OrderedDict,
+        clear_screen_on_query: bool = True,
+    ):
+        """Builds DiscreteInteractivePolicy.
+
+        Args:
+            observation_space: Observation space.
+            action_space: Action space.
+            action_keys_names: `OrderedDict` containing pairs (key, name) for every
+                action, where key will be used in the console interface, and name
+                is a semantic action name.
+            clear_screen_on_query: If `True`, console will be cleared on every query.
+        """
+        super().__init__(
+            observation_space=observation_space,
+            action_space=action_space,
+        )
+
+        assert isinstance(action_space, gym.spaces.Discrete)
+        assert (
+            len(action_keys_names)
+            == len(set(action_keys_names.values()))
+            == action_space.n
+        )
+
+        self.action_keys_names = action_keys_names
+        self.action_key_to_index = {
+            k: i for i, k in enumerate(action_keys_names.keys())
-            k: i for i, k in enumerate(action_keys_names.keys())
+            k: i for i, k in enumerate(action_keys_names)
-            k: i for i, k in enumerate(action_keys_names.keys())
+            k: i for i, k in enumerate(action_keys_names)
+        }
+        self.clear_screen_on_query = clear_screen_on_query
+
+    def _choose_action(self, obs: np.ndarray) -> np.ndarray:
+        if self.clear_screen_on_query:
+            util.clear_screen()
+
+        context = self._render(obs)
+        key = self._get_input_key()
+        self._clean_up(context)
+
+        return np.array([self.action_key_to_index[key]])
+
+    def _get_input_key(self) -> str:
+        """Obtains input key for action selection."""
+        print(
+            "Please select an action. Possible choices in [ACTION_NAME:KEY] format:",
+            ", ".join([f"{n}:{k}" for k, n in self.action_keys_names.items()]),
+        )
+
+        key = input("Your choice (enter key):")
+        while key not in self.action_keys_names.keys():
+            key = input("Invalid key, please try again! Your choice (enter key):")
+
+        return key
+
+    @abc.abstractmethod
+    def _render(self, obs: np.ndarray) -> typing.Optional[object]:
+        """Renders an observation, optionally returns a context for later cleanup."""
+
+    def _clean_up(self, context: object) -> None:
+        """Cleans up after the input has been captured, e.g. stops showing the image."""
+        pass
+
+
+class ImageObsDiscreteInteractivePolicy(DiscreteInteractivePolicy):
+    """DiscreteInteractivePolicy that renders image observations."""
+
+    def _render(self, obs: np.ndarray) -> plt.Figure:
+        img = self._prepare_obs_image(obs)
+
+        fig, ax = plt.subplots()
+        ax.imshow(img, cmap="gray", vmin=0, vmax=255)  # cmap is ignored for RGB images.
+        ax.axis("off")
+        fig.show()
+
+        return fig
+
+    def _clean_up(self, context: plt.Figure) -> None:
+        plt.close(context)
+
+    def _prepare_obs_image(self, obs: np.ndarray) -> np.ndarray:
+        """Applies any required observation processing to get an image to show."""
+        return obs
+
+
+ATARI_ACTION_NAMES_TO_KEYS = {
+    "NOOP": "1",
+    "FIRE": "2",
+    "UP": "w",
+    "RIGHT": "d",
+    "LEFT": "a",
+    "DOWN": "x",
+    "UPRIGHT": "e",
+    "UPLEFT": "q",
+    "DOWNRIGHT": "c",
+    "DOWNLEFT": "z",
+    "UPFIRE": "t",
+    "RIGHTFIRE": "h",
+    "LEFTFIRE": "f",
+    "DOWNFIRE": "b",
+    "UPRIGHTFIRE": "y",
+    "UPLEFTFIRE": "r",
+    "DOWNRIGHTFIRE": "n",
+    "DOWNLEFTFIRE": "v",
+}
+
+
+class AtariInteractivePolicy(ImageObsDiscreteInteractivePolicy):
+    """Interactive policy for Atari environments."""
+
+    def __init__(self, env: typing.Union[gym.Env, vec_env.VecEnv], *args, **kwargs):
+        """Builds AtariInteractivePolicy."""
+        action_names = (
+            env.get_action_meanings()
+            if isinstance(env, gym.Env)
+            else env.env_method("get_action_meanings", indices=[0])[0]
+        )
+        action_keys_names = collections.OrderedDict(
+            [(ATARI_ACTION_NAMES_TO_KEYS[name], name) for name in action_names],
+        )
+        super().__init__(
+            env.observation_space,
+            env.action_space,
+            action_keys_names,
+            *args,
+            **kwargs,
+        )
diff --git a/src/imitation/util/util.py b/src/imitation/util/util.py
@@ -460,3 +460,11 @@ def split_in_half(x: int) -> Tuple[int, int]:
     """
     half = x // 2
     return half, x - half
+
+
+def clear_screen() -> None:
+    """Clears the console screen."""
+    if os.name == "nt":  # Windows
+        os.system("cls")
+    else:
+        os.system("clear")
diff --git a/tests/algorithms/test_mce_irl.py b/tests/algorithms/test_mce_irl.py
@@ -132,6 +132,8 @@ def test_infinite_horizon_error(random_mdp, rng):
 def test_policy_om_random_mdp(discount: float):
     """Test that optimal policy occupancy measure ("om") for a random MDP is sane."""
     mdp = gym.make("seals/Random-v0")
+    mdp.seed(0)
+
     V, Q, pi = mce_partition_fh(mdp, discount=discount)
     assert np.all(np.isfinite(V))
     assert np.all(np.isfinite(Q))

diff --git a/tests/policies/test_interactive.py b/tests/policies/test_interactive.py
@@ -0,0 +1,117 @@
+"""Tests interactive policies."""
+
+import collections
+from unittest import mock
+
+import gym
+import numpy as np
+import pytest
+from stable_baselines3.common import vec_env
+
+from imitation.policies import interactive
+
+ENVS = [
+    "CartPole-v0",
+]
+
+
+class NoRenderingDiscreteInteractivePolicy(interactive.DiscreteInteractivePolicy):
+    """DiscreteInteractivePolicy with no rendering."""
+
+    def _render(self, obs: np.ndarray) -> None:
+        pass
+
+
+def _get_interactive_policy(env: vec_env.VecEnv):
+    num_actions = env.action_space.n
+    action_keys_names = collections.OrderedDict(
+        [(f"k{i}", f"n{i}") for i in range(num_actions)],
+    )
+    interactive_policy = NoRenderingDiscreteInteractivePolicy(
+        env.observation_space,
+        env.action_space,
+        action_keys_names,
+    )
+    return interactive_policy
+
+
+@pytest.mark.parametrize("env_name", ENVS)
+def test_interactive_policy(env_name: str):
+    """Test if correct actions are selected, as specified by input keys."""
+    env = vec_env.DummyVecEnv([lambda: gym.wrappers.TimeLimit(gym.make(env_name), 10)])
+    env.seed(0)
+
+    interactive_policy = _get_interactive_policy(env)
+    action_keys = list(interactive_policy.action_keys_names.keys())
+
+    obs = env.reset()
+    done = np.array([False])
+
+    class mock_input:
+        def __init__(self):
+            self.index = 0
+
+        def __call__(self, _):
+            # Sometimes insert incorrect keys, which should get ignored by the policy.
+            if np.random.uniform() < 0.5:
+                return "invalid"
+            key = action_keys[self.index]
+            self.index = (self.index + 1) % len(action_keys)
+            return key
+
+    with mock.patch("builtins.input", mock_input()):
+        requested_action = 0
+        while not done.all():
+            action, _ = interactive_policy.predict(obs)
+            assert isinstance(action, np.ndarray)
+            assert all(env.action_space.contains(a) for a in action)
+            assert action[0] == requested_action
+
+            obs, reward, done, info = env.step(action)
+            assert isinstance(obs, np.ndarray)
+            assert all(env.observation_space.contains(o) for o in obs)
+            assert isinstance(reward, np.ndarray)
+            assert isinstance(done, np.ndarray)
+
+            requested_action = (requested_action + 1) % len(action_keys)
+
+
+@pytest.mark.parametrize("env_name", ENVS)
+def test_interactive_policy_input_validity(capsys, env_name: str):
+    """Test if appropriate feedback is given on the validity of the input."""
+    env = vec_env.DummyVecEnv([lambda: gym.wrappers.TimeLimit(gym.make(env_name), 10)])
+    env.seed(0)
+
+    interactive_policy = _get_interactive_policy(env)
+    action_keys = list(interactive_policy.action_keys_names.keys())
+
+    # Valid input key case
+    obs = env.reset()
+
+    def mock_input_valid(prompt):
+        print(prompt)
+        return action_keys[0]
+
+    with mock.patch("builtins.input", mock_input_valid):
+        interactive_policy.predict(obs)
+        stdout = capsys.readouterr().out
+        assert "Your choice" in stdout and "Invalid" not in stdout
+
+    # First invalid input key, then valid
+    obs = env.reset()
+
+    class mock_input_invalid_then_valid:
+        def __init__(self):
+            self.return_valid = False
+
+        def __call__(self, prompt):
+            print(prompt)
+            if self.return_valid:
+                return action_keys[0]
+            self.return_valid = True
+            return "invalid"
+
+    with mock.patch("builtins.input", mock_input_invalid_then_valid()):
+        interactive_policy.predict(obs)
+        stdout = capsys.readouterr().out
+        assert "Your choice" in stdout and "Invalid" in stdout
diff --git a/tests/policies/test_policies.py b/tests/policies/test_policies.py
@@ -17,13 +17,13 @@
 SIMPLE_DISCRETE_ENV = "CartPole-v0"  # Discrete(2) action space
 SIMPLE_CONTINUOUS_ENV = "MountainCarContinuous-v0"  # Box(1) action space
 SIMPLE_ENVS = [SIMPLE_DISCRETE_ENV, SIMPLE_CONTINUOUS_ENV]
-HARDCODED_TYPES = ["random", "zero"]
+NONTRAINABLE_TYPES = ["random", "zero"]
 
 assert_equal = functools.partial(th.testing.assert_close, rtol=0, atol=0)
 
 
 @pytest.mark.parametrize("env_name", SIMPLE_ENVS)
-@pytest.mark.parametrize("policy_type", HARDCODED_TYPES)
+@pytest.mark.parametrize("policy_type", NONTRAINABLE_TYPES)
 def test_actions_valid(env_name, policy_type, rng):
     """Test output actions of our custom policies always lie in action space."""
     venv = util.make_vec_env(