HumanCompatibleAI · Rocamonde · Oct 4, 2022 · Aug 26, 2022 · Aug 26, 2022 · Aug 26, 2022
diff --git a/Makefile b/Makefile
diff --git a/setup.cfg b/setup.cfg
@@ -22,7 +22,7 @@ strictness=long
 [flake8]
 docstring-convention=google
 ignore = E203, W503
-max-line-length = 100
+max-line-length = 88
 
 [isort]
 line_length=88
@@ -38,7 +38,7 @@ inputs =
 	src/
 	tests/
 	setup.py
-python_version >= 3.7
+python_version >= 3.8
 
 [tool:pytest]
 markers =

diff --git a/setup.py b/setup.py
@@ -107,6 +107,7 @@ def get_readme() -> str:
     "flake8-docstrings",
     "flake8-isort",
     "isort",
+    "matplotlib",
     "mypy",
     "pydocstyle",
     "pytest",
@@ -137,7 +138,7 @@ def get_readme() -> str:
     packages=find_packages("src"),
     package_dir={"": "src"},
     package_data={"seals": ["py.typed"]},
-    install_requires=["gym", "numpy", "matplotlib"],
+    install_requires=["gym", "numpy"],
     tests_require=TESTS_REQUIRE,
     extras_require={
         # recommended packages for development

diff --git a/src/seals/base_envs.py b/src/seals/base_envs.py
@@ -101,7 +101,7 @@ def state(self) -> State:
     @state.setter
     def state(self, state: State):
         """Set current state."""
-        if self._cur_state is not None and self._cur_state not in self.state_space:
+        if state not in self.state_space:
             raise ValueError(f"{state} not in {self.state_space}")
         self._cur_state = state
 
@@ -130,8 +130,7 @@ def step(self, action: Action) -> Tuple[Observation, float, bool, dict]:
         old_state = self.state
         self.state = self.transition(self.state, action)
         obs = self.obs_from_state(self.state)
-        if obs not in self.observation_space:
-            raise ValueError(f"{obs} not in {self.observation_space}")
+        assert obs in self.observation_space
         reward = self.reward(old_state, action, self.state)
         self._n_actions_taken += 1
         done = self.terminal(self.state, self.n_actions_taken)
@@ -205,13 +204,13 @@ class BaseTabularModelPOMDP(ResettablePOMDP[int, Observation, int]):
 
     transition_matrix: np.ndarray
     reward_matrix: np.ndarray
-    observation_matrix: np.ndarray
+
+    state_space: spaces.Discrete
 
     def __init__(
         self,
         *,
         transition_matrix: np.ndarray,
-        observation_matrix: np.ndarray,
         reward_matrix: np.ndarray,
         horizon: float = np.inf,
         initial_state_dist: Optional[np.ndarray] = None,
@@ -221,8 +220,6 @@ def __init__(
         Args:
             transition_matrix: 3-D array with transition probabilities for a
                 given state-action pair, of shape `(n_states,n_actions,n_states)`.
-            observation_matrix: 2-D array with observation probabilities for a
-                given state, of shape `(n_states,n_observations)`.
             reward_matrix: 1-D, 2-D or 3-D array corresponding to rewards to a
                 given `(state, action, next_state)` triple. A 2-D array assumes
                 the `next_state` is not used in the reward, and a 1-D array
@@ -239,84 +236,62 @@ def __init__(
                 `initial_state_dist` have shapes different to specified above.
         """
         # The following matrices should conform to the shapes below:
-        # transition matrix: n_states x n_actions x n_states
-        # reward matrix: n_states x n_actions x n_states
-        #   OR n_states x n_actions
-        #   OR n_states
-        # observation matrix: n_states x n_observations
-        # initial state dist: n_states
-        # we want to make sure that the shapes are correct
 
-        if transition_matrix.shape[0] != transition_matrix.shape[2]:
+        # transition matrix: n_states x n_actions x n_states
+        n_states = transition_matrix.shape[0]
+        if n_states != transition_matrix.shape[2]:
             raise ValueError(
                 "Malformed transition_matrix:\n"
                 f"transition_matrix.shape: {transition_matrix.shape}\n"
-                f"{transition_matrix.shape[0]} != {transition_matrix.shape[2]}",
+                f"{n_states} != {transition_matrix.shape[2]}",
             )
 
+        # reward matrix: n_states x n_actions x n_states
+        #   OR n_states x n_actions
+        #   OR n_states
         if reward_matrix.shape != transition_matrix.shape[: len(reward_matrix.shape)]:
             raise ValueError(
                 "transition_matrix and reward_matrix are not compatible:\n"
                 f"transition_matrix.shape: {transition_matrix.shape}\n"
                 f"reward_matrix.shape: {reward_matrix.shape}",
             )
 
-        if observation_matrix.shape[0] != transition_matrix.shape[0]:
-            raise ValueError(
-                "transition_matrix and observation_matrix are not compatible:\n"
-                f"transition_matrix.shape[0]: {transition_matrix.shape[0]}\n"
-                f"observation_matrix.shape[0]: {observation_matrix.shape[0]}",
-            )
-
+        # initial state dist: n_states
         if initial_state_dist is None:
-            initial_state_dist = util.one_hot_encoding(0, transition_matrix.shape[0])
+            initial_state_dist = util.one_hot_encoding(0, n_states)
         if initial_state_dist.ndim != 1:
             raise ValueError(
                 "initial_state_dist has multiple dimensions:\n"
                 f"{initial_state_dist.ndim} != 1",
             )
-        if initial_state_dist.shape[0] != transition_matrix.shape[0]:
+        if initial_state_dist.shape[0] != n_states:
             raise ValueError(
                 "transition_matrix and initial_state_dist are not compatible:\n"
-                f"number of states = {transition_matrix.shape[0]}\n"
+                f"number of states = {n_states}\n"
                 f"len(initial_state_dist) = {len(initial_state_dist)}",
             )
 
         self.transition_matrix = transition_matrix
         self.reward_matrix = reward_matrix
-        self.observation_matrix = observation_matrix
         self._feature_matrix = None
         self.horizon = horizon
         self.initial_state_dist = initial_state_dist
 
         super().__init__(
-            state_space=self._construct_state_space(self.state_dim),
-            action_space=self._construct_action_space(self.action_dim),
-            observation_space=self._construct_obs_space(self.obs_dim, self.obs_dtype),
+            state_space=self._construct_state_space(),
+            action_space=self._construct_action_space(),
+            observation_space=self._construct_observation_space(),
         )
 
-    @staticmethod
-    def _construct_state_space(n_states: int) -> gym.Space:
-        return spaces.Discrete(n_states)
+    def _construct_state_space(self) -> gym.Space:
+        return spaces.Discrete(self.state_dim)
 
-    @staticmethod
-    def _construct_action_space(n_actions: int) -> gym.Space:
-        return spaces.Discrete(n_actions)
+    def _construct_action_space(self) -> gym.Space:
+        return spaces.Discrete(self.action_dim)
 
-    @staticmethod
-    def _construct_obs_space(obs_dim, obs_dtype) -> gym.Space:
-        try:
-            dtype_iinfo = np.iinfo(obs_dtype)
-            min_val, max_val = dtype_iinfo.min, dtype_iinfo.max
-        except ValueError:
-            min_val = -np.inf
-            max_val = np.inf
-        return spaces.Box(
-            low=min_val,
-            high=max_val,
-            shape=(obs_dim,),
-            dtype=obs_dtype,
-        )
+    @abc.abstractmethod
+    def _construct_observation_space(self) -> gym.Space:
+        pass  # pragma: no cover
 
     def initial_state(self) -> int:
         """Samples from the initial state distribution."""
@@ -346,8 +321,6 @@ def feature_matrix(self):
         """Matrix mapping states to feature vectors."""
         # Construct lazily to save memory in algorithms that don't need features.
         if self._feature_matrix is None:
-            # TODO(juan) Space() does not have an `n` attribute (?).
-            #  Are we hinting the wrong type?
             n_states = self.state_space.n
             self._feature_matrix = np.eye(n_states)
         return self._feature_matrix
@@ -362,16 +335,6 @@ def action_dim(self) -> int:
         """Number of action vectors (int)."""
         return self.transition_matrix.shape[1]
 
-    @property
-    def obs_dim(self) -> int:
-        """Size of observation vectors for this MDP."""
-        return self.observation_matrix.shape[1]
-
-    @property
-    def obs_dtype(self) -> int:
-        """Data type of observation vectors (e.g. np.float32)."""
-        return self.observation_matrix.dtype
-
 
 class TabularModelPOMDP(BaseTabularModelPOMDP[np.ndarray]):
     """Tabular model POMDP.
@@ -385,6 +348,50 @@ class TabularModelPOMDP(BaseTabularModelPOMDP[np.ndarray]):
     a vector with self.obs_dim entries.
     """
 
+    observation_matrix: np.ndarray
+
+    def __init__(
+        self,
+        *,
+        transition_matrix: np.ndarray,
+        observation_matrix: np.ndarray,
+        reward_matrix: np.ndarray,
+        horizon: float = np.inf,
+        initial_state_dist: Optional[np.ndarray] = None,
+    ):
+        """Initializes a tabular model POMDP."""
+        self.observation_matrix = observation_matrix
+        super().__init__(
+            transition_matrix=transition_matrix,
+            reward_matrix=reward_matrix,
+            horizon=horizon,
+            initial_state_dist=initial_state_dist,
+        )
+
+        # observation matrix: n_states x n_observations
+        if observation_matrix.shape[0] != self.state_dim:
+            raise ValueError(
+                "transition_matrix and observation_matrix are not compatible:\n"
+                f"transition_matrix.shape[0]: {self.state_dim}\n"
+                f"observation_matrix.shape[0]: {observation_matrix.shape[0]}",
+            )
+
+    def _construct_observation_space(self) -> gym.Space:
+        min_val: float
+        max_val: float
+        try:
+            dtype_iinfo = np.iinfo(self.obs_dtype)
+            min_val, max_val = dtype_iinfo.min, dtype_iinfo.max
+        except ValueError:
+            min_val = -np.inf
+            max_val = np.inf
+        return spaces.Box(
+            low=min_val,
+            high=max_val,
+            shape=(self.obs_dim,),
+            dtype=self.obs_dtype,
+        )
+
     def obs_from_state(self, state: int) -> np.ndarray:
         """Computes observation from state."""
         # Copy so it can't be mutated in-place (updates will be reflected in
@@ -393,6 +400,16 @@ def obs_from_state(self, state: int) -> np.ndarray:
         assert obs.ndim == 1, obs.shape
         return obs
 
+    @property
+    def obs_dim(self) -> int:
+        """Size of observation vectors for this MDP."""
+        return self.observation_matrix.shape[1]
+
+    @property
+    def obs_dtype(self) -> int:
+        """Data type of observation vectors (e.g. np.float32)."""
+        return self.observation_matrix.dtype
+
 
 class TabularModelMDP(BaseTabularModelPOMDP[int]):
     """Tabular model MDP.
@@ -424,10 +441,11 @@ def __init__(
             reward_matrix=reward_matrix,
             horizon=horizon,
             initial_state_dist=initial_state_dist,
-            observation_matrix=np.eye(transition_matrix.shape[0]),
         )
-        self._observation_space = self._state_space
 
     def obs_from_state(self, state: int) -> int:
         """Identity since observation == state in an MDP."""
         return state
+
+    def _construct_observation_space(self) -> gym.Space:
+        return self._construct_state_space()
diff --git a/src/seals/diagnostics/__init__.py b/src/seals/diagnostics/__init__.py
@@ -67,3 +67,42 @@
     entry_point="seals.diagnostics.sort:SortEnv",
     max_episode_steps=6,
 )
+
+
+def register_cliff_world(suffix, kwargs):
+    """Register a CliffWorld with the given suffix and keyword arguments."""
+    gym.register(
+        f"seals/CliffWorld{suffix}-v0",
+        entry_point="seals.diagnostics.cliff_world:CliffWorldEnv",
+        kwargs=kwargs,
+    )
+
+
+for width, height, horizon in [(7, 4, 9), (15, 6, 18), (100, 20, 110)]:
+    for use_xy in [False, True]:
+        use_xy_str = "XY" if use_xy else ""
+        register_cliff_world(
+            f"{width}x{height}{use_xy_str}",
+            kwargs={
+                "width": width,
+                "height": height,
+                "use_xy_obs": use_xy,
+                "horizon": horizon,
+            },
+        )
+
+# These parameter choices are somewhat arbitrary.
+# We anticipate most users will want to construct RandomTransitionEnv directly.
+gym.register(
+    "seals/Random-v0",
+    entry_point="seals.diagnostics.random_trans:RandomTransitionEnv",
+    kwargs={
+        "n_states": 16,
+        "n_actions": 3,
+        "branch_factor": 2,
+        "horizon": 20,
+        "random_obs": True,
+        "obs_dim": 5,
+        "generator_seed": 42,
+    },
+)