HumanCompatibleAI · pedrofreire · Jun 30, 2020 · Jun 27, 2020 · Jun 30, 2020 · Jun 30, 2020
diff --git a/src/seals/diagnostics/__init__.py b/src/seals/diagnostics/__init__.py
@@ -7,3 +7,9 @@
     entry_point="seals.diagnostics.risky_path:RiskyPathEnv",
     max_episode_steps=5,
 )
+
+gym.register(
+    id="seals/ProcGoal-v0",
+    entry_point="seals.diagnostics.proc_goal:ProcGoalEnv",
+    max_episode_steps=20,
+)
diff --git a/src/seals/diagnostics/proc_goal.py b/src/seals/diagnostics/proc_goal.py
@@ -0,0 +1,62 @@
+"""Large gridworld with random agent and goal position."""
+
+from gym import spaces
+import numpy as np
+
+from seals import base_envs, util
+
+
+class ProcGoalEnv(base_envs.ResettableMDP):
+    """Large gridworld with random agent and goal position.
+
+    In this task, the agent starts at a random position in a large
+    grid, and must navigate to a goal randomly placed in a
+    neighborhood around the agent.  The observation is a 4-dimensional
+    vector containing the (x,y) coordinates of the agent and the goal.
+    The reward at each timestep is the negative Manhattan distance
+    between the two positions.  With a large enough grid, generalizing
+    is necessary to achieve good performance, since most initial
+    states will be unseen.
+    """
+
+    def __init__(self, bounds: int = 100, distance: int = 10):
+        """Constructs environment.
+
+        Args:
+            bounds: the absolute values of the coordinates of the initial agent
+            position are bounded by `bounds`. Increasing the value might make
+            generalization harder.
+            distance: initial distance between agent and goal.
+        """
+        self._bounds = bounds
+        self._distance = distance
+
+        super().__init__(
+            state_space=spaces.Box(low=-np.inf, high=np.inf, shape=(4,)),
+            action_space=spaces.Discrete(5),
+        )
+
+    def terminal(self, state: np.ndarray, n_actions_taken: int) -> bool:
+        """Always returns False."""
+        return False
+
+    def initial_state(self) -> np.ndarray:
+        """Samples random agent position and random goal."""
+        pos = self.rand_state.randint(low=-self._bounds, high=self._bounds, size=(2,))
+
+        x_dist = self.rand_state.randint(self._distance)
+        y_dist = self._distance - x_dist
+        random_signs = 2 * self.rand_state.randint(2, size=2) - 1
+        goal = pos + random_signs * (x_dist, y_dist)
+
+        return np.concatenate([pos, goal]).astype(self.observation_space.dtype)
+
+    def reward(self, state: np.ndarray, action: int, new_state: np.ndarray) -> float:
+        """Negative L1 distance to goal."""
+        return (-1) * np.sum(np.abs(state[2:] - state[:2]))
+
+    def transition(self, state: np.ndarray, action: int) -> np.ndarray:
+        """Returns next state according to grid."""
+        pos, goal = state[:2], state[2:]
+        next_pos = util.grid_transition_fn(pos, action)
+        return np.concatenate([next_pos, goal])
diff --git a/src/seals/util.py b/src/seals/util.py
@@ -1,6 +1,6 @@
 """Miscellaneous utilities."""
 
-from typing import Optional
+from typing import Optional, Tuple
 
 import gym
 import numpy as np
@@ -118,3 +118,39 @@ def sample_distribution(
 def one_hot_encoding(pos: int, size: int) -> np.ndarray:
     """Returns a 1-D hot encoding of a given position and size."""
     return np.eye(size)[pos]
+
+
+def grid_transition_fn(
+    state: np.ndarray,
+    action: int,
+    x_bounds: Tuple[float, float] = (-np.inf, np.inf),
+    y_bounds: Tuple[float, float] = (-np.inf, np.inf),
+):
+    """Returns transition of a deterministic gridworld.
+
+    Agent is bounded in the region limited by x_bounds and y_bounds,
+    ends inclusive.
+
+    Actions:
+    0: Right
+    1: Down
+    2: Left
+    3: Up
+    4: Stay put
+    """
+    dirs = [
+        (1, 0),
+        (0, 1),
+        (-1, 0),
+        (0, -1),
+        (0, 0),
+    ]
+
+    x, y = state
+    dx, dy = dirs[action]
+
+    next_x = np.clip(x + dx, *x_bounds)
+    next_y = np.clip(y + dy, *y_bounds)
+    next_state = np.array([next_x, next_y], dtype=state.dtype)
+
+    return next_state