HumanCompatibleAI · pedrofreire · Jun 30, 2020 · Jun 27, 2020 · Jun 30, 2020 · Jun 30, 2020
diff --git a/src/seals/diagnostics/__init__.py b/src/seals/diagnostics/__init__.py
@@ -61,3 +61,9 @@
     entry_point="seals.diagnostics.parabola:ParabolaEnv",
     max_episode_steps=20,
 )
+
+gym.register(
+    id="seals/ProcGoal-v0",
+    entry_point="seals.diagnostics.proc_goal:ProcGoalEnv",
+    max_episode_steps=20,
+)
diff --git a/src/seals/diagnostics/proc_goal.py b/src/seals/diagnostics/proc_goal.py
@@ -0,0 +1,62 @@
+"""Large gridworld with random agent and goal position."""
+
+from gym import spaces
+import numpy as np
+
+from seals import base_envs, util
+
+
+class ProcGoalEnv(base_envs.ResettableMDP):
+    """Large gridworld with random agent and goal position.
+
+    In this task, the agent starts at a random position in a large
+    grid, and must navigate to a goal randomly placed in a
+    neighborhood around the agent.  The observation is a 4-dimensional
+    vector containing the (x,y) coordinates of the agent and the goal.
+    The reward at each timestep is the negative Manhattan distance
+    between the two positions.  With a large enough grid, generalizing
+    is necessary to achieve good performance, since most initial
+    states will be unseen.
+    """
+
+    def __init__(self, bounds: int = 100, distance: int = 10):
+        """Constructs environment.
+
+        Args:
+            bounds: the absolute values of the coordinates of the initial agent
+                position are bounded by `bounds`. Increasing the value might make
+                generalization harder.
+            distance: initial distance between agent and goal.
+        """
+        self._bounds = bounds
+        self._distance = distance
+
+        super().__init__(
+            state_space=spaces.Box(low=-np.inf, high=np.inf, shape=(4,)),
+            action_space=spaces.Discrete(5),
+        )
+
+    def terminal(self, state: np.ndarray, n_actions_taken: int) -> bool:
+        """Always returns False."""
+        return False
+
+    def initial_state(self) -> np.ndarray:
+        """Samples random agent position and random goal."""
+        pos = self.rand_state.randint(low=-self._bounds, high=self._bounds, size=(2,))
+
+        x_dist = self.rand_state.randint(self._distance)
+        y_dist = self._distance - x_dist
+        random_signs = 2 * self.rand_state.randint(2, size=2) - 1
+        goal = pos + random_signs * (x_dist, y_dist)
+
+        return np.concatenate([pos, goal]).astype(self.observation_space.dtype)
+
+    def reward(self, state: np.ndarray, action: int, new_state: np.ndarray) -> float:
+        """Negative L1 distance to goal."""
+        return (-1) * np.sum(np.abs(state[2:] - state[:2]))
+
+    def transition(self, state: np.ndarray, action: int) -> np.ndarray:
+        """Returns next state according to grid."""
+        pos, goal = state[:2], state[2:]
+        next_pos = util.grid_transition_fn(pos, action)
+        return np.concatenate([next_pos, goal])