HumanCompatibleAI · AdamGleave · Sep 23, 2023 · Jun 29, 2023 · Jul 1, 2023 · Jul 28, 2023
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -65,7 +65,7 @@ commands:
       # Download and cache dependencies
       - restore_cache:
           keys:
-            - v7linux-dependencies-{{ checksum "setup.py" }}-{{ checksum "ci/build_and_activate_venv.sh" }}
+            - v8linux-dependencies-{{ checksum "setup.py" }}-{{ checksum "ci/build_and_activate_venv.sh" }}
 
       - run:
           name: install dependencies
@@ -75,7 +75,7 @@ commands:
       - save_cache:
           paths:
             - /venv
-          key: v7linux-dependencies-{{ checksum "setup.py" }}-{{ checksum "ci/build_and_activate_venv.sh" }}
+          key: v8linux-dependencies-{{ checksum "setup.py" }}-{{ checksum "ci/build_and_activate_venv.sh" }}
 
       - run:
           name: install imitation

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -3,7 +3,7 @@
 repos:
 # Linting
 - repo: https://github.com/pre-commit/pre-commit-hooks
-  rev: v4.3.0
+  rev: v4.4.0
   hooks:
   - id: check-ast
   - id: trailing-whitespace
@@ -12,7 +12,7 @@ repos:
   - id: check-toml
   - id: check-added-large-files
 - repo: https://github.com/psf/black
-  rev: 22.6.0
+  rev: 23.9.1
   hooks:
   - id: black
   - id: black-jupyter
@@ -22,7 +22,7 @@ repos:
   - id: isort
 # Python static analysis
 - repo: https://github.com/pycqa/flake8
-  rev: '5.0.4'
+  rev: '6.1.0'
   hooks:
   - id: flake8
     additional_dependencies:
@@ -34,7 +34,7 @@ repos:
       - flake8-docstrings~=1.6.0
 # Shell static analysis
 - repo: https://github.com/koalaman/shellcheck-precommit
-  rev: v0.8.0
+  rev: v0.9.0
   hooks:
   - id: shellcheck
   # precommit invokes shellcheck once per file. shellcheck complains if file
@@ -43,12 +43,12 @@ repos:
     args: ["-e", "SC1091"]
 # Misc
 - repo: https://github.com/codespell-project/codespell
-  rev: v2.2.2
+  rev: v2.2.4
   hooks:
   - id: codespell
     args: ["--skip=*.pyc,tests/testdata/*,*.ipynb,*.csv","--ignore-words-list=reacher,ith,iff"]
 - repo: https://github.com/syntaqx/git-hooks
-  rev: v0.0.17
+  rev: v0.0.18
   hooks:
   - id: circleci-config-validate
 # Hooks that run in local environment (not isolated venv) as they need

diff --git a/ci/build_and_activate_venv.ps1 b/ci/build_and_activate_venv.ps1
@@ -9,8 +9,4 @@ If ($venv -eq $null) {
 
 virtualenv -p python3.8 $venv
 & $venv\Scripts\activate
-# Note: We need to install these versions of setuptools and wheel to allow installing gym==0.21.0 on Windows.
-# See https://github.com/freqtrade/freqtrade/issues/8376
-# TODO(GH#707): remove pin once upgraded Gym
-python -m pip install --upgrade pip wheel==0.38.4 setuptools==65.5.1
 pip install ".[docs,parallel,test]"
diff --git a/ci/build_and_activate_venv.sh b/ci/build_and_activate_venv.sh
@@ -20,8 +20,9 @@ fi
 virtualenv -p ${python_version} ${venv}
 # shellcheck disable=SC1090,SC1091
 source ${venv}/bin/activate
-# Note: We need to install setuptools==66.1.1 to allow installing gym==0.21.0.
-python -m pip install --upgrade pip setuptools==66.1.1
+
+# Update pip to the latest version.
+pip install --upgrade pip
 
 # If platform is linux, install pytorch CPU version.
 # This will prevent installing the CUDA version in the pip install ".[docs,parallel,test]" command.

diff --git a/ci/clean_notebooks.py b/ci/clean_notebooks.py
@@ -63,7 +63,6 @@ def clean_notebook(file: pathlib.Path, check_only=False) -> None:
         print(f"Checking {file}")
 
     for cell in nb.cells:
-
         # Remove empty cells
         if cell["cell_type"] == "code" and not cell["source"]:
             if check_only:

diff --git a/docs/algorithms/airl.rst b/docs/algorithms/airl.rst
@@ -23,7 +23,7 @@ Detailed example notebook: :doc:`../tutorials/4_train_airl`
     :skipif: skip_doctests
 
     import numpy as np
-    import seals  # noqa: F401  # needed to load "seals/" environments
+    import gymnasium as gym
     from stable_baselines3 import PPO
     from stable_baselines3.common.evaluation import evaluate_policy
     from stable_baselines3.ppo import MlpPolicy
@@ -39,7 +39,7 @@ Detailed example notebook: :doc:`../tutorials/4_train_airl`
     SEED = 42
 
     env = make_vec_env(
-        "seals/CartPole-v0",
+        "seals:seals/CartPole-v0",
         rng=np.random.default_rng(SEED),
         n_envs=8,
         post_wrappers=[lambda env, _: RolloutInfoWrapper(env)],  # to compute rollouts

diff --git a/docs/algorithms/bc.rst b/docs/algorithms/bc.rst
@@ -21,7 +21,7 @@ Detailed example notebook: :doc:`../tutorials/1_train_bc`
     :skipif: skip_doctests
 
     import numpy as np
-    import seals  # noqa: F401  # needed to load "seals/" environments
+    import gymnasium as gym
     from stable_baselines3.common.evaluation import evaluate_policy
 
     from imitation.algorithms import bc
@@ -32,7 +32,7 @@ Detailed example notebook: :doc:`../tutorials/1_train_bc`
 
     rng = np.random.default_rng(0)
     env = make_vec_env(
-        "seals/CartPole-v0",
+        "seals:seals/CartPole-v0",
         rng=rng,
         n_envs=1,
         post_wrappers=[lambda env, _: RolloutInfoWrapper(env)],  # for computing rollouts

diff --git a/docs/algorithms/dagger.rst b/docs/algorithms/dagger.rst
@@ -26,7 +26,7 @@ Detailed example notebook: :doc:`../tutorials/2_train_dagger`
     import tempfile
 
     import numpy as np
-    import seals  # noqa: F401  # needed to load "seals/" environments
+    import gymnasium as gym
     from stable_baselines3.common.evaluation import evaluate_policy
 
     from imitation.algorithms import bc
@@ -36,7 +36,7 @@ Detailed example notebook: :doc:`../tutorials/2_train_dagger`
 
     rng = np.random.default_rng(0)
     env = make_vec_env(
-        "seals/CartPole-v0",
+        "seals:seals/CartPole-v0",
         rng=rng,
     )
     expert = load_policy(

diff --git a/docs/algorithms/gail.rst b/docs/algorithms/gail.rst
@@ -20,7 +20,7 @@ Detailed example notebook: :doc:`../tutorials/3_train_gail`
     :skipif: skip_doctests
 
     import numpy as np
-    import seals  # noqa: F401  # needed to load "seals/" environments
+    import gymnasium as gym
     from stable_baselines3 import PPO
     from stable_baselines3.common.evaluation import evaluate_policy
     from stable_baselines3.ppo import MlpPolicy
@@ -36,7 +36,7 @@ Detailed example notebook: :doc:`../tutorials/3_train_gail`
     SEED = 42
 
     env = make_vec_env(
-        "seals/CartPole-v0",
+        "seals:seals/CartPole-v0",
         rng=np.random.default_rng(SEED),
         n_envs=8,
         post_wrappers=[lambda env, _: RolloutInfoWrapper(env)],  # to compute rollouts

diff --git a/docs/algorithms/sqil.rst b/docs/algorithms/sqil.rst
@@ -28,7 +28,7 @@ Detailed example notebook: :doc:`../tutorials/8_train_sqil`
     :skipif: skip_doctests
 
     import datasets
-    import gym
+    import gymnasium as gym
     from stable_baselines3.common.evaluation import evaluate_policy
     from stable_baselines3.common.vec_env import DummyVecEnv
 

diff --git a/docs/getting-started/installation.rst b/docs/getting-started/installation.rst
@@ -6,13 +6,7 @@ Prerequisites
 -------------
 
 - Python 3.8+
-- Specific versions of pip and setuptools due to \
-  `a bug with gym <https://github.com/openai/gym/issues/3176>`_:
-
-.. code-block:: bash
-
-    pip install -U setuptools==65.5.0 pip==21
-
+- pip (it helps to make sure this is up-to-date: ``pip install -U pip``)
 - (on ARM64 Macs) you need to set environment variables due to \
   `a bug in grpcio <https://stackoverflow.com/questions/66640705/how-can-i-install-grpcio-on-an-apple-m1-silicon-laptop>`_:
 
@@ -23,11 +17,6 @@ Prerequisites
 
 - (Optional) OpenGL (to render gym environments)
 - (Optional) FFmpeg (to encode videos of renders)
-- (Optional) MuJoCo (follow instructions to install `mujoco\_py v1.5 here`_)
-
-.. _mujoco_py v1.5 here:
-    https://github.com/openai/mujoco-py/tree/498b451a03fb61e5bdfcb6956d8d7c881b1098b5#install-mujoco
-
 
 Installation from PyPI
 ----------------------

diff --git a/docs/main-concepts/reward_networks.rst b/docs/main-concepts/reward_networks.rst
@@ -47,7 +47,7 @@ In order to use a reward network to train a policy, we need to integrate it into
 
     import numpy as np
     rng = np.random.default_rng(0)
-    from gym.spaces import Box
+    from gymnasium.spaces import Box
     obs_space = Box(np.ones(2), np.ones(2))
     action_space = Box(np.ones(5), np.ones(5))
 

diff --git a/docs/tutorials/10_train_custom_env.ipynb b/docs/tutorials/10_train_custom_env.ipynb
@@ -34,32 +34,30 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "from typing import Dict, Optional\n",
+    "from typing import Any\n",
     "import numpy as np\n",
-    "import gym\n",
+    "import gymnasium as gym\n",
     "\n",
-    "from gym.spaces import Box\n",
-    "from gym.utils import seeding\n",
+    "from gymnasium.spaces import Box\n",
     "\n",
     "\n",
     "class ObservationMatchingEnv(gym.Env):\n",
     "    def __init__(self, num_options: int = 2):\n",
+    "        self.state = None\n",
     "        self.num_options = num_options\n",
-    "        self.observation_space = Box(0, 1, shape=(num_options,), dtype=np.float32)\n",
-    "        self.action_space = Box(0, 1, shape=(num_options,), dtype=np.float32)\n",
-    "        self.seed()\n",
+    "        self.observation_space = Box(0, 1, shape=(num_options,))\n",
+    "        self.action_space = Box(0, 1, shape=(num_options,))\n",
     "\n",
-    "    def seed(self, seed=None):\n",
-    "        self.np_random, seed = seeding.np_random(seed)\n",
-    "        return [seed]\n",
-    "\n",
-    "    def reset(self):\n",
-    "        self.state = self.np_random.uniform(size=self.num_options)\n",
-    "        return self.state\n",
+    "    def reset(self, seed: int = None, options: Optional[Dict[str, Any]] = None):\n",
+    "        super().reset(seed=seed, options=options)\n",
+    "        self.state = self.observation_space.sample()\n",
+    "        return self.state, {}\n",
     "\n",
     "    def step(self, action):\n",
     "        reward = -np.abs(self.state - action).mean()\n",
-    "        self.state = self.np_random.uniform(size=self.num_options)\n",
-    "        return self.state, reward, False, {}"
+    "        self.state = self.observation_space.sample()\n",
+    "        return self.state, reward, False, False, {}"
    ]
   },
   {
@@ -126,7 +124,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from gym.wrappers import TimeLimit\n",
+    "from gymnasium.wrappers import TimeLimit\n",
     "from imitation.data import rollout\n",
     "from imitation.data.wrappers import RolloutInfoWrapper\n",
     "from imitation.util.util import make_vec_env\n",
@@ -176,7 +174,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from gym.wrappers import TimeLimit\n",
+    "from gymnasium.wrappers import TimeLimit\n",
     "from imitation.data import rollout\n",
     "from imitation.data.wrappers import RolloutInfoWrapper\n",
     "from stable_baselines3.common.vec_env import DummyVecEnv\n",
@@ -236,7 +234,7 @@
     "from stable_baselines3 import PPO\n",
     "from stable_baselines3.ppo import MlpPolicy\n",
     "from stable_baselines3.common.evaluation import evaluate_policy\n",
-    "from gym.wrappers import TimeLimit\n",
+    "from gymnasium.wrappers import TimeLimit\n",
     "\n",
     "expert = PPO(\n",
     "    policy=MlpPolicy,\n",
@@ -266,8 +264,7 @@
     "#     n_steps=64,\n",
     "# )\n",
     "expert.learn(10_000)  # Note: set to 100000 to train a proficient expert\n",
-    "\n",
-    "reward, _ = evaluate_policy(expert, env, 10)\n",
+    "reward, _ = evaluate_policy(expert, expert.get_env(), 10)\n",
     "print(f\"Expert reward: {reward}\")"
    ]
   },

diff --git a/docs/tutorials/1_train_bc.ipynb b/docs/tutorials/1_train_bc.ipynb
@@ -32,13 +32,13 @@
    "outputs": [],
    "source": [
     "import numpy as np\n",
-    "import seals  # noqa: F401  # needed to load \"seals/\" environments\n",
+    "import gymnasium as gym\n",
     "from imitation.policies.serialize import load_policy\n",
     "from imitation.util.util import make_vec_env\n",
     "from imitation.data.wrappers import RolloutInfoWrapper\n",
     "\n",
     "env = make_vec_env(\n",
-    "    \"seals/CartPole-v0\",\n",
+    "    \"seals:seals/CartPole-v0\",\n",
     "    rng=np.random.default_rng(),\n",
     "    post_wrappers=[\n",
     "        lambda env, _: RolloutInfoWrapper(env)\n",

diff --git a/docs/tutorials/2_train_dagger.ipynb b/docs/tutorials/2_train_dagger.ipynb
@@ -27,12 +27,12 @@
    "outputs": [],
    "source": [
     "import numpy as np\n",
-    "import seals  # noqa: F401  # needed to load \"seals/\" environments\n",
+    "import gymnasium as gym\n",
     "from imitation.policies.serialize import load_policy\n",
     "from imitation.util.util import make_vec_env\n",
     "\n",
     "env = make_vec_env(\n",
-    "    \"seals/CartPole-v0\",\n",
+    "    \"seals:seals/CartPole-v0\",\n",
     "    rng=np.random.default_rng(),\n",
     "    n_envs=1,\n",
     ")\n",

diff --git a/docs/tutorials/3_train_gail.ipynb b/docs/tutorials/3_train_gail.ipynb
@@ -27,15 +27,14 @@
    "outputs": [],
    "source": [
     "import numpy as np\n",
-    "import seals  # noqa: F401  # needed to load \"seals/\" environments\n",
     "from imitation.policies.serialize import load_policy\n",
     "from imitation.util.util import make_vec_env\n",
     "from imitation.data.wrappers import RolloutInfoWrapper\n",
     "\n",
     "SEED = 42\n",
     "\n",
     "env = make_vec_env(\n",
-    "    \"seals/CartPole-v0\",\n",
+    "    \"seals:seals/CartPole-v0\",\n",
     "    rng=np.random.default_rng(SEED),\n",
     "    n_envs=8,\n",
     "    post_wrappers=[\n",
@@ -45,7 +44,7 @@
     "expert = load_policy(\n",
     "    \"ppo-huggingface\",\n",
     "    organization=\"HumanCompatibleAI\",\n",
-    "    env_name=\"seals-CartPole-v0\",\n",
+    "    env_name=\"seals:seals/CartPole-v0\",\n",
     "    venv=env,\n",
     ")"
    ]

diff --git a/docs/tutorials/4_train_airl.ipynb b/docs/tutorials/4_train_airl.ipynb
@@ -24,7 +24,7 @@
    "outputs": [],
    "source": [
     "import numpy as np\n",
-    "import seals  # noqa: F401  # needed to load \"seals/\" environments\n",
+    "import gymnasium as gym\n",
     "from imitation.policies.serialize import load_policy\n",
     "from imitation.util.util import make_vec_env\n",
     "from imitation.data.wrappers import RolloutInfoWrapper\n",
@@ -127,7 +127,7 @@
     "    reward_net=reward_net,\n",
     ")\n",
     "\n",
-    "env.seed(SEED)\n",
+    "env.reset(seed=SEED)\n",
     "learner_rewards_before_training, _ = evaluate_policy(\n",
     "    learner, env, 100, return_episode_rewards=True\n",
     ")\n",

diff --git a/docs/tutorials/5_train_preference_comparisons.ipynb b/docs/tutorials/5_train_preference_comparisons.ipynb
@@ -23,11 +23,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import random\n",
     "from imitation.algorithms import preference_comparisons\n",
     "from imitation.rewards.reward_nets import BasicRewardNet\n",
     "from imitation.util.networks import RunningNorm\n",
     "from imitation.util.util import make_vec_env\n",
     "from imitation.policies.base import FeedForward32Policy, NormalizeFeaturesExtractor\n",
+    "import gymnasium as gym\n",
     "from stable_baselines3 import PPO\n",
     "import numpy as np\n",
     "\n",