[RLlib] New API stack: On by default for BC/MARWIL/CQL. (#48599)

ray-project · Nov 10, 2024 · 03ea4f6 · 03ea4f6
1 parent aee0a0e
commit 03ea4f6
Show file tree

Hide file tree

Showing 20 changed files with 135 additions and 153 deletions.
diff --git a/rllib/BUILD b/rllib/BUILD
@@ -948,12 +948,13 @@ py_test(
 )
 
 # CQL
+# @OldAPIStack
 py_test(
-    name = "test_cql",
+    name = "test_cql_old_api_stack",
     tags = ["team:rllib", "algorithms_dir"],
     size = "large",
     data = ["tests/data/pendulum/small.json"],
-    srcs = ["algorithms/cql/tests/test_cql.py"]
+    srcs = ["algorithms/cql/tests/test_cql_old_api_stack.py"]
 )
 
 # DQN

diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py
@@ -82,25 +82,6 @@
 
 Space = gym.Space
 
-"""TODO(jungong, sven): in "offline_data" we can potentially unify all input types
-under input and input_config keys. E.g.
-input: sample
-input_config {
-env: CartPole-v1
-}
-or:
-input: json_reader
-input_config {
-path: /tmp/
-}
-or:
-input: dataset
-input_config {
-format: parquet
-path: /tmp/
-}
-"""
-
 
 if TYPE_CHECKING:
     from ray.rllib.algorithms.algorithm import Algorithm
@@ -131,12 +112,13 @@ class AlgorithmConfig(_Config):
         from ray.rllib.algorithms.callbacks import MemoryTrackingCallbacks
         # Construct a generic config object, specifying values within different
         # sub-categories, e.g. "training".
-        config = (PPOConfig().training(gamma=0.9, lr=0.01)
-                .environment(env="CartPole-v1")
-                .resources(num_gpus=0)
-                .env_runners(num_env_runners=0)
-                .callbacks(MemoryTrackingCallbacks)
-            )
+        config = (
+            PPOConfig()
+            .training(gamma=0.9, lr=0.01)
+            .environment(env="CartPole-v1")
+            .env_runners(num_env_runners=0)
+            .callbacks(MemoryTrackingCallbacks)
+        )
         # A config object can be used to construct the respective Algorithm.
         rllib_algo = config.build()
 
@@ -321,10 +303,6 @@ def __init__(self, algo_class: Optional[type] = None):
         # Default setting for skipping `nan` gradient updates.
         self.torch_skip_nan_gradients = False
 
-        # `self.api_stack()`
-        self.enable_rl_module_and_learner = False
-        self.enable_env_runner_and_connector_v2 = False
-
         # `self.environment()`
         self.env = None
         self.env_config = {}
@@ -425,7 +403,19 @@ def __init__(self, algo_class: Optional[type] = None):
         self.explore = True
         # This is not compatible with RLModules, which have a method
         # `forward_exploration` to specify custom exploration behavior.
-        self.exploration_config = {}
+        if not hasattr(self, "exploration_config"):
+            # Helper to keep track of the original exploration config when dis-/enabling
+            # rl modules.
+            self._prior_exploration_config = None
+            self.exploration_config = {}
+
+        # `self.api_stack()`
+        self.enable_rl_module_and_learner = True
+        self.enable_env_runner_and_connector_v2 = True
+        self.api_stack(
+            enable_rl_module_and_learner=True,
+            enable_env_runner_and_connector_v2=True,
+        )
 
         # `self.multi_agent()`
         # TODO (sven): Prepare multi-agent setup for logging each agent's and each
@@ -549,9 +539,6 @@ def __init__(self, algo_class: Optional[type] = None):
         # `self.rl_module()`
         self._model_config = {}
         self._rl_module_spec = None
-        # Helper to keep track of the original exploration config when dis-/enabling
-        # rl modules.
-        self.__prior_exploration_config = None
         # Module ID specific config overrides.
         self.algorithm_config_overrides_per_module = {}
         # Cached, actual AlgorithmConfig objects derived from
@@ -1612,13 +1599,13 @@ def api_stack(
             self.enable_rl_module_and_learner = enable_rl_module_and_learner
 
             if enable_rl_module_and_learner is True and self.exploration_config:
-                self.__prior_exploration_config = self.exploration_config
+                self._prior_exploration_config = self.exploration_config
                 self.exploration_config = {}
 
             elif enable_rl_module_and_learner is False and not self.exploration_config:
-                if self.__prior_exploration_config is not None:
-                    self.exploration_config = self.__prior_exploration_config
-                    self.__prior_exploration_config = None
+                if self._prior_exploration_config is not None:
+                    self.exploration_config = self._prior_exploration_config
+                    self._prior_exploration_config = None
                 else:
                     logger.warning(
                         "config.enable_rl_module_and_learner was set to False, but no "

diff --git a/rllib/algorithms/appo/appo.py b/rllib/algorithms/appo/appo.py
@@ -88,8 +88,6 @@ class APPOConfig(IMPALAConfig):
 
     def __init__(self, algo_class=None):
         """Initializes a APPOConfig instance."""
-        super().__init__(algo_class=algo_class or APPO)
-
         self.exploration_config = {
             # The Exploration class to use. In the simplest case, this is the name
             # (str) of any class present in the `rllib.utils.exploration` package.
@@ -100,6 +98,8 @@ def __init__(self, algo_class=None):
             # Add constructor kwargs here (if any).
         }
 
+        super().__init__(algo_class=algo_class or APPO)
+
         # fmt: off
         # __sphinx_doc_begin__
         # APPO specific settings:
@@ -138,10 +138,6 @@ def __init__(self, algo_class=None):
         self.vf_loss_coeff = 0.5
         self.entropy_coeff = 0.01
         self.tau = 1.0
-        self.api_stack(
-            enable_rl_module_and_learner=True,
-            enable_env_runner_and_connector_v2=True,
-        )
         # __sphinx_doc_end__
         # fmt: on
 

diff --git a/rllib/algorithms/appo/tests/test_appo.py b/rllib/algorithms/appo/tests/test_appo.py
@@ -4,9 +4,7 @@
 import ray.rllib.algorithms.appo as appo
 from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
 from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
-from ray.rllib.utils.metrics import (
-    LEARNER_RESULTS,
-)
+from ray.rllib.utils.metrics import LEARNER_RESULTS
 from ray.rllib.utils.test_utils import (
     check_train_results,
     check_train_results_new_api_stack,

diff --git a/rllib/algorithms/bc/tests/test_bc_old_api_stack.py b/rllib/algorithms/bc/tests/test_bc_old_api_stack.py
@@ -37,6 +37,10 @@ def test_bc_compilation_and_learning_from_offline_file(self):
 
         config = (
             bc.BCConfig()
+            .api_stack(
+                enable_env_runner_and_connector_v2=False,
+                enable_rl_module_and_learner=False,
+            )
             .evaluation(
                 evaluation_interval=3,
                 evaluation_num_env_runners=1,

diff --git a/rllib/algorithms/cql/cql.py b/rllib/algorithms/cql/cql.py
@@ -108,19 +108,9 @@ def __init__(self, algo_class=None):
 
         # Changes to Algorithm's/SACConfig's default:
 
-        # `.api_stack()`
-        self.api_stack(
-            enable_rl_module_and_learner=False,
-            enable_env_runner_and_connector_v2=False,
-        )
         # .reporting()
         self.min_sample_timesteps_per_iteration = 0
         self.min_train_timesteps_per_iteration = 100
-        # `.api_stack()`
-        self.api_stack(
-            enable_rl_module_and_learner=False,
-            enable_env_runner_and_connector_v2=False,
-        )
         # fmt: on
         # __sphinx_doc_end__
 

diff --git a/rllib/algorithms/cql/tests/test_cql.py → ...ithms/cql/tests/test_cql_old_api_stack.py b/rllib/algorithms/cql/tests/test_cql.py → ...ithms/cql/tests/test_cql_old_api_stack.py
@@ -39,6 +39,10 @@ def test_cql_compilation(self):
 
         config = (
             cql.CQLConfig()
+            .api_stack(
+                enable_rl_module_and_learner=False,
+                enable_env_runner_and_connector_v2=False,
+            )
             .environment(
                 env="Pendulum-v1",
             )

diff --git a/rllib/algorithms/dqn/dqn.py b/rllib/algorithms/dqn/dqn.py
@@ -134,18 +134,19 @@ class DQNConfig(AlgorithmConfig):
 
     def __init__(self, algo_class=None):
         """Initializes a DQNConfig instance."""
-        super().__init__(algo_class=algo_class or DQN)
-
-        # Overrides of AlgorithmConfig defaults
-        # `env_runners()`
-        # Set to `self.n_step`, if 'auto'.
-        self.rollout_fragment_length: Union[int, str] = "auto"
         self.exploration_config = {
             "type": "EpsilonGreedy",
             "initial_epsilon": 1.0,
             "final_epsilon": 0.02,
             "epsilon_timesteps": 10000,
         }
+
+        super().__init__(algo_class=algo_class or DQN)
+
+        # Overrides of AlgorithmConfig defaults
+        # `env_runners()`
+        # Set to `self.n_step`, if 'auto'.
+        self.rollout_fragment_length: Union[int, str] = "auto"
         # New stack uses `epsilon` as either a constant value or a scheduler
         # defined like this.
         # TODO (simon): Ensure that users can understand how to provide epsilon.
@@ -174,7 +175,6 @@ def __init__(self, algo_class=None):
         self.target_network_update_freq = 500
         self.num_steps_sampled_before_learning_starts = 1000
         self.store_buffer_in_checkpoints = False
-        self.lr_schedule = None
         self.adam_epsilon = 1e-8
 
         self.tau = 1.0
@@ -203,14 +203,11 @@ def __init__(self, algo_class=None):
             # Beta parameter for sampling from prioritized replay buffer.
             "beta": 0.4,
         }
-        # `.api_stack()`
-        self.api_stack(
-            enable_rl_module_and_learner=True,
-            enable_env_runner_and_connector_v2=True,
-        )
         # fmt: on
         # __sphinx_doc_end__
 
+        self.lr_schedule = None  # @OldAPIStack
+
         # Deprecated
         self.buffer_size = DEPRECATED_VALUE
         self.prioritized_replay = DEPRECATED_VALUE

diff --git a/rllib/algorithms/impala/impala.py b/rllib/algorithms/impala/impala.py
@@ -123,8 +123,6 @@ class IMPALAConfig(AlgorithmConfig):
 
     def __init__(self, algo_class=None):
         """Initializes a IMPALAConfig instance."""
-        super().__init__(algo_class=algo_class or IMPALA)
-
         self.exploration_config = {  # @OldAPIstack
             # The Exploration class to use. In the simplest case, this is the name
             # (str) of any class present in the `rllib.utils.exploration` package.
@@ -135,6 +133,8 @@ def __init__(self, algo_class=None):
             # Add constructor kwargs here (if any).
         }
 
+        super().__init__(algo_class=algo_class or IMPALA)
+
         # fmt: off
         # __sphinx_doc_begin__
 
@@ -170,10 +170,6 @@ def __init__(self, algo_class=None):
         self.num_env_runners = 2
         self.lr = 0.0005
         self.min_time_s_per_iteration = 10
-        self.api_stack(
-            enable_rl_module_and_learner=True,
-            enable_env_runner_and_connector_v2=True,
-        )
         # __sphinx_doc_end__
         # fmt: on
 

diff --git a/rllib/algorithms/marwil/marwil.py b/rllib/algorithms/marwil/marwil.py
@@ -137,6 +137,16 @@ class MARWILConfig(AlgorithmConfig):
 
     def __init__(self, algo_class=None):
         """Initializes a MARWILConfig instance."""
+        self.exploration_config = {
+            # The Exploration class to use. In the simplest case, this is the name
+            # (str) of any class present in the `rllib.utils.exploration` package.
+            # You can also provide the python class directly or the full location
+            # of your class (e.g. "ray.rllib.utils.exploration.epsilon_greedy.
+            # EpsilonGreedy").
+            "type": "StochasticSampling",
+            # Add constructor kwargs here (if any).
+        }
+
         super().__init__(algo_class=algo_class or MARWIL)
 
         # fmt: off
@@ -165,18 +175,6 @@ def __init__(self, algo_class=None):
         self.lr = 1e-4
         self.lambda_ = 1.0
         self.train_batch_size = 2000
-        # TODO (Artur): MARWIL should not need an exploration config as an offline
-        #  algorithm. However, the current implementation of the CRR algorithm
-        #  requires it. Investigate.
-        self.exploration_config = {
-            # The Exploration class to use. In the simplest case, this is the name
-            # (str) of any class present in the `rllib.utils.exploration` package.
-            # You can also provide the python class directly or the full location
-            # of your class (e.g. "ray.rllib.utils.exploration.epsilon_greedy.
-            # EpsilonGreedy").
-            "type": "StochasticSampling",
-            # Add constructor kwargs here (if any).
-        }
 
         # Materialize only the data in raw format, but not the mapped data b/c
         # MARWIL uses a connector to calculate values and therefore the module

diff --git a/rllib/algorithms/marwil/tests/test_marwil_old_api_stack.py b/rllib/algorithms/marwil/tests/test_marwil_old_api_stack.py
@@ -49,6 +49,10 @@ def test_marwil_compilation_and_learning_from_offline_file(self):
 
         config = (
             marwil.MARWILConfig()
+            .api_stack(
+                enable_rl_module_and_learner=False,
+                enable_env_runner_and_connector_v2=False,
+            )
             .env_runners(num_env_runners=2)
             .environment(env="CartPole-v1")
             .evaluation(
@@ -111,6 +115,10 @@ def test_marwil_cont_actions_from_offline_file(self):
 
         config = (
             marwil.MARWILConfig()
+            .api_stack(
+                enable_rl_module_and_learner=False,
+                enable_env_runner_and_connector_v2=False,
+            )
             .env_runners(num_env_runners=1)
             .evaluation(
                 evaluation_num_env_runners=1,
@@ -148,6 +156,10 @@ def test_marwil_loss_function(self):
 
         config = (
             marwil.MARWILConfig()
+            .api_stack(
+                enable_rl_module_and_learner=False,
+                enable_env_runner_and_connector_v2=False,
+            )
             .env_runners(num_env_runners=0)
             .offline_data(input_=[data_file])
         )  # Learn from offline data.

diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py
@@ -110,8 +110,6 @@ class PPOConfig(AlgorithmConfig):
 
     def __init__(self, algo_class=None):
         """Initializes a PPOConfig instance."""
-        super().__init__(algo_class=algo_class or PPO)
-
         self.exploration_config = {
             # The Exploration class to use. In the simplest case, this is the name
             # (str) of any class present in the `rllib.utils.exploration` package.
@@ -122,6 +120,8 @@ def __init__(self, algo_class=None):
             # Add constructor kwargs here (if any).
         }
 
+        super().__init__(algo_class=algo_class or PPO)
+
         # fmt: off
         # __sphinx_doc_begin__
         self.lr = 5e-5
@@ -146,12 +146,6 @@ def __init__(self, algo_class=None):
 
         # Override some of AlgorithmConfig's default values with PPO-specific values.
         self.num_env_runners = 2
-
-        # `.api_stack()`
-        self.api_stack(
-            enable_rl_module_and_learner=True,
-            enable_env_runner_and_connector_v2=True,
-        )
         # __sphinx_doc_end__
         # fmt: on