diff --git a/doc/source/_includes/rllib/new_api_stack.rst b/doc/source/_includes/rllib/new_api_stack.rst index a23c0abbae60..08cdf1b5523b 100644 --- a/doc/source/_includes/rllib/new_api_stack.rst +++ b/doc/source/_includes/rllib/new_api_stack.rst @@ -4,6 +4,4 @@ The Ray Team plans to transition algorithms, example scripts, and documentation to the new code base thereby incrementally replacing the "old API stack" (e.g., ModelV2, Policy, RolloutWorker) throughout the subsequent minor releases leading up to Ray 3.0. - Note, however, that all algorithms continue to run by default with the old APIs. - :doc:`See here ` for more details on how to use the new API stack. \ No newline at end of file diff --git a/doc/source/rllib/doc_code/new_api_stack.py b/doc/source/rllib/doc_code/new_api_stack.py deleted file mode 100644 index 4e332beb1a7e..000000000000 --- a/doc/source/rllib/doc_code/new_api_stack.py +++ /dev/null @@ -1,141 +0,0 @@ -# __enabling-new-api-stack-sa-ppo-begin__ - -from ray.rllib.algorithms.ppo import PPOConfig - - -config = ( - PPOConfig() - .environment("CartPole-v1") - # Switch both the new API stack flags to True (both False by default). - # This enables the use of - # a) RLModule (replaces ModelV2) and Learner (replaces Policy) - # b) and automatically picks the correct EnvRunner (single-agent vs multi-agent) - # and enables ConnectorV2 support. - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) - .resources( - num_cpus_for_main_process=1, - ) - # We are using a simple 1-CPU setup here for learning. However, as the new stack - # supports arbitrary scaling on the learner axis, feel free to set - # `num_learners` to the number of available GPUs for multi-GPU training (and - # `num_gpus_per_learner=1`). - .learners( - num_learners=0, # <- in most cases, set this value to the number of GPUs - num_gpus_per_learner=0, # <- set this to 1, if you have at least 1 GPU - ) - # When using RLlib's default models (RLModules) AND the new EnvRunners, you should - # set this flag in your model config. Having to set this, will no longer be required - # in the near future. It does yield a small performance advantage as value function - # predictions for PPO are no longer required to happen on the sampler side (but are - # now fully located on the learner side, which might have GPUs available). - .training(model={"uses_new_env_runners": True}) -) - -# __enabling-new-api-stack-sa-ppo-end__ - -# Test whether it works. -print(config.build().train()) - - -# __enabling-new-api-stack-ma-ppo-begin__ - -from ray.rllib.algorithms.ppo import PPOConfig # noqa -from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole # noqa - - -# A typical multi-agent setup (otherwise using the exact same parameters as before) -# looks like this. -config = ( - PPOConfig() - .environment(MultiAgentCartPole, env_config={"num_agents": 2}) - # Switch both the new API stack flags to True (both False by default). - # This enables the use of - # a) RLModule (replaces ModelV2) and Learner (replaces Policy) - # b) and automatically picks the correct EnvRunner (single-agent vs multi-agent) - # and enables ConnectorV2 support. - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) - .resources( - num_cpus_for_main_process=1, - ) - # We are using a simple 1-CPU setup here for learning. However, as the new stack - # supports arbitrary scaling on the learner axis, feel free to set - # `num_learners` to the number of available GPUs for multi-GPU training (and - # `num_gpus_per_learner=1`). - .learners( - num_learners=0, # <- in most cases, set this value to the number of GPUs - num_gpus_per_learner=0, # <- set this to 1, if you have at least 1 GPU - ) - # When using RLlib's default models (RLModules) AND the new EnvRunners, you should - # set this flag in your model config. Having to set this, will no longer be required - # in the near future. It does yield a small performance advantage as value function - # predictions for PPO are no longer required to happen on the sampler side (but are - # now fully located on the learner side, which might have GPUs available). - .training(model={"uses_new_env_runners": True}) - # Because you are in a multi-agent env, you have to set up the usual multi-agent - # parameters: - .multi_agent( - policies={"p0", "p1"}, - # Map agent 0 to p0 and agent 1 to p1. - policy_mapping_fn=lambda agent_id, episode, **kwargs: f"p{agent_id}", - ) -) - -# __enabling-new-api-stack-ma-ppo-end__ - -# Test whether it works. -print(config.build().train()) - - -# __enabling-new-api-stack-sa-sac-begin__ - -from ray.rllib.algorithms.sac import SACConfig # noqa - - -config = ( - SACConfig() - .environment("Pendulum-v1") - # Switch both the new API stack flags to True (both False by default). - # This enables the use of - # a) RLModule (replaces ModelV2) and Learner (replaces Policy) - # b) and automatically picks the correct EnvRunner (single-agent vs multi-agent) - # and enables ConnectorV2 support. - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) - .resources( - num_cpus_for_main_process=1, - ) - # We are using a simple 1-CPU setup here for learning. However, as the new stack - # supports arbitrary scaling on the learner axis, feel free to set - # `num_learners` to the number of available GPUs for multi-GPU training (and - # `num_gpus_per_learner=1`). - .learners( - num_learners=0, # <- in most cases, set this value to the number of GPUs - num_gpus_per_learner=0, # <- set this to 1, if you have at least 1 GPU - ) - # When using RLlib's default models (RLModules) AND the new EnvRunners, you should - # set this flag in your model config. Having to set this, will no longer be required - # in the near future. It does yield a small performance advantage as value function - # predictions for PPO are no longer required to happen on the sampler side (but are - # now fully located on the learner side, which might have GPUs available). - .training( - model={"uses_new_env_runners": True}, - replay_buffer_config={"type": "EpisodeReplayBuffer"}, - # Note, new API stack SAC uses its own learning rates specific to actor, - # critic, and alpha. `lr` therefore needs to be set to `None`. See `actor_lr`, - # `critic_lr`, and `alpha_lr` for the specific learning rates, respectively. - lr=None, - ) -) -# __enabling-new-api-stack-sa-sac-end__ - - -# Test whether it works. -print(config.build().train()) diff --git a/doc/source/rllib/index.rst b/doc/source/rllib/index.rst index 2f229eded32d..16bd8647a2c4 100644 --- a/doc/source/rllib/index.rst +++ b/doc/source/rllib/index.rst @@ -17,6 +17,7 @@ RLlib: Industry-Grade Reinforcement Learning user-guides rllib-examples rllib-new-api-stack + new-api-stack-migration-guide package_ref/index diff --git a/doc/source/rllib/new-api-stack-migration-guide.rst b/doc/source/rllib/new-api-stack-migration-guide.rst new file mode 100644 index 000000000000..525f6a984e83 --- /dev/null +++ b/doc/source/rllib/new-api-stack-migration-guide.rst @@ -0,0 +1,429 @@ +.. include:: /_includes/rllib/we_are_hiring.rst + +.. include:: /_includes/rllib/new_api_stack.rst + + +.. _rllib-new-api-stack-migration-guide: + + +New API stack migration guide +============================= + +This page explains, step by step, how to convert and translate your existing old API stack +RLlib classes and code to RLlib's new API stack. +:ref:`Why you should migrate to the new API stack `. + + +.. note:: + + Even though the new API stack still provides rudimentary support for `TensorFlow `__, + RLlib supports a single deep learning framework, the `PyTorch `__ + framework, dropping TensorFlow support entirely. + Note, though, that the Ray team continues to design RLlib to be framework-agnostic. + + +Change your AlgorithmConfig +--------------------------- + +RLlib turns off the new API stack by default for all RLlib algorithms. To activate it, use the `api_stack()` method +in your `AlgorithmConfig` object like so: + +.. testcode:: + + from ray.rllib.algorithms.ppo import PPOConfig + + config = ( + PPOConfig() + # Switch both the new API stack flags to True (both False by default). + # This action enables the use of + # a) RLModule (replaces ModelV2) and Learner (replaces Policy). + # b) the correct EnvRunner, which replaces RolloutWorker, and + # ConnectorV2 pipelines, which replaces the old stack Connectors. + .api_stack( + enable_rl_module_and_learner=True, + enable_env_runner_and_connector_v2=True, + ) + ) + + +Note that there are a few other differences between configuring an old API stack algorithm +and its new stack counterpart. +Go through the following sections and make sure you're translating the respective +settings. Remove settings that the new stack doesn't support or need. + + +AlgorithmConfig.framework() +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Even though the new API stack still provides rudimentary support for `TensorFlow `__, +RLlib supports a single deep learning framework, the `PyTorch `__ framework. + +The new API stack deprecates the following framework-related settings: + +.. testcode:: + + # Make sure you always set the framework to "torch"... + config.framework("torch") + + # ... and drop all tf-specific settings. + config.framework( + eager_tracing=True, + eager_max_retraces=20, + tf_session_args={}, + local_tf_session_args={}, + ) + + +AlgorithmConfig.resources() +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The `num_gpus` and `_fake_gpus` settings have been deprecated. To place your +RLModule on one or more GPUs on the Learner side, do the following: + +.. testcode:: + + # The following setting is equivalent to the old stack's `config.resources(num_gpus=2)`. + config.learners( + num_learners=2, + num_gpus_per_learner=1, + ) + +.. hint:: + + The `num_learners` setting determines how many remote :py:class:`~ray.rllib.core.learner.learner.Learner` + workers there are in your Algorithm's :py:class:`~ray.rllib.core.learner.learner_group.LearnerGroup`. + If you set this to 0, your LearnerGroup only contains a **local** Learner that runs on the main + process (and shares the compute resources with that process, usually 1 CPU). + For asynchronous algorithms like IMPALA or APPO, this setting should therefore always be >0. + +`See here for an example on how to train with fractional GPUs `__. +Also note that for fractional GPUs, you should always set `num_learners` to 0 or 1. + +If GPUs aren't available, but you want to learn with more than one +:py:class:`~ray.rllib.core.learner.learner.Learner` in a multi-**CPU** fashion, you can do the following: + +.. testcode:: + + config.learners( + num_learners=2, # or >2 + num_cpus_per_learner=1, # <- default + num_gpus_per_learner=0, # <- default + ) + +The setting `num_cpus_for_local_worker` has been renamed to `num_cpus_for_main_process`. + +.. testcode:: + + config.resources(num_cpus_for_main_process=0) # default is 1 + + +AlgorithmConfig.training() +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Train batch size +................ + +Due to the new API stack's :py:class:`~ray.rllib.core.learner.learner.Learner` worker +architecture, training may be distributed over n +:py:class:`~ray.rllib.core.learner.learner.Learner` workers, so RLlib provides the train batch size +per individual :py:class:`~ray.rllib.core.learner.learner.Learner`. +You should no longer use the `train_batch_size` setting: + + +.. testcode:: + + config.training( + train_batch_size_per_learner=512, + ) + +You don't need to change this setting, even when increasing the number of +:py:class:`~ray.rllib.core.learner.learner.Learner`, through `config.learners(num_learners=...)`. + +Note that a good rule of thumb for scaling on the learner axis is to keep the +`train_batch_size_per_learner` value constant with a growing number of Learners and +to increase the learning rate as follows: + +`lr = [original_lr] * ([num_learners] ** 0.5)` + + +Neural network configuration +............................ + +The old stack's `config.training(model=...)` is no longer supported on the new API stack. +Instead, use the new :py:meth:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfig.rl_module` +method to configure RLlib's default :py:class:`~ray.rllib.core.rl_module.rl_module.RLModule` +or specify and configure a custom :py:class:`~ray.rllib.core.rl_module.rl_module.RLModule`. + +See :ref:`RLModules API `, a general guide that also explains +the use of the `config.rl_module()` method. + +If you have an old stack `ModelV2` and want to migrate the entire NN logic to the +new stack, see :ref:`ModelV2 to RLModule ` for migration instructions. + + +Learning rate- and coefficient schedules +........................................ + +If you're using schedules for learning rate or other coefficients, for example, the +`entropy_coeff` setting in PPO, provide scheduling information directly in the respective setting. +Scheduling behavior doesn't require a specific, separate setting anymore. + +When defining a schedule, provide a list of 2-tuples, where the first item is the global timestep +(*num_env_steps_sampled_lifetime* in the reported metrics) and the second item is the value that the learning rate should reach at that timestep. +Always start the first 2-tuple with timestep 0. Note that RLlib linearly interpolates values between +two provided timesteps. + +For example, to create a learning rate schedule that starts with a value of 1e-5, then increases over 1M timesteps to 1e-4 and stays constant after that, do the following: + +.. testcode:: + + config.training( + lr=[ + [0, 1e-5], # <- initial value at timestep 0 + [1000000, 1e-4], # <- final value at 1M timesteps + ], + ) + + +In the preceding example, the value after 500k timesteps is roughly `5e-5` from linear interpolation. + +Another example is to create an entropy coefficient schedule that starts with a value of 0.05, then increases over 1M timesteps to 0.1 and +then suddenly drops to 0, after the 1Mth timestep, do the following: + +.. testcode:: + + config.training( + entropy_coeff=[ + [0, 0.05], # <- initial value at timestep 0 + [1000000, 0.1], # <- value at 1M timesteps + [1000001, 0.0], # <- sudden drop to 0.0 right after 1M timesteps + ] + ) + +In case you need to configure a more complex learning rate scheduling behavior or chain different schedulers +into a pipeline, you can use the experimental `_torch_lr_schedule_classes` config property. +See `this example script `__ for more details. +Note that this example only covers learning rate schedules, but not any other coefficients. + + +AlgorithmConfig.learners() +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This method isn't used on the old API stack because the old stack doesn't use Learner workers. + +It allows you to specify: + +1) the number of `Learner` workers through `.learners(num_learners=...)`. +1) the resources per learner; use `.learners(num_gpus_per_learner=1)` for GPU training and `.learners(num_gpus_per_learner=0)` for CPU training. +1) the custom Learner class you want to use (`example on how to do this here `__) +1) a config dict you would like to set for your custom learner: `.learners(learner_config_dict={...})`. Note that every `Learner` has access to the entire `AlgorithmConfig` object through `self.config`, but setting the `learner_config_dict` is a convenient way to avoid having to create an entirely new `AlgorithmConfig` subclass only to support a few extra settings for your custom `Learner` class. + + +AlgorithmConfig.env_runners() +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. testcode:: + + # RolloutWorkers have been replace by EnvRunners. EnvRunners are more efficient and offer + # a more separation-of-concerns design and cleaner code. + config.env_runners( + num_env_runners=2, # use this instead of `num_workers` + ) + + # The following `env_runners` settings are deprecated and should no longer be explicitly + # set on the new stack: + config.env_runners( + create_env_on_local_worker=False, + sample_collector=None, + enable_connectors=True, + remote_worker_envs=False, + remote_env_batch_wait_ms=0, + preprocessor_pref="deepmind", + enable_tf1_exec_eagerly=False, + sampler_perf_stats_ema_coef=None, + ) + +.. hint:: + + If you want to IDE-debug what's going on inside your `EnvRunners`, set `num_env_runners=0` + and make sure you are running your experiment locally and not through Ray Tune. + In order to do this with any of RLlib's `example `__ + or `tuned_example `__ scripts, + simply set the command line args: `--no-tune --num-env-runners=0`. + +In case you were using the `observation_filter` setting, perform the following translations: + +.. testcode:: + + # For `observation_filter="NoFilter"`, don't set anything in particular. This is the default. + + # For `observation_filter="MeanStdFilter"`, do the following: + from ray.rllib.connectors.env_to_module import MeanStdFilter + + config.env_runners( + env_to_module_connector=lambda env: MeanStdFilter(multi_agent=False), # <- or True + ) + + +AlgorithmConfig.exploration() +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The new stack only supports the `explore` setting. +It determines whether the :py:meth:`~ray.rllib.core.rl_module.rl_module.RLModule._forward_exploration`, in the case `explore=True`, +or the :py:meth:`~ray.rllib.core.rl_module.rl_module.RLModule._forward_inference`, in the case `explore=False`, is the method +your :py:class:`~ray.rllib.core.rl_module.rl_module.RLModule` calls +inside the :py:class:`~ray.rllib.env.env_runner.EnvRunner`. + +.. testcode:: + + config.exploration(explore=True) # <- or False + + +The `exploration_config` setting is deprecated and no longer used. Instead, determine the exact exploratory +behavior, for example, sample an action from a distribution, inside the overridden +:py:meth:`~ray.rllib.core.rl_module.rl_module.RLModule._forward_exploration` method of your +:py:class:`~ray.rllib.core.rl_module.rl_module.RLModule`. + + +Custom callbacks +---------------- + +If you're using custom callbacks on the old API stack, you're subclassing the :py:class`~ray.rllib.algorithms.callbacks.DefaultCallbacks` class. +You can continue this approach with the new API stack and also pass your custom subclass to your config like the following: + +.. testcode:: + + # config.callbacks(YourCallbacksClass) + +However, if you're overriding those methods that the EnvRunner side triggered, for example,`on_episode_start/stop/step/etc...`, +you might have to do a small amount of translation, because the +EnvRunner may have changed the arguments that RLlib passes to many of these methods. + +The following is a one-to-one translation guide for these types of Callbacks methods: + +.. testcode:: + + from ray.rllib.algorithms.callbacks import DefaultCallbacks + + class YourCallbacksClass(DefaultCallbacks): + + def on_episode_start( + self, + *, + episode, + env_runner, + metrics_logger, + env, + env_index, + rl_module, + + # Old API stack args; don't use or access these inside your method code. + worker=None, + base_env=None, + policies=None, + **kwargs, + ): + # The `SingleAgentEpisode` or `MultiAgentEpisode` that RLlib has just started. + # See https://docs.ray.io/en/latest/rllib/single-agent-episode.html for more details: + print(episode) + + # The `EnvRunner` class that collects the episode in question. + # This class used to be a `RolloutWorker`. On the new stack, this class is either a + # `SingleAgentEnvRunner` or a `MultiAgentEnvRunner` holding the gymnasium Env, + # the RLModule, and the 2 connector pipelines, env-to-module and module-to-env. + print(env_runner) + + # The MetricsLogger object on the EnvRunner (documentation is a WIP). + print(metrics_logger.peek("episode_return_mean", default=0.0)) + + # The gymnasium env that sample collection uses. Note that this env may be a + # gymnasium.vector.VectorEnv. + print(env) + + # The env index, in case of a vector env, that handles the `episode`. + print(env_index) + + # The RL Module that this EnvRunner uses. Note that this module may be a "plain", single-agent + # `RLModule`, or a `MultiRLModule` in the multi-agent case. + print(rl_module) + + # Change similarly: + # on_episode_created() + # on_episode_step() + # on_episode_end() + + +The following callback methods are no longer available on the new API stack: + +**`on_sub_environment_created()`**: The new API stack uses `Farama's gymnasium `__ vector Envs leaving no control for RLlib +to call a callback on each individual env-index's creation. + +**`on_create_policy()`**: This method is no longer available on the new API stack because only :py:class:`~ray.rllib.evaluation.rollout_worker.RolloutWorker` calls it. + +**`on_postprocess_trajectory()`**: The new API stack no longer triggers and calls this method, +because :py:class:`~ray.rllib.connectors.connector_v2.ConnectorV2` pipelines handle trajectory processing entirely. +The documention for :py:class:`~ray.rllib.connectors.connector_v2.ConnectorV2` documentation is under development. + + +.. _rllib-modelv2-to-rlmodule: + +ModelV2 to RLModule +------------------- + +If you're using a custom :py:class:`~ray.rllib.models.modelv2.ModelV2` class and want to translate +the entire NN architecture and possibly action distribution logic to the new API stack, see +:ref:`RL Modules ` in addition to this section. + +See these example scripts on `how to write a custom CNN-containing RL Module `__ +and `how to write a custom LSTM-containing RL Module `__. + +There are various options for translating an existing, custom :py:class:`~ray.rllib.models.modelv2.ModelV2` from the old API stack, +to the new API stack's :py:class:`~ray.rllib.core.rl_module.rl_module.RLModule`: + +1) Move your ModelV2 code to a new, custom `RLModule` class. See :ref:`RL Modules ` for details). +1) Use an Algorithm checkpoint or a Policy checkpoint that you have from an old API stack training run and use this checkpoint with the `new stack RL Module convenience wrapper `__. +1) Use an existing :py:class:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfig` object from an old API stack training run, with the `new stack RL Module convenience wrapper `__. + + +Custom loss functions and policies +------------------------------------- + +If you're using one or more custom loss functions or custom (PyTorch) optimizers to train your models, instead of doing these +customizations inside the old stack's Policy class, you need to move the logic into the new API stack's +:py:class:`~ray.rllib.core.learner.learner.Learner` class. + +See :ref:`Learner ` for details on how to write a custom Learner . + +The following example scripts show how to write: +- `a simple custom loss function `__ +- `a custom Learner with 2 optimizers and different learning rates for each `__. + +Note that the new API stack doesn't support the Policy class. In the old stack, this class holds a +neural network, which is the :py:class:`~ray.rllib.core.rl_module.rl_module.RLModule` in the new API stack, +an old stack connector, which is the :py:class:`~ray.rllib.connector.connector_v2.ConnectorV2` in the new API stack, +and one or more optimizers and losses, which are the :py:class:`~ray.rllib.core.learner.learner.Learner` class in the new API stack. + +The RL Module API is much more flexible than the old stack's Policy API and +provides a cleaner separation-of-concerns experience. Things related to action +inference run on the EnvRunners, and things related to updating run on the Learner workers +It also provides superior scalability, allowing training in a multi-GPU setup in any Ray cluster +and multi-node with multi-GPU training on the `Anyscale `__ platform. + + +Custom connectors (old-stack) +----------------------------- + +If you're using custom connectors from the old API stack, move your logic into the +new :py:class:`~ray.rllib.connectors.connector_v2.ConnectorV2` API. +Translate your agent connectors into env-to-module ConnectorV2 pieces and your +action connectors into module-to-env ConnectorV2 pieces. + +The :py:class:`~ray.rllib.connectors.connector_v2.ConnectorV2` documentation is under development. + +The following are some examples on how to write ConnectorV2 pieces for the +different pipelines: + +1) `Observation frame-stacking `__. +1) `Add the most recent action and reward to the RL Module's input `__. +1) `Mean-std filtering on all observations `__. +1) `Flatten any complex observation space to a 1D space `__. diff --git a/doc/source/rllib/rllib-learner.rst b/doc/source/rllib/rllib-learner.rst index 38bfea05c079..7ece2c55f2cc 100644 --- a/doc/source/rllib/rllib-learner.rst +++ b/doc/source/rllib/rllib-learner.rst @@ -10,6 +10,7 @@ :class: inline-figure :width: 16 +.. _learner-guide: Learner (Alpha) =============== diff --git a/doc/source/rllib/rllib-new-api-stack.rst b/doc/source/rllib/rllib-new-api-stack.rst index 046c61f7fd7a..37094aba2881 100644 --- a/doc/source/rllib/rllib-new-api-stack.rst +++ b/doc/source/rllib/rllib-new-api-stack.rst @@ -4,61 +4,81 @@ .. _rllib-new-api-stack-guide: -RLlib's New API Stack +RLlib's new API stack ===================== +.. hint:: + + This section describes the new API stack and why you should migrate to it + if you have old API stack custom code. See the :ref:`migration guide ` for details. + + Overview -------- -Starting in Ray 2.10, you can opt-in to the alpha version of a "new API stack", a fundamental overhaul from the ground up with respect to architecture, -design principles, code base, and user facing APIs. The following select algorithms and setups are available. +Starting in Ray 2.10, you can opt-in to the alpha version of the "new API stack", a fundamental overhaul from the ground +up with respect to architecture, design principles, code base, and user facing APIs. +The following select algorithms and setups are available. .. list-table:: :header-rows: 1 - :widths: 40 40 40 + :widths: 25 25 25 25 25 25 * - Feature/Algo (on new API stack) + - **APPO** + - **DQN** + - **IMPALA** - **PPO** - **SAC** - * - Single Agent + * - Single- and Multi-Agent + - Yes + - Yes - Yes - Yes - * - Multi Agent - Yes - - No * - Fully-connected (MLP) - Yes - Yes + - Yes + - Yes + - Yes * - Image inputs (CNN) - Yes - - No + - Yes + - Yes + - Yes + - Yes * - RNN support (LSTM) - Yes - No + - Yes + - Yes + - No * - Complex inputs (flatten) - Yes - Yes + - Yes + - Yes + - Yes -Over the next couple of months, the Ray Team will continue to test, benchmark, bug-fix, and -further polish these new APIs as well as rollout more and more algorithms that you can run in -either stack. -The goal is to reach a state where the new stack can completely replace the old one. +Over the next few months, the RLlib Team continues to document, test, benchmark, bug-fix, and +further polish these new APIs as well as rollout more algorithms +that you can run in the new stack, with a focus on offline RL. -Keep in mind that due to its alpha nature, when using the new stack, you might run into issues and encounter instabilities. -Also, rest assured that you are able to continue using your custom classes and setups -on the old API stack for the foreseeable future (beyond Ray 3.0). +You can continue using custom classes and setups +on the old API stack for the foreseeable future, beyond Ray 3.0. However, you should +migrate to the new stack with the :ref:`migration guide ` -What is the New API Stack? +New API stack -------------------------- -The new API stack is the result of re-writing from scratch RLlib's core APIs and reducing -its user-facing classes from more than a dozen critical ones -down to only a handful of classes. During the design of these new interfaces from the ground up, +The new API stack is the result of re-writing RLlib's core APIs from scratch and reducing +its user-facing classes from more than a dozen critical ones down to only a handful +of classes, without any loss of functionaliy. During the design of these new interfaces, the Ray Team strictly applied the following principles: -* Suppose a simple mental-model underlying the new APIs * Classes must be usable outside of RLlib * Separate concerns as much as possible. Try to answer: "**WHAT** should be done **WHEN** and by **WHOM**?" * Offer fine-grained modularity, full interoperability, and frictionless pluggability of classes @@ -78,25 +98,16 @@ The :py:class:`~ray.rllib.algorithm.algorithm_config.AlgorithmConfig` and :py:cl Who should use the new API stack? --------------------------------- -Eventually, all users of RLlib should switch over to running experiments and developing their custom classes -against the new API stack. - -Right now, it's only available for a few algorithms and setups (see table above), however, if you do use -PPO (single- or multi-agent) or SAC (single-agent), you should try it. +Migrate your code from the old to new API stack as soon as possible. +The classes and APIs are sufficiently stable. The Ray team expects very minor changes. -The following section, lists some compelling reasons to migrate to the new stack. +See the :ref:`New API stack migration guide ` for a comprehensive migration guide with step-by-step instructions on translating your code from the +old to new API stack. -Note these indicators against using it at this early stage: +A comparison of the old to new API stack provides additional motivation for migrating to the new stack. -1) You're using a custom :py:class:`~ray.rllib.models.modelv2.ModelV2` class and aren't interested right now in moving it into the new :py:class:`~ray.rllib.core.rl_module.rl_module.RLModule` API. -1) You're using a custom :py:class:`~ray.rllib.policy.policy.Policy` class (e.g., with a custom loss function and aren't interested right now in moving it into the new :py:class:`~ray.rllib.core.learner.learner.Learner` API. -1) You're using custom :py:class:`~ray.rllib.connector.connector.Connector` classes and aren't interested right now in moving them into the new :py:class:`~ray.rllib.connector.connector_v2.ConnectorV2` API. -If any of the above applies to you, don't migrate for now, and continue running with the old API stack. Migrate to the new -stack whenever you're ready to re-write some small part of your code. - - -Comparison to the Old API Stack +Comparison to the old API stack ------------------------------- This table compares features and design choices between the new and old API stack: @@ -108,13 +119,19 @@ This table compares features and design choices between the new and old API stac * - - **New API Stack** - **Old API Stack** + * - Multi-GPU and multi-node/multi-GPU + - Yes + - Yes and No + * - Support for shared (multi-agent) model components (e.g., communication channels, shared value functions, etc.) + - Yes + - No * - Reduced code complexity (for beginners and advanced users) - 5 user-facing classes (`AlgorithmConfig`, `RLModule`, `Learner`, `ConnectorV2`, `Episode`) - 8 user-facing classes (`AlgorithmConfig`, `ModelV2`, `Policy`, `build_policy`, `Connector`, `RolloutWorker`, `BaseEnv`, `ViewRequirement`) * - Classes are usable outside of RLlib - Yes - Partly - * - Separation-of-concerns design (e.g., during sampling, only action must be computed) + * - Strict separation-of-concerns design - Yes - No * - Distributed/scalable sample collection @@ -123,12 +140,6 @@ This table compares features and design choices between the new and old API stac * - Full 360° read/write access to (multi-)agent trajectories - Yes - No - * - Multi-GPU and multi-node/multi-GPU - - Yes - - Yes & No - * - Support for shared (multi-agent) model components (e.g., communication channels, shared value functions, etc.) - - Yes - - No * - Env vectorization with `gym.vector.Env` - Yes - No (RLlib's own solution) @@ -137,31 +148,6 @@ This table compares features and design choices between the new and old API stac How to Use the New API Stack? ----------------------------- -The new API stack is disabled by default for all algorithms. -To activate it for PPO (single- and multi-agent) or SAC (single-agent only), -change the following in your `AlgorithmConfig` object: - -.. tab-set:: - - .. tab-item:: Single Agent **PPO** - - .. literalinclude:: doc_code/new_api_stack.py - :language: python - :start-after: __enabling-new-api-stack-sa-ppo-begin__ - :end-before: __enabling-new-api-stack-sa-ppo-end__ - - - .. tab-item:: Multi Agent **PPO** - - .. literalinclude:: doc_code/new_api_stack.py - :language: python - :start-after: __enabling-new-api-stack-ma-ppo-begin__ - :end-before: __enabling-new-api-stack-ma-ppo-end__ - - - .. tab-item:: Single Agent **SAC** - - .. literalinclude:: doc_code/new_api_stack.py - :language: python - :start-after: __enabling-new-api-stack-sa-sac-begin__ - :end-before: __enabling-new-api-stack-sa-sac-end__ +See :ref:`New API stack migration guide ` for a complete and comprehensive migration guide +with detailed steps and changes to apply to your +custom RLlib classes to migrate from the old to the new stack. diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index 983374552f93..6ff95f612905 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -344,6 +344,7 @@ def __init__(self, algo_class: Optional[type] = None): self.num_gpus_per_env_runner = 0 self.custom_resources_per_env_runner = {} self.validate_env_runners_after_construction = True + self.max_requests_in_flight_per_env_runner = 2 self.sample_timeout_s = 60.0 self.create_env_on_local_worker = False self._env_to_module_connector = None @@ -411,8 +412,7 @@ def __init__(self, algo_class: Optional[type] = None): self._learner_connector = None self.add_default_connectors_to_learner_pipeline = True self.learner_config_dict = {} - self.optimizer = {} - self.max_requests_in_flight_per_sampler_worker = 2 + self.optimizer = {} # @OldAPIStack self._learner_class = None # `self.callbacks()` @@ -1709,6 +1709,7 @@ def env_runners( custom_resources_per_env_runner: Optional[dict] = NotProvided, validate_env_runners_after_construction: Optional[bool] = NotProvided, sample_timeout_s: Optional[float] = NotProvided, + max_requests_in_flight_per_env_runner: Optional[int] = NotProvided, env_to_module_connector: Optional[ Callable[[EnvType], Union["ConnectorV2", List["ConnectorV2"]]] ] = NotProvided, @@ -1769,9 +1770,20 @@ def env_runners( sample_timeout_s: The timeout in seconds for calling `sample()` on remote EnvRunner workers. Results (episode list) from workers that take longer than this time are discarded. Only used by algorithms that sample - synchronously in turn with their update step (e.g. PPO or DQN). Not + synchronously in turn with their update step (e.g., PPO or DQN). Not relevant for any algos that sample asynchronously, such as APPO or IMPALA. + max_requests_in_flight_per_env_runner: Max number of inflight requests + to each EnvRunner worker. See the FaultTolerantActorManager class for + more details. + Tuning these values is important when running experiments with + large sample batches, where there is the risk that the object store may + fill up, causing spilling of objects to disk. This can cause any + asynchronous requests to become very slow, making your experiment run + slowly as well. You can inspect the object store during your experiment + via a call to Ray memory on your head node, and by using the Ray + dashboard. If you're seeing that the object store is filling up, + turn down the number of remote requests in flight or enable compression. sample_collector: For the old API stack only. The SampleCollector class to be used to collect and retrieve environment-, model-, and sampler data. Override the SampleCollector base class to implement your own @@ -1938,6 +1950,10 @@ def env_runners( if sample_timeout_s is not NotProvided: self.sample_timeout_s = sample_timeout_s + if max_requests_in_flight_per_env_runner is not NotProvided: + self.max_requests_in_flight_per_env_runner = ( + max_requests_in_flight_per_env_runner + ) if sample_collector is not NotProvided: self.sample_collector = sample_collector if create_env_on_local_worker is not NotProvided: @@ -2128,7 +2144,6 @@ def training( shuffle_batch_per_epoch: Optional[bool] = NotProvided, model: Optional[dict] = NotProvided, optimizer: Optional[dict] = NotProvided, - max_requests_in_flight_per_sampler_worker: Optional[int] = NotProvided, learner_class: Optional[Type["Learner"]] = NotProvided, learner_connector: Optional[ Callable[["RLModule"], Union["ConnectorV2", List["ConnectorV2"]]] @@ -2137,6 +2152,7 @@ def training( learner_config_dict: Optional[Dict[str, Any]] = NotProvided, # Deprecated args. num_sgd_iter=DEPRECATED_VALUE, + max_requests_in_flight_per_sampler_worker=DEPRECATED_VALUE, ) -> "AlgorithmConfig": """Sets the training related configuration. @@ -2200,18 +2216,6 @@ def training( TODO: Provide ModelConfig objects instead of dicts. optimizer: Arguments to pass to the policy optimizer. This setting is not used when `enable_rl_module_and_learner=True`. - max_requests_in_flight_per_sampler_worker: Max number of inflight requests - to each sampling worker. See the FaultTolerantActorManager class for - more details. - Tuning these values is important when running experimens with - large sample batches, where there is the risk that the object store may - fill up, causing spilling of objects to disk. This can cause any - asynchronous requests to become very slow, making your experiment run - slow as well. You can inspect the object store during your experiment - via a call to ray memory on your headnode, and by using the ray - dashboard. If you're seeing that the object store is filling up, - turn down the number of remote requests in flight, or enable compression - in your experiment of timesteps. learner_class: The `Learner` class to use for (distributed) updating of the RLModule. Only used when `enable_rl_module_and_learner=True`. learner_connector: A callable taking an env observation space and an env @@ -2248,6 +2252,19 @@ def training( error=False, ) num_epochs = num_sgd_iter + if max_requests_in_flight_per_sampler_worker != DEPRECATED_VALUE: + deprecation_warning( + old="AlgorithmConfig.training(" + "max_requests_in_flight_per_sampler_worker=...)", + new="AlgorithmConfig.env_runners(" + "max_requests_in_flight_per_env_runner=...)", + error=False, + ) + self.env_runners( + max_requests_in_flight_per_env_runner=( + max_requests_in_flight_per_sampler_worker + ), + ) if gamma is not NotProvided: self.gamma = gamma @@ -2291,10 +2308,6 @@ def training( if optimizer is not NotProvided: self.optimizer = merge_dicts(self.optimizer, optimizer) - if max_requests_in_flight_per_sampler_worker is not NotProvided: - self.max_requests_in_flight_per_sampler_worker = ( - max_requests_in_flight_per_sampler_worker - ) if learner_class is not NotProvided: self._learner_class = learner_class if learner_connector is not NotProvided: @@ -3366,7 +3379,7 @@ def experimental( *, _torch_grad_scaler_class: Optional[Type] = NotProvided, _torch_lr_scheduler_classes: Optional[ - Union[List[Type], Dict[ModuleID, Type]] + Union[List[Type], Dict[ModuleID, List[Type]]] ] = NotProvided, _tf_policy_handles_more_than_one_loss: Optional[bool] = NotProvided, _disable_preprocessor_api: Optional[bool] = NotProvided, @@ -3395,10 +3408,11 @@ def experimental( classes or a dictionary mapping module IDs to such a list of respective scheduler classes. Multiple scheduler classes can be applied in sequence and will be stepped in the same sequence as defined here. Note, most - learning rate schedulers need arguments to be configured, i.e. you need - to partially initialize the schedulers in the list(s). + learning rate schedulers need arguments to be configured, that is, you + might have to partially initialize the schedulers in the list(s) using + `functools.partial`. _tf_policy_handles_more_than_one_loss: Experimental flag. - If True, TFPolicy will handle more than one loss/optimizer. + If True, TFPolicy handles more than one loss or optimizer. Set this to True, if you would like to return more than one loss term from your `loss_fn` and an equal number of optimizers from your `optimizer_fn`. In the future, the default for this will be diff --git a/rllib/algorithms/impala/impala.py b/rllib/algorithms/impala/impala.py index c52191e90ec7..4d0da0425ad4 100644 --- a/rllib/algorithms/impala/impala.py +++ b/rllib/algorithms/impala/impala.py @@ -138,7 +138,7 @@ def __init__(self, algo_class=None): self.replay_buffer_num_slots = 0 # @OldAPIstack self.learner_queue_size = 3 self.learner_queue_timeout = 300 # @OldAPIstack - self.max_requests_in_flight_per_sampler_worker = 2 + self.max_requests_in_flight_per_env_runner = 2 self.max_requests_in_flight_per_aggregator_worker = 2 self.timeout_s_sampler_manager = 0.0 self.timeout_s_aggregator_manager = 0.0 diff --git a/rllib/env/env_runner_group.py b/rllib/env/env_runner_group.py index 47ee3b02c5f9..747ca55774e5 100644 --- a/rllib/env/env_runner_group.py +++ b/rllib/env/env_runner_group.py @@ -179,7 +179,7 @@ def __init__( # Starting remote workers from ID 1 to avoid conflicts. self._worker_manager = FaultTolerantActorManager( max_remote_requests_in_flight_per_actor=( - config["max_requests_in_flight_per_sampler_worker"] + config["max_requests_in_flight_per_env_runner"] ), init_id=1, ) diff --git a/rllib/examples/learners/ppo_with_torch_lr_schedulers.py b/rllib/examples/learners/ppo_with_torch_lr_schedulers.py index 92146f075ede..f40b4d0d23b8 100644 --- a/rllib/examples/learners/ppo_with_torch_lr_schedulers.py +++ b/rllib/examples/learners/ppo_with_torch_lr_schedulers.py @@ -5,9 +5,9 @@ optimizer. In this way even more complex learning rate schedules can be assembled. This example shows: - - how to partially initialize multiple learning rate schedulers in PyTorch. - - how to chain these schedulers together and pass the chain into RLlib's - configuration. + - how to configure multiple learning rate schedulers, as a chained pipeline, in + PyTorch using partial initialization with `functools.partial`. + How to run this script ---------------------- @@ -29,29 +29,24 @@ `--wandb-key=[your WandB API key] --wandb-project=[some project name] --wandb-run-name=[optional: WandB run name (within the defined project)]` + Results to expect ----------------- You should expect to observe decent learning behavior from your console output: With `--lr-const-factor=0.1`, `--lr-const-iters=10, and `--lr-exp_decay=0.3`. -+-----------------------------+------------+----------------------+--------+ -| Trial name | status | loc | iter | -| | | | | -|-----------------------------+------------+----------------------+--------+ -| PPO_CartPole-v1_7fc44_00000 | TERMINATED | 192.168.1.178:225070 | 50 | -+-----------------------------+------------+----------------------+--------+ -+------------------+------------------------+------------------------+ -| total time (s) | num_env_steps_sample | num_episodes_lifetim | -| | d_lifetime | e | -+------------------+------------------------+------------------------+ -| 59.6542 | 200000 | 9952 | -+------------------+------------------------+------------------------+ -+------------------------+ -| num_env_steps_traine | -| d_lifetime | -+------------------------| -| 210047 | -+------------------------+ ++-----------------------------+------------+--------+------------------+ +| Trial name | status | iter | total time (s) | +| | | | | +|-----------------------------+------------+--------+------------------+ +| PPO_CartPole-v1_7fc44_00000 | TERMINATED | 50 | 59.6542 | ++-----------------------------+------------+--------+------------------+ ++------------------------+------------------------+------------------------+ +| episode_return_mean | num_episodes_lifetime | num_env_steps_traine | +| | | d_lifetime | ++------------------------+------------------------+------------------------| +| 451.2 | 9952 | 210047 | ++------------------------+------------------------+------------------------+ """ import functools diff --git a/rllib/utils/filter.py b/rllib/utils/filter.py index 040ed426e353..d969abddb119 100644 --- a/rllib/utils/filter.py +++ b/rllib/utils/filter.py @@ -408,7 +408,6 @@ def __repr__(self) -> str: @OldAPIStack def get_filter(filter_config, shape): - # TODO(rliaw): move this into filter manager if filter_config == "MeanStdFilter": return MeanStdFilter(shape, clip=None) elif filter_config == "ConcurrentMeanStdFilter":