From 0ba68feac5cd9d6612a8ae8694f9ae1359e15bb6 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Tue, 11 Jun 2024 15:22:45 +0200 Subject: [PATCH 01/10] wip Signed-off-by: sven1977 --- .../checkpoint_by_custom_criteria.py | 20 ++-- .../continue_training_from_checkpoint.py | 18 +-- .../restore_1_of_n_agents_from_checkpoint.py | 16 +-- rllib/examples/connectors/frame_stacking.py | 110 +++++++++++++----- .../examples/connectors/mean_std_filtering.py | 82 +++++++++++++ .../connectors/nested_action_spaces.py | 82 +++++++++++++ .../connectors/nested_observation_spaces.py | 82 +++++++++++++ .../connectors/prev_actions_prev_rewards.py | 82 +++++++++++++ .../policy_inference_after_training.py | 8 +- ...cy_inference_after_training_w_connector.py | 8 +- 10 files changed, 445 insertions(+), 63 deletions(-) diff --git a/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py b/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py index 0419a8ae1512..075cf8ca7e42 100644 --- a/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py +++ b/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py @@ -1,16 +1,16 @@ """Example extracting a checkpoint from n trials using one or more custom criteria. This example: -- runs a CartPole experiment with three different learning rates (three tune -"trials"). During the experiment, for each trial, we create a checkpoint at each -iteration. -- at the end of the experiment, we compare the trials and pick the one that performed -best, based on the criterion: Lowest episode count per single iteration (for CartPole, -a low episode count means the episodes are very long and thus the reward is also very -high). -- from that best trial (with the lowest episode count), we then pick those checkpoints -that a) have the lowest policy loss (good) and b) have the highest value function loss -(bad). + - runs a CartPole experiment with three different learning rates (three tune + "trials"). During the experiment, for each trial, we create a checkpoint at each + iteration. + - at the end of the experiment, we compare the trials and pick the one that performed + best, based on the criterion: Lowest episode count per single iteration (for CartPole, + a low episode count means the episodes are very long and thus the reward is also very + high). + - from that best trial (with the lowest episode count), we then pick those checkpoints + that a) have the lowest policy loss (good) and b) have the highest value function loss + (bad). How to run this script diff --git a/rllib/examples/checkpoints/continue_training_from_checkpoint.py b/rllib/examples/checkpoints/continue_training_from_checkpoint.py index a8400659d960..45e16e3f89fe 100644 --- a/rllib/examples/checkpoints/continue_training_from_checkpoint.py +++ b/rllib/examples/checkpoints/continue_training_from_checkpoint.py @@ -4,15 +4,15 @@ and you would therefore like to make your setup more robust and fault-tolerant. This example: -- runs a single- or multi-agent CartPole experiment (for multi-agent, we use different -learning rates) thereby checkpointing the state of the Algorithm every n iterations. -- stops the experiment due to an expected crash in the algorithm's main process after -a certain number of iterations. -- just for testing purposes, restores the entire algorithm from the latest checkpoint -and checks, whether the state of the restored algo exactly match the state of the -crashed one. -- then continues training with the restored algorithm until the desired final episode -return is reached. + - runs a single- or multi-agent CartPole experiment (for multi-agent, we use different + learning rates) thereby checkpointing the state of the Algorithm every n iterations. + - stops the experiment due to an expected crash in the algorithm's main process after + a certain number of iterations. + - just for testing purposes, restores the entire algorithm from the latest checkpoint + and checks, whether the state of the restored algo exactly match the state of the + crashed one. + - then continues training with the restored algorithm until the desired final episode + return is reached. How to run this script diff --git a/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py b/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py index fb53e2cb876f..8e1b3f0023bf 100644 --- a/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py +++ b/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py @@ -2,14 +2,14 @@ from checkpoint. This example: - - Runs a multi-agent `Pendulum-v1` experiment with >= 2 policies. - - Saves a checkpoint of the `MultiAgentRLModule` used every `--checkpoint-freq` - iterations. - - Stops the experiments after the agents reach a combined return of `-800`. - - Picks the best checkpoint by combined return and restores policy 0 from it. - - Runs a second experiment with the restored `RLModule` for policy 0 and - a fresh `RLModule` for the other policies. - - Stops the second experiment after the agents reach a combined return of `-800`. + - Runs a multi-agent `Pendulum-v1` experiment with >= 2 policies. + - Saves a checkpoint of the `MultiAgentRLModule` used every `--checkpoint-freq` + iterations. + - Stops the experiments after the agents reach a combined return of `-800`. + - Picks the best checkpoint by combined return and restores policy 0 from it. + - Runs a second experiment with the restored `RLModule` for policy 0 and + a fresh `RLModule` for the other policies. + - Stops the second experiment after the agents reach a combined return of `-800`. How to run this script ---------------------- diff --git a/rllib/examples/connectors/frame_stacking.py b/rllib/examples/connectors/frame_stacking.py index 6abce5582b0b..dbc18f726559 100644 --- a/rllib/examples/connectors/frame_stacking.py +++ b/rllib/examples/connectors/frame_stacking.py @@ -1,15 +1,80 @@ -""" Example using connectors (V2) for frame-stacking in Atari environments. +"""Example using 2 ConnectorV2 for observation frame-stacking in Atari environments. + +An RLlib Algorithm has 3 distinct connector pipelines: +- An env-to-module pipeline in an EnvRunner accepting a list of episodes and producing +a batch for an RLModule to compute actions (`forward_inference()` or +`forward_exploration()`). +- A module-to-env pipeline in an EnvRunner taking the RLModule's output and converting +it into an action readable by the environment. +- A learner connector pipeline on a Learner taking a list of episodes and producing +a batch for an RLModule to perform the training forward pass (`forward_train()`). + +Each of these pipelines has a fixed set of default ConnectorV2 pieces that RLlib +adds/prepends to these pipelines in order to perform the most basic functionalities. +For example, RLlib adds the `AddObservationsFromEpisodesToBatch` ConnectorV2 into any +env-to-module pipeline to make sure the batch for computing actions contains - at the +minimum - the most recent observation. + +On top of these default ConnectorV2 pieces, users can define their own ConnectorV2 +pieces (or use the ones available already in RLlib) and add them to one of the 3 +different pipelines described above, as required. + +This example: + - shows how the `FrameStackingEnvToModule` ConnectorV2 piece can be added to the + env-to-module pipeline. + - shows how the `FrameStackingLearner` ConnectorV2 piece can be added to the + learner connector pipeline. + - demonstrates that using these two pieces (rather than performing framestacking + already inside the environment using a gymnasium wrapper) increases overall + performance by about 5%. + How to run this script ---------------------- -`python [script file name].py --enable-new-api-stack` +`python [script file name].py --enable-new-api-stack --num-frames=4 --env=ALE/Pong-v5` + +Use the `--num-frames` option to define the number of observations to framestack. +If you don't want to use Connectors to perform the framestacking, set the +`--use-gym-wrapper-framestacking` flag to perform framestacking already inside a +gymnasium observation wrapper. In this case though, be aware that the tensors being +sent through the network are `--num-frames` x larger than if you use the Connector +setup. + For debugging, use the following additional command line options `--no-tune --num-env-runners=0` which should allow you to set breakpoints anywhere in the RLlib code and have the execution stop there for inspection and debugging. + For logging to your WandB account, use: `--wandb-key=[your WandB API key] --wandb-project=[some project name] --wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- + +With `--num-frames=4` and using the two extra ConnectorV2 pieces (in the env-to-module +and learner connector pipelines), you should see something like: ++---------------------------+------------+--------+------------------+... +| Trial name | status | iter | total time (s) | +| | | | | +|---------------------------+------------+--------+------------------+... +| PPO_atari-env_2fc4a_00000 | TERMINATED | 10 | 557.257 | ++---------------------------+------------+--------+------------------+... + +Note that the time to run these 10 iterations is about .% faster than when +performing framestacking already inside the environment (using a +`gymnasium.wrappers.ObservationWrapper`), due to the additional network traffic +needed (sending back 4x[obs] batches instead of 1x[obs] to the learners). + +Thus, with the `--use-gym-wrapper-framestacking` option, the output looks +like this: ++---------------------------+------------+--------+------------------+... +| Trial name | status | iter | total time (s) | +| | | | | +|---------------------------+------------+--------+------------------+... +| PPO_atari-env_2fc4a_00000 | TERMINATED | 10 | 557.257 | ++---------------------------+------------+--------+------------------+... """ import gymnasium as gym @@ -27,12 +92,8 @@ parser = add_rllib_example_script_args( default_timesteps=5000000, default_reward=20.0, default_iters=200 ) -parser.add_argument( - "--atari-env", - type=str, - default="ALE/Pong-v5", - help="The name of the Atari env to run, e.g. `ALE/Breakout-v5`.", -) +# Use Pong by default. +parser.set_defaults(env="ALE/Pong-v5") parser.add_argument( "--num-frames", type=int, @@ -52,12 +113,16 @@ args = parser.parse_args() + assert ( + args.enable_new_api_stack + ), "Must set --enable-new-api-stack when running this script!" + # Define our custom connector pipelines. def _make_env_to_module_connector(env): # Create the env-to-module connector. We return an individual connector piece - # here, which RLlib will then automatically integrate into a pipeline (and + # here, which RLlib automatically integrates into a pipeline (and # add its default connector piece to the end of that pipeline). - # This pipeline also automatically fixes the input- and output spaces of the + # The default pipeline automatically fixes the input- and output spaces of the # individual connector pieces in it. # Note that since the frame stacking connector does NOT write information # back to the episode (in order to save memory and network traffic), we @@ -79,29 +144,29 @@ def _make_learner_connector(input_observation_space, input_action_space): # We would like our frame stacking connector to do this job. def _env_creator(cfg): return wrap_atari_for_new_api_stack( - gym.make(args.atari_env, **cfg, **{"render_mode": "rgb_array"}), + gym.make(args.env, **cfg, **{"render_mode": "rgb_array"}), # Perform framestacking either through ConnectorV2 or right here through # the observation wrapper. framestack=( - args.num_framestack if args.use_gym_wrapper_framestacking else None + args.num_frames if args.use_gym_wrapper_framestacking else None ), ) if args.num_agents > 0: tune.register_env( - "env", + "atari-env", lambda cfg: make_multi_agent(_env_creator)( dict(cfg, **{"num_agents": args.num_agents}) ), ) else: - tune.register_env("env", _env_creator) + tune.register_env("atari-env", _env_creator) base_config = ( get_trainable_cls(args.algo) .get_default_config() .environment( - "env", + "atari-env", env_config={ # Make analogous to old v4 + NoFrameskip. "frameskip": 1, @@ -135,9 +200,7 @@ def _env_creator(cfg): grad_clip=100.0, grad_clip_by="global_norm", ) - ) - if args.enable_new_api_stack: - base_config.rl_module( + .rl_module( model_config_dict=dict( { "vf_share_layers": True, @@ -148,16 +211,7 @@ def _env_creator(cfg): }, ) ) - else: - base_config.training( - model={ - "vf_share_layers": True, - "conv_filters": [[16, 4, 2], [32, 4, 2], [64, 4, 2], [128, 4, 2]], - "conv_activation": "relu", - "post_fcnet_hiddens": [256], - "uses_new_env_runners": False, - } - ) + ) # Add a simple multi-agent setup. if args.num_agents > 0: diff --git a/rllib/examples/connectors/mean_std_filtering.py b/rllib/examples/connectors/mean_std_filtering.py index a30d6e399c00..1603512bab9a 100644 --- a/rllib/examples/connectors/mean_std_filtering.py +++ b/rllib/examples/connectors/mean_std_filtering.py @@ -1,3 +1,81 @@ +"""Example using a ConnectorV2 for processing observations with a mean/std filter. + +An RLlib Algorithm has 3 distinct connector pipelines: +- An env-to-module pipeline in an EnvRunner accepting a list of episodes and producing +a batch for an RLModule to compute actions (`forward_inference()` or +`forward_exploration()`). +- A module-to-env pipeline in an EnvRunner taking the RLModule's output and converting +it into an action readable by the environment. +- A learner connector pipeline on a Learner taking a list of episodes and producing +a batch for an RLModule to perform the training forward pass (`forward_train()`). + +Each of these pipelines has a fixed set of default ConnectorV2 pieces that RLlib +adds/prepends to these pipelines in order to perform the most basic functionalities. +For example, RLlib adds the `AddObservationsFromEpisodesToBatch` ConnectorV2 into any +env-to-module pipeline to make sure the batch for computing actions contains - at the +minimum - the most recent observation. + +On top of these default ConnectorV2 pieces, users can define their own ConnectorV2 +pieces (or use the ones available already in RLlib) and add them to one of the 3 +different pipelines described above, as required. + +This example: + - shows how the `FrameStackingEnvToModule` ConnectorV2 piece can be added to the + env-to-module pipeline. + - shows how the `FrameStackingLearner` ConnectorV2 piece can be added to the + learner connector pipeline. + - demonstrates that using these two pieces (rather than performing framestacking + already inside the environment using a gymnasium wrapper) increases overall + performance by about 5%. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack --num-frames=4 --env=ALE/Pong-v5` + +Use the `--num-frames` option to define the number of observations to framestack. +If you don't want to use Connectors to perform the framestacking, set the +`--use-gym-wrapper-framestacking` flag to perform framestacking already inside a +gymnasium observation wrapper. In this case though, be aware that the tensors being +sent through the network are `--num-frames` x larger than if you use the Connector +setup. + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- + +With `--num-frames=4` and using the two extra ConnectorV2 pieces (in the env-to-module +and learner connector pipelines), you should see something like: ++---------------------------+------------+--------+------------------+... +| Trial name | status | iter | total time (s) | +| | | | | +|---------------------------+------------+--------+------------------+... +| PPO_atari-env_2fc4a_00000 | TERMINATED | 10 | 557.257 | ++---------------------------+------------+--------+------------------+... + +Note that the time to run these 10 iterations is about .% faster than when +performing framestacking already inside the environment (using a +`gymnasium.wrappers.ObservationWrapper`), due to the additional network traffic +needed (sending back 4x[obs] batches instead of 1x[obs] to the learners). + +Thus, with the `--use-gym-wrapper-framestacking` option, the output looks +like this: ++---------------------------+------------+--------+------------------+... +| Trial name | status | iter | total time (s) | +| | | | | +|---------------------------+------------+--------+------------------+... +| PPO_atari-env_2fc4a_00000 | TERMINATED | 10 | 557.257 | ++---------------------------+------------+--------+------------------+... +""" from ray.air.constants import TRAINING_ITERATION from ray.rllib.connectors.env_to_module.mean_std_filter import MeanStdFilter from ray.rllib.examples.envs.classes.multi_agent import MultiAgentPendulum @@ -26,6 +104,10 @@ if __name__ == "__main__": args = parser.parse_args() + assert ( + args.enable_new_api_stack + ), "Must set --enable-new-api-stack when running this script!" + # Register our environment with tune. if args.num_agents > 0: register_env( diff --git a/rllib/examples/connectors/nested_action_spaces.py b/rllib/examples/connectors/nested_action_spaces.py index 830b87fb25fb..86df316c7916 100644 --- a/rllib/examples/connectors/nested_action_spaces.py +++ b/rllib/examples/connectors/nested_action_spaces.py @@ -1,3 +1,81 @@ +"""Example using connectors (V2) for observation frame-stacking in Atari environments. + +An RLlib Algorithm has 3 distinct connector pipelines: +- An env-to-module pipeline in an EnvRunner accepting a list of episodes and producing +a batch for an RLModule to compute actions (`forward_inference()` or +`forward_exploration()`). +- A module-to-env pipeline in an EnvRunner taking the RLModule's output and converting +it into an action readable by the environment. +- A learner connector pipeline on a Learner taking a list of episodes and producing +a batch for an RLModule to perform the training forward pass (`forward_train()`). + +Each of these pipelines has a fixed set of default ConnectorV2 pieces that RLlib +adds/prepends to these pipelines in order to perform the most basic functionalities. +For example, RLlib adds the `AddObservationsFromEpisodesToBatch` ConnectorV2 into any +env-to-module pipeline to make sure the batch for computing actions contains - at the +minimum - the most recent observation. + +On top of these default ConnectorV2 pieces, users can define their own ConnectorV2 +pieces (or use the ones available already in RLlib) and add them to one of the 3 +different pipelines described above, as required. + +This example: + - shows how the `FrameStackingEnvToModule` ConnectorV2 piece can be added to the + env-to-module pipeline. + - shows how the `FrameStackingLearner` ConnectorV2 piece can be added to the + learner connector pipeline. + - demonstrates that using these two pieces (rather than performing framestacking + already inside the environment using a gymnasium wrapper) increases overall + performance by about 5%. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack --num-frames=4 --env=ALE/Pong-v5` + +Use the `--num-frames` option to define the number of observations to framestack. +If you don't want to use Connectors to perform the framestacking, set the +`--use-gym-wrapper-framestacking` flag to perform framestacking already inside a +gymnasium observation wrapper. In this case though, be aware that the tensors being +sent through the network are `--num-frames` x larger than if you use the Connector +setup. + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- + +With `--num-frames=4` and using the two extra ConnectorV2 pieces (in the env-to-module +and learner connector pipelines), you should see something like: ++---------------------------+------------+--------+------------------+... +| Trial name | status | iter | total time (s) | +| | | | | +|---------------------------+------------+--------+------------------+... +| PPO_atari-env_2fc4a_00000 | TERMINATED | 10 | 557.257 | ++---------------------------+------------+--------+------------------+... + +Note that the time to run these 10 iterations is about .% faster than when +performing framestacking already inside the environment (using a +`gymnasium.wrappers.ObservationWrapper`), due to the additional network traffic +needed (sending back 4x[obs] batches instead of 1x[obs] to the learners). + +Thus, with the `--use-gym-wrapper-framestacking` option, the output looks +like this: ++---------------------------+------------+--------+------------------+... +| Trial name | status | iter | total time (s) | +| | | | | +|---------------------------+------------+--------+------------------+... +| PPO_atari-env_2fc4a_00000 | TERMINATED | 10 | 557.257 | ++---------------------------+------------+--------+------------------+... +""" from gymnasium.spaces import Dict, Tuple, Box, Discrete, MultiDiscrete from ray.tune.registry import register_env @@ -26,6 +104,10 @@ if __name__ == "__main__": args = parser.parse_args() + assert ( + args.enable_new_api_stack + ), "Must set --enable-new-api-stack when running this script!" + # Define env-to-module-connector pipeline for the new stack. def _env_to_module_pipeline(env): return [ diff --git a/rllib/examples/connectors/nested_observation_spaces.py b/rllib/examples/connectors/nested_observation_spaces.py index 39a4bac1c585..cf98909502ee 100644 --- a/rllib/examples/connectors/nested_observation_spaces.py +++ b/rllib/examples/connectors/nested_observation_spaces.py @@ -1,3 +1,81 @@ +"""Example using connectors (V2) for observation frame-stacking in Atari environments. + +An RLlib Algorithm has 3 distinct connector pipelines: +- An env-to-module pipeline in an EnvRunner accepting a list of episodes and producing +a batch for an RLModule to compute actions (`forward_inference()` or +`forward_exploration()`). +- A module-to-env pipeline in an EnvRunner taking the RLModule's output and converting +it into an action readable by the environment. +- A learner connector pipeline on a Learner taking a list of episodes and producing +a batch for an RLModule to perform the training forward pass (`forward_train()`). + +Each of these pipelines has a fixed set of default ConnectorV2 pieces that RLlib +adds/prepends to these pipelines in order to perform the most basic functionalities. +For example, RLlib adds the `AddObservationsFromEpisodesToBatch` ConnectorV2 into any +env-to-module pipeline to make sure the batch for computing actions contains - at the +minimum - the most recent observation. + +On top of these default ConnectorV2 pieces, users can define their own ConnectorV2 +pieces (or use the ones available already in RLlib) and add them to one of the 3 +different pipelines described above, as required. + +This example: + - shows how the `FrameStackingEnvToModule` ConnectorV2 piece can be added to the + env-to-module pipeline. + - shows how the `FrameStackingLearner` ConnectorV2 piece can be added to the + learner connector pipeline. + - demonstrates that using these two pieces (rather than performing framestacking + already inside the environment using a gymnasium wrapper) increases overall + performance by about 5%. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack --num-frames=4 --env=ALE/Pong-v5` + +Use the `--num-frames` option to define the number of observations to framestack. +If you don't want to use Connectors to perform the framestacking, set the +`--use-gym-wrapper-framestacking` flag to perform framestacking already inside a +gymnasium observation wrapper. In this case though, be aware that the tensors being +sent through the network are `--num-frames` x larger than if you use the Connector +setup. + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- + +With `--num-frames=4` and using the two extra ConnectorV2 pieces (in the env-to-module +and learner connector pipelines), you should see something like: ++---------------------------+------------+--------+------------------+... +| Trial name | status | iter | total time (s) | +| | | | | +|---------------------------+------------+--------+------------------+... +| PPO_atari-env_2fc4a_00000 | TERMINATED | 10 | 557.257 | ++---------------------------+------------+--------+------------------+... + +Note that the time to run these 10 iterations is about .% faster than when +performing framestacking already inside the environment (using a +`gymnasium.wrappers.ObservationWrapper`), due to the additional network traffic +needed (sending back 4x[obs] batches instead of 1x[obs] to the learners). + +Thus, with the `--use-gym-wrapper-framestacking` option, the output looks +like this: ++---------------------------+------------+--------+------------------+... +| Trial name | status | iter | total time (s) | +| | | | | +|---------------------------+------------+--------+------------------+... +| PPO_atari-env_2fc4a_00000 | TERMINATED | 10 | 557.257 | ++---------------------------+------------+--------+------------------+... +""" from ray.tune.registry import register_env from ray.rllib.connectors.env_to_module import ( AddObservationsFromEpisodesToBatch, @@ -24,6 +102,10 @@ if __name__ == "__main__": args = parser.parse_args() + assert ( + args.enable_new_api_stack + ), "Must set --enable-new-api-stack when running this script!" + # Define env-to-module-connector pipeline for the new stack. def _env_to_module_pipeline(env): return [ diff --git a/rllib/examples/connectors/prev_actions_prev_rewards.py b/rllib/examples/connectors/prev_actions_prev_rewards.py index 0c3a2693cca2..2b62fcdd0ef8 100644 --- a/rllib/examples/connectors/prev_actions_prev_rewards.py +++ b/rllib/examples/connectors/prev_actions_prev_rewards.py @@ -1,3 +1,81 @@ +"""Example using connectors (V2) for observation frame-stacking in Atari environments. + +An RLlib Algorithm has 3 distinct connector pipelines: +- An env-to-module pipeline in an EnvRunner accepting a list of episodes and producing +a batch for an RLModule to compute actions (`forward_inference()` or +`forward_exploration()`). +- A module-to-env pipeline in an EnvRunner taking the RLModule's output and converting +it into an action readable by the environment. +- A learner connector pipeline on a Learner taking a list of episodes and producing +a batch for an RLModule to perform the training forward pass (`forward_train()`). + +Each of these pipelines has a fixed set of default ConnectorV2 pieces that RLlib +adds/prepends to these pipelines in order to perform the most basic functionalities. +For example, RLlib adds the `AddObservationsFromEpisodesToBatch` ConnectorV2 into any +env-to-module pipeline to make sure the batch for computing actions contains - at the +minimum - the most recent observation. + +On top of these default ConnectorV2 pieces, users can define their own ConnectorV2 +pieces (or use the ones available already in RLlib) and add them to one of the 3 +different pipelines described above, as required. + +This example: + - shows how the `FrameStackingEnvToModule` ConnectorV2 piece can be added to the + env-to-module pipeline. + - shows how the `FrameStackingLearner` ConnectorV2 piece can be added to the + learner connector pipeline. + - demonstrates that using these two pieces (rather than performing framestacking + already inside the environment using a gymnasium wrapper) increases overall + performance by about 5%. + + +How to run this script +---------------------- +`python [script file name].py --enable-new-api-stack --num-frames=4 --env=ALE/Pong-v5` + +Use the `--num-frames` option to define the number of observations to framestack. +If you don't want to use Connectors to perform the framestacking, set the +`--use-gym-wrapper-framestacking` flag to perform framestacking already inside a +gymnasium observation wrapper. In this case though, be aware that the tensors being +sent through the network are `--num-frames` x larger than if you use the Connector +setup. + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + + +Results to expect +----------------- + +With `--num-frames=4` and using the two extra ConnectorV2 pieces (in the env-to-module +and learner connector pipelines), you should see something like: ++---------------------------+------------+--------+------------------+... +| Trial name | status | iter | total time (s) | +| | | | | +|---------------------------+------------+--------+------------------+... +| PPO_atari-env_2fc4a_00000 | TERMINATED | 10 | 557.257 | ++---------------------------+------------+--------+------------------+... + +Note that the time to run these 10 iterations is about .% faster than when +performing framestacking already inside the environment (using a +`gymnasium.wrappers.ObservationWrapper`), due to the additional network traffic +needed (sending back 4x[obs] batches instead of 1x[obs] to the learners). + +Thus, with the `--use-gym-wrapper-framestacking` option, the output looks +like this: ++---------------------------+------------+--------+------------------+... +| Trial name | status | iter | total time (s) | +| | | | | +|---------------------------+------------+--------+------------------+... +| PPO_atari-env_2fc4a_00000 | TERMINATED | 10 | 557.257 | ++---------------------------+------------+--------+------------------+... +""" import functools from ray.rllib.algorithms.ppo import PPOConfig @@ -29,6 +107,10 @@ if __name__ == "__main__": args = parser.parse_args() + assert ( + args.enable_new_api_stack + ), "Must set --enable-new-api-stack when running this script!" + # Define our custom connector pipelines. def _env_to_module(env): # Create the env-to-module connector pipeline. diff --git a/rllib/examples/inference/policy_inference_after_training.py b/rllib/examples/inference/policy_inference_after_training.py index 0f61f4519cd7..5cdb6090f758 100644 --- a/rllib/examples/inference/policy_inference_after_training.py +++ b/rllib/examples/inference/policy_inference_after_training.py @@ -4,12 +4,12 @@ from a checkpoint and a manual env-loop (CartPole-v1). No ConnectorV2s or EnvRunners are used in this example. -This example shows .. - - .. how to use an already existing checkpoint to extract a single-agent RLModule +This example: + - shows how to use an already existing checkpoint to extract a single-agent RLModule from (our policy network). - - .. how to setup this recovered policy net for action computations (with or without + - shows how to setup this recovered policy net for action computations (with or without using exploration). - - .. have the policy run through a very simple gymnasium based env-loop, w/o using + - shows have the policy run through a very simple gymnasium based env-loop, w/o using RLlib's ConnectorV2s or EnvRunners. diff --git a/rllib/examples/inference/policy_inference_after_training_w_connector.py b/rllib/examples/inference/policy_inference_after_training_w_connector.py index 6d97ef61f865..304c5ba76ed2 100644 --- a/rllib/examples/inference/policy_inference_after_training_w_connector.py +++ b/rllib/examples/inference/policy_inference_after_training_w_connector.py @@ -6,12 +6,12 @@ The RLModule contains an LSTM that requires its own previous STATE_OUT as new input at every episode step to compute a new action. -This example shows .. - - .. how to use an already existing checkpoint to extract a single-agent RLModule +This example: + - shows how to use an already existing checkpoint to extract a single-agent RLModule from (our policy network). - - .. how to setup this recovered policy net for action computations (with or without + - shows how to setup this recovered policy net for action computations (with or without using exploration). - - .. how to create a more complex env-loop in which the action-computing RLModule + - shows how to create a more complex env-loop in which the action-computing RLModule requires its own previous state outputs as new input and how to use RLlib's Episode APIs to achieve this. From 1ed0eec100e2209f18e3bb7f242a32f0189d918a Mon Sep 17 00:00:00 2001 From: sven1977 Date: Tue, 11 Jun 2024 16:25:25 +0200 Subject: [PATCH 02/10] wip Signed-off-by: sven1977 --- .../env_to_module/mean_std_filter.py | 2 +- .../checkpoint_by_custom_criteria.py | 14 +- .../continue_training_from_checkpoint.py | 19 +- .../restore_1_of_n_agents_from_checkpoint.py | 4 +- rllib/examples/connectors/frame_stacking.py | 13 +- .../examples/connectors/mean_std_filtering.py | 167 +++++++++--------- .../policy_inference_after_training.py | 12 +- ...cy_inference_after_training_w_connector.py | 14 +- 8 files changed, 120 insertions(+), 125 deletions(-) diff --git a/rllib/connectors/env_to_module/mean_std_filter.py b/rllib/connectors/env_to_module/mean_std_filter.py index e4709aff5b44..187fc9130826 100644 --- a/rllib/connectors/env_to_module/mean_std_filter.py +++ b/rllib/connectors/env_to_module/mean_std_filter.py @@ -1,7 +1,7 @@ from typing import Any, Dict, List, Optional -from gymnasium.spaces import Discrete, MultiDiscrete import gymnasium as gym +from gymnasium.spaces import Discrete, MultiDiscrete import numpy as np import tree diff --git a/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py b/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py index 075cf8ca7e42..33204e52d5e9 100644 --- a/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py +++ b/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py @@ -4,13 +4,13 @@ - runs a CartPole experiment with three different learning rates (three tune "trials"). During the experiment, for each trial, we create a checkpoint at each iteration. - - at the end of the experiment, we compare the trials and pick the one that performed - best, based on the criterion: Lowest episode count per single iteration (for CartPole, - a low episode count means the episodes are very long and thus the reward is also very - high). - - from that best trial (with the lowest episode count), we then pick those checkpoints - that a) have the lowest policy loss (good) and b) have the highest value function loss - (bad). + - at the end of the experiment, we compare the trials and pick the one that + performed best, based on the criterion: Lowest episode count per single iteration + (for CartPole, a low episode count means the episodes are very long and thus the + reward is also very high). + - from that best trial (with the lowest episode count), we then pick those + checkpoints that a) have the lowest policy loss (good) and b) have the highest value + function loss (bad). How to run this script diff --git a/rllib/examples/checkpoints/continue_training_from_checkpoint.py b/rllib/examples/checkpoints/continue_training_from_checkpoint.py index 45e16e3f89fe..c52a7868b4e8 100644 --- a/rllib/examples/checkpoints/continue_training_from_checkpoint.py +++ b/rllib/examples/checkpoints/continue_training_from_checkpoint.py @@ -4,15 +4,16 @@ and you would therefore like to make your setup more robust and fault-tolerant. This example: - - runs a single- or multi-agent CartPole experiment (for multi-agent, we use different - learning rates) thereby checkpointing the state of the Algorithm every n iterations. - - stops the experiment due to an expected crash in the algorithm's main process after - a certain number of iterations. - - just for testing purposes, restores the entire algorithm from the latest checkpoint - and checks, whether the state of the restored algo exactly match the state of the - crashed one. - - then continues training with the restored algorithm until the desired final episode - return is reached. + - runs a single- or multi-agent CartPole experiment (for multi-agent, we use + different learning rates) thereby checkpointing the state of the Algorithm every n + iterations. + - stops the experiment due to an expected crash in the algorithm's main process + after a certain number of iterations. + - just for testing purposes, restores the entire algorithm from the latest + checkpoint and checks, whether the state of the restored algo exactly match the + state of the crashed one. + - then continues training with the restored algorithm until the desired final + episode return is reached. How to run this script diff --git a/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py b/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py index 8e1b3f0023bf..f3c83777e8e5 100644 --- a/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py +++ b/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py @@ -5,11 +5,11 @@ - Runs a multi-agent `Pendulum-v1` experiment with >= 2 policies. - Saves a checkpoint of the `MultiAgentRLModule` used every `--checkpoint-freq` iterations. - - Stops the experiments after the agents reach a combined return of `-800`. + - Stops the experiments after the agents reach a combined return of -800. - Picks the best checkpoint by combined return and restores policy 0 from it. - Runs a second experiment with the restored `RLModule` for policy 0 and a fresh `RLModule` for the other policies. - - Stops the second experiment after the agents reach a combined return of `-800`. + - Stops the second experiment after the agents reach a combined return of -800. How to run this script ---------------------- diff --git a/rllib/examples/connectors/frame_stacking.py b/rllib/examples/connectors/frame_stacking.py index dbc18f726559..e26918796ff4 100644 --- a/rllib/examples/connectors/frame_stacking.py +++ b/rllib/examples/connectors/frame_stacking.py @@ -54,26 +54,27 @@ ----------------- With `--num-frames=4` and using the two extra ConnectorV2 pieces (in the env-to-module -and learner connector pipelines), you should see something like: +and learner connector pipelines), you should see something like this using: +`--env ALE/Pong-v5 --num-gpus=4 --num-env-runners=95` +---------------------------+------------+--------+------------------+... | Trial name | status | iter | total time (s) | | | | | | |---------------------------+------------+--------+------------------+... -| PPO_atari-env_2fc4a_00000 | TERMINATED | 10 | 557.257 | +| PPO_atari-env_2fc4a_00000 | TERMINATED | 200 | 335.837 | +---------------------------+------------+--------+------------------+... -Note that the time to run these 10 iterations is about .% faster than when +Note that the time to run these 200 iterations is about ~5% faster than when performing framestacking already inside the environment (using a `gymnasium.wrappers.ObservationWrapper`), due to the additional network traffic needed (sending back 4x[obs] batches instead of 1x[obs] to the learners). -Thus, with the `--use-gym-wrapper-framestacking` option, the output looks -like this: +Thus, with the `--use-gym-wrapper-framestacking` option (all other options being equal), +the output looks like this: +---------------------------+------------+--------+------------------+... | Trial name | status | iter | total time (s) | | | | | | |---------------------------+------------+--------+------------------+... -| PPO_atari-env_2fc4a_00000 | TERMINATED | 10 | 557.257 | +| PPO_atari-env_2fc4a_00000 | TERMINATED | 200 | 351.505 | +---------------------------+------------+--------+------------------+... """ import gymnasium as gym diff --git a/rllib/examples/connectors/mean_std_filtering.py b/rllib/examples/connectors/mean_std_filtering.py index 1603512bab9a..470812585138 100644 --- a/rllib/examples/connectors/mean_std_filtering.py +++ b/rllib/examples/connectors/mean_std_filtering.py @@ -20,25 +20,16 @@ different pipelines described above, as required. This example: - - shows how the `FrameStackingEnvToModule` ConnectorV2 piece can be added to the - env-to-module pipeline. - - shows how the `FrameStackingLearner` ConnectorV2 piece can be added to the - learner connector pipeline. - - demonstrates that using these two pieces (rather than performing framestacking - already inside the environment using a gymnasium wrapper) increases overall - performance by about 5%. + - shows how the `MeanStdFilter` ConnectorV2 piece can be added to the env-to-module + pipeline. + - demonstrates that using such a filter enhances learning behavior (or even makes + if possible to learn overall) in some environments, especially those with lopsided + observation spaces, for example `Box(-3000, -1000, ...)`. How to run this script ---------------------- -`python [script file name].py --enable-new-api-stack --num-frames=4 --env=ALE/Pong-v5` - -Use the `--num-frames` option to define the number of observations to framestack. -If you don't want to use Connectors to perform the framestacking, set the -`--use-gym-wrapper-framestacking` flag to perform framestacking already inside a -gymnasium observation wrapper. In this case though, be aware that the tensors being -sent through the network are `--num-frames` x larger than if you use the Connector -setup. +`python [script file name].py --enable-new-api-stack` For debugging, use the following additional command line options `--no-tune --num-env-runners=0` @@ -52,40 +43,39 @@ Results to expect ----------------- - -With `--num-frames=4` and using the two extra ConnectorV2 pieces (in the env-to-module -and learner connector pipelines), you should see something like: -+---------------------------+------------+--------+------------------+... -| Trial name | status | iter | total time (s) | -| | | | | -|---------------------------+------------+--------+------------------+... -| PPO_atari-env_2fc4a_00000 | TERMINATED | 10 | 557.257 | -+---------------------------+------------+--------+------------------+... - -Note that the time to run these 10 iterations is about .% faster than when -performing framestacking already inside the environment (using a -`gymnasium.wrappers.ObservationWrapper`), due to the additional network traffic -needed (sending back 4x[obs] batches instead of 1x[obs] to the learners). - -Thus, with the `--use-gym-wrapper-framestacking` option, the output looks -like this: -+---------------------------+------------+--------+------------------+... -| Trial name | status | iter | total time (s) | -| | | | | -|---------------------------+------------+--------+------------------+... -| PPO_atari-env_2fc4a_00000 | TERMINATED | 10 | 557.257 | -+---------------------------+------------+--------+------------------+... +Running this example with the mean-std filter results in the normally expected Pendulum +learning behavior: ++-------------------------------+------------+-----------------+--------+ +| Trial name | status | loc | iter | +| | | | | +|-------------------------------+------------+-----------------+--------+ +| PPO_lopsided-pend_f9c96_00000 | TERMINATED | 127.0.0.1:43612 | 77 | ++-------------------------------+------------+-----------------+--------+ ++------------------+------------------------+-----------------------+ +| total time (s) | num_env_steps_sample | episode_return_mean | +| | d_lifetime | | +|------------------+------------------------+-----------------------| +| 30.7466 | 40040 | -276.3 | ++------------------+------------------------+-----------------------+ + +If you try using the `--disable-mean-std-filter` (all other things being equal), you +will either see no learning progress at all (or a very slow one), but more likely some +numerical instability related error will be thrown: + +ValueError: Expected parameter loc (Tensor of shape (64, 1)) of distribution + Normal(loc: torch.Size([64, 1]), scale: torch.Size([64, 1])) to satisfy the + constraint Real(), but found invalid values: +tensor([[nan], + [nan], + [nan], + ... """ -from ray.air.constants import TRAINING_ITERATION +import gymnasium as gym +import numpy as np + from ray.rllib.connectors.env_to_module.mean_std_filter import MeanStdFilter from ray.rllib.examples.envs.classes.multi_agent import MultiAgentPendulum from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.metrics import ( - ENV_RUNNER_RESULTS, - EPISODE_RETURN_MEAN, - EVALUATION_RESULTS, - NUM_ENV_STEPS_SAMPLED_LIFETIME, -) from ray.rllib.utils.test_utils import ( add_rllib_example_script_args, run_rllib_example_script_experiment, @@ -99,6 +89,21 @@ default_timesteps=500000, default_reward=-300.0, ) +parser.add_argument( + "--disable-mean-std-filter", + action="store_true", + help="Run w/o a mean/std env-to-module connector piece (filter).", +) + + +class LopsidedObs(gym.ObservationWrapper): + def __init__(self, env): + super().__init__(env) + self.observation_space = gym.spaces.Box(-4000.0, -1456.0, (3,), np.float32) + + def observation(self, observation): + # Lopside [-1.0, 1.0] Pendulum observations + return ((observation + 1.0) / 2.0) * (4000.0 - 1456.0) - 4000.0 if __name__ == "__main__": @@ -111,14 +116,16 @@ # Register our environment with tune. if args.num_agents > 0: register_env( - "env", + "lopsided-pend", lambda _: MultiAgentPendulum(config={"num_agents": args.num_agents}), ) + else: + register_env("lopsided-pend", lambda _: LopsidedObs(gym.make("Pendulum-v1"))) config = ( get_trainable_cls(args.algo) .get_default_config() - .environment("env" if args.num_agents > 0 else "Pendulum-v1") + .environment("lopsided-pend") .env_runners( # TODO (sven): MAEnvRunner does not support vectorized envs yet # due to gym's env checkers and non-compatability with RLlib's @@ -130,7 +137,9 @@ # included in an automatically generated EnvToModulePipeline or return a # EnvToModulePipeline directly. env_to_module_connector=( - lambda env: MeanStdFilter(multi_agent=args.num_agents > 0) + None + if args.disable_mean_std_filter + else lambda env: MeanStdFilter(multi_agent=args.num_agents > 0) ), ) .training( @@ -143,25 +152,7 @@ vf_clip_param=10.0, vf_loss_coeff=0.01, ) - .evaluation( - evaluation_num_env_runners=1, - evaluation_parallel_to_training=True, - evaluation_interval=1, - evaluation_duration=10, - evaluation_duration_unit="episodes", - evaluation_config={ - "explore": False, - # Do NOT use the eval EnvRunners' ConnectorV2 states. Instead, before - # each round of evaluation, broadcast the latest training - # EnvRunnerGroup's ConnectorV2 states (merged from all training remote - # EnvRunners) to the eval EnvRunnerGroup (and discard the eval - # EnvRunners' stats). - "use_worker_filter_stats": False, - }, - ) - ) - if args.enable_new_api_stack: - config = config.rl_module( + .rl_module( model_config_dict={ "fcnet_activation": "relu", "fcnet_weights_initializer": torch.nn.init.xavier_uniform_, @@ -170,17 +161,27 @@ "uses_new_env_runners": True, } ) - else: - config = config.training( - model=dict( - { - "fcnet_activation": "relu", - "fcnet_weights_initializer": torch.nn.init.xavier_uniform_, - "fcnet_bias_initializer": torch.nn.init.constant_, - "fcnet_bias_initializer_config": {"val": 0.0}, - } - ) - ) + # In case you would like to run with a evaluation EnvRunners, make sure your + # `evaluation_config` key contains the `use_worker_filter_stats=False` setting + # (see below). This setting makes sure that the mean/std stats collected by the + # evaluation EnvRunners are NOT used for the training EnvRunners (unless you + # really want to mix these stats). It's normally a good idea to keep the stats + # collected during evaluation completely out of the training data (already for + # better reproducibility alone). + # .evaluation( + # evaluation_num_env_runners=1, + # evaluation_interval=1, + # evaluation_config={ + # "explore": False, + # # Do NOT use the eval EnvRunners' ConnectorV2 states. Instead, before + # # each round of evaluation, broadcast the latest training + # # EnvRunnerGroup's ConnectorV2 states (merged from all training remote + # # EnvRunners) to the eval EnvRunnerGroup (and discard the eval + # # EnvRunners' stats). + # "use_worker_filter_stats": False, + # }, + # ) + ) # Add a simple multi-agent setup. if args.num_agents > 0: @@ -189,12 +190,4 @@ policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}", ) - stop = { - TRAINING_ITERATION: args.stop_iters, - f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": ( - args.stop_reward - ), - NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps, - } - - run_rllib_example_script_experiment(config, args, stop=stop) + run_rllib_example_script_experiment(config, args) diff --git a/rllib/examples/inference/policy_inference_after_training.py b/rllib/examples/inference/policy_inference_after_training.py index 5cdb6090f758..2525d5ca2935 100644 --- a/rllib/examples/inference/policy_inference_after_training.py +++ b/rllib/examples/inference/policy_inference_after_training.py @@ -5,12 +5,12 @@ used in this example. This example: - - shows how to use an already existing checkpoint to extract a single-agent RLModule - from (our policy network). - - shows how to setup this recovered policy net for action computations (with or without - using exploration). - - shows have the policy run through a very simple gymnasium based env-loop, w/o using - RLlib's ConnectorV2s or EnvRunners. + - shows how to use an already existing checkpoint to extract a single-agent RLModule + from (our policy network). + - shows how to setup this recovered policy net for action computations (with or + without using exploration). + - shows have the policy run through a very simple gymnasium based env-loop, w/o + using RLlib's ConnectorV2s or EnvRunners. How to run this script diff --git a/rllib/examples/inference/policy_inference_after_training_w_connector.py b/rllib/examples/inference/policy_inference_after_training_w_connector.py index 304c5ba76ed2..e4a66ec33266 100644 --- a/rllib/examples/inference/policy_inference_after_training_w_connector.py +++ b/rllib/examples/inference/policy_inference_after_training_w_connector.py @@ -7,13 +7,13 @@ at every episode step to compute a new action. This example: - - shows how to use an already existing checkpoint to extract a single-agent RLModule - from (our policy network). - - shows how to setup this recovered policy net for action computations (with or without - using exploration). - - shows how to create a more complex env-loop in which the action-computing RLModule - requires its own previous state outputs as new input and how to use RLlib's Episode - APIs to achieve this. + - shows how to use an already existing checkpoint to extract a single-agent RLModule + from (our policy network). + - shows how to setup this recovered policy net for action computations (with or + without using exploration). + - shows how to create a more complex env-loop in which the action-computing RLModule + requires its own previous state outputs as new input and how to use RLlib's Episode + APIs to achieve this. How to run this script From db94e46ffb13235894dae52a4c47d47643115b1b Mon Sep 17 00:00:00 2001 From: sven1977 Date: Tue, 11 Jun 2024 17:44:04 +0200 Subject: [PATCH 03/10] wip Signed-off-by: sven1977 --- .../restore_1_of_n_agents_from_checkpoint.py | 18 ++++++----- .../connectors/prev_actions_prev_rewards.py | 31 +++++-------------- 2 files changed, 18 insertions(+), 31 deletions(-) diff --git a/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py b/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py index f3c83777e8e5..bf6889113fed 100644 --- a/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py +++ b/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py @@ -2,14 +2,15 @@ from checkpoint. This example: - - Runs a multi-agent `Pendulum-v1` experiment with >= 2 policies. - - Saves a checkpoint of the `MultiAgentRLModule` used every `--checkpoint-freq` - iterations. - - Stops the experiments after the agents reach a combined return of -800. - - Picks the best checkpoint by combined return and restores policy 0 from it. - - Runs a second experiment with the restored `RLModule` for policy 0 and - a fresh `RLModule` for the other policies. - - Stops the second experiment after the agents reach a combined return of -800. + - Runs a multi-agent `Pendulum-v1` experiment with >= 2 policies. + - Saves a checkpoint of the `MultiAgentRLModule` used every `--checkpoint-freq` + iterations. + - Stops the experiments after the agents reach a combined return of -800. + - Picks the best checkpoint by combined return and restores policy 0 from it. + - Runs a second experiment with the restored `RLModule` for policy 0 and + a fresh `RLModule` for the other policies. + - Stops the second experiment after the agents reach a combined return of -800. + How to run this script ---------------------- @@ -34,6 +35,7 @@ `--wandb-key=[your WandB API key] --wandb-project=[some project name] --wandb-run-name=[optional: WandB run name (within the defined project)]` + Results to expect ----------------- You should expect a reward of -400.0 eventually being achieved by a simple diff --git a/rllib/examples/connectors/prev_actions_prev_rewards.py b/rllib/examples/connectors/prev_actions_prev_rewards.py index 2b62fcdd0ef8..f24b96078aeb 100644 --- a/rllib/examples/connectors/prev_actions_prev_rewards.py +++ b/rllib/examples/connectors/prev_actions_prev_rewards.py @@ -1,4 +1,4 @@ -"""Example using connectors (V2) for observation frame-stacking in Atari environments. +"""Example using a ConnectorV2 to add previous rewards/actions to an RLModule's input. An RLlib Algorithm has 3 distinct connector pipelines: - An env-to-module pipeline in an EnvRunner accepting a list of episodes and producing @@ -20,10 +20,11 @@ different pipelines described above, as required. This example: - - shows how the `FrameStackingEnvToModule` ConnectorV2 piece can be added to the - env-to-module pipeline. - - shows how the `FrameStackingLearner` ConnectorV2 piece can be added to the - learner connector pipeline. + - shows how the `PrevActionsPrevRewardsConnector` piece can be added to the + env-to-module pipeline to extract previous rewards and/or actions from the ongoing + episodes. + - shows how this connector creates and wraps this information together to the RLModule's + original observation and . - demonstrates that using these two pieces (rather than performing framestacking already inside the environment using a gymnasium wrapper) increases overall performance by about 5%. @@ -146,10 +147,7 @@ def _env_to_module(env): train_batch_size=4000, vf_loss_coeff=0.01, ) - ) - - if args.enable_new_api_stack: - config = config.rl_module( + .rl_module( model_config_dict={ "use_lstm": True, "max_seq_len": 50, @@ -161,20 +159,7 @@ def _env_to_module(env): "uses_new_env_runners": True, } ) - else: - config = config.training( - model=dict( - { - "use_lstm": True, - "max_seq_len": 50, - "fcnet_hiddens": [32], - "fcnet_activation": "linear", - "vf_share_layers": True, - "fcnet_weights_initializer": nn.init.xavier_uniform_, - "fcnet_bias_initializer": functools.partial(nn.init.constant_, 0.0), - } - ) - ) + ) # Add a simple multi-agent setup. if args.num_agents > 0: From cc8277c8b66ae451479bd6656e73f829a12a21ae Mon Sep 17 00:00:00 2001 From: sven1977 Date: Wed, 12 Jun 2024 11:51:32 +0200 Subject: [PATCH 04/10] wip Signed-off-by: sven1977 --- rllib/connectors/env_to_module/flatten_observations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rllib/connectors/env_to_module/flatten_observations.py b/rllib/connectors/env_to_module/flatten_observations.py index 1958f9e871d1..c3443b3dff7f 100644 --- a/rllib/connectors/env_to_module/flatten_observations.py +++ b/rllib/connectors/env_to_module/flatten_observations.py @@ -199,7 +199,7 @@ def __call__( if not agent_id else self._input_obs_base_struct[agent_id] ), - # Our items are bare observations (no batch axis present). + # Our items are individual observations (no batch axis present). batch_axis=False, ) ) From f1d05fd674d21ae74fb618cf7a81bb53e2ec3fac Mon Sep 17 00:00:00 2001 From: sven1977 Date: Wed, 12 Jun 2024 13:16:48 +0200 Subject: [PATCH 05/10] wip Signed-off-by: sven1977 --- rllib/connectors/env_to_module/__init__.py | 4 +- .../env_to_module/flatten_observations.py | 68 ++++++++++--------- .../env_to_module/mean_std_filter.py | 13 ++-- .../prev_actions_prev_rewards.py | 38 ++++++----- .../connectors/prev_actions_prev_rewards.py | 18 ++--- .../rock_paper_scissors_learned_vs_learned.py | 12 +--- 6 files changed, 76 insertions(+), 77 deletions(-) diff --git a/rllib/connectors/env_to_module/__init__.py b/rllib/connectors/env_to_module/__init__.py index 8f2750c9a807..98b73bd9962b 100644 --- a/rllib/connectors/env_to_module/__init__.py +++ b/rllib/connectors/env_to_module/__init__.py @@ -14,7 +14,7 @@ FlattenObservations, ) from ray.rllib.connectors.env_to_module.prev_actions_prev_rewards import ( - PrevActionsPrevRewardsConnector, + PrevActionsPrevRewards, ) from ray.rllib.connectors.env_to_module.write_observations_to_episodes import ( WriteObservationsToEpisodes, @@ -29,6 +29,6 @@ "EnvToModulePipeline", "FlattenObservations", "NumpyToTensor", - "PrevActionsPrevRewardsConnector", + "PrevActionsPrevRewards", "WriteObservationsToEpisodes", ] diff --git a/rllib/connectors/env_to_module/flatten_observations.py b/rllib/connectors/env_to_module/flatten_observations.py index c3443b3dff7f..57ec233a4a53 100644 --- a/rllib/connectors/env_to_module/flatten_observations.py +++ b/rllib/connectors/env_to_module/flatten_observations.py @@ -19,18 +19,12 @@ class FlattenObservations(ConnectorV2): """A connector piece that flattens all observation components into a 1D array. - - Only works on data that has already been added to the batch. - - This connector makes the assumption that under the Columns.OBS key in batch, - there is either a list of individual env observations to be flattened (single-agent - case) or a dict mapping agent- and module IDs to lists of data items to be - flattened (multi-agent case). - - Does NOT work in a Learner pipeline as it operates on individual observation - items (as opposed to batched/time-ranked data). - - Therefore, assumes that the altered (flattened) observations will be written - back into the episode by a later connector piece in the env-to-module pipeline - (which this piece is part of as well). - - Does NOT read any information from the given list of Episode objects. - - Does NOT write any observations (or other data) to the given Episode objects. + - Works directly on the incoming episodes list and changes the last observation + in-place (write the flattened observation back into the episode). + - This connector does NOT alter the incoming batch (`data`) when called. + - This connector does NOT work in a `LearnerConnectorPipeline` because it requires + the incoming episodes to still be ongoing (in progress) as it only alters the + latest observation, not all observations in an episode. .. testcode:: @@ -177,32 +171,40 @@ def __call__( f"for this connector to work!" ) - # Process each item under the Columns.OBS key individually and flatten - # it. We are using the `ConnectorV2.foreach_batch_item_change_in_place` API, - # allowing us to not worry about multi- or single-agent setups and returning - # the new version of each item we are iterating over. - self.foreach_batch_item_change_in_place( - batch=data, - column=Columns.OBS, - func=( - lambda item, eps_id, agent_id, module_id: ( - # Multi-agent AND skip this AgentID. - item - if self._agent_ids and agent_id not in self._agent_ids - # Single-agent or flatten this AgentIDs observation. + for sa_episode in self.single_agent_episode_iterator( + episodes, agents_that_stepped_only=True + ): + # Episode is not finalized yet and thus still operates on lists of items. + assert not sa_episode.is_finalized + + last_obs = sa_episode.get_observations(-1) + + if self._multi_agent: + flattened_obs = { + agent_obs if aid not in self._agent_ids else flatten_inputs_to_1d_tensor( - item, + inputs=agent_obs, # In the multi-agent case, we need to use the specific agent's # space struct, not the multi-agent observation space dict. - ( - self._input_obs_base_struct - if not agent_id - else self._input_obs_base_struct[agent_id] - ), + spaces_struct=self._input_obs_base_struct[agent_id], # Our items are individual observations (no batch axis present). batch_axis=False, ) + for aid, agent_obs in last_obs.items() + } + else: + flattened_obs = flatten_inputs_to_1d_tensor( + inputs=last_obs, + spaces_struct=self._input_obs_base_struct, + # Our items are individual observations (no batch axis present). + batch_axis=False, ) - ), - ) + + # Write new observation directly back into the episode. + sa_episode.set_observations(at_indices=-1, new_data=flattened_obs) + # We set the Episode's observation space to ours so that we can safely + # set the last obs to the new value (without causing a space mismatch + # error). + sa_episode.observation_space = self.observation_space + return data diff --git a/rllib/connectors/env_to_module/mean_std_filter.py b/rllib/connectors/env_to_module/mean_std_filter.py index 187fc9130826..09b5e4f0fbcf 100644 --- a/rllib/connectors/env_to_module/mean_std_filter.py +++ b/rllib/connectors/env_to_module/mean_std_filter.py @@ -19,6 +19,8 @@ class MeanStdFilter(ConnectorV2): """A connector used to mean-std-filter observations. + + Incoming observations are filtered such that the output of this filter is on average 0.0 and has a standard deviation of 1.0. If the observation space is a (possibly nested) dict, this filtering is applied separately per element of @@ -121,13 +123,10 @@ def __call__( sa_obs, update=self._update_stats ) sa_episode.set_observations(at_indices=-1, new_data=normalized_sa_obs) - - if len(sa_episode) == 0: - # TODO (sven): This is kind of a hack. - # We set the Episode's observation space to ours so that we can safely - # set the last obs to the new value (without causing a space mismatch - # error). - sa_episode.observation_space = self.observation_space + # We set the Episode's observation space to ours so that we can safely + # set the last obs to the new value (without causing a space mismatch + # error). + sa_episode.observation_space = self.observation_space # Leave `data` as is. RLlib's default connector will automatically # populate the OBS column therein from the episodes' now transformed diff --git a/rllib/connectors/env_to_module/prev_actions_prev_rewards.py b/rllib/connectors/env_to_module/prev_actions_prev_rewards.py index 5a0222fceb0c..f48c5e459a39 100644 --- a/rllib/connectors/env_to_module/prev_actions_prev_rewards.py +++ b/rllib/connectors/env_to_module/prev_actions_prev_rewards.py @@ -12,7 +12,7 @@ from ray.rllib.utils.typing import EpisodeType -class PrevActionsPrevRewardsConnector(ConnectorV2): +class PrevActionsPrevRewards(ConnectorV2): """A connector piece that adds previous rewards and actions to the input obs. - Requires Columns.OBS to be already a part of the batch. @@ -64,7 +64,7 @@ def __init__( n_prev_rewards: int = 1, **kwargs, ): - """Initializes a PrevActionsPrevRewardsConnector instance. + """Initializes a PrevActionsPrevRewards instance. Args: multi_agent: Whether this is a connector operating on a multi-agent @@ -116,7 +116,6 @@ def __call__( f"for this connector to work!" ) - new_obs = [] for sa_episode, orig_obs in self.single_agent_episode_iterator( episodes, zip_with_batch_column=observations ): @@ -142,21 +141,26 @@ def __call__( ) ) - new_obs.append( - { - self.ORIG_OBS_KEY: orig_obs, - self.PREV_ACTIONS_KEY: prev_n_actions, - self.PREV_REWARDS_KEY: prev_n_rewards, - } - ) + augmented_obs = { + self.ORIG_OBS_KEY: orig_obs, + self.PREV_ACTIONS_KEY: prev_n_actions, + self.PREV_REWARDS_KEY: prev_n_rewards, + } - # Convert the observations in the batch into a dict with the keys: - # "_obs", "_prev_rewards", and "_prev_actions". - self.foreach_batch_item_change_in_place( - batch=data, - column=Columns.OBS, - func=lambda orig_obs, eps_id, agent_id, module_id: new_obs.pop(0), - ) + # Write new observation directly back into the episode. + sa_episode.set_observations(at_indices=-1, new_data=augmented_obs) + # We set the Episode's observation space to ours so that we can safely + # set the last obs to the new value (without causing a space mismatch + # error). + sa_episode.observation_space = self.observation_space + + ## Convert the observations in the batch into a dict with the keys: + ## "_obs", "_prev_rewards", and "_prev_actions". + #self.foreach_batch_item_change_in_place( + # batch=data, + # column=Columns.OBS, + # func=lambda orig_obs, eps_id, agent_id, module_id: new_obs.pop(0), + #) return data diff --git a/rllib/examples/connectors/prev_actions_prev_rewards.py b/rllib/examples/connectors/prev_actions_prev_rewards.py index f24b96078aeb..82003157c19c 100644 --- a/rllib/examples/connectors/prev_actions_prev_rewards.py +++ b/rllib/examples/connectors/prev_actions_prev_rewards.py @@ -20,12 +20,14 @@ different pipelines described above, as required. This example: - - shows how the `PrevActionsPrevRewardsConnector` piece can be added to the + - shows how the `PrevActionsPrevRewards` ConnectorV2 piece can be added to the env-to-module pipeline to extract previous rewards and/or actions from the ongoing episodes. - - shows how this connector creates and wraps this information together to the RLModule's - original observation and . - - demonstrates that using these two pieces (rather than performing framestacking + - shows how this connector creates and wraps this new information (rewards and + actions) together with the original observations into the RLModule's input dict + as a new gym.spaces.Dict structure. + - demonstrates how to plug in RLlib's in-house observation flattening + connector after the that using these two pieces (rather than performing framestacking already inside the environment using a gymnasium wrapper) increases overall performance by about 5%. @@ -83,7 +85,7 @@ from ray.rllib.connectors.env_to_module import ( AddObservationsFromEpisodesToBatch, FlattenObservations, - PrevActionsPrevRewardsConnector, + PrevActionsPrevRewards, WriteObservationsToEpisodes, ) from ray.rllib.examples.envs.classes.stateless_cartpole import StatelessCartPole @@ -116,14 +118,14 @@ def _env_to_module(env): # Create the env-to-module connector pipeline. return [ - AddObservationsFromEpisodesToBatch(), - PrevActionsPrevRewardsConnector( + #AddObservationsFromEpisodesToBatch(), + PrevActionsPrevRewards( multi_agent=args.num_agents > 0, n_prev_rewards=args.n_prev_rewards, n_prev_actions=args.n_prev_actions, ), FlattenObservations(multi_agent=args.num_agents > 0), - WriteObservationsToEpisodes(), + #WriteObservationsToEpisodes(), ] # Register our environment with tune. diff --git a/rllib/examples/multi_agent/rock_paper_scissors_learned_vs_learned.py b/rllib/examples/multi_agent/rock_paper_scissors_learned_vs_learned.py index 507c018babc8..e3e75c990692 100644 --- a/rllib/examples/multi_agent/rock_paper_scissors_learned_vs_learned.py +++ b/rllib/examples/multi_agent/rock_paper_scissors_learned_vs_learned.py @@ -15,11 +15,7 @@ from pettingzoo.classic import rps_v2 -from ray.rllib.connectors.env_to_module import ( - AddObservationsFromEpisodesToBatch, - FlattenObservations, - WriteObservationsToEpisodes, -) +from ray.rllib.connectors.env_to_module import FlattenObservations from ray.rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec from ray.rllib.env.wrappers.pettingzoo_env import ParallelPettingZooEnv @@ -62,11 +58,7 @@ .get_default_config() .environment("RockPaperScissors") .env_runners( - env_to_module_connector=lambda env: ( - AddObservationsFromEpisodesToBatch(), - FlattenObservations(multi_agent=True), - WriteObservationsToEpisodes(), - ), + env_to_module_connector=lambda env: FlattenObservations(multi_agent=True), ) .multi_agent( policies={"p0", "p1"}, From e278e67eb67ab64c7f35554dac42b159020da4aa Mon Sep 17 00:00:00 2001 From: sven1977 Date: Wed, 12 Jun 2024 14:30:49 +0200 Subject: [PATCH 06/10] wip Signed-off-by: sven1977 --- rllib/BUILD | 52 ++--- .../env_to_module/flatten_observations.py | 16 +- .../prev_actions_prev_rewards.py | 35 +--- rllib/examples/actions/__init__.py | 0 .../examples/actions/nested_action_spaces.py | 92 +++++++++ ....py => flatten_observations_dict_space.py} | 81 ++++---- .../connectors/nested_action_spaces.py | 178 ------------------ .../connectors/prev_actions_prev_rewards.py | 17 +- .../curriculum/curriculum_learning.py | 12 +- ...ock_paper_scissors_heuristic_vs_learned.py | 10 +- .../two_step_game_with_grouped_agents.py | 12 +- 11 files changed, 177 insertions(+), 328 deletions(-) create mode 100644 rllib/examples/actions/__init__.py create mode 100644 rllib/examples/actions/nested_action_spaces.py rename rllib/examples/connectors/{nested_observation_spaces.py => flatten_observations_dict_space.py} (60%) delete mode 100644 rllib/examples/connectors/nested_action_spaces.py diff --git a/rllib/BUILD b/rllib/BUILD index 3ff7a8275461..c2d310d337e8 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -2086,6 +2086,27 @@ py_test( # tagged by @OldAPIStack and/or @HybridAPIStack # ---------------------- +# subdirectory: actions/ + +# Nested action spaces (flattening obs and learning w/ multi-action distribution). +py_test( + name = "examples/actions/nested_action_spaces_ppo", + main = "examples/actions/nested_action_spaces.py", + tags = ["team:rllib", "exclusive", "examples"], + size = "large", + srcs = ["examples/actions/nested_action_spaces.py"], + args = ["--enable-new-api-stack", "--as-test", "--framework=torch", "--stop-reward=-500.0", "--algo=PPO"] +) + +py_test( + name = "examples/actions/nested_action_spaces_multi_agent_ppo", + main = "examples/actions/nested_action_spaces.py", + tags = ["team:rllib", "exclusive", "examples"], + size = "large", + srcs = ["examples/actions/nested_action_spaces.py"], + args = ["--enable-new-api-stack", "--as-test", "--num-agents=2", "--framework=torch", "--stop-reward=-1000.0", "--algo=PPO"] +) + # subdirectory: algorithms/ #@OldAPIStack @@ -2213,41 +2234,22 @@ py_test( args = ["--enable-new-api-stack", "--num-agents=2", "--stop-iter=2", "--framework=torch", "--algo=PPO", "--num-env-runners=4", "--num-cpus=6"] ) -# Nested action spaces (flattening obs and learning w/ multi-action distribution). -py_test( - name = "examples/connectors/nested_action_spaces_ppo", - main = "examples/connectors/nested_action_spaces.py", - tags = ["team:rllib", "exclusive", "examples"], - size = "large", - srcs = ["examples/connectors/nested_action_spaces.py"], - args = ["--enable-new-api-stack", "--as-test", "--framework=torch", "--stop-reward=-500.0", "--algo=PPO"] -) - -py_test( - name = "examples/connectors/nested_action_spaces_multi_agent_ppo", - main = "examples/connectors/nested_action_spaces.py", - tags = ["team:rllib", "exclusive", "examples"], - size = "large", - srcs = ["examples/connectors/nested_action_spaces.py"], - args = ["--enable-new-api-stack", "--as-test", "--num-agents=2", "--framework=torch", "--stop-reward=-1000.0", "--algo=PPO"] -) - # Nested observation spaces (flattening). py_test( - name = "examples/connectors/nested_observation_spaces_ppo", - main = "examples/connectors/nested_observation_spaces.py", + name = "examples/connectors/flatten_observations_dict_space_ppo", + main = "examples/connectors/flatten_observations_dict_space.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/connectors/nested_observation_spaces.py"], + srcs = ["examples/connectors/flatten_observations_dict_space.py"], args = ["--enable-new-api-stack", "--as-test", "--stop-reward=400.0", "--framework=torch", "--algo=PPO"] ) py_test( - name = "examples/connectors/nested_observation_spaces_multi_agent_ppo", - main = "examples/connectors/nested_observation_spaces.py", + name = "examples/connectors/flatten_observations_dict_space_multi_agent_ppo", + main = "examples/connectors/flatten_observations_dict_space.py", tags = ["team:rllib", "exclusive", "examples"], size = "medium", - srcs = ["examples/connectors/nested_observation_spaces.py"], + srcs = ["examples/connectors/flatten_observations_dict_space.py"], args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--stop-reward=800.0", "--framework=torch", "--algo=PPO"] ) diff --git a/rllib/connectors/env_to_module/flatten_observations.py b/rllib/connectors/env_to_module/flatten_observations.py index 57ec233a4a53..8ac27ac8b817 100644 --- a/rllib/connectors/env_to_module/flatten_observations.py +++ b/rllib/connectors/env_to_module/flatten_observations.py @@ -6,7 +6,6 @@ import tree # pip install dm_tree from ray.rllib.connectors.connector_v2 import ConnectorV2 -from ray.rllib.core.columns import Columns from ray.rllib.core.rl_module.rl_module import RLModule from ray.rllib.utils.annotations import override from ray.rllib.utils.numpy import flatten_inputs_to_1d_tensor @@ -163,16 +162,8 @@ def __call__( shared_data: Optional[dict] = None, **kwargs, ) -> Any: - observations = data.get(Columns.OBS) - - if observations is None: - raise ValueError( - f"`batch` must already have a column named {Columns.OBS} in it " - f"for this connector to work!" - ) - for sa_episode in self.single_agent_episode_iterator( - episodes, agents_that_stepped_only=True + episodes, agents_that_stepped_only=True ): # Episode is not finalized yet and thus still operates on lists of items. assert not sa_episode.is_finalized @@ -181,7 +172,8 @@ def __call__( if self._multi_agent: flattened_obs = { - agent_obs if aid not in self._agent_ids + agent_obs + if agent_id not in self._agent_ids else flatten_inputs_to_1d_tensor( inputs=agent_obs, # In the multi-agent case, we need to use the specific agent's @@ -190,7 +182,7 @@ def __call__( # Our items are individual observations (no batch axis present). batch_axis=False, ) - for aid, agent_obs in last_obs.items() + for agent_id, agent_obs in last_obs.items() } else: flattened_obs = flatten_inputs_to_1d_tensor( diff --git a/rllib/connectors/env_to_module/prev_actions_prev_rewards.py b/rllib/connectors/env_to_module/prev_actions_prev_rewards.py index f48c5e459a39..89aa51f69e2b 100644 --- a/rllib/connectors/env_to_module/prev_actions_prev_rewards.py +++ b/rllib/connectors/env_to_module/prev_actions_prev_rewards.py @@ -5,7 +5,6 @@ import numpy as np from ray.rllib.connectors.connector_v2 import ConnectorV2 -from ray.rllib.core.columns import Columns from ray.rllib.core.rl_module.rl_module import RLModule from ray.rllib.utils.annotations import override from ray.rllib.utils.spaces.space_utils import batch, flatten_to_single_ndarray @@ -36,8 +35,6 @@ class PrevActionsPrevRewards(ConnectorV2): """ ORIG_OBS_KEY = "_orig_obs" - PREV_ACTIONS_KEY = "prev_actions" - PREV_REWARDS_KEY = "prev_rewards" @override(ConnectorV2) def recompute_observation_space_from_input_spaces(self): @@ -108,22 +105,16 @@ def __call__( shared_data: Optional[dict] = None, **kwargs, ) -> Any: - observations = data.get(Columns.OBS) - - if observations is None: - raise ValueError( - f"`batch` must already have a column named {Columns.OBS} in it " - f"for this connector to work!" - ) - - for sa_episode, orig_obs in self.single_agent_episode_iterator( - episodes, zip_with_batch_column=observations + for sa_episode in self.single_agent_episode_iterator( + episodes, agents_that_stepped_only=True ): # Episode is not finalized yet and thus still operates on lists of items. assert not sa_episode.is_finalized + augmented_obs = {self.ORIG_OBS_KEY: sa_episode.get_observations(-1)} + if self.n_prev_actions: - prev_n_actions = flatten_to_single_ndarray( + augmented_obs[self.PREV_ACTIONS_KEY] = flatten_to_single_ndarray( batch( sa_episode.get_actions( indices=slice(-self.n_prev_actions, None), @@ -134,19 +125,13 @@ def __call__( ) if self.n_prev_rewards: - prev_n_rewards = np.array( + augmented_obs[self.PREV_REWARDS_KEY] = np.array( sa_episode.get_rewards( indices=slice(-self.n_prev_rewards, None), fill=0.0, ) ) - augmented_obs = { - self.ORIG_OBS_KEY: orig_obs, - self.PREV_ACTIONS_KEY: prev_n_actions, - self.PREV_REWARDS_KEY: prev_n_rewards, - } - # Write new observation directly back into the episode. sa_episode.set_observations(at_indices=-1, new_data=augmented_obs) # We set the Episode's observation space to ours so that we can safely @@ -154,14 +139,6 @@ def __call__( # error). sa_episode.observation_space = self.observation_space - ## Convert the observations in the batch into a dict with the keys: - ## "_obs", "_prev_rewards", and "_prev_actions". - #self.foreach_batch_item_change_in_place( - # batch=data, - # column=Columns.OBS, - # func=lambda orig_obs, eps_id, agent_id, module_id: new_obs.pop(0), - #) - return data def _convert_individual_space(self, obs_space, act_space): diff --git a/rllib/examples/actions/__init__.py b/rllib/examples/actions/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/rllib/examples/actions/nested_action_spaces.py b/rllib/examples/actions/nested_action_spaces.py new file mode 100644 index 000000000000..db7ad434c674 --- /dev/null +++ b/rllib/examples/actions/nested_action_spaces.py @@ -0,0 +1,92 @@ +from gymnasium.spaces import Dict, Tuple, Box, Discrete, MultiDiscrete + +from ray.tune.registry import register_env +from ray.rllib.connectors.env_to_module import FlattenObservations +from ray.rllib.examples.envs.classes.multi_agent import ( + MultiAgentNestedSpaceRepeatAfterMeEnv, +) +from ray.rllib.examples.envs.classes.nested_space_repeat_after_me_env import ( + NestedSpaceRepeatAfterMeEnv, +) +from ray.rllib.utils.test_utils import ( + add_rllib_example_script_args, + run_rllib_example_script_experiment, +) +from ray.tune.registry import get_trainable_cls + + +# Read in common example script command line arguments. +parser = add_rllib_example_script_args(default_timesteps=200000, default_reward=-500.0) + + +if __name__ == "__main__": + args = parser.parse_args() + + assert ( + args.enable_new_api_stack + ), "Must set --enable-new-api-stack when running this script!" + + # Define env-to-module-connector pipeline for the new stack. + def _env_to_module_pipeline(env): + return FlattenObservations(multi_agent=args.num_agents > 0) + + # Register our environment with tune. + if args.num_agents > 0: + register_env( + "env", + lambda c: MultiAgentNestedSpaceRepeatAfterMeEnv( + config=dict(c, **{"num_agents": args.num_agents}) + ), + ) + else: + register_env("env", lambda c: NestedSpaceRepeatAfterMeEnv(c)) + + # Define the AlgorithmConfig used. + base_config = ( + get_trainable_cls(args.algo) + .get_default_config() + .environment( + "env", + env_config={ + "space": Dict( + { + "a": Tuple( + [Dict({"d": Box(-15.0, 3.0, ()), "e": Discrete(3)})] + ), + "b": Box(-10.0, 10.0, (2,)), + "c": MultiDiscrete([3, 3]), + "d": Discrete(2), + } + ), + "episode_len": 100, + }, + ) + .env_runners(env_to_module_connector=_env_to_module_pipeline) + # No history in Env (bandit problem). + .training( + gamma=0.0, + lr=0.0005, + model=( + {} if not args.enable_new_api_stack else {"uses_new_env_runners": True} + ), + ) + ) + + # Add a simple multi-agent setup. + if args.num_agents > 0: + base_config.multi_agent( + policies={f"p{i}" for i in range(args.num_agents)}, + policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}", + ) + + # Fix some PPO-specific settings. + if args.algo == "PPO": + base_config.training( + # We don't want high entropy in this Env. + entropy_coeff=0.00005, + num_sgd_iter=4, + vf_loss_coeff=0.01, + ) + + # Run everything as configured. + run_rllib_example_script_experiment(base_config, args) diff --git a/rllib/examples/connectors/nested_observation_spaces.py b/rllib/examples/connectors/flatten_observations_dict_space.py similarity index 60% rename from rllib/examples/connectors/nested_observation_spaces.py rename to rllib/examples/connectors/flatten_observations_dict_space.py index cf98909502ee..bed31ce5ac28 100644 --- a/rllib/examples/connectors/nested_observation_spaces.py +++ b/rllib/examples/connectors/flatten_observations_dict_space.py @@ -1,4 +1,4 @@ -"""Example using connectors (V2) for observation frame-stacking in Atari environments. +"""Example using a ConnectorV2 to flatten arbitrarily nested dict or tuple observations. An RLlib Algorithm has 3 distinct connector pipelines: - An env-to-module pipeline in an EnvRunner accepting a list of episodes and producing @@ -20,25 +20,30 @@ different pipelines described above, as required. This example: - - shows how the `FrameStackingEnvToModule` ConnectorV2 piece can be added to the + - shows how the `FlattenObservation` ConnectorV2 piece can be added to the env-to-module pipeline. - - shows how the `FrameStackingLearner` ConnectorV2 piece can be added to the - learner connector pipeline. - - demonstrates that using these two pieces (rather than performing framestacking - already inside the environment using a gymnasium wrapper) increases overall - performance by about 5%. + - demonstrates that by using this connector, any arbitrarily nested dict or tuple + observations is properly flattened into a simple 1D tensor, for easier RLModule + processing. + - shows how - in a multi-agent setup - individual agents can be specified, whose + observations should be flattened (while other agents' observations will always + be left as-is). + - uses a variant of the CartPole-v1 environment, in which the 4 observation items + (x-pos, x-veloc, angle, and angle-veloc) are taken apart and put into a nested dict + with the structure: + { + "x-pos": [x-pos], + "angular-pos": { + "value": [angle], + "some_random_stuff": [random Discrete(3)], # <- should be ignored by algo + }, + "velocs": Tuple([x-veloc], [angle-veloc]), + } How to run this script ---------------------- -`python [script file name].py --enable-new-api-stack --num-frames=4 --env=ALE/Pong-v5` - -Use the `--num-frames` option to define the number of observations to framestack. -If you don't want to use Connectors to perform the framestacking, set the -`--use-gym-wrapper-framestacking` flag to perform framestacking already inside a -gymnasium observation wrapper. In this case though, be aware that the tensors being -sent through the network are `--num-frames` x larger than if you use the Connector -setup. +`python [script file name].py --enable-new-api-stack` For debugging, use the following additional command line options `--no-tune --num-env-runners=0` @@ -53,35 +58,21 @@ Results to expect ----------------- -With `--num-frames=4` and using the two extra ConnectorV2 pieces (in the env-to-module -and learner connector pipelines), you should see something like: -+---------------------------+------------+--------+------------------+... -| Trial name | status | iter | total time (s) | -| | | | | -|---------------------------+------------+--------+------------------+... -| PPO_atari-env_2fc4a_00000 | TERMINATED | 10 | 557.257 | -+---------------------------+------------+--------+------------------+... - -Note that the time to run these 10 iterations is about .% faster than when -performing framestacking already inside the environment (using a -`gymnasium.wrappers.ObservationWrapper`), due to the additional network traffic -needed (sending back 4x[obs] batches instead of 1x[obs] to the learners). - -Thus, with the `--use-gym-wrapper-framestacking` option, the output looks -like this: -+---------------------------+------------+--------+------------------+... -| Trial name | status | iter | total time (s) | -| | | | | -|---------------------------+------------+--------+------------------+... -| PPO_atari-env_2fc4a_00000 | TERMINATED | 10 | 557.257 | -+---------------------------+------------+--------+------------------+... ++---------------------+------------+----------------+--------+------------------+ +| Trial name | status | loc | iter | total time (s) | +| | | | | | +|---------------------+------------+----------------+--------+------------------+ +| PPO_env_a2fd6_00000 | TERMINATED | 127.0.0.1:7409 | 25 | 24.1426 | ++---------------------+------------+----------------+--------+------------------+ +------------------------+------------------------+------------------------+ + num_env_steps_sample | num_env_steps_traine | episode_return_mean | + d_lifetime | d_lifetime | | +------------------------+------------------------+------------------------| + 100000 | 100000 | 421.42 | +------------------------+------------------------+------------------------+ """ from ray.tune.registry import register_env -from ray.rllib.connectors.env_to_module import ( - AddObservationsFromEpisodesToBatch, - FlattenObservations, - WriteObservationsToEpisodes, -) +from ray.rllib.connectors.env_to_module import FlattenObservations from ray.rllib.examples.envs.classes.cartpole_with_dict_observation_space import ( CartPoleWithDictObservationSpace, ) @@ -108,11 +99,7 @@ # Define env-to-module-connector pipeline for the new stack. def _env_to_module_pipeline(env): - return [ - AddObservationsFromEpisodesToBatch(), - FlattenObservations(multi_agent=args.num_agents > 0), - WriteObservationsToEpisodes(), - ] + return FlattenObservations(multi_agent=args.num_agents > 0) # Register our environment with tune. if args.num_agents > 0: diff --git a/rllib/examples/connectors/nested_action_spaces.py b/rllib/examples/connectors/nested_action_spaces.py deleted file mode 100644 index 86df316c7916..000000000000 --- a/rllib/examples/connectors/nested_action_spaces.py +++ /dev/null @@ -1,178 +0,0 @@ -"""Example using connectors (V2) for observation frame-stacking in Atari environments. - -An RLlib Algorithm has 3 distinct connector pipelines: -- An env-to-module pipeline in an EnvRunner accepting a list of episodes and producing -a batch for an RLModule to compute actions (`forward_inference()` or -`forward_exploration()`). -- A module-to-env pipeline in an EnvRunner taking the RLModule's output and converting -it into an action readable by the environment. -- A learner connector pipeline on a Learner taking a list of episodes and producing -a batch for an RLModule to perform the training forward pass (`forward_train()`). - -Each of these pipelines has a fixed set of default ConnectorV2 pieces that RLlib -adds/prepends to these pipelines in order to perform the most basic functionalities. -For example, RLlib adds the `AddObservationsFromEpisodesToBatch` ConnectorV2 into any -env-to-module pipeline to make sure the batch for computing actions contains - at the -minimum - the most recent observation. - -On top of these default ConnectorV2 pieces, users can define their own ConnectorV2 -pieces (or use the ones available already in RLlib) and add them to one of the 3 -different pipelines described above, as required. - -This example: - - shows how the `FrameStackingEnvToModule` ConnectorV2 piece can be added to the - env-to-module pipeline. - - shows how the `FrameStackingLearner` ConnectorV2 piece can be added to the - learner connector pipeline. - - demonstrates that using these two pieces (rather than performing framestacking - already inside the environment using a gymnasium wrapper) increases overall - performance by about 5%. - - -How to run this script ----------------------- -`python [script file name].py --enable-new-api-stack --num-frames=4 --env=ALE/Pong-v5` - -Use the `--num-frames` option to define the number of observations to framestack. -If you don't want to use Connectors to perform the framestacking, set the -`--use-gym-wrapper-framestacking` flag to perform framestacking already inside a -gymnasium observation wrapper. In this case though, be aware that the tensors being -sent through the network are `--num-frames` x larger than if you use the Connector -setup. - -For debugging, use the following additional command line options -`--no-tune --num-env-runners=0` -which should allow you to set breakpoints anywhere in the RLlib code and -have the execution stop there for inspection and debugging. - -For logging to your WandB account, use: -`--wandb-key=[your WandB API key] --wandb-project=[some project name] ---wandb-run-name=[optional: WandB run name (within the defined project)]` - - -Results to expect ------------------ - -With `--num-frames=4` and using the two extra ConnectorV2 pieces (in the env-to-module -and learner connector pipelines), you should see something like: -+---------------------------+------------+--------+------------------+... -| Trial name | status | iter | total time (s) | -| | | | | -|---------------------------+------------+--------+------------------+... -| PPO_atari-env_2fc4a_00000 | TERMINATED | 10 | 557.257 | -+---------------------------+------------+--------+------------------+... - -Note that the time to run these 10 iterations is about .% faster than when -performing framestacking already inside the environment (using a -`gymnasium.wrappers.ObservationWrapper`), due to the additional network traffic -needed (sending back 4x[obs] batches instead of 1x[obs] to the learners). - -Thus, with the `--use-gym-wrapper-framestacking` option, the output looks -like this: -+---------------------------+------------+--------+------------------+... -| Trial name | status | iter | total time (s) | -| | | | | -|---------------------------+------------+--------+------------------+... -| PPO_atari-env_2fc4a_00000 | TERMINATED | 10 | 557.257 | -+---------------------------+------------+--------+------------------+... -""" -from gymnasium.spaces import Dict, Tuple, Box, Discrete, MultiDiscrete - -from ray.tune.registry import register_env -from ray.rllib.connectors.env_to_module import ( - AddObservationsFromEpisodesToBatch, - FlattenObservations, - WriteObservationsToEpisodes, -) -from ray.rllib.examples.envs.classes.multi_agent import ( - MultiAgentNestedSpaceRepeatAfterMeEnv, -) -from ray.rllib.examples.envs.classes.nested_space_repeat_after_me_env import ( - NestedSpaceRepeatAfterMeEnv, -) -from ray.rllib.utils.test_utils import ( - add_rllib_example_script_args, - run_rllib_example_script_experiment, -) -from ray.tune.registry import get_trainable_cls - - -# Read in common example script command line arguments. -parser = add_rllib_example_script_args(default_timesteps=200000, default_reward=-500.0) - - -if __name__ == "__main__": - args = parser.parse_args() - - assert ( - args.enable_new_api_stack - ), "Must set --enable-new-api-stack when running this script!" - - # Define env-to-module-connector pipeline for the new stack. - def _env_to_module_pipeline(env): - return [ - AddObservationsFromEpisodesToBatch(), - FlattenObservations(multi_agent=args.num_agents > 0), - WriteObservationsToEpisodes(), - ] - - # Register our environment with tune. - if args.num_agents > 0: - register_env( - "env", - lambda c: MultiAgentNestedSpaceRepeatAfterMeEnv( - config=dict(c, **{"num_agents": args.num_agents}) - ), - ) - else: - register_env("env", lambda c: NestedSpaceRepeatAfterMeEnv(c)) - - # Define the AlgorithmConfig used. - base_config = ( - get_trainable_cls(args.algo) - .get_default_config() - .environment( - "env", - env_config={ - "space": Dict( - { - "a": Tuple( - [Dict({"d": Box(-15.0, 3.0, ()), "e": Discrete(3)})] - ), - "b": Box(-10.0, 10.0, (2,)), - "c": MultiDiscrete([3, 3]), - "d": Discrete(2), - } - ), - "episode_len": 100, - }, - ) - .env_runners(env_to_module_connector=_env_to_module_pipeline) - # No history in Env (bandit problem). - .training( - gamma=0.0, - lr=0.0005, - model=( - {} if not args.enable_new_api_stack else {"uses_new_env_runners": True} - ), - ) - ) - - # Add a simple multi-agent setup. - if args.num_agents > 0: - base_config.multi_agent( - policies={f"p{i}" for i in range(args.num_agents)}, - policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}", - ) - - # Fix some PPO-specific settings. - if args.algo == "PPO": - base_config.training( - # We don't want high entropy in this Env. - entropy_coeff=0.00005, - num_sgd_iter=4, - vf_loss_coeff=0.01, - ) - - # Run everything as configured. - run_rllib_example_script_experiment(base_config, args) diff --git a/rllib/examples/connectors/prev_actions_prev_rewards.py b/rllib/examples/connectors/prev_actions_prev_rewards.py index 82003157c19c..1f341c5aeaaa 100644 --- a/rllib/examples/connectors/prev_actions_prev_rewards.py +++ b/rllib/examples/connectors/prev_actions_prev_rewards.py @@ -25,11 +25,12 @@ episodes. - shows how this connector creates and wraps this new information (rewards and actions) together with the original observations into the RLModule's input dict - as a new gym.spaces.Dict structure. - - demonstrates how to plug in RLlib's in-house observation flattening - connector after the that using these two pieces (rather than performing framestacking - already inside the environment using a gymnasium wrapper) increases overall - performance by about 5%. + under a new `gym.spaces.Dict` structure (for example, if your observation space + is `O=Box(shape=(3,))` and you add the most recent 1 reward, the new observation + space will be `Dict({"_original_obs": O, "prev_n_rewards": Box(shape=())})`. + - demonstrates how to use RLlib's `FlattenObservations` right after the + `PrevActionsPrevRewards` to flatten that new dict observation structure again into + a single 1D tensor. How to run this script @@ -83,10 +84,8 @@ from ray.rllib.algorithms.ppo import PPOConfig from ray.rllib.connectors.env_to_module import ( - AddObservationsFromEpisodesToBatch, FlattenObservations, PrevActionsPrevRewards, - WriteObservationsToEpisodes, ) from ray.rllib.examples.envs.classes.stateless_cartpole import StatelessCartPole from ray.rllib.examples.envs.classes.multi_agent import MultiAgentStatelessCartPole @@ -118,14 +117,14 @@ def _env_to_module(env): # Create the env-to-module connector pipeline. return [ - #AddObservationsFromEpisodesToBatch(), + # AddObservationsFromEpisodesToBatch(), PrevActionsPrevRewards( multi_agent=args.num_agents > 0, n_prev_rewards=args.n_prev_rewards, n_prev_actions=args.n_prev_actions, ), FlattenObservations(multi_agent=args.num_agents > 0), - #WriteObservationsToEpisodes(), + # WriteObservationsToEpisodes(), ] # Register our environment with tune. diff --git a/rllib/examples/curriculum/curriculum_learning.py b/rllib/examples/curriculum/curriculum_learning.py index 02916ed459df..b215e4ed0b6b 100644 --- a/rllib/examples/curriculum/curriculum_learning.py +++ b/rllib/examples/curriculum/curriculum_learning.py @@ -59,11 +59,7 @@ from ray.air.constants import TRAINING_ITERATION from ray.rllib.algorithms.algorithm import Algorithm from ray.rllib.algorithms.callbacks import DefaultCallbacks -from ray.rllib.connectors.env_to_module import ( - AddObservationsFromEpisodesToBatch, - FlattenObservations, - WriteObservationsToEpisodes, -) +from ray.rllib.connectors.env_to_module import FlattenObservations from ray.rllib.utils.metrics import ( ENV_RUNNER_RESULTS, EPISODE_RETURN_MEAN, @@ -221,11 +217,7 @@ def on_train_result( ) .env_runners( num_envs_per_env_runner=5, - env_to_module_connector=lambda env: [ - AddObservationsFromEpisodesToBatch(), - FlattenObservations(), - WriteObservationsToEpisodes(), - ], + env_to_module_connector=lambda env: FlattenObservations(), ) ) diff --git a/rllib/examples/multi_agent/rock_paper_scissors_heuristic_vs_learned.py b/rllib/examples/multi_agent/rock_paper_scissors_heuristic_vs_learned.py index d503e7f23ad3..1f7ad8dc238c 100644 --- a/rllib/examples/multi_agent/rock_paper_scissors_heuristic_vs_learned.py +++ b/rllib/examples/multi_agent/rock_paper_scissors_heuristic_vs_learned.py @@ -33,11 +33,7 @@ from pettingzoo.classic import rps_v2 from ray.air.constants import TRAINING_ITERATION -from ray.rllib.connectors.env_to_module import ( - AddObservationsFromEpisodesToBatch, - FlattenObservations, - WriteObservationsToEpisodes, -) +from ray.rllib.connectors.env_to_module import FlattenObservations from ray.rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec from ray.rllib.env.wrappers.pettingzoo_env import ParallelPettingZooEnv @@ -89,10 +85,8 @@ .environment("RockPaperScissors") .env_runners( env_to_module_connector=lambda env: ( - AddObservationsFromEpisodesToBatch(), - # Only flatten obs for the learning RLModul + # `agent_ids=...`: Only flatten obs for the learning RLModule. FlattenObservations(multi_agent=True, agent_ids={"player_0"}), - WriteObservationsToEpisodes(), ), ) .multi_agent( diff --git a/rllib/examples/multi_agent/two_step_game_with_grouped_agents.py b/rllib/examples/multi_agent/two_step_game_with_grouped_agents.py index afabd3fe9003..2c9435822290 100644 --- a/rllib/examples/multi_agent/two_step_game_with_grouped_agents.py +++ b/rllib/examples/multi_agent/two_step_game_with_grouped_agents.py @@ -40,11 +40,7 @@ +------------------+-------+-------------------+-------------+ """ -from ray.rllib.connectors.env_to_module import ( - AddObservationsFromEpisodesToBatch, - FlattenObservations, - WriteObservationsToEpisodes, -) +from ray.rllib.connectors.env_to_module import FlattenObservations from ray.rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec from ray.rllib.examples.envs.classes.two_step_game import TwoStepGameWithGroupedAgents @@ -76,11 +72,7 @@ .get_default_config() .environment("grouped_twostep") .env_runners( - env_to_module_connector=lambda env: ( - AddObservationsFromEpisodesToBatch(), - FlattenObservations(multi_agent=True), - WriteObservationsToEpisodes(), - ), + env_to_module_connector=lambda env: FlattenObservations(multi_agent=True), ) .multi_agent( policies={"p0"}, From 70c939ba6d603ebf788123544a7e20704ccb1677 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Wed, 12 Jun 2024 14:53:26 +0200 Subject: [PATCH 07/10] wip Signed-off-by: sven1977 --- rllib/connectors/connector_pipeline_v2.py | 2 +- rllib/connectors/connector_v2.py | 3 +- .../prev_actions_prev_rewards.py | 4 +- .../connectors/prev_actions_prev_rewards.py | 42 +++++++++---------- 4 files changed, 25 insertions(+), 26 deletions(-) diff --git a/rllib/connectors/connector_pipeline_v2.py b/rllib/connectors/connector_pipeline_v2.py index f04376dcd993..3156c66a7a69 100644 --- a/rllib/connectors/connector_pipeline_v2.py +++ b/rllib/connectors/connector_pipeline_v2.py @@ -312,7 +312,7 @@ def _fix_spaces(self): obs_space = self.input_observation_space act_space = self.input_action_space for con in self.connectors: - con.input_observation_space = obs_space con.input_action_space = act_space + con.input_observation_space = obs_space obs_space = con.observation_space act_space = con.action_space diff --git a/rllib/connectors/connector_v2.py b/rllib/connectors/connector_v2.py index ad7bd9eed4bb..e43f7515faea 100644 --- a/rllib/connectors/connector_v2.py +++ b/rllib/connectors/connector_v2.py @@ -84,8 +84,9 @@ def __init__( self._action_space = None self._input_observation_space = None self._input_action_space = None - self.input_observation_space = input_observation_space + self.input_action_space = input_action_space + self.input_observation_space = input_observation_space @OverrideToImplementCustomLogic def recompute_observation_space_from_input_spaces(self) -> gym.Space: diff --git a/rllib/connectors/env_to_module/prev_actions_prev_rewards.py b/rllib/connectors/env_to_module/prev_actions_prev_rewards.py index 89aa51f69e2b..5b26cd1f8b87 100644 --- a/rllib/connectors/env_to_module/prev_actions_prev_rewards.py +++ b/rllib/connectors/env_to_module/prev_actions_prev_rewards.py @@ -35,11 +35,11 @@ class PrevActionsPrevRewards(ConnectorV2): """ ORIG_OBS_KEY = "_orig_obs" + PREV_ACTIONS_KEY = "prev_n_actions" + PREV_REWARDS_KEY = "prev_n_rewards" @override(ConnectorV2) def recompute_observation_space_from_input_spaces(self): - if self.input_action_space is None: - return None if self._multi_agent: ret = {} for agent_id, obs_space in self.input_observation_space.spaces.items(): diff --git a/rllib/examples/connectors/prev_actions_prev_rewards.py b/rllib/examples/connectors/prev_actions_prev_rewards.py index 1f341c5aeaaa..dcee6ac5689e 100644 --- a/rllib/examples/connectors/prev_actions_prev_rewards.py +++ b/rllib/examples/connectors/prev_actions_prev_rewards.py @@ -31,6 +31,11 @@ - demonstrates how to use RLlib's `FlattenObservations` right after the `PrevActionsPrevRewards` to flatten that new dict observation structure again into a single 1D tensor. + - uses the StatelessCartPole environment, a CartPole-v1 derivative that's missing + both x-veloc and angle-veloc observation components and is therefore non-Markovian + (only partially observable). An LSTM default model is used for training. Adding + the additional context to the observations (for example, prev. actions) helps the + LSTM to more quickly learn in this environment. How to run this script @@ -57,28 +62,21 @@ Results to expect ----------------- -With `--num-frames=4` and using the two extra ConnectorV2 pieces (in the env-to-module -and learner connector pipelines), you should see something like: -+---------------------------+------------+--------+------------------+... -| Trial name | status | iter | total time (s) | -| | | | | -|---------------------------+------------+--------+------------------+... -| PPO_atari-env_2fc4a_00000 | TERMINATED | 10 | 557.257 | -+---------------------------+------------+--------+------------------+... - -Note that the time to run these 10 iterations is about .% faster than when -performing framestacking already inside the environment (using a -`gymnasium.wrappers.ObservationWrapper`), due to the additional network traffic -needed (sending back 4x[obs] batches instead of 1x[obs] to the learners). - -Thus, with the `--use-gym-wrapper-framestacking` option, the output looks -like this: -+---------------------------+------------+--------+------------------+... -| Trial name | status | iter | total time (s) | -| | | | | -|---------------------------+------------+--------+------------------+... -| PPO_atari-env_2fc4a_00000 | TERMINATED | 10 | 557.257 | -+---------------------------+------------+--------+------------------+... +You should see something similar to this in your terminal output when running +ths script as described above: + ++---------------------+------------+-----------------+--------+------------------+ +| Trial name | status | loc | iter | total time (s) | +| | | | | | +|---------------------+------------+-----------------+--------+------------------+ +| PPO_env_0edd2_00000 | TERMINATED | 127.0.0.1:12632 | 17 | 42.6898 | ++---------------------+------------+-----------------+--------+------------------+ ++------------------------+------------------------+------------------------+ +| num_env_steps_sample | num_env_steps_traine | episode_return_mean | +| d_lifetime | d_lifetime | | +|------------------------+------------------------+------------------------| +| 68000 | 68000 | 205.22 | ++------------------------+------------------------+------------------------+ """ import functools From 3bc04bc09d9c91220fe3b6f341efd94fe218cbbf Mon Sep 17 00:00:00 2001 From: sven1977 Date: Wed, 12 Jun 2024 21:03:55 +0200 Subject: [PATCH 08/10] wip Signed-off-by: sven1977 --- .../env_to_module/flatten_observations.py | 31 ++++++++++--------- .../env_to_module/mean_std_filter.py | 2 -- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/rllib/connectors/env_to_module/flatten_observations.py b/rllib/connectors/env_to_module/flatten_observations.py index 8ac27ac8b817..e5f393b84614 100644 --- a/rllib/connectors/env_to_module/flatten_observations.py +++ b/rllib/connectors/env_to_module/flatten_observations.py @@ -31,6 +31,7 @@ class FlattenObservations(ConnectorV2): import numpy as np from ray.rllib.connectors.env_to_module import FlattenObservations + from ray.rllib.env.single_agent_episode import SingleAgentEpisode from ray.rllib.utils.test_utils import check # Some arbitrarily nested, complex observation space. @@ -44,24 +45,26 @@ class FlattenObservations(ConnectorV2): }) act_space = gym.spaces.Discrete(2) - # A batch of two example items, both coming from the above defined observation - # space. - batch = { - "obs": [ - # 1st example item. + # Two example episodes, both with initial (reset) observations coming from the + # above defined observation space. + episode_1 = SingleAgentEpisode( + observations=[ { "a": np.array(-10.0, np.float32), "b": (1, np.array([[-1.0], [-1.0]], np.float32)), "c": np.array([0, 2]), }, - # 2nd example item. + ], + ) + episode_2 = SingleAgentEpisode( + observations=[ { "a": np.array(10.0, np.float32), "b": (0, np.array([[1.0], [1.0]], np.float32)), "c": np.array([1, 1]), }, ], - } + ) # Construct our connector piece. connector = FlattenObservations(obs_space, act_space) @@ -69,23 +72,23 @@ class FlattenObservations(ConnectorV2): # Call our connector piece with the example data. output_data = connector( rl_module=None, # This connector works without an RLModule. - data=batch, - episodes=[], # This connector does not need the `episodes` input. + data={}, # This connector does not alter any data. + episodes=[episode_1, episode_2], explore=True, shared_data={}, ) - # The connector does not change the number of items in the data (still 2 items). - check(len(output_data["obs"]), 2) + # The connector does not alter the data and acts as pure pass-through. + check(output_data, {}) - # The connector has flattened each item in the data to a 1D tensor. + # The connector has flattened each item in the episodes to a 1D tensor. check( - output_data["obs"][0], + episode_1.get_observations(0), # box() disc(2). box(2, 1). multidisc(2, 3)........ np.array([-10.0, 0.0, 1.0, -1.0, -1.0, 1.0, 0.0, 0.0, 0.0, 1.0]), ) check( - output_data["obs"][1], + episode_2.get_observations(0), # box() disc(2). box(2, 1). multidisc(2, 3)........ np.array([10.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0]), ) diff --git a/rllib/connectors/env_to_module/mean_std_filter.py b/rllib/connectors/env_to_module/mean_std_filter.py index 09b5e4f0fbcf..c0bdf8bc6544 100644 --- a/rllib/connectors/env_to_module/mean_std_filter.py +++ b/rllib/connectors/env_to_module/mean_std_filter.py @@ -19,8 +19,6 @@ class MeanStdFilter(ConnectorV2): """A connector used to mean-std-filter observations. - - Incoming observations are filtered such that the output of this filter is on average 0.0 and has a standard deviation of 1.0. If the observation space is a (possibly nested) dict, this filtering is applied separately per element of From d14ec9fc3874d5ae0e44ab7210b4c55fb5c35f7f Mon Sep 17 00:00:00 2001 From: sven1977 Date: Wed, 12 Jun 2024 21:14:04 +0200 Subject: [PATCH 09/10] wip Signed-off-by: sven1977 --- .../env_to_module/flatten_observations.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/rllib/connectors/env_to_module/flatten_observations.py b/rllib/connectors/env_to_module/flatten_observations.py index e5f393b84614..9df1d1c61f84 100644 --- a/rllib/connectors/env_to_module/flatten_observations.py +++ b/rllib/connectors/env_to_module/flatten_observations.py @@ -174,19 +174,17 @@ def __call__( last_obs = sa_episode.get_observations(-1) if self._multi_agent: - flattened_obs = { - agent_obs - if agent_id not in self._agent_ids - else flatten_inputs_to_1d_tensor( - inputs=agent_obs, + if self._agent_ids is not None and agent_id not in self._agent_ids: + flattened_obs = last_obs + else: + flattened_obs = flatten_inputs_to_1d_tensor( + inputs=last_obs, # In the multi-agent case, we need to use the specific agent's # space struct, not the multi-agent observation space dict. - spaces_struct=self._input_obs_base_struct[agent_id], + spaces_struct=self._input_obs_base_struct[sa_episode.agent_id], # Our items are individual observations (no batch axis present). batch_axis=False, ) - for agent_id, agent_obs in last_obs.items() - } else: flattened_obs = flatten_inputs_to_1d_tensor( inputs=last_obs, From e0593300ea1b4f15e6f029c88c3bc0106d1f9e20 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Wed, 12 Jun 2024 21:33:10 +0200 Subject: [PATCH 10/10] LINT Signed-off-by: sven1977 --- rllib/connectors/env_to_module/flatten_observations.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/rllib/connectors/env_to_module/flatten_observations.py b/rllib/connectors/env_to_module/flatten_observations.py index 9df1d1c61f84..6a2e60173b65 100644 --- a/rllib/connectors/env_to_module/flatten_observations.py +++ b/rllib/connectors/env_to_module/flatten_observations.py @@ -174,7 +174,10 @@ def __call__( last_obs = sa_episode.get_observations(-1) if self._multi_agent: - if self._agent_ids is not None and agent_id not in self._agent_ids: + if ( + self._agent_ids is not None + and sa_episode.agent_id not in self._agent_ids + ): flattened_obs = last_obs else: flattened_obs = flatten_inputs_to_1d_tensor(