From 0ba68feac5cd9d6612a8ae8694f9ae1359e15bb6 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Tue, 11 Jun 2024 15:22:45 +0200
Subject: [PATCH 01/10] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 .../checkpoint_by_custom_criteria.py          |  20 ++--
 .../continue_training_from_checkpoint.py      |  18 +--
 .../restore_1_of_n_agents_from_checkpoint.py  |  16 +--
 rllib/examples/connectors/frame_stacking.py   | 110 +++++++++++++-----
 .../examples/connectors/mean_std_filtering.py |  82 +++++++++++++
 .../connectors/nested_action_spaces.py        |  82 +++++++++++++
 .../connectors/nested_observation_spaces.py   |  82 +++++++++++++
 .../connectors/prev_actions_prev_rewards.py   |  82 +++++++++++++
 .../policy_inference_after_training.py        |   8 +-
 ...cy_inference_after_training_w_connector.py |   8 +-
 10 files changed, 445 insertions(+), 63 deletions(-)

diff --git a/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py b/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py
index 0419a8ae1512..075cf8ca7e42 100644
--- a/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py
+++ b/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py
@@ -1,16 +1,16 @@
 """Example extracting a checkpoint from n trials using one or more custom criteria.
 
 This example:
-- runs a CartPole experiment with three different learning rates (three tune
-"trials"). During the experiment, for each trial, we create a checkpoint at each
-iteration.
-- at the end of the experiment, we compare the trials and pick the one that performed
-best, based on the criterion: Lowest episode count per single iteration (for CartPole,
-a low episode count means the episodes are very long and thus the reward is also very
-high).
-- from that best trial (with the lowest episode count), we then pick those checkpoints
-that a) have the lowest policy loss (good) and b) have the highest value function loss
-(bad).
+    - runs a CartPole experiment with three different learning rates (three tune
+    "trials"). During the experiment, for each trial, we create a checkpoint at each
+    iteration.
+    - at the end of the experiment, we compare the trials and pick the one that performed
+    best, based on the criterion: Lowest episode count per single iteration (for CartPole,
+    a low episode count means the episodes are very long and thus the reward is also very
+    high).
+    - from that best trial (with the lowest episode count), we then pick those checkpoints
+    that a) have the lowest policy loss (good) and b) have the highest value function loss
+    (bad).
 
 
 How to run this script
diff --git a/rllib/examples/checkpoints/continue_training_from_checkpoint.py b/rllib/examples/checkpoints/continue_training_from_checkpoint.py
index a8400659d960..45e16e3f89fe 100644
--- a/rllib/examples/checkpoints/continue_training_from_checkpoint.py
+++ b/rllib/examples/checkpoints/continue_training_from_checkpoint.py
@@ -4,15 +4,15 @@
 and you would therefore like to make your setup more robust and fault-tolerant.
 
 This example:
-- runs a single- or multi-agent CartPole experiment (for multi-agent, we use different
-learning rates) thereby checkpointing the state of the Algorithm every n iterations.
-- stops the experiment due to an expected crash in the algorithm's main process after
-a certain number of iterations.
-- just for testing purposes, restores the entire algorithm from the latest checkpoint
-and checks, whether the state of the restored algo exactly match the state of the
-crashed one.
-- then continues training with the restored algorithm until the desired final episode
-return is reached.
+    - runs a single- or multi-agent CartPole experiment (for multi-agent, we use different
+    learning rates) thereby checkpointing the state of the Algorithm every n iterations.
+    - stops the experiment due to an expected crash in the algorithm's main process after
+    a certain number of iterations.
+    - just for testing purposes, restores the entire algorithm from the latest checkpoint
+    and checks, whether the state of the restored algo exactly match the state of the
+    crashed one.
+    - then continues training with the restored algorithm until the desired final episode
+    return is reached.
 
 
 How to run this script
diff --git a/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py b/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py
index fb53e2cb876f..8e1b3f0023bf 100644
--- a/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py
+++ b/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py
@@ -2,14 +2,14 @@
 from checkpoint.
 
 This example:
-    - Runs a multi-agent `Pendulum-v1` experiment with >= 2 policies.
-    - Saves a checkpoint of the `MultiAgentRLModule` used every `--checkpoint-freq`
-        iterations.
-    - Stops the experiments after the agents reach a combined return of `-800`.
-    - Picks the best checkpoint by combined return and restores policy 0 from it.
-    - Runs a second experiment with the restored `RLModule` for policy 0 and
-        a fresh `RLModule` for the other policies.
-    - Stops the second experiment after the agents reach a combined return of `-800`.
+        - Runs a multi-agent `Pendulum-v1` experiment with >= 2 policies.
+        - Saves a checkpoint of the `MultiAgentRLModule` used every `--checkpoint-freq`
+            iterations.
+        - Stops the experiments after the agents reach a combined return of `-800`.
+        - Picks the best checkpoint by combined return and restores policy 0 from it.
+        - Runs a second experiment with the restored `RLModule` for policy 0 and
+            a fresh `RLModule` for the other policies.
+        - Stops the second experiment after the agents reach a combined return of `-800`.
 
 How to run this script
 ----------------------
diff --git a/rllib/examples/connectors/frame_stacking.py b/rllib/examples/connectors/frame_stacking.py
index 6abce5582b0b..dbc18f726559 100644
--- a/rllib/examples/connectors/frame_stacking.py
+++ b/rllib/examples/connectors/frame_stacking.py
@@ -1,15 +1,80 @@
-""" Example using connectors (V2) for frame-stacking in Atari environments.
+"""Example using 2 ConnectorV2 for observation frame-stacking in Atari environments.
+
+An RLlib Algorithm has 3 distinct connector pipelines:
+- An env-to-module pipeline in an EnvRunner accepting a list of episodes and producing
+a batch for an RLModule to compute actions (`forward_inference()` or
+`forward_exploration()`).
+- A module-to-env pipeline in an EnvRunner taking the RLModule's output and converting
+it into an action readable by the environment.
+- A learner connector pipeline on a Learner taking a list of episodes and producing
+a batch for an RLModule to perform the training forward pass (`forward_train()`).
+
+Each of these pipelines has a fixed set of default ConnectorV2 pieces that RLlib
+adds/prepends to these pipelines in order to perform the most basic functionalities.
+For example, RLlib adds the `AddObservationsFromEpisodesToBatch` ConnectorV2 into any
+env-to-module pipeline to make sure the batch for computing actions contains - at the
+minimum - the most recent observation.
+
+On top of these default ConnectorV2 pieces, users can define their own ConnectorV2
+pieces (or use the ones available already in RLlib) and add them to one of the 3
+different pipelines described above, as required.
+
+This example:
+    - shows how the `FrameStackingEnvToModule` ConnectorV2 piece can be added to the
+    env-to-module pipeline.
+    - shows how the `FrameStackingLearner` ConnectorV2 piece can be added to the
+    learner connector pipeline.
+    - demonstrates that using these two pieces (rather than performing framestacking
+    already inside the environment using a gymnasium wrapper) increases overall
+    performance by about 5%.
+
 
 How to run this script
 ----------------------
-`python [script file name].py --enable-new-api-stack`
+`python [script file name].py --enable-new-api-stack --num-frames=4 --env=ALE/Pong-v5`
+
+Use the `--num-frames` option to define the number of observations to framestack.
+If you don't want to use Connectors to perform the framestacking, set the
+`--use-gym-wrapper-framestacking` flag to perform framestacking already inside a
+gymnasium observation wrapper. In this case though, be aware that the tensors being
+sent through the network are `--num-frames` x larger than if you use the Connector
+setup.
+
 For debugging, use the following additional command line options
 `--no-tune --num-env-runners=0`
 which should allow you to set breakpoints anywhere in the RLlib code and
 have the execution stop there for inspection and debugging.
+
 For logging to your WandB account, use:
 `--wandb-key=[your WandB API key] --wandb-project=[some project name]
 --wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+
+With `--num-frames=4` and using the two extra ConnectorV2 pieces (in the env-to-module
+and learner connector pipelines), you should see something like:
++---------------------------+------------+--------+------------------+...
+| Trial name                | status     |   iter |   total time (s) |
+|                           |            |        |                  |
+|---------------------------+------------+--------+------------------+...
+| PPO_atari-env_2fc4a_00000 | TERMINATED |     10 |          557.257 |
++---------------------------+------------+--------+------------------+...
+
+Note that the time to run these 10 iterations is about .% faster than when
+performing framestacking already inside the environment (using a
+`gymnasium.wrappers.ObservationWrapper`), due to the additional network traffic
+needed (sending back 4x[obs] batches instead of 1x[obs] to the learners).
+
+Thus, with the `--use-gym-wrapper-framestacking` option, the output looks
+like this:
++---------------------------+------------+--------+------------------+...
+| Trial name                | status     |   iter |   total time (s) |
+|                           |            |        |                  |
+|---------------------------+------------+--------+------------------+...
+| PPO_atari-env_2fc4a_00000 | TERMINATED |     10 |          557.257 |
++---------------------------+------------+--------+------------------+...
 """
 import gymnasium as gym
 
@@ -27,12 +92,8 @@
 parser = add_rllib_example_script_args(
     default_timesteps=5000000, default_reward=20.0, default_iters=200
 )
-parser.add_argument(
-    "--atari-env",
-    type=str,
-    default="ALE/Pong-v5",
-    help="The name of the Atari env to run, e.g. `ALE/Breakout-v5`.",
-)
+# Use Pong by default.
+parser.set_defaults(env="ALE/Pong-v5")
 parser.add_argument(
     "--num-frames",
     type=int,
@@ -52,12 +113,16 @@
 
     args = parser.parse_args()
 
+    assert (
+        args.enable_new_api_stack
+    ), "Must set --enable-new-api-stack when running this script!"
+
     # Define our custom connector pipelines.
     def _make_env_to_module_connector(env):
         # Create the env-to-module connector. We return an individual connector piece
-        # here, which RLlib will then automatically integrate into a pipeline (and
+        # here, which RLlib automatically integrates into a pipeline (and
         # add its default connector piece to the end of that pipeline).
-        # This pipeline also automatically fixes the input- and output spaces of the
+        # The default pipeline automatically fixes the input- and output spaces of the
         # individual connector pieces in it.
         # Note that since the frame stacking connector does NOT write information
         # back to the episode (in order to save memory and network traffic), we
@@ -79,29 +144,29 @@ def _make_learner_connector(input_observation_space, input_action_space):
     # We would like our frame stacking connector to do this job.
     def _env_creator(cfg):
         return wrap_atari_for_new_api_stack(
-            gym.make(args.atari_env, **cfg, **{"render_mode": "rgb_array"}),
+            gym.make(args.env, **cfg, **{"render_mode": "rgb_array"}),
             # Perform framestacking either through ConnectorV2 or right here through
             # the observation wrapper.
             framestack=(
-                args.num_framestack if args.use_gym_wrapper_framestacking else None
+                args.num_frames if args.use_gym_wrapper_framestacking else None
             ),
         )
 
     if args.num_agents > 0:
         tune.register_env(
-            "env",
+            "atari-env",
             lambda cfg: make_multi_agent(_env_creator)(
                 dict(cfg, **{"num_agents": args.num_agents})
             ),
         )
     else:
-        tune.register_env("env", _env_creator)
+        tune.register_env("atari-env", _env_creator)
 
     base_config = (
         get_trainable_cls(args.algo)
         .get_default_config()
         .environment(
-            "env",
+            "atari-env",
             env_config={
                 # Make analogous to old v4 + NoFrameskip.
                 "frameskip": 1,
@@ -135,9 +200,7 @@ def _env_creator(cfg):
             grad_clip=100.0,
             grad_clip_by="global_norm",
         )
-    )
-    if args.enable_new_api_stack:
-        base_config.rl_module(
+        .rl_module(
             model_config_dict=dict(
                 {
                     "vf_share_layers": True,
@@ -148,16 +211,7 @@ def _env_creator(cfg):
                 },
             )
         )
-    else:
-        base_config.training(
-            model={
-                "vf_share_layers": True,
-                "conv_filters": [[16, 4, 2], [32, 4, 2], [64, 4, 2], [128, 4, 2]],
-                "conv_activation": "relu",
-                "post_fcnet_hiddens": [256],
-                "uses_new_env_runners": False,
-            }
-        )
+    )
 
     # Add a simple multi-agent setup.
     if args.num_agents > 0:
diff --git a/rllib/examples/connectors/mean_std_filtering.py b/rllib/examples/connectors/mean_std_filtering.py
index a30d6e399c00..1603512bab9a 100644
--- a/rllib/examples/connectors/mean_std_filtering.py
+++ b/rllib/examples/connectors/mean_std_filtering.py
@@ -1,3 +1,81 @@
+"""Example using a ConnectorV2 for processing observations with a mean/std filter.
+
+An RLlib Algorithm has 3 distinct connector pipelines:
+- An env-to-module pipeline in an EnvRunner accepting a list of episodes and producing
+a batch for an RLModule to compute actions (`forward_inference()` or
+`forward_exploration()`).
+- A module-to-env pipeline in an EnvRunner taking the RLModule's output and converting
+it into an action readable by the environment.
+- A learner connector pipeline on a Learner taking a list of episodes and producing
+a batch for an RLModule to perform the training forward pass (`forward_train()`).
+
+Each of these pipelines has a fixed set of default ConnectorV2 pieces that RLlib
+adds/prepends to these pipelines in order to perform the most basic functionalities.
+For example, RLlib adds the `AddObservationsFromEpisodesToBatch` ConnectorV2 into any
+env-to-module pipeline to make sure the batch for computing actions contains - at the
+minimum - the most recent observation.
+
+On top of these default ConnectorV2 pieces, users can define their own ConnectorV2
+pieces (or use the ones available already in RLlib) and add them to one of the 3
+different pipelines described above, as required.
+
+This example:
+    - shows how the `FrameStackingEnvToModule` ConnectorV2 piece can be added to the
+    env-to-module pipeline.
+    - shows how the `FrameStackingLearner` ConnectorV2 piece can be added to the
+    learner connector pipeline.
+    - demonstrates that using these two pieces (rather than performing framestacking
+    already inside the environment using a gymnasium wrapper) increases overall
+    performance by about 5%.
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack --num-frames=4 --env=ALE/Pong-v5`
+
+Use the `--num-frames` option to define the number of observations to framestack.
+If you don't want to use Connectors to perform the framestacking, set the
+`--use-gym-wrapper-framestacking` flag to perform framestacking already inside a
+gymnasium observation wrapper. In this case though, be aware that the tensors being
+sent through the network are `--num-frames` x larger than if you use the Connector
+setup.
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+
+With `--num-frames=4` and using the two extra ConnectorV2 pieces (in the env-to-module
+and learner connector pipelines), you should see something like:
++---------------------------+------------+--------+------------------+...
+| Trial name                | status     |   iter |   total time (s) |
+|                           |            |        |                  |
+|---------------------------+------------+--------+------------------+...
+| PPO_atari-env_2fc4a_00000 | TERMINATED |     10 |          557.257 |
++---------------------------+------------+--------+------------------+...
+
+Note that the time to run these 10 iterations is about .% faster than when
+performing framestacking already inside the environment (using a
+`gymnasium.wrappers.ObservationWrapper`), due to the additional network traffic
+needed (sending back 4x[obs] batches instead of 1x[obs] to the learners).
+
+Thus, with the `--use-gym-wrapper-framestacking` option, the output looks
+like this:
++---------------------------+------------+--------+------------------+...
+| Trial name                | status     |   iter |   total time (s) |
+|                           |            |        |                  |
+|---------------------------+------------+--------+------------------+...
+| PPO_atari-env_2fc4a_00000 | TERMINATED |     10 |          557.257 |
++---------------------------+------------+--------+------------------+...
+"""
 from ray.air.constants import TRAINING_ITERATION
 from ray.rllib.connectors.env_to_module.mean_std_filter import MeanStdFilter
 from ray.rllib.examples.envs.classes.multi_agent import MultiAgentPendulum
@@ -26,6 +104,10 @@
 if __name__ == "__main__":
     args = parser.parse_args()
 
+    assert (
+        args.enable_new_api_stack
+    ), "Must set --enable-new-api-stack when running this script!"
+
     # Register our environment with tune.
     if args.num_agents > 0:
         register_env(
diff --git a/rllib/examples/connectors/nested_action_spaces.py b/rllib/examples/connectors/nested_action_spaces.py
index 830b87fb25fb..86df316c7916 100644
--- a/rllib/examples/connectors/nested_action_spaces.py
+++ b/rllib/examples/connectors/nested_action_spaces.py
@@ -1,3 +1,81 @@
+"""Example using connectors (V2) for observation frame-stacking in Atari environments.
+
+An RLlib Algorithm has 3 distinct connector pipelines:
+- An env-to-module pipeline in an EnvRunner accepting a list of episodes and producing
+a batch for an RLModule to compute actions (`forward_inference()` or
+`forward_exploration()`).
+- A module-to-env pipeline in an EnvRunner taking the RLModule's output and converting
+it into an action readable by the environment.
+- A learner connector pipeline on a Learner taking a list of episodes and producing
+a batch for an RLModule to perform the training forward pass (`forward_train()`).
+
+Each of these pipelines has a fixed set of default ConnectorV2 pieces that RLlib
+adds/prepends to these pipelines in order to perform the most basic functionalities.
+For example, RLlib adds the `AddObservationsFromEpisodesToBatch` ConnectorV2 into any
+env-to-module pipeline to make sure the batch for computing actions contains - at the
+minimum - the most recent observation.
+
+On top of these default ConnectorV2 pieces, users can define their own ConnectorV2
+pieces (or use the ones available already in RLlib) and add them to one of the 3
+different pipelines described above, as required.
+
+This example:
+    - shows how the `FrameStackingEnvToModule` ConnectorV2 piece can be added to the
+    env-to-module pipeline.
+    - shows how the `FrameStackingLearner` ConnectorV2 piece can be added to the
+    learner connector pipeline.
+    - demonstrates that using these two pieces (rather than performing framestacking
+    already inside the environment using a gymnasium wrapper) increases overall
+    performance by about 5%.
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack --num-frames=4 --env=ALE/Pong-v5`
+
+Use the `--num-frames` option to define the number of observations to framestack.
+If you don't want to use Connectors to perform the framestacking, set the
+`--use-gym-wrapper-framestacking` flag to perform framestacking already inside a
+gymnasium observation wrapper. In this case though, be aware that the tensors being
+sent through the network are `--num-frames` x larger than if you use the Connector
+setup.
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+
+With `--num-frames=4` and using the two extra ConnectorV2 pieces (in the env-to-module
+and learner connector pipelines), you should see something like:
++---------------------------+------------+--------+------------------+...
+| Trial name                | status     |   iter |   total time (s) |
+|                           |            |        |                  |
+|---------------------------+------------+--------+------------------+...
+| PPO_atari-env_2fc4a_00000 | TERMINATED |     10 |          557.257 |
++---------------------------+------------+--------+------------------+...
+
+Note that the time to run these 10 iterations is about .% faster than when
+performing framestacking already inside the environment (using a
+`gymnasium.wrappers.ObservationWrapper`), due to the additional network traffic
+needed (sending back 4x[obs] batches instead of 1x[obs] to the learners).
+
+Thus, with the `--use-gym-wrapper-framestacking` option, the output looks
+like this:
++---------------------------+------------+--------+------------------+...
+| Trial name                | status     |   iter |   total time (s) |
+|                           |            |        |                  |
+|---------------------------+------------+--------+------------------+...
+| PPO_atari-env_2fc4a_00000 | TERMINATED |     10 |          557.257 |
++---------------------------+------------+--------+------------------+...
+"""
 from gymnasium.spaces import Dict, Tuple, Box, Discrete, MultiDiscrete
 
 from ray.tune.registry import register_env
@@ -26,6 +104,10 @@
 if __name__ == "__main__":
     args = parser.parse_args()
 
+    assert (
+        args.enable_new_api_stack
+    ), "Must set --enable-new-api-stack when running this script!"
+
     # Define env-to-module-connector pipeline for the new stack.
     def _env_to_module_pipeline(env):
         return [
diff --git a/rllib/examples/connectors/nested_observation_spaces.py b/rllib/examples/connectors/nested_observation_spaces.py
index 39a4bac1c585..cf98909502ee 100644
--- a/rllib/examples/connectors/nested_observation_spaces.py
+++ b/rllib/examples/connectors/nested_observation_spaces.py
@@ -1,3 +1,81 @@
+"""Example using connectors (V2) for observation frame-stacking in Atari environments.
+
+An RLlib Algorithm has 3 distinct connector pipelines:
+- An env-to-module pipeline in an EnvRunner accepting a list of episodes and producing
+a batch for an RLModule to compute actions (`forward_inference()` or
+`forward_exploration()`).
+- A module-to-env pipeline in an EnvRunner taking the RLModule's output and converting
+it into an action readable by the environment.
+- A learner connector pipeline on a Learner taking a list of episodes and producing
+a batch for an RLModule to perform the training forward pass (`forward_train()`).
+
+Each of these pipelines has a fixed set of default ConnectorV2 pieces that RLlib
+adds/prepends to these pipelines in order to perform the most basic functionalities.
+For example, RLlib adds the `AddObservationsFromEpisodesToBatch` ConnectorV2 into any
+env-to-module pipeline to make sure the batch for computing actions contains - at the
+minimum - the most recent observation.
+
+On top of these default ConnectorV2 pieces, users can define their own ConnectorV2
+pieces (or use the ones available already in RLlib) and add them to one of the 3
+different pipelines described above, as required.
+
+This example:
+    - shows how the `FrameStackingEnvToModule` ConnectorV2 piece can be added to the
+    env-to-module pipeline.
+    - shows how the `FrameStackingLearner` ConnectorV2 piece can be added to the
+    learner connector pipeline.
+    - demonstrates that using these two pieces (rather than performing framestacking
+    already inside the environment using a gymnasium wrapper) increases overall
+    performance by about 5%.
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack --num-frames=4 --env=ALE/Pong-v5`
+
+Use the `--num-frames` option to define the number of observations to framestack.
+If you don't want to use Connectors to perform the framestacking, set the
+`--use-gym-wrapper-framestacking` flag to perform framestacking already inside a
+gymnasium observation wrapper. In this case though, be aware that the tensors being
+sent through the network are `--num-frames` x larger than if you use the Connector
+setup.
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+
+With `--num-frames=4` and using the two extra ConnectorV2 pieces (in the env-to-module
+and learner connector pipelines), you should see something like:
++---------------------------+------------+--------+------------------+...
+| Trial name                | status     |   iter |   total time (s) |
+|                           |            |        |                  |
+|---------------------------+------------+--------+------------------+...
+| PPO_atari-env_2fc4a_00000 | TERMINATED |     10 |          557.257 |
++---------------------------+------------+--------+------------------+...
+
+Note that the time to run these 10 iterations is about .% faster than when
+performing framestacking already inside the environment (using a
+`gymnasium.wrappers.ObservationWrapper`), due to the additional network traffic
+needed (sending back 4x[obs] batches instead of 1x[obs] to the learners).
+
+Thus, with the `--use-gym-wrapper-framestacking` option, the output looks
+like this:
++---------------------------+------------+--------+------------------+...
+| Trial name                | status     |   iter |   total time (s) |
+|                           |            |        |                  |
+|---------------------------+------------+--------+------------------+...
+| PPO_atari-env_2fc4a_00000 | TERMINATED |     10 |          557.257 |
++---------------------------+------------+--------+------------------+...
+"""
 from ray.tune.registry import register_env
 from ray.rllib.connectors.env_to_module import (
     AddObservationsFromEpisodesToBatch,
@@ -24,6 +102,10 @@
 if __name__ == "__main__":
     args = parser.parse_args()
 
+    assert (
+        args.enable_new_api_stack
+    ), "Must set --enable-new-api-stack when running this script!"
+
     # Define env-to-module-connector pipeline for the new stack.
     def _env_to_module_pipeline(env):
         return [
diff --git a/rllib/examples/connectors/prev_actions_prev_rewards.py b/rllib/examples/connectors/prev_actions_prev_rewards.py
index 0c3a2693cca2..2b62fcdd0ef8 100644
--- a/rllib/examples/connectors/prev_actions_prev_rewards.py
+++ b/rllib/examples/connectors/prev_actions_prev_rewards.py
@@ -1,3 +1,81 @@
+"""Example using connectors (V2) for observation frame-stacking in Atari environments.
+
+An RLlib Algorithm has 3 distinct connector pipelines:
+- An env-to-module pipeline in an EnvRunner accepting a list of episodes and producing
+a batch for an RLModule to compute actions (`forward_inference()` or
+`forward_exploration()`).
+- A module-to-env pipeline in an EnvRunner taking the RLModule's output and converting
+it into an action readable by the environment.
+- A learner connector pipeline on a Learner taking a list of episodes and producing
+a batch for an RLModule to perform the training forward pass (`forward_train()`).
+
+Each of these pipelines has a fixed set of default ConnectorV2 pieces that RLlib
+adds/prepends to these pipelines in order to perform the most basic functionalities.
+For example, RLlib adds the `AddObservationsFromEpisodesToBatch` ConnectorV2 into any
+env-to-module pipeline to make sure the batch for computing actions contains - at the
+minimum - the most recent observation.
+
+On top of these default ConnectorV2 pieces, users can define their own ConnectorV2
+pieces (or use the ones available already in RLlib) and add them to one of the 3
+different pipelines described above, as required.
+
+This example:
+    - shows how the `FrameStackingEnvToModule` ConnectorV2 piece can be added to the
+    env-to-module pipeline.
+    - shows how the `FrameStackingLearner` ConnectorV2 piece can be added to the
+    learner connector pipeline.
+    - demonstrates that using these two pieces (rather than performing framestacking
+    already inside the environment using a gymnasium wrapper) increases overall
+    performance by about 5%.
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack --num-frames=4 --env=ALE/Pong-v5`
+
+Use the `--num-frames` option to define the number of observations to framestack.
+If you don't want to use Connectors to perform the framestacking, set the
+`--use-gym-wrapper-framestacking` flag to perform framestacking already inside a
+gymnasium observation wrapper. In this case though, be aware that the tensors being
+sent through the network are `--num-frames` x larger than if you use the Connector
+setup.
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+
+With `--num-frames=4` and using the two extra ConnectorV2 pieces (in the env-to-module
+and learner connector pipelines), you should see something like:
++---------------------------+------------+--------+------------------+...
+| Trial name                | status     |   iter |   total time (s) |
+|                           |            |        |                  |
+|---------------------------+------------+--------+------------------+...
+| PPO_atari-env_2fc4a_00000 | TERMINATED |     10 |          557.257 |
++---------------------------+------------+--------+------------------+...
+
+Note that the time to run these 10 iterations is about .% faster than when
+performing framestacking already inside the environment (using a
+`gymnasium.wrappers.ObservationWrapper`), due to the additional network traffic
+needed (sending back 4x[obs] batches instead of 1x[obs] to the learners).
+
+Thus, with the `--use-gym-wrapper-framestacking` option, the output looks
+like this:
++---------------------------+------------+--------+------------------+...
+| Trial name                | status     |   iter |   total time (s) |
+|                           |            |        |                  |
+|---------------------------+------------+--------+------------------+...
+| PPO_atari-env_2fc4a_00000 | TERMINATED |     10 |          557.257 |
++---------------------------+------------+--------+------------------+...
+"""
 import functools
 
 from ray.rllib.algorithms.ppo import PPOConfig
@@ -29,6 +107,10 @@
 if __name__ == "__main__":
     args = parser.parse_args()
 
+    assert (
+        args.enable_new_api_stack
+    ), "Must set --enable-new-api-stack when running this script!"
+
     # Define our custom connector pipelines.
     def _env_to_module(env):
         # Create the env-to-module connector pipeline.
diff --git a/rllib/examples/inference/policy_inference_after_training.py b/rllib/examples/inference/policy_inference_after_training.py
index 0f61f4519cd7..5cdb6090f758 100644
--- a/rllib/examples/inference/policy_inference_after_training.py
+++ b/rllib/examples/inference/policy_inference_after_training.py
@@ -4,12 +4,12 @@
 from a checkpoint and a manual env-loop (CartPole-v1). No ConnectorV2s or EnvRunners are
 used in this example.
 
-This example shows ..
-  - .. how to use an already existing checkpoint to extract a single-agent RLModule
+This example:
+  - shows how to use an already existing checkpoint to extract a single-agent RLModule
   from (our policy network).
-  - .. how to setup this recovered policy net for action computations (with or without
+  - shows how to setup this recovered policy net for action computations (with or without
   using exploration).
-  - .. have the policy run through a very simple gymnasium based env-loop, w/o using
+  - shows have the policy run through a very simple gymnasium based env-loop, w/o using
   RLlib's ConnectorV2s or EnvRunners.
 
 
diff --git a/rllib/examples/inference/policy_inference_after_training_w_connector.py b/rllib/examples/inference/policy_inference_after_training_w_connector.py
index 6d97ef61f865..304c5ba76ed2 100644
--- a/rllib/examples/inference/policy_inference_after_training_w_connector.py
+++ b/rllib/examples/inference/policy_inference_after_training_w_connector.py
@@ -6,12 +6,12 @@
 The RLModule contains an LSTM that requires its own previous STATE_OUT as new input
 at every episode step to compute a new action.
 
-This example shows ..
-  - .. how to use an already existing checkpoint to extract a single-agent RLModule
+This example:
+  - shows how to use an already existing checkpoint to extract a single-agent RLModule
   from (our policy network).
-  - .. how to setup this recovered policy net for action computations (with or without
+  - shows how to setup this recovered policy net for action computations (with or without
   using exploration).
-  - .. how to create a more complex env-loop in which the action-computing RLModule
+  - shows how to create a more complex env-loop in which the action-computing RLModule
   requires its own previous state outputs as new input and how to use RLlib's Episode
   APIs to achieve this.
 

From 1ed0eec100e2209f18e3bb7f242a32f0189d918a Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Tue, 11 Jun 2024 16:25:25 +0200
Subject: [PATCH 02/10] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 .../env_to_module/mean_std_filter.py          |   2 +-
 .../checkpoint_by_custom_criteria.py          |  14 +-
 .../continue_training_from_checkpoint.py      |  19 +-
 .../restore_1_of_n_agents_from_checkpoint.py  |   4 +-
 rllib/examples/connectors/frame_stacking.py   |  13 +-
 .../examples/connectors/mean_std_filtering.py | 167 +++++++++---------
 .../policy_inference_after_training.py        |  12 +-
 ...cy_inference_after_training_w_connector.py |  14 +-
 8 files changed, 120 insertions(+), 125 deletions(-)

diff --git a/rllib/connectors/env_to_module/mean_std_filter.py b/rllib/connectors/env_to_module/mean_std_filter.py
index e4709aff5b44..187fc9130826 100644
--- a/rllib/connectors/env_to_module/mean_std_filter.py
+++ b/rllib/connectors/env_to_module/mean_std_filter.py
@@ -1,7 +1,7 @@
 from typing import Any, Dict, List, Optional
-from gymnasium.spaces import Discrete, MultiDiscrete
 
 import gymnasium as gym
+from gymnasium.spaces import Discrete, MultiDiscrete
 import numpy as np
 import tree
 
diff --git a/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py b/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py
index 075cf8ca7e42..33204e52d5e9 100644
--- a/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py
+++ b/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py
@@ -4,13 +4,13 @@
     - runs a CartPole experiment with three different learning rates (three tune
     "trials"). During the experiment, for each trial, we create a checkpoint at each
     iteration.
-    - at the end of the experiment, we compare the trials and pick the one that performed
-    best, based on the criterion: Lowest episode count per single iteration (for CartPole,
-    a low episode count means the episodes are very long and thus the reward is also very
-    high).
-    - from that best trial (with the lowest episode count), we then pick those checkpoints
-    that a) have the lowest policy loss (good) and b) have the highest value function loss
-    (bad).
+    - at the end of the experiment, we compare the trials and pick the one that
+    performed best, based on the criterion: Lowest episode count per single iteration
+    (for CartPole, a low episode count means the episodes are very long and thus the
+    reward is also very high).
+    - from that best trial (with the lowest episode count), we then pick those
+    checkpoints that a) have the lowest policy loss (good) and b) have the highest value
+    function loss (bad).
 
 
 How to run this script
diff --git a/rllib/examples/checkpoints/continue_training_from_checkpoint.py b/rllib/examples/checkpoints/continue_training_from_checkpoint.py
index 45e16e3f89fe..c52a7868b4e8 100644
--- a/rllib/examples/checkpoints/continue_training_from_checkpoint.py
+++ b/rllib/examples/checkpoints/continue_training_from_checkpoint.py
@@ -4,15 +4,16 @@
 and you would therefore like to make your setup more robust and fault-tolerant.
 
 This example:
-    - runs a single- or multi-agent CartPole experiment (for multi-agent, we use different
-    learning rates) thereby checkpointing the state of the Algorithm every n iterations.
-    - stops the experiment due to an expected crash in the algorithm's main process after
-    a certain number of iterations.
-    - just for testing purposes, restores the entire algorithm from the latest checkpoint
-    and checks, whether the state of the restored algo exactly match the state of the
-    crashed one.
-    - then continues training with the restored algorithm until the desired final episode
-    return is reached.
+    - runs a single- or multi-agent CartPole experiment (for multi-agent, we use
+    different learning rates) thereby checkpointing the state of the Algorithm every n
+    iterations.
+    - stops the experiment due to an expected crash in the algorithm's main process
+    after a certain number of iterations.
+    - just for testing purposes, restores the entire algorithm from the latest
+    checkpoint and checks, whether the state of the restored algo exactly match the
+    state of the crashed one.
+    - then continues training with the restored algorithm until the desired final
+    episode return is reached.
 
 
 How to run this script
diff --git a/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py b/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py
index 8e1b3f0023bf..f3c83777e8e5 100644
--- a/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py
+++ b/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py
@@ -5,11 +5,11 @@
         - Runs a multi-agent `Pendulum-v1` experiment with >= 2 policies.
         - Saves a checkpoint of the `MultiAgentRLModule` used every `--checkpoint-freq`
             iterations.
-        - Stops the experiments after the agents reach a combined return of `-800`.
+        - Stops the experiments after the agents reach a combined return of -800.
         - Picks the best checkpoint by combined return and restores policy 0 from it.
         - Runs a second experiment with the restored `RLModule` for policy 0 and
             a fresh `RLModule` for the other policies.
-        - Stops the second experiment after the agents reach a combined return of `-800`.
+        - Stops the second experiment after the agents reach a combined return of -800.
 
 How to run this script
 ----------------------
diff --git a/rllib/examples/connectors/frame_stacking.py b/rllib/examples/connectors/frame_stacking.py
index dbc18f726559..e26918796ff4 100644
--- a/rllib/examples/connectors/frame_stacking.py
+++ b/rllib/examples/connectors/frame_stacking.py
@@ -54,26 +54,27 @@
 -----------------
 
 With `--num-frames=4` and using the two extra ConnectorV2 pieces (in the env-to-module
-and learner connector pipelines), you should see something like:
+and learner connector pipelines), you should see something like this using:
+`--env ALE/Pong-v5 --num-gpus=4 --num-env-runners=95`
 +---------------------------+------------+--------+------------------+...
 | Trial name                | status     |   iter |   total time (s) |
 |                           |            |        |                  |
 |---------------------------+------------+--------+------------------+...
-| PPO_atari-env_2fc4a_00000 | TERMINATED |     10 |          557.257 |
+| PPO_atari-env_2fc4a_00000 | TERMINATED |    200 |          335.837 |
 +---------------------------+------------+--------+------------------+...
 
-Note that the time to run these 10 iterations is about .% faster than when
+Note that the time to run these 200 iterations is about ~5% faster than when
 performing framestacking already inside the environment (using a
 `gymnasium.wrappers.ObservationWrapper`), due to the additional network traffic
 needed (sending back 4x[obs] batches instead of 1x[obs] to the learners).
 
-Thus, with the `--use-gym-wrapper-framestacking` option, the output looks
-like this:
+Thus, with the `--use-gym-wrapper-framestacking` option (all other options being equal),
+the output looks like this:
 +---------------------------+------------+--------+------------------+...
 | Trial name                | status     |   iter |   total time (s) |
 |                           |            |        |                  |
 |---------------------------+------------+--------+------------------+...
-| PPO_atari-env_2fc4a_00000 | TERMINATED |     10 |          557.257 |
+| PPO_atari-env_2fc4a_00000 | TERMINATED |    200 |          351.505 |
 +---------------------------+------------+--------+------------------+...
 """
 import gymnasium as gym
diff --git a/rllib/examples/connectors/mean_std_filtering.py b/rllib/examples/connectors/mean_std_filtering.py
index 1603512bab9a..470812585138 100644
--- a/rllib/examples/connectors/mean_std_filtering.py
+++ b/rllib/examples/connectors/mean_std_filtering.py
@@ -20,25 +20,16 @@
 different pipelines described above, as required.
 
 This example:
-    - shows how the `FrameStackingEnvToModule` ConnectorV2 piece can be added to the
-    env-to-module pipeline.
-    - shows how the `FrameStackingLearner` ConnectorV2 piece can be added to the
-    learner connector pipeline.
-    - demonstrates that using these two pieces (rather than performing framestacking
-    already inside the environment using a gymnasium wrapper) increases overall
-    performance by about 5%.
+    - shows how the `MeanStdFilter` ConnectorV2 piece can be added to the env-to-module
+    pipeline.
+    - demonstrates that using such a filter enhances learning behavior (or even makes
+    if possible to learn overall) in some environments, especially those with lopsided
+    observation spaces, for example `Box(-3000, -1000, ...)`.
 
 
 How to run this script
 ----------------------
-`python [script file name].py --enable-new-api-stack --num-frames=4 --env=ALE/Pong-v5`
-
-Use the `--num-frames` option to define the number of observations to framestack.
-If you don't want to use Connectors to perform the framestacking, set the
-`--use-gym-wrapper-framestacking` flag to perform framestacking already inside a
-gymnasium observation wrapper. In this case though, be aware that the tensors being
-sent through the network are `--num-frames` x larger than if you use the Connector
-setup.
+`python [script file name].py --enable-new-api-stack`
 
 For debugging, use the following additional command line options
 `--no-tune --num-env-runners=0`
@@ -52,40 +43,39 @@
 
 Results to expect
 -----------------
-
-With `--num-frames=4` and using the two extra ConnectorV2 pieces (in the env-to-module
-and learner connector pipelines), you should see something like:
-+---------------------------+------------+--------+------------------+...
-| Trial name                | status     |   iter |   total time (s) |
-|                           |            |        |                  |
-|---------------------------+------------+--------+------------------+...
-| PPO_atari-env_2fc4a_00000 | TERMINATED |     10 |          557.257 |
-+---------------------------+------------+--------+------------------+...
-
-Note that the time to run these 10 iterations is about .% faster than when
-performing framestacking already inside the environment (using a
-`gymnasium.wrappers.ObservationWrapper`), due to the additional network traffic
-needed (sending back 4x[obs] batches instead of 1x[obs] to the learners).
-
-Thus, with the `--use-gym-wrapper-framestacking` option, the output looks
-like this:
-+---------------------------+------------+--------+------------------+...
-| Trial name                | status     |   iter |   total time (s) |
-|                           |            |        |                  |
-|---------------------------+------------+--------+------------------+...
-| PPO_atari-env_2fc4a_00000 | TERMINATED |     10 |          557.257 |
-+---------------------------+------------+--------+------------------+...
+Running this example with the mean-std filter results in the normally expected Pendulum
+learning behavior:
++-------------------------------+------------+-----------------+--------+
+| Trial name                    | status     | loc             |   iter |
+|                               |            |                 |        |
+|-------------------------------+------------+-----------------+--------+
+| PPO_lopsided-pend_f9c96_00000 | TERMINATED | 127.0.0.1:43612 |     77 |
++-------------------------------+------------+-----------------+--------+
++------------------+------------------------+-----------------------+
+|   total time (s) |   num_env_steps_sample |   episode_return_mean |
+|                  |             d_lifetime |                       |
+|------------------+------------------------+-----------------------|
+|          30.7466 |                  40040 |                -276.3 |
++------------------+------------------------+-----------------------+
+
+If you try using the `--disable-mean-std-filter` (all other things being equal), you
+will either see no learning progress at all (or a very slow one), but more likely some
+numerical instability related error will be thrown:
+
+ValueError: Expected parameter loc (Tensor of shape (64, 1)) of distribution
+            Normal(loc: torch.Size([64, 1]), scale: torch.Size([64, 1])) to satisfy the
+            constraint Real(), but found invalid values:
+tensor([[nan],
+        [nan],
+        [nan],
+        ...
 """
-from ray.air.constants import TRAINING_ITERATION
+import gymnasium as gym
+import numpy as np
+
 from ray.rllib.connectors.env_to_module.mean_std_filter import MeanStdFilter
 from ray.rllib.examples.envs.classes.multi_agent import MultiAgentPendulum
 from ray.rllib.utils.framework import try_import_torch
-from ray.rllib.utils.metrics import (
-    ENV_RUNNER_RESULTS,
-    EPISODE_RETURN_MEAN,
-    EVALUATION_RESULTS,
-    NUM_ENV_STEPS_SAMPLED_LIFETIME,
-)
 from ray.rllib.utils.test_utils import (
     add_rllib_example_script_args,
     run_rllib_example_script_experiment,
@@ -99,6 +89,21 @@
     default_timesteps=500000,
     default_reward=-300.0,
 )
+parser.add_argument(
+    "--disable-mean-std-filter",
+    action="store_true",
+    help="Run w/o a mean/std env-to-module connector piece (filter).",
+)
+
+
+class LopsidedObs(gym.ObservationWrapper):
+    def __init__(self, env):
+        super().__init__(env)
+        self.observation_space = gym.spaces.Box(-4000.0, -1456.0, (3,), np.float32)
+
+    def observation(self, observation):
+        # Lopside [-1.0, 1.0] Pendulum observations
+        return ((observation + 1.0) / 2.0) * (4000.0 - 1456.0) - 4000.0
 
 
 if __name__ == "__main__":
@@ -111,14 +116,16 @@
     # Register our environment with tune.
     if args.num_agents > 0:
         register_env(
-            "env",
+            "lopsided-pend",
             lambda _: MultiAgentPendulum(config={"num_agents": args.num_agents}),
         )
+    else:
+        register_env("lopsided-pend", lambda _: LopsidedObs(gym.make("Pendulum-v1")))
 
     config = (
         get_trainable_cls(args.algo)
         .get_default_config()
-        .environment("env" if args.num_agents > 0 else "Pendulum-v1")
+        .environment("lopsided-pend")
         .env_runners(
             # TODO (sven): MAEnvRunner does not support vectorized envs yet
             #  due to gym's env checkers and non-compatability with RLlib's
@@ -130,7 +137,9 @@
             # included in an automatically generated EnvToModulePipeline or return a
             # EnvToModulePipeline directly.
             env_to_module_connector=(
-                lambda env: MeanStdFilter(multi_agent=args.num_agents > 0)
+                None
+                if args.disable_mean_std_filter
+                else lambda env: MeanStdFilter(multi_agent=args.num_agents > 0)
             ),
         )
         .training(
@@ -143,25 +152,7 @@
             vf_clip_param=10.0,
             vf_loss_coeff=0.01,
         )
-        .evaluation(
-            evaluation_num_env_runners=1,
-            evaluation_parallel_to_training=True,
-            evaluation_interval=1,
-            evaluation_duration=10,
-            evaluation_duration_unit="episodes",
-            evaluation_config={
-                "explore": False,
-                # Do NOT use the eval EnvRunners' ConnectorV2 states. Instead, before
-                # each round of evaluation, broadcast the latest training
-                # EnvRunnerGroup's ConnectorV2 states (merged from all training remote
-                # EnvRunners) to the eval EnvRunnerGroup (and discard the eval
-                # EnvRunners' stats).
-                "use_worker_filter_stats": False,
-            },
-        )
-    )
-    if args.enable_new_api_stack:
-        config = config.rl_module(
+        .rl_module(
             model_config_dict={
                 "fcnet_activation": "relu",
                 "fcnet_weights_initializer": torch.nn.init.xavier_uniform_,
@@ -170,17 +161,27 @@
                 "uses_new_env_runners": True,
             }
         )
-    else:
-        config = config.training(
-            model=dict(
-                {
-                    "fcnet_activation": "relu",
-                    "fcnet_weights_initializer": torch.nn.init.xavier_uniform_,
-                    "fcnet_bias_initializer": torch.nn.init.constant_,
-                    "fcnet_bias_initializer_config": {"val": 0.0},
-                }
-            )
-        )
+        # In case you would like to run with a evaluation EnvRunners, make sure your
+        # `evaluation_config` key contains the `use_worker_filter_stats=False` setting
+        # (see below). This setting makes sure that the mean/std stats collected by the
+        # evaluation EnvRunners are NOT used for the training EnvRunners (unless you
+        # really want to mix these stats). It's normally a good idea to keep the stats
+        # collected during evaluation completely out of the training data (already for
+        # better reproducibility alone).
+        # .evaluation(
+        #    evaluation_num_env_runners=1,
+        #    evaluation_interval=1,
+        #    evaluation_config={
+        #        "explore": False,
+        #        # Do NOT use the eval EnvRunners' ConnectorV2 states. Instead, before
+        #        # each round of evaluation, broadcast the latest training
+        #        # EnvRunnerGroup's ConnectorV2 states (merged from all training remote
+        #        # EnvRunners) to the eval EnvRunnerGroup (and discard the eval
+        #        # EnvRunners' stats).
+        #        "use_worker_filter_stats": False,
+        #    },
+        # )
+    )
 
     # Add a simple multi-agent setup.
     if args.num_agents > 0:
@@ -189,12 +190,4 @@
             policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}",
         )
 
-    stop = {
-        TRAINING_ITERATION: args.stop_iters,
-        f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": (
-            args.stop_reward
-        ),
-        NUM_ENV_STEPS_SAMPLED_LIFETIME: args.stop_timesteps,
-    }
-
-    run_rllib_example_script_experiment(config, args, stop=stop)
+    run_rllib_example_script_experiment(config, args)
diff --git a/rllib/examples/inference/policy_inference_after_training.py b/rllib/examples/inference/policy_inference_after_training.py
index 5cdb6090f758..2525d5ca2935 100644
--- a/rllib/examples/inference/policy_inference_after_training.py
+++ b/rllib/examples/inference/policy_inference_after_training.py
@@ -5,12 +5,12 @@
 used in this example.
 
 This example:
-  - shows how to use an already existing checkpoint to extract a single-agent RLModule
-  from (our policy network).
-  - shows how to setup this recovered policy net for action computations (with or without
-  using exploration).
-  - shows have the policy run through a very simple gymnasium based env-loop, w/o using
-  RLlib's ConnectorV2s or EnvRunners.
+    - shows how to use an already existing checkpoint to extract a single-agent RLModule
+    from (our policy network).
+    - shows how to setup this recovered policy net for action computations (with or
+    without using exploration).
+    - shows have the policy run through a very simple gymnasium based env-loop, w/o
+    using RLlib's ConnectorV2s or EnvRunners.
 
 
 How to run this script
diff --git a/rllib/examples/inference/policy_inference_after_training_w_connector.py b/rllib/examples/inference/policy_inference_after_training_w_connector.py
index 304c5ba76ed2..e4a66ec33266 100644
--- a/rllib/examples/inference/policy_inference_after_training_w_connector.py
+++ b/rllib/examples/inference/policy_inference_after_training_w_connector.py
@@ -7,13 +7,13 @@
 at every episode step to compute a new action.
 
 This example:
-  - shows how to use an already existing checkpoint to extract a single-agent RLModule
-  from (our policy network).
-  - shows how to setup this recovered policy net for action computations (with or without
-  using exploration).
-  - shows how to create a more complex env-loop in which the action-computing RLModule
-  requires its own previous state outputs as new input and how to use RLlib's Episode
-  APIs to achieve this.
+    - shows how to use an already existing checkpoint to extract a single-agent RLModule
+    from (our policy network).
+    - shows how to setup this recovered policy net for action computations (with or
+    without using exploration).
+    - shows how to create a more complex env-loop in which the action-computing RLModule
+    requires its own previous state outputs as new input and how to use RLlib's Episode
+    APIs to achieve this.
 
 
 How to run this script

From db94e46ffb13235894dae52a4c47d47643115b1b Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Tue, 11 Jun 2024 17:44:04 +0200
Subject: [PATCH 03/10] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 .../restore_1_of_n_agents_from_checkpoint.py  | 18 ++++++-----
 .../connectors/prev_actions_prev_rewards.py   | 31 +++++--------------
 2 files changed, 18 insertions(+), 31 deletions(-)

diff --git a/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py b/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py
index f3c83777e8e5..bf6889113fed 100644
--- a/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py
+++ b/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py
@@ -2,14 +2,15 @@
 from checkpoint.
 
 This example:
-        - Runs a multi-agent `Pendulum-v1` experiment with >= 2 policies.
-        - Saves a checkpoint of the `MultiAgentRLModule` used every `--checkpoint-freq`
-            iterations.
-        - Stops the experiments after the agents reach a combined return of -800.
-        - Picks the best checkpoint by combined return and restores policy 0 from it.
-        - Runs a second experiment with the restored `RLModule` for policy 0 and
-            a fresh `RLModule` for the other policies.
-        - Stops the second experiment after the agents reach a combined return of -800.
+    - Runs a multi-agent `Pendulum-v1` experiment with >= 2 policies.
+    - Saves a checkpoint of the `MultiAgentRLModule` used every `--checkpoint-freq`
+       iterations.
+    - Stops the experiments after the agents reach a combined return of -800.
+    - Picks the best checkpoint by combined return and restores policy 0 from it.
+    - Runs a second experiment with the restored `RLModule` for policy 0 and
+        a fresh `RLModule` for the other policies.
+    - Stops the second experiment after the agents reach a combined return of -800.
+
 
 How to run this script
 ----------------------
@@ -34,6 +35,7 @@
 `--wandb-key=[your WandB API key] --wandb-project=[some project name]
 --wandb-run-name=[optional: WandB run name (within the defined project)]`
 
+
 Results to expect
 -----------------
 You should expect a reward of -400.0 eventually being achieved by a simple
diff --git a/rllib/examples/connectors/prev_actions_prev_rewards.py b/rllib/examples/connectors/prev_actions_prev_rewards.py
index 2b62fcdd0ef8..f24b96078aeb 100644
--- a/rllib/examples/connectors/prev_actions_prev_rewards.py
+++ b/rllib/examples/connectors/prev_actions_prev_rewards.py
@@ -1,4 +1,4 @@
-"""Example using connectors (V2) for observation frame-stacking in Atari environments.
+"""Example using a ConnectorV2 to add previous rewards/actions to an RLModule's input.
 
 An RLlib Algorithm has 3 distinct connector pipelines:
 - An env-to-module pipeline in an EnvRunner accepting a list of episodes and producing
@@ -20,10 +20,11 @@
 different pipelines described above, as required.
 
 This example:
-    - shows how the `FrameStackingEnvToModule` ConnectorV2 piece can be added to the
-    env-to-module pipeline.
-    - shows how the `FrameStackingLearner` ConnectorV2 piece can be added to the
-    learner connector pipeline.
+    - shows how the `PrevActionsPrevRewardsConnector` piece can be added to the
+    env-to-module pipeline to extract previous rewards and/or actions from the ongoing
+    episodes.
+    - shows how this connector creates  and wraps this information together to the RLModule's
+    original observation and .
     - demonstrates that using these two pieces (rather than performing framestacking
     already inside the environment using a gymnasium wrapper) increases overall
     performance by about 5%.
@@ -146,10 +147,7 @@ def _env_to_module(env):
             train_batch_size=4000,
             vf_loss_coeff=0.01,
         )
-    )
-
-    if args.enable_new_api_stack:
-        config = config.rl_module(
+        .rl_module(
             model_config_dict={
                 "use_lstm": True,
                 "max_seq_len": 50,
@@ -161,20 +159,7 @@ def _env_to_module(env):
                 "uses_new_env_runners": True,
             }
         )
-    else:
-        config = config.training(
-            model=dict(
-                {
-                    "use_lstm": True,
-                    "max_seq_len": 50,
-                    "fcnet_hiddens": [32],
-                    "fcnet_activation": "linear",
-                    "vf_share_layers": True,
-                    "fcnet_weights_initializer": nn.init.xavier_uniform_,
-                    "fcnet_bias_initializer": functools.partial(nn.init.constant_, 0.0),
-                }
-            )
-        )
+    )
 
     # Add a simple multi-agent setup.
     if args.num_agents > 0:

From cc8277c8b66ae451479bd6656e73f829a12a21ae Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Wed, 12 Jun 2024 11:51:32 +0200
Subject: [PATCH 04/10] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/connectors/env_to_module/flatten_observations.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rllib/connectors/env_to_module/flatten_observations.py b/rllib/connectors/env_to_module/flatten_observations.py
index 1958f9e871d1..c3443b3dff7f 100644
--- a/rllib/connectors/env_to_module/flatten_observations.py
+++ b/rllib/connectors/env_to_module/flatten_observations.py
@@ -199,7 +199,7 @@ def __call__(
                             if not agent_id
                             else self._input_obs_base_struct[agent_id]
                         ),
-                        # Our items are bare observations (no batch axis present).
+                        # Our items are individual observations (no batch axis present).
                         batch_axis=False,
                     )
                 )

From f1d05fd674d21ae74fb618cf7a81bb53e2ec3fac Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Wed, 12 Jun 2024 13:16:48 +0200
Subject: [PATCH 05/10] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/connectors/env_to_module/__init__.py    |  4 +-
 .../env_to_module/flatten_observations.py     | 68 ++++++++++---------
 .../env_to_module/mean_std_filter.py          | 13 ++--
 .../prev_actions_prev_rewards.py              | 38 ++++++-----
 .../connectors/prev_actions_prev_rewards.py   | 18 ++---
 .../rock_paper_scissors_learned_vs_learned.py | 12 +---
 6 files changed, 76 insertions(+), 77 deletions(-)

diff --git a/rllib/connectors/env_to_module/__init__.py b/rllib/connectors/env_to_module/__init__.py
index 8f2750c9a807..98b73bd9962b 100644
--- a/rllib/connectors/env_to_module/__init__.py
+++ b/rllib/connectors/env_to_module/__init__.py
@@ -14,7 +14,7 @@
     FlattenObservations,
 )
 from ray.rllib.connectors.env_to_module.prev_actions_prev_rewards import (
-    PrevActionsPrevRewardsConnector,
+    PrevActionsPrevRewards,
 )
 from ray.rllib.connectors.env_to_module.write_observations_to_episodes import (
     WriteObservationsToEpisodes,
@@ -29,6 +29,6 @@
     "EnvToModulePipeline",
     "FlattenObservations",
     "NumpyToTensor",
-    "PrevActionsPrevRewardsConnector",
+    "PrevActionsPrevRewards",
     "WriteObservationsToEpisodes",
 ]
diff --git a/rllib/connectors/env_to_module/flatten_observations.py b/rllib/connectors/env_to_module/flatten_observations.py
index c3443b3dff7f..57ec233a4a53 100644
--- a/rllib/connectors/env_to_module/flatten_observations.py
+++ b/rllib/connectors/env_to_module/flatten_observations.py
@@ -19,18 +19,12 @@
 class FlattenObservations(ConnectorV2):
     """A connector piece that flattens all observation components into a 1D array.
 
-    - Only works on data that has already been added to the batch.
-    - This connector makes the assumption that under the Columns.OBS key in batch,
-    there is either a list of individual env observations to be flattened (single-agent
-    case) or a dict mapping agent- and module IDs to lists of data items to be
-    flattened (multi-agent case).
-    - Does NOT work in a Learner pipeline as it operates on individual observation
-    items (as opposed to batched/time-ranked data).
-    - Therefore, assumes that the altered (flattened) observations will be written
-    back into the episode by a later connector piece in the env-to-module pipeline
-    (which this piece is part of as well).
-    - Does NOT read any information from the given list of Episode objects.
-    - Does NOT write any observations (or other data) to the given Episode objects.
+    - Works directly on the incoming episodes list and changes the last observation
+    in-place (write the flattened observation back into the episode).
+    - This connector does NOT alter the incoming batch (`data`) when called.
+    - This connector does NOT work in a `LearnerConnectorPipeline` because it requires
+    the incoming episodes to still be ongoing (in progress) as it only alters the
+    latest observation, not all observations in an episode.
 
     .. testcode::
 
@@ -177,32 +171,40 @@ def __call__(
                 f"for this connector to work!"
             )
 
-        # Process each item under the Columns.OBS key individually and flatten
-        # it. We are using the `ConnectorV2.foreach_batch_item_change_in_place` API,
-        # allowing us to not worry about multi- or single-agent setups and returning
-        # the new version of each item we are iterating over.
-        self.foreach_batch_item_change_in_place(
-            batch=data,
-            column=Columns.OBS,
-            func=(
-                lambda item, eps_id, agent_id, module_id: (
-                    # Multi-agent AND skip this AgentID.
-                    item
-                    if self._agent_ids and agent_id not in self._agent_ids
-                    # Single-agent or flatten this AgentIDs observation.
+        for sa_episode in self.single_agent_episode_iterator(
+                episodes, agents_that_stepped_only=True
+        ):
+            # Episode is not finalized yet and thus still operates on lists of items.
+            assert not sa_episode.is_finalized
+
+            last_obs = sa_episode.get_observations(-1)
+
+            if self._multi_agent:
+                flattened_obs = {
+                    agent_obs if aid not in self._agent_ids
                     else flatten_inputs_to_1d_tensor(
-                        item,
+                        inputs=agent_obs,
                         # In the multi-agent case, we need to use the specific agent's
                         # space struct, not the multi-agent observation space dict.
-                        (
-                            self._input_obs_base_struct
-                            if not agent_id
-                            else self._input_obs_base_struct[agent_id]
-                        ),
+                        spaces_struct=self._input_obs_base_struct[agent_id],
                         # Our items are individual observations (no batch axis present).
                         batch_axis=False,
                     )
+                    for aid, agent_obs in last_obs.items()
+                }
+            else:
+                flattened_obs = flatten_inputs_to_1d_tensor(
+                    inputs=last_obs,
+                    spaces_struct=self._input_obs_base_struct,
+                    # Our items are individual observations (no batch axis present).
+                    batch_axis=False,
                 )
-            ),
-        )
+
+            # Write new observation directly back into the episode.
+            sa_episode.set_observations(at_indices=-1, new_data=flattened_obs)
+            #  We set the Episode's observation space to ours so that we can safely
+            #  set the last obs to the new value (without causing a space mismatch
+            #  error).
+            sa_episode.observation_space = self.observation_space
+
         return data
diff --git a/rllib/connectors/env_to_module/mean_std_filter.py b/rllib/connectors/env_to_module/mean_std_filter.py
index 187fc9130826..09b5e4f0fbcf 100644
--- a/rllib/connectors/env_to_module/mean_std_filter.py
+++ b/rllib/connectors/env_to_module/mean_std_filter.py
@@ -19,6 +19,8 @@
 class MeanStdFilter(ConnectorV2):
     """A connector used to mean-std-filter observations.
 
+
+
     Incoming observations are filtered such that the output of this filter is on
     average 0.0 and has a standard deviation of 1.0. If the observation space is
     a (possibly nested) dict, this filtering is applied separately per element of
@@ -121,13 +123,10 @@ def __call__(
                 sa_obs, update=self._update_stats
             )
             sa_episode.set_observations(at_indices=-1, new_data=normalized_sa_obs)
-
-            if len(sa_episode) == 0:
-                # TODO (sven): This is kind of a hack.
-                #  We set the Episode's observation space to ours so that we can safely
-                #  set the last obs to the new value (without causing a space mismatch
-                #  error).
-                sa_episode.observation_space = self.observation_space
+            #  We set the Episode's observation space to ours so that we can safely
+            #  set the last obs to the new value (without causing a space mismatch
+            #  error).
+            sa_episode.observation_space = self.observation_space
 
         # Leave `data` as is. RLlib's default connector will automatically
         # populate the OBS column therein from the episodes' now transformed
diff --git a/rllib/connectors/env_to_module/prev_actions_prev_rewards.py b/rllib/connectors/env_to_module/prev_actions_prev_rewards.py
index 5a0222fceb0c..f48c5e459a39 100644
--- a/rllib/connectors/env_to_module/prev_actions_prev_rewards.py
+++ b/rllib/connectors/env_to_module/prev_actions_prev_rewards.py
@@ -12,7 +12,7 @@
 from ray.rllib.utils.typing import EpisodeType
 
 
-class PrevActionsPrevRewardsConnector(ConnectorV2):
+class PrevActionsPrevRewards(ConnectorV2):
     """A connector piece that adds previous rewards and actions to the input obs.
 
     - Requires Columns.OBS to be already a part of the batch.
@@ -64,7 +64,7 @@ def __init__(
         n_prev_rewards: int = 1,
         **kwargs,
     ):
-        """Initializes a PrevActionsPrevRewardsConnector instance.
+        """Initializes a PrevActionsPrevRewards instance.
 
         Args:
             multi_agent: Whether this is a connector operating on a multi-agent
@@ -116,7 +116,6 @@ def __call__(
                 f"for this connector to work!"
             )
 
-        new_obs = []
         for sa_episode, orig_obs in self.single_agent_episode_iterator(
             episodes, zip_with_batch_column=observations
         ):
@@ -142,21 +141,26 @@ def __call__(
                     )
                 )
 
-            new_obs.append(
-                {
-                    self.ORIG_OBS_KEY: orig_obs,
-                    self.PREV_ACTIONS_KEY: prev_n_actions,
-                    self.PREV_REWARDS_KEY: prev_n_rewards,
-                }
-            )
+            augmented_obs = {
+                self.ORIG_OBS_KEY: orig_obs,
+                self.PREV_ACTIONS_KEY: prev_n_actions,
+                self.PREV_REWARDS_KEY: prev_n_rewards,
+            }
 
-        # Convert the observations in the batch into a dict with the keys:
-        # "_obs", "_prev_rewards", and "_prev_actions".
-        self.foreach_batch_item_change_in_place(
-            batch=data,
-            column=Columns.OBS,
-            func=lambda orig_obs, eps_id, agent_id, module_id: new_obs.pop(0),
-        )
+            # Write new observation directly back into the episode.
+            sa_episode.set_observations(at_indices=-1, new_data=augmented_obs)
+            #  We set the Episode's observation space to ours so that we can safely
+            #  set the last obs to the new value (without causing a space mismatch
+            #  error).
+            sa_episode.observation_space = self.observation_space
+
+        ## Convert the observations in the batch into a dict with the keys:
+        ## "_obs", "_prev_rewards", and "_prev_actions".
+        #self.foreach_batch_item_change_in_place(
+        #    batch=data,
+        #    column=Columns.OBS,
+        #    func=lambda orig_obs, eps_id, agent_id, module_id: new_obs.pop(0),
+        #)
 
         return data
 
diff --git a/rllib/examples/connectors/prev_actions_prev_rewards.py b/rllib/examples/connectors/prev_actions_prev_rewards.py
index f24b96078aeb..82003157c19c 100644
--- a/rllib/examples/connectors/prev_actions_prev_rewards.py
+++ b/rllib/examples/connectors/prev_actions_prev_rewards.py
@@ -20,12 +20,14 @@
 different pipelines described above, as required.
 
 This example:
-    - shows how the `PrevActionsPrevRewardsConnector` piece can be added to the
+    - shows how the `PrevActionsPrevRewards` ConnectorV2 piece can be added to the
     env-to-module pipeline to extract previous rewards and/or actions from the ongoing
     episodes.
-    - shows how this connector creates  and wraps this information together to the RLModule's
-    original observation and .
-    - demonstrates that using these two pieces (rather than performing framestacking
+    - shows how this connector creates  and wraps this new information (rewards and
+    actions) together with the original observations into the RLModule's input dict
+    as a new gym.spaces.Dict structure.
+    - demonstrates how to plug in RLlib's in-house observation flattening
+    connector after the that using these two pieces (rather than performing framestacking
     already inside the environment using a gymnasium wrapper) increases overall
     performance by about 5%.
 
@@ -83,7 +85,7 @@
 from ray.rllib.connectors.env_to_module import (
     AddObservationsFromEpisodesToBatch,
     FlattenObservations,
-    PrevActionsPrevRewardsConnector,
+    PrevActionsPrevRewards,
     WriteObservationsToEpisodes,
 )
 from ray.rllib.examples.envs.classes.stateless_cartpole import StatelessCartPole
@@ -116,14 +118,14 @@
     def _env_to_module(env):
         # Create the env-to-module connector pipeline.
         return [
-            AddObservationsFromEpisodesToBatch(),
-            PrevActionsPrevRewardsConnector(
+            #AddObservationsFromEpisodesToBatch(),
+            PrevActionsPrevRewards(
                 multi_agent=args.num_agents > 0,
                 n_prev_rewards=args.n_prev_rewards,
                 n_prev_actions=args.n_prev_actions,
             ),
             FlattenObservations(multi_agent=args.num_agents > 0),
-            WriteObservationsToEpisodes(),
+            #WriteObservationsToEpisodes(),
         ]
 
     # Register our environment with tune.
diff --git a/rllib/examples/multi_agent/rock_paper_scissors_learned_vs_learned.py b/rllib/examples/multi_agent/rock_paper_scissors_learned_vs_learned.py
index 507c018babc8..e3e75c990692 100644
--- a/rllib/examples/multi_agent/rock_paper_scissors_learned_vs_learned.py
+++ b/rllib/examples/multi_agent/rock_paper_scissors_learned_vs_learned.py
@@ -15,11 +15,7 @@
 
 from pettingzoo.classic import rps_v2
 
-from ray.rllib.connectors.env_to_module import (
-    AddObservationsFromEpisodesToBatch,
-    FlattenObservations,
-    WriteObservationsToEpisodes,
-)
+from ray.rllib.connectors.env_to_module import FlattenObservations
 from ray.rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec
 from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec
 from ray.rllib.env.wrappers.pettingzoo_env import ParallelPettingZooEnv
@@ -62,11 +58,7 @@
         .get_default_config()
         .environment("RockPaperScissors")
         .env_runners(
-            env_to_module_connector=lambda env: (
-                AddObservationsFromEpisodesToBatch(),
-                FlattenObservations(multi_agent=True),
-                WriteObservationsToEpisodes(),
-            ),
+            env_to_module_connector=lambda env: FlattenObservations(multi_agent=True),
         )
         .multi_agent(
             policies={"p0", "p1"},

From e278e67eb67ab64c7f35554dac42b159020da4aa Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Wed, 12 Jun 2024 14:30:49 +0200
Subject: [PATCH 06/10] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/BUILD                                   |  52 ++---
 .../env_to_module/flatten_observations.py     |  16 +-
 .../prev_actions_prev_rewards.py              |  35 +---
 rllib/examples/actions/__init__.py            |   0
 .../examples/actions/nested_action_spaces.py  |  92 +++++++++
 ....py => flatten_observations_dict_space.py} |  81 ++++----
 .../connectors/nested_action_spaces.py        | 178 ------------------
 .../connectors/prev_actions_prev_rewards.py   |  17 +-
 .../curriculum/curriculum_learning.py         |  12 +-
 ...ock_paper_scissors_heuristic_vs_learned.py |  10 +-
 .../two_step_game_with_grouped_agents.py      |  12 +-
 11 files changed, 177 insertions(+), 328 deletions(-)
 create mode 100644 rllib/examples/actions/__init__.py
 create mode 100644 rllib/examples/actions/nested_action_spaces.py
 rename rllib/examples/connectors/{nested_observation_spaces.py => flatten_observations_dict_space.py} (60%)
 delete mode 100644 rllib/examples/connectors/nested_action_spaces.py

diff --git a/rllib/BUILD b/rllib/BUILD
index 3ff7a8275461..c2d310d337e8 100644
--- a/rllib/BUILD
+++ b/rllib/BUILD
@@ -2086,6 +2086,27 @@ py_test(
 # tagged by @OldAPIStack and/or @HybridAPIStack
 # ----------------------
 
+# subdirectory: actions/
+
+# Nested action spaces (flattening obs and learning w/ multi-action distribution).
+py_test(
+    name = "examples/actions/nested_action_spaces_ppo",
+    main = "examples/actions/nested_action_spaces.py",
+    tags = ["team:rllib", "exclusive", "examples"],
+    size = "large",
+    srcs = ["examples/actions/nested_action_spaces.py"],
+    args = ["--enable-new-api-stack", "--as-test", "--framework=torch", "--stop-reward=-500.0", "--algo=PPO"]
+)
+
+py_test(
+    name = "examples/actions/nested_action_spaces_multi_agent_ppo",
+    main = "examples/actions/nested_action_spaces.py",
+    tags = ["team:rllib", "exclusive", "examples"],
+    size = "large",
+    srcs = ["examples/actions/nested_action_spaces.py"],
+    args = ["--enable-new-api-stack", "--as-test", "--num-agents=2", "--framework=torch", "--stop-reward=-1000.0", "--algo=PPO"]
+)
+
 # subdirectory: algorithms/
 
 #@OldAPIStack
@@ -2213,41 +2234,22 @@ py_test(
     args = ["--enable-new-api-stack", "--num-agents=2", "--stop-iter=2", "--framework=torch", "--algo=PPO", "--num-env-runners=4", "--num-cpus=6"]
 )
 
-# Nested action spaces (flattening obs and learning w/ multi-action distribution).
-py_test(
-    name = "examples/connectors/nested_action_spaces_ppo",
-    main = "examples/connectors/nested_action_spaces.py",
-    tags = ["team:rllib", "exclusive", "examples"],
-    size = "large",
-    srcs = ["examples/connectors/nested_action_spaces.py"],
-    args = ["--enable-new-api-stack", "--as-test", "--framework=torch", "--stop-reward=-500.0", "--algo=PPO"]
-)
-
-py_test(
-    name = "examples/connectors/nested_action_spaces_multi_agent_ppo",
-    main = "examples/connectors/nested_action_spaces.py",
-    tags = ["team:rllib", "exclusive", "examples"],
-    size = "large",
-    srcs = ["examples/connectors/nested_action_spaces.py"],
-    args = ["--enable-new-api-stack", "--as-test", "--num-agents=2", "--framework=torch", "--stop-reward=-1000.0", "--algo=PPO"]
-)
-
 # Nested observation spaces (flattening).
 py_test(
-    name = "examples/connectors/nested_observation_spaces_ppo",
-    main = "examples/connectors/nested_observation_spaces.py",
+    name = "examples/connectors/flatten_observations_dict_space_ppo",
+    main = "examples/connectors/flatten_observations_dict_space.py",
     tags = ["team:rllib", "exclusive", "examples"],
     size = "medium",
-    srcs = ["examples/connectors/nested_observation_spaces.py"],
+    srcs = ["examples/connectors/flatten_observations_dict_space.py"],
     args = ["--enable-new-api-stack", "--as-test", "--stop-reward=400.0", "--framework=torch", "--algo=PPO"]
 )
 
 py_test(
-    name = "examples/connectors/nested_observation_spaces_multi_agent_ppo",
-    main = "examples/connectors/nested_observation_spaces.py",
+    name = "examples/connectors/flatten_observations_dict_space_multi_agent_ppo",
+    main = "examples/connectors/flatten_observations_dict_space.py",
     tags = ["team:rllib", "exclusive", "examples"],
     size = "medium",
-    srcs = ["examples/connectors/nested_observation_spaces.py"],
+    srcs = ["examples/connectors/flatten_observations_dict_space.py"],
     args = ["--enable-new-api-stack", "--num-agents=2", "--as-test", "--stop-reward=800.0", "--framework=torch", "--algo=PPO"]
 )
 
diff --git a/rllib/connectors/env_to_module/flatten_observations.py b/rllib/connectors/env_to_module/flatten_observations.py
index 57ec233a4a53..8ac27ac8b817 100644
--- a/rllib/connectors/env_to_module/flatten_observations.py
+++ b/rllib/connectors/env_to_module/flatten_observations.py
@@ -6,7 +6,6 @@
 import tree  # pip install dm_tree
 
 from ray.rllib.connectors.connector_v2 import ConnectorV2
-from ray.rllib.core.columns import Columns
 from ray.rllib.core.rl_module.rl_module import RLModule
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.numpy import flatten_inputs_to_1d_tensor
@@ -163,16 +162,8 @@ def __call__(
         shared_data: Optional[dict] = None,
         **kwargs,
     ) -> Any:
-        observations = data.get(Columns.OBS)
-
-        if observations is None:
-            raise ValueError(
-                f"`batch` must already have a column named {Columns.OBS} in it "
-                f"for this connector to work!"
-            )
-
         for sa_episode in self.single_agent_episode_iterator(
-                episodes, agents_that_stepped_only=True
+            episodes, agents_that_stepped_only=True
         ):
             # Episode is not finalized yet and thus still operates on lists of items.
             assert not sa_episode.is_finalized
@@ -181,7 +172,8 @@ def __call__(
 
             if self._multi_agent:
                 flattened_obs = {
-                    agent_obs if aid not in self._agent_ids
+                    agent_obs
+                    if agent_id not in self._agent_ids
                     else flatten_inputs_to_1d_tensor(
                         inputs=agent_obs,
                         # In the multi-agent case, we need to use the specific agent's
@@ -190,7 +182,7 @@ def __call__(
                         # Our items are individual observations (no batch axis present).
                         batch_axis=False,
                     )
-                    for aid, agent_obs in last_obs.items()
+                    for agent_id, agent_obs in last_obs.items()
                 }
             else:
                 flattened_obs = flatten_inputs_to_1d_tensor(
diff --git a/rllib/connectors/env_to_module/prev_actions_prev_rewards.py b/rllib/connectors/env_to_module/prev_actions_prev_rewards.py
index f48c5e459a39..89aa51f69e2b 100644
--- a/rllib/connectors/env_to_module/prev_actions_prev_rewards.py
+++ b/rllib/connectors/env_to_module/prev_actions_prev_rewards.py
@@ -5,7 +5,6 @@
 import numpy as np
 
 from ray.rllib.connectors.connector_v2 import ConnectorV2
-from ray.rllib.core.columns import Columns
 from ray.rllib.core.rl_module.rl_module import RLModule
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.spaces.space_utils import batch, flatten_to_single_ndarray
@@ -36,8 +35,6 @@ class PrevActionsPrevRewards(ConnectorV2):
     """
 
     ORIG_OBS_KEY = "_orig_obs"
-    PREV_ACTIONS_KEY = "prev_actions"
-    PREV_REWARDS_KEY = "prev_rewards"
 
     @override(ConnectorV2)
     def recompute_observation_space_from_input_spaces(self):
@@ -108,22 +105,16 @@ def __call__(
         shared_data: Optional[dict] = None,
         **kwargs,
     ) -> Any:
-        observations = data.get(Columns.OBS)
-
-        if observations is None:
-            raise ValueError(
-                f"`batch` must already have a column named {Columns.OBS} in it "
-                f"for this connector to work!"
-            )
-
-        for sa_episode, orig_obs in self.single_agent_episode_iterator(
-            episodes, zip_with_batch_column=observations
+        for sa_episode in self.single_agent_episode_iterator(
+            episodes, agents_that_stepped_only=True
         ):
             # Episode is not finalized yet and thus still operates on lists of items.
             assert not sa_episode.is_finalized
 
+            augmented_obs = {self.ORIG_OBS_KEY: sa_episode.get_observations(-1)}
+
             if self.n_prev_actions:
-                prev_n_actions = flatten_to_single_ndarray(
+                augmented_obs[self.PREV_ACTIONS_KEY] = flatten_to_single_ndarray(
                     batch(
                         sa_episode.get_actions(
                             indices=slice(-self.n_prev_actions, None),
@@ -134,19 +125,13 @@ def __call__(
                 )
 
             if self.n_prev_rewards:
-                prev_n_rewards = np.array(
+                augmented_obs[self.PREV_REWARDS_KEY] = np.array(
                     sa_episode.get_rewards(
                         indices=slice(-self.n_prev_rewards, None),
                         fill=0.0,
                     )
                 )
 
-            augmented_obs = {
-                self.ORIG_OBS_KEY: orig_obs,
-                self.PREV_ACTIONS_KEY: prev_n_actions,
-                self.PREV_REWARDS_KEY: prev_n_rewards,
-            }
-
             # Write new observation directly back into the episode.
             sa_episode.set_observations(at_indices=-1, new_data=augmented_obs)
             #  We set the Episode's observation space to ours so that we can safely
@@ -154,14 +139,6 @@ def __call__(
             #  error).
             sa_episode.observation_space = self.observation_space
 
-        ## Convert the observations in the batch into a dict with the keys:
-        ## "_obs", "_prev_rewards", and "_prev_actions".
-        #self.foreach_batch_item_change_in_place(
-        #    batch=data,
-        #    column=Columns.OBS,
-        #    func=lambda orig_obs, eps_id, agent_id, module_id: new_obs.pop(0),
-        #)
-
         return data
 
     def _convert_individual_space(self, obs_space, act_space):
diff --git a/rllib/examples/actions/__init__.py b/rllib/examples/actions/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/rllib/examples/actions/nested_action_spaces.py b/rllib/examples/actions/nested_action_spaces.py
new file mode 100644
index 000000000000..db7ad434c674
--- /dev/null
+++ b/rllib/examples/actions/nested_action_spaces.py
@@ -0,0 +1,92 @@
+from gymnasium.spaces import Dict, Tuple, Box, Discrete, MultiDiscrete
+
+from ray.tune.registry import register_env
+from ray.rllib.connectors.env_to_module import FlattenObservations
+from ray.rllib.examples.envs.classes.multi_agent import (
+    MultiAgentNestedSpaceRepeatAfterMeEnv,
+)
+from ray.rllib.examples.envs.classes.nested_space_repeat_after_me_env import (
+    NestedSpaceRepeatAfterMeEnv,
+)
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray.tune.registry import get_trainable_cls
+
+
+# Read in common example script command line arguments.
+parser = add_rllib_example_script_args(default_timesteps=200000, default_reward=-500.0)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    assert (
+        args.enable_new_api_stack
+    ), "Must set --enable-new-api-stack when running this script!"
+
+    # Define env-to-module-connector pipeline for the new stack.
+    def _env_to_module_pipeline(env):
+        return FlattenObservations(multi_agent=args.num_agents > 0)
+
+    # Register our environment with tune.
+    if args.num_agents > 0:
+        register_env(
+            "env",
+            lambda c: MultiAgentNestedSpaceRepeatAfterMeEnv(
+                config=dict(c, **{"num_agents": args.num_agents})
+            ),
+        )
+    else:
+        register_env("env", lambda c: NestedSpaceRepeatAfterMeEnv(c))
+
+    # Define the AlgorithmConfig used.
+    base_config = (
+        get_trainable_cls(args.algo)
+        .get_default_config()
+        .environment(
+            "env",
+            env_config={
+                "space": Dict(
+                    {
+                        "a": Tuple(
+                            [Dict({"d": Box(-15.0, 3.0, ()), "e": Discrete(3)})]
+                        ),
+                        "b": Box(-10.0, 10.0, (2,)),
+                        "c": MultiDiscrete([3, 3]),
+                        "d": Discrete(2),
+                    }
+                ),
+                "episode_len": 100,
+            },
+        )
+        .env_runners(env_to_module_connector=_env_to_module_pipeline)
+        # No history in Env (bandit problem).
+        .training(
+            gamma=0.0,
+            lr=0.0005,
+            model=(
+                {} if not args.enable_new_api_stack else {"uses_new_env_runners": True}
+            ),
+        )
+    )
+
+    # Add a simple multi-agent setup.
+    if args.num_agents > 0:
+        base_config.multi_agent(
+            policies={f"p{i}" for i in range(args.num_agents)},
+            policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}",
+        )
+
+    # Fix some PPO-specific settings.
+    if args.algo == "PPO":
+        base_config.training(
+            # We don't want high entropy in this Env.
+            entropy_coeff=0.00005,
+            num_sgd_iter=4,
+            vf_loss_coeff=0.01,
+        )
+
+    # Run everything as configured.
+    run_rllib_example_script_experiment(base_config, args)
diff --git a/rllib/examples/connectors/nested_observation_spaces.py b/rllib/examples/connectors/flatten_observations_dict_space.py
similarity index 60%
rename from rllib/examples/connectors/nested_observation_spaces.py
rename to rllib/examples/connectors/flatten_observations_dict_space.py
index cf98909502ee..bed31ce5ac28 100644
--- a/rllib/examples/connectors/nested_observation_spaces.py
+++ b/rllib/examples/connectors/flatten_observations_dict_space.py
@@ -1,4 +1,4 @@
-"""Example using connectors (V2) for observation frame-stacking in Atari environments.
+"""Example using a ConnectorV2 to flatten arbitrarily nested dict or tuple observations.
 
 An RLlib Algorithm has 3 distinct connector pipelines:
 - An env-to-module pipeline in an EnvRunner accepting a list of episodes and producing
@@ -20,25 +20,30 @@
 different pipelines described above, as required.
 
 This example:
-    - shows how the `FrameStackingEnvToModule` ConnectorV2 piece can be added to the
+    - shows how the `FlattenObservation` ConnectorV2 piece can be added to the
     env-to-module pipeline.
-    - shows how the `FrameStackingLearner` ConnectorV2 piece can be added to the
-    learner connector pipeline.
-    - demonstrates that using these two pieces (rather than performing framestacking
-    already inside the environment using a gymnasium wrapper) increases overall
-    performance by about 5%.
+    - demonstrates that by using this connector, any arbitrarily nested dict or tuple
+    observations is properly flattened into a simple 1D tensor, for easier RLModule
+    processing.
+    - shows how - in a multi-agent setup - individual agents can be specified, whose
+    observations should be flattened (while other agents' observations will always
+    be left as-is).
+    - uses a variant of the CartPole-v1 environment, in which the 4 observation items
+    (x-pos, x-veloc, angle, and angle-veloc) are taken apart and put into a nested dict
+    with the structure:
+    {
+        "x-pos": [x-pos],
+        "angular-pos": {
+            "value": [angle],
+            "some_random_stuff": [random Discrete(3)],  # <- should be ignored by algo
+        },
+        "velocs": Tuple([x-veloc], [angle-veloc]),
+    }
 
 
 How to run this script
 ----------------------
-`python [script file name].py --enable-new-api-stack --num-frames=4 --env=ALE/Pong-v5`
-
-Use the `--num-frames` option to define the number of observations to framestack.
-If you don't want to use Connectors to perform the framestacking, set the
-`--use-gym-wrapper-framestacking` flag to perform framestacking already inside a
-gymnasium observation wrapper. In this case though, be aware that the tensors being
-sent through the network are `--num-frames` x larger than if you use the Connector
-setup.
+`python [script file name].py --enable-new-api-stack`
 
 For debugging, use the following additional command line options
 `--no-tune --num-env-runners=0`
@@ -53,35 +58,21 @@
 Results to expect
 -----------------
 
-With `--num-frames=4` and using the two extra ConnectorV2 pieces (in the env-to-module
-and learner connector pipelines), you should see something like:
-+---------------------------+------------+--------+------------------+...
-| Trial name                | status     |   iter |   total time (s) |
-|                           |            |        |                  |
-|---------------------------+------------+--------+------------------+...
-| PPO_atari-env_2fc4a_00000 | TERMINATED |     10 |          557.257 |
-+---------------------------+------------+--------+------------------+...
-
-Note that the time to run these 10 iterations is about .% faster than when
-performing framestacking already inside the environment (using a
-`gymnasium.wrappers.ObservationWrapper`), due to the additional network traffic
-needed (sending back 4x[obs] batches instead of 1x[obs] to the learners).
-
-Thus, with the `--use-gym-wrapper-framestacking` option, the output looks
-like this:
-+---------------------------+------------+--------+------------------+...
-| Trial name                | status     |   iter |   total time (s) |
-|                           |            |        |                  |
-|---------------------------+------------+--------+------------------+...
-| PPO_atari-env_2fc4a_00000 | TERMINATED |     10 |          557.257 |
-+---------------------------+------------+--------+------------------+...
++---------------------+------------+----------------+--------+------------------+
+| Trial name          | status     | loc            |   iter |   total time (s) |
+|                     |            |                |        |                  |
+|---------------------+------------+----------------+--------+------------------+
+| PPO_env_a2fd6_00000 | TERMINATED | 127.0.0.1:7409 |     25 |          24.1426 |
++---------------------+------------+----------------+--------+------------------+
+------------------------+------------------------+------------------------+
+   num_env_steps_sample |   num_env_steps_traine |   episode_return_mean  |
+             d_lifetime |             d_lifetime |                        |
+------------------------+------------------------+------------------------|
+                 100000 |                 100000 |                 421.42 |
+------------------------+------------------------+------------------------+
 """
 from ray.tune.registry import register_env
-from ray.rllib.connectors.env_to_module import (
-    AddObservationsFromEpisodesToBatch,
-    FlattenObservations,
-    WriteObservationsToEpisodes,
-)
+from ray.rllib.connectors.env_to_module import FlattenObservations
 from ray.rllib.examples.envs.classes.cartpole_with_dict_observation_space import (
     CartPoleWithDictObservationSpace,
 )
@@ -108,11 +99,7 @@
 
     # Define env-to-module-connector pipeline for the new stack.
     def _env_to_module_pipeline(env):
-        return [
-            AddObservationsFromEpisodesToBatch(),
-            FlattenObservations(multi_agent=args.num_agents > 0),
-            WriteObservationsToEpisodes(),
-        ]
+        return FlattenObservations(multi_agent=args.num_agents > 0)
 
     # Register our environment with tune.
     if args.num_agents > 0:
diff --git a/rllib/examples/connectors/nested_action_spaces.py b/rllib/examples/connectors/nested_action_spaces.py
deleted file mode 100644
index 86df316c7916..000000000000
--- a/rllib/examples/connectors/nested_action_spaces.py
+++ /dev/null
@@ -1,178 +0,0 @@
-"""Example using connectors (V2) for observation frame-stacking in Atari environments.
-
-An RLlib Algorithm has 3 distinct connector pipelines:
-- An env-to-module pipeline in an EnvRunner accepting a list of episodes and producing
-a batch for an RLModule to compute actions (`forward_inference()` or
-`forward_exploration()`).
-- A module-to-env pipeline in an EnvRunner taking the RLModule's output and converting
-it into an action readable by the environment.
-- A learner connector pipeline on a Learner taking a list of episodes and producing
-a batch for an RLModule to perform the training forward pass (`forward_train()`).
-
-Each of these pipelines has a fixed set of default ConnectorV2 pieces that RLlib
-adds/prepends to these pipelines in order to perform the most basic functionalities.
-For example, RLlib adds the `AddObservationsFromEpisodesToBatch` ConnectorV2 into any
-env-to-module pipeline to make sure the batch for computing actions contains - at the
-minimum - the most recent observation.
-
-On top of these default ConnectorV2 pieces, users can define their own ConnectorV2
-pieces (or use the ones available already in RLlib) and add them to one of the 3
-different pipelines described above, as required.
-
-This example:
-    - shows how the `FrameStackingEnvToModule` ConnectorV2 piece can be added to the
-    env-to-module pipeline.
-    - shows how the `FrameStackingLearner` ConnectorV2 piece can be added to the
-    learner connector pipeline.
-    - demonstrates that using these two pieces (rather than performing framestacking
-    already inside the environment using a gymnasium wrapper) increases overall
-    performance by about 5%.
-
-
-How to run this script
-----------------------
-`python [script file name].py --enable-new-api-stack --num-frames=4 --env=ALE/Pong-v5`
-
-Use the `--num-frames` option to define the number of observations to framestack.
-If you don't want to use Connectors to perform the framestacking, set the
-`--use-gym-wrapper-framestacking` flag to perform framestacking already inside a
-gymnasium observation wrapper. In this case though, be aware that the tensors being
-sent through the network are `--num-frames` x larger than if you use the Connector
-setup.
-
-For debugging, use the following additional command line options
-`--no-tune --num-env-runners=0`
-which should allow you to set breakpoints anywhere in the RLlib code and
-have the execution stop there for inspection and debugging.
-
-For logging to your WandB account, use:
-`--wandb-key=[your WandB API key] --wandb-project=[some project name]
---wandb-run-name=[optional: WandB run name (within the defined project)]`
-
-
-Results to expect
------------------
-
-With `--num-frames=4` and using the two extra ConnectorV2 pieces (in the env-to-module
-and learner connector pipelines), you should see something like:
-+---------------------------+------------+--------+------------------+...
-| Trial name                | status     |   iter |   total time (s) |
-|                           |            |        |                  |
-|---------------------------+------------+--------+------------------+...
-| PPO_atari-env_2fc4a_00000 | TERMINATED |     10 |          557.257 |
-+---------------------------+------------+--------+------------------+...
-
-Note that the time to run these 10 iterations is about .% faster than when
-performing framestacking already inside the environment (using a
-`gymnasium.wrappers.ObservationWrapper`), due to the additional network traffic
-needed (sending back 4x[obs] batches instead of 1x[obs] to the learners).
-
-Thus, with the `--use-gym-wrapper-framestacking` option, the output looks
-like this:
-+---------------------------+------------+--------+------------------+...
-| Trial name                | status     |   iter |   total time (s) |
-|                           |            |        |                  |
-|---------------------------+------------+--------+------------------+...
-| PPO_atari-env_2fc4a_00000 | TERMINATED |     10 |          557.257 |
-+---------------------------+------------+--------+------------------+...
-"""
-from gymnasium.spaces import Dict, Tuple, Box, Discrete, MultiDiscrete
-
-from ray.tune.registry import register_env
-from ray.rllib.connectors.env_to_module import (
-    AddObservationsFromEpisodesToBatch,
-    FlattenObservations,
-    WriteObservationsToEpisodes,
-)
-from ray.rllib.examples.envs.classes.multi_agent import (
-    MultiAgentNestedSpaceRepeatAfterMeEnv,
-)
-from ray.rllib.examples.envs.classes.nested_space_repeat_after_me_env import (
-    NestedSpaceRepeatAfterMeEnv,
-)
-from ray.rllib.utils.test_utils import (
-    add_rllib_example_script_args,
-    run_rllib_example_script_experiment,
-)
-from ray.tune.registry import get_trainable_cls
-
-
-# Read in common example script command line arguments.
-parser = add_rllib_example_script_args(default_timesteps=200000, default_reward=-500.0)
-
-
-if __name__ == "__main__":
-    args = parser.parse_args()
-
-    assert (
-        args.enable_new_api_stack
-    ), "Must set --enable-new-api-stack when running this script!"
-
-    # Define env-to-module-connector pipeline for the new stack.
-    def _env_to_module_pipeline(env):
-        return [
-            AddObservationsFromEpisodesToBatch(),
-            FlattenObservations(multi_agent=args.num_agents > 0),
-            WriteObservationsToEpisodes(),
-        ]
-
-    # Register our environment with tune.
-    if args.num_agents > 0:
-        register_env(
-            "env",
-            lambda c: MultiAgentNestedSpaceRepeatAfterMeEnv(
-                config=dict(c, **{"num_agents": args.num_agents})
-            ),
-        )
-    else:
-        register_env("env", lambda c: NestedSpaceRepeatAfterMeEnv(c))
-
-    # Define the AlgorithmConfig used.
-    base_config = (
-        get_trainable_cls(args.algo)
-        .get_default_config()
-        .environment(
-            "env",
-            env_config={
-                "space": Dict(
-                    {
-                        "a": Tuple(
-                            [Dict({"d": Box(-15.0, 3.0, ()), "e": Discrete(3)})]
-                        ),
-                        "b": Box(-10.0, 10.0, (2,)),
-                        "c": MultiDiscrete([3, 3]),
-                        "d": Discrete(2),
-                    }
-                ),
-                "episode_len": 100,
-            },
-        )
-        .env_runners(env_to_module_connector=_env_to_module_pipeline)
-        # No history in Env (bandit problem).
-        .training(
-            gamma=0.0,
-            lr=0.0005,
-            model=(
-                {} if not args.enable_new_api_stack else {"uses_new_env_runners": True}
-            ),
-        )
-    )
-
-    # Add a simple multi-agent setup.
-    if args.num_agents > 0:
-        base_config.multi_agent(
-            policies={f"p{i}" for i in range(args.num_agents)},
-            policy_mapping_fn=lambda aid, *a, **kw: f"p{aid}",
-        )
-
-    # Fix some PPO-specific settings.
-    if args.algo == "PPO":
-        base_config.training(
-            # We don't want high entropy in this Env.
-            entropy_coeff=0.00005,
-            num_sgd_iter=4,
-            vf_loss_coeff=0.01,
-        )
-
-    # Run everything as configured.
-    run_rllib_example_script_experiment(base_config, args)
diff --git a/rllib/examples/connectors/prev_actions_prev_rewards.py b/rllib/examples/connectors/prev_actions_prev_rewards.py
index 82003157c19c..1f341c5aeaaa 100644
--- a/rllib/examples/connectors/prev_actions_prev_rewards.py
+++ b/rllib/examples/connectors/prev_actions_prev_rewards.py
@@ -25,11 +25,12 @@
     episodes.
     - shows how this connector creates  and wraps this new information (rewards and
     actions) together with the original observations into the RLModule's input dict
-    as a new gym.spaces.Dict structure.
-    - demonstrates how to plug in RLlib's in-house observation flattening
-    connector after the that using these two pieces (rather than performing framestacking
-    already inside the environment using a gymnasium wrapper) increases overall
-    performance by about 5%.
+    under a new `gym.spaces.Dict` structure (for example, if your observation space
+    is `O=Box(shape=(3,))` and you add the most recent 1 reward, the new observation
+    space will be `Dict({"_original_obs": O, "prev_n_rewards": Box(shape=())})`.
+    - demonstrates how to use RLlib's `FlattenObservations` right after the
+    `PrevActionsPrevRewards` to flatten that new dict observation structure again into
+    a single 1D tensor.
 
 
 How to run this script
@@ -83,10 +84,8 @@
 
 from ray.rllib.algorithms.ppo import PPOConfig
 from ray.rllib.connectors.env_to_module import (
-    AddObservationsFromEpisodesToBatch,
     FlattenObservations,
     PrevActionsPrevRewards,
-    WriteObservationsToEpisodes,
 )
 from ray.rllib.examples.envs.classes.stateless_cartpole import StatelessCartPole
 from ray.rllib.examples.envs.classes.multi_agent import MultiAgentStatelessCartPole
@@ -118,14 +117,14 @@
     def _env_to_module(env):
         # Create the env-to-module connector pipeline.
         return [
-            #AddObservationsFromEpisodesToBatch(),
+            # AddObservationsFromEpisodesToBatch(),
             PrevActionsPrevRewards(
                 multi_agent=args.num_agents > 0,
                 n_prev_rewards=args.n_prev_rewards,
                 n_prev_actions=args.n_prev_actions,
             ),
             FlattenObservations(multi_agent=args.num_agents > 0),
-            #WriteObservationsToEpisodes(),
+            # WriteObservationsToEpisodes(),
         ]
 
     # Register our environment with tune.
diff --git a/rllib/examples/curriculum/curriculum_learning.py b/rllib/examples/curriculum/curriculum_learning.py
index 02916ed459df..b215e4ed0b6b 100644
--- a/rllib/examples/curriculum/curriculum_learning.py
+++ b/rllib/examples/curriculum/curriculum_learning.py
@@ -59,11 +59,7 @@
 from ray.air.constants import TRAINING_ITERATION
 from ray.rllib.algorithms.algorithm import Algorithm
 from ray.rllib.algorithms.callbacks import DefaultCallbacks
-from ray.rllib.connectors.env_to_module import (
-    AddObservationsFromEpisodesToBatch,
-    FlattenObservations,
-    WriteObservationsToEpisodes,
-)
+from ray.rllib.connectors.env_to_module import FlattenObservations
 from ray.rllib.utils.metrics import (
     ENV_RUNNER_RESULTS,
     EPISODE_RETURN_MEAN,
@@ -221,11 +217,7 @@ def on_train_result(
         )
         .env_runners(
             num_envs_per_env_runner=5,
-            env_to_module_connector=lambda env: [
-                AddObservationsFromEpisodesToBatch(),
-                FlattenObservations(),
-                WriteObservationsToEpisodes(),
-            ],
+            env_to_module_connector=lambda env: FlattenObservations(),
         )
     )
 
diff --git a/rllib/examples/multi_agent/rock_paper_scissors_heuristic_vs_learned.py b/rllib/examples/multi_agent/rock_paper_scissors_heuristic_vs_learned.py
index d503e7f23ad3..1f7ad8dc238c 100644
--- a/rllib/examples/multi_agent/rock_paper_scissors_heuristic_vs_learned.py
+++ b/rllib/examples/multi_agent/rock_paper_scissors_heuristic_vs_learned.py
@@ -33,11 +33,7 @@
 from pettingzoo.classic import rps_v2
 
 from ray.air.constants import TRAINING_ITERATION
-from ray.rllib.connectors.env_to_module import (
-    AddObservationsFromEpisodesToBatch,
-    FlattenObservations,
-    WriteObservationsToEpisodes,
-)
+from ray.rllib.connectors.env_to_module import FlattenObservations
 from ray.rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec
 from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec
 from ray.rllib.env.wrappers.pettingzoo_env import ParallelPettingZooEnv
@@ -89,10 +85,8 @@
         .environment("RockPaperScissors")
         .env_runners(
             env_to_module_connector=lambda env: (
-                AddObservationsFromEpisodesToBatch(),
-                # Only flatten obs for the learning RLModul
+                # `agent_ids=...`: Only flatten obs for the learning RLModule.
                 FlattenObservations(multi_agent=True, agent_ids={"player_0"}),
-                WriteObservationsToEpisodes(),
             ),
         )
         .multi_agent(
diff --git a/rllib/examples/multi_agent/two_step_game_with_grouped_agents.py b/rllib/examples/multi_agent/two_step_game_with_grouped_agents.py
index afabd3fe9003..2c9435822290 100644
--- a/rllib/examples/multi_agent/two_step_game_with_grouped_agents.py
+++ b/rllib/examples/multi_agent/two_step_game_with_grouped_agents.py
@@ -40,11 +40,7 @@
 +------------------+-------+-------------------+-------------+
 """
 
-from ray.rllib.connectors.env_to_module import (
-    AddObservationsFromEpisodesToBatch,
-    FlattenObservations,
-    WriteObservationsToEpisodes,
-)
+from ray.rllib.connectors.env_to_module import FlattenObservations
 from ray.rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec
 from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec
 from ray.rllib.examples.envs.classes.two_step_game import TwoStepGameWithGroupedAgents
@@ -76,11 +72,7 @@
         .get_default_config()
         .environment("grouped_twostep")
         .env_runners(
-            env_to_module_connector=lambda env: (
-                AddObservationsFromEpisodesToBatch(),
-                FlattenObservations(multi_agent=True),
-                WriteObservationsToEpisodes(),
-            ),
+            env_to_module_connector=lambda env: FlattenObservations(multi_agent=True),
         )
         .multi_agent(
             policies={"p0"},

From 70c939ba6d603ebf788123544a7e20704ccb1677 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Wed, 12 Jun 2024 14:53:26 +0200
Subject: [PATCH 07/10] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/connectors/connector_pipeline_v2.py     |  2 +-
 rllib/connectors/connector_v2.py              |  3 +-
 .../prev_actions_prev_rewards.py              |  4 +-
 .../connectors/prev_actions_prev_rewards.py   | 42 +++++++++----------
 4 files changed, 25 insertions(+), 26 deletions(-)

diff --git a/rllib/connectors/connector_pipeline_v2.py b/rllib/connectors/connector_pipeline_v2.py
index f04376dcd993..3156c66a7a69 100644
--- a/rllib/connectors/connector_pipeline_v2.py
+++ b/rllib/connectors/connector_pipeline_v2.py
@@ -312,7 +312,7 @@ def _fix_spaces(self):
             obs_space = self.input_observation_space
             act_space = self.input_action_space
             for con in self.connectors:
-                con.input_observation_space = obs_space
                 con.input_action_space = act_space
+                con.input_observation_space = obs_space
                 obs_space = con.observation_space
                 act_space = con.action_space
diff --git a/rllib/connectors/connector_v2.py b/rllib/connectors/connector_v2.py
index ad7bd9eed4bb..e43f7515faea 100644
--- a/rllib/connectors/connector_v2.py
+++ b/rllib/connectors/connector_v2.py
@@ -84,8 +84,9 @@ def __init__(
         self._action_space = None
         self._input_observation_space = None
         self._input_action_space = None
-        self.input_observation_space = input_observation_space
+
         self.input_action_space = input_action_space
+        self.input_observation_space = input_observation_space
 
     @OverrideToImplementCustomLogic
     def recompute_observation_space_from_input_spaces(self) -> gym.Space:
diff --git a/rllib/connectors/env_to_module/prev_actions_prev_rewards.py b/rllib/connectors/env_to_module/prev_actions_prev_rewards.py
index 89aa51f69e2b..5b26cd1f8b87 100644
--- a/rllib/connectors/env_to_module/prev_actions_prev_rewards.py
+++ b/rllib/connectors/env_to_module/prev_actions_prev_rewards.py
@@ -35,11 +35,11 @@ class PrevActionsPrevRewards(ConnectorV2):
     """
 
     ORIG_OBS_KEY = "_orig_obs"
+    PREV_ACTIONS_KEY = "prev_n_actions"
+    PREV_REWARDS_KEY = "prev_n_rewards"
 
     @override(ConnectorV2)
     def recompute_observation_space_from_input_spaces(self):
-        if self.input_action_space is None:
-            return None
         if self._multi_agent:
             ret = {}
             for agent_id, obs_space in self.input_observation_space.spaces.items():
diff --git a/rllib/examples/connectors/prev_actions_prev_rewards.py b/rllib/examples/connectors/prev_actions_prev_rewards.py
index 1f341c5aeaaa..dcee6ac5689e 100644
--- a/rllib/examples/connectors/prev_actions_prev_rewards.py
+++ b/rllib/examples/connectors/prev_actions_prev_rewards.py
@@ -31,6 +31,11 @@
     - demonstrates how to use RLlib's `FlattenObservations` right after the
     `PrevActionsPrevRewards` to flatten that new dict observation structure again into
     a single 1D tensor.
+    - uses the StatelessCartPole environment, a CartPole-v1 derivative that's missing
+    both x-veloc and angle-veloc observation components and is therefore non-Markovian
+    (only partially observable). An LSTM default model is used for training. Adding
+    the additional context to the observations (for example, prev. actions) helps the
+    LSTM to more quickly learn in this environment.
 
 
 How to run this script
@@ -57,28 +62,21 @@
 Results to expect
 -----------------
 
-With `--num-frames=4` and using the two extra ConnectorV2 pieces (in the env-to-module
-and learner connector pipelines), you should see something like:
-+---------------------------+------------+--------+------------------+...
-| Trial name                | status     |   iter |   total time (s) |
-|                           |            |        |                  |
-|---------------------------+------------+--------+------------------+...
-| PPO_atari-env_2fc4a_00000 | TERMINATED |     10 |          557.257 |
-+---------------------------+------------+--------+------------------+...
-
-Note that the time to run these 10 iterations is about .% faster than when
-performing framestacking already inside the environment (using a
-`gymnasium.wrappers.ObservationWrapper`), due to the additional network traffic
-needed (sending back 4x[obs] batches instead of 1x[obs] to the learners).
-
-Thus, with the `--use-gym-wrapper-framestacking` option, the output looks
-like this:
-+---------------------------+------------+--------+------------------+...
-| Trial name                | status     |   iter |   total time (s) |
-|                           |            |        |                  |
-|---------------------------+------------+--------+------------------+...
-| PPO_atari-env_2fc4a_00000 | TERMINATED |     10 |          557.257 |
-+---------------------------+------------+--------+------------------+...
+You should see something similar to this in your terminal output when running
+ths script as described above:
+
++---------------------+------------+-----------------+--------+------------------+
+| Trial name          | status     | loc             |   iter |   total time (s) |
+|                     |            |                 |        |                  |
+|---------------------+------------+-----------------+--------+------------------+
+| PPO_env_0edd2_00000 | TERMINATED | 127.0.0.1:12632 |     17 |          42.6898 |
++---------------------+------------+-----------------+--------+------------------+
++------------------------+------------------------+------------------------+
+|   num_env_steps_sample |   num_env_steps_traine |   episode_return_mean  |
+|             d_lifetime |             d_lifetime |                        |
+|------------------------+------------------------+------------------------|
+|                  68000 |                  68000 |                 205.22 |
++------------------------+------------------------+------------------------+
 """
 import functools
 

From 3bc04bc09d9c91220fe3b6f341efd94fe218cbbf Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Wed, 12 Jun 2024 21:03:55 +0200
Subject: [PATCH 08/10] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 .../env_to_module/flatten_observations.py     | 31 ++++++++++---------
 .../env_to_module/mean_std_filter.py          |  2 --
 2 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/rllib/connectors/env_to_module/flatten_observations.py b/rllib/connectors/env_to_module/flatten_observations.py
index 8ac27ac8b817..e5f393b84614 100644
--- a/rllib/connectors/env_to_module/flatten_observations.py
+++ b/rllib/connectors/env_to_module/flatten_observations.py
@@ -31,6 +31,7 @@ class FlattenObservations(ConnectorV2):
         import numpy as np
 
         from ray.rllib.connectors.env_to_module import FlattenObservations
+        from ray.rllib.env.single_agent_episode import SingleAgentEpisode
         from ray.rllib.utils.test_utils import check
 
         # Some arbitrarily nested, complex observation space.
@@ -44,24 +45,26 @@ class FlattenObservations(ConnectorV2):
         })
         act_space = gym.spaces.Discrete(2)
 
-        # A batch of two example items, both coming from the above defined observation
-        # space.
-        batch = {
-            "obs": [
-                # 1st example item.
+        # Two example episodes, both with initial (reset) observations coming from the
+        # above defined observation space.
+        episode_1 = SingleAgentEpisode(
+            observations=[
                 {
                     "a": np.array(-10.0, np.float32),
                     "b": (1, np.array([[-1.0], [-1.0]], np.float32)),
                     "c": np.array([0, 2]),
                 },
-                # 2nd example item.
+            ],
+        )
+        episode_2 = SingleAgentEpisode(
+            observations=[
                 {
                     "a": np.array(10.0, np.float32),
                     "b": (0, np.array([[1.0], [1.0]], np.float32)),
                     "c": np.array([1, 1]),
                 },
             ],
-        }
+        )
 
         # Construct our connector piece.
         connector = FlattenObservations(obs_space, act_space)
@@ -69,23 +72,23 @@ class FlattenObservations(ConnectorV2):
         # Call our connector piece with the example data.
         output_data = connector(
             rl_module=None,  # This connector works without an RLModule.
-            data=batch,
-            episodes=[],  # This connector does not need the `episodes` input.
+            data={},  # This connector does not alter any data.
+            episodes=[episode_1, episode_2],
             explore=True,
             shared_data={},
         )
 
-        # The connector does not change the number of items in the data (still 2 items).
-        check(len(output_data["obs"]), 2)
+        # The connector does not alter the data and acts as pure pass-through.
+        check(output_data, {})
 
-        # The connector has flattened each item in the data to a 1D tensor.
+        # The connector has flattened each item in the episodes to a 1D tensor.
         check(
-            output_data["obs"][0],
+            episode_1.get_observations(0),
             #         box()  disc(2).  box(2, 1).  multidisc(2, 3)........
             np.array([-10.0, 0.0, 1.0, -1.0, -1.0, 1.0, 0.0, 0.0, 0.0, 1.0]),
         )
         check(
-            output_data["obs"][1],
+            episode_2.get_observations(0),
             #         box()  disc(2).  box(2, 1).  multidisc(2, 3)........
             np.array([10.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0]),
         )
diff --git a/rllib/connectors/env_to_module/mean_std_filter.py b/rllib/connectors/env_to_module/mean_std_filter.py
index 09b5e4f0fbcf..c0bdf8bc6544 100644
--- a/rllib/connectors/env_to_module/mean_std_filter.py
+++ b/rllib/connectors/env_to_module/mean_std_filter.py
@@ -19,8 +19,6 @@
 class MeanStdFilter(ConnectorV2):
     """A connector used to mean-std-filter observations.
 
-
-
     Incoming observations are filtered such that the output of this filter is on
     average 0.0 and has a standard deviation of 1.0. If the observation space is
     a (possibly nested) dict, this filtering is applied separately per element of

From d14ec9fc3874d5ae0e44ab7210b4c55fb5c35f7f Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Wed, 12 Jun 2024 21:14:04 +0200
Subject: [PATCH 09/10] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 .../env_to_module/flatten_observations.py          | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/rllib/connectors/env_to_module/flatten_observations.py b/rllib/connectors/env_to_module/flatten_observations.py
index e5f393b84614..9df1d1c61f84 100644
--- a/rllib/connectors/env_to_module/flatten_observations.py
+++ b/rllib/connectors/env_to_module/flatten_observations.py
@@ -174,19 +174,17 @@ def __call__(
             last_obs = sa_episode.get_observations(-1)
 
             if self._multi_agent:
-                flattened_obs = {
-                    agent_obs
-                    if agent_id not in self._agent_ids
-                    else flatten_inputs_to_1d_tensor(
-                        inputs=agent_obs,
+                if self._agent_ids is not None and agent_id not in self._agent_ids:
+                    flattened_obs = last_obs
+                else:
+                    flattened_obs = flatten_inputs_to_1d_tensor(
+                        inputs=last_obs,
                         # In the multi-agent case, we need to use the specific agent's
                         # space struct, not the multi-agent observation space dict.
-                        spaces_struct=self._input_obs_base_struct[agent_id],
+                        spaces_struct=self._input_obs_base_struct[sa_episode.agent_id],
                         # Our items are individual observations (no batch axis present).
                         batch_axis=False,
                     )
-                    for agent_id, agent_obs in last_obs.items()
-                }
             else:
                 flattened_obs = flatten_inputs_to_1d_tensor(
                     inputs=last_obs,

From e0593300ea1b4f15e6f029c88c3bc0106d1f9e20 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Wed, 12 Jun 2024 21:33:10 +0200
Subject: [PATCH 10/10] LINT

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/connectors/env_to_module/flatten_observations.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/rllib/connectors/env_to_module/flatten_observations.py b/rllib/connectors/env_to_module/flatten_observations.py
index 9df1d1c61f84..6a2e60173b65 100644
--- a/rllib/connectors/env_to_module/flatten_observations.py
+++ b/rllib/connectors/env_to_module/flatten_observations.py
@@ -174,7 +174,10 @@ def __call__(
             last_obs = sa_episode.get_observations(-1)
 
             if self._multi_agent:
-                if self._agent_ids is not None and agent_id not in self._agent_ids:
+                if (
+                    self._agent_ids is not None
+                    and sa_episode.agent_id not in self._agent_ids
+                ):
                     flattened_obs = last_obs
                 else:
                     flattened_obs = flatten_inputs_to_1d_tensor(