From dd18dd02fab3e241233933970a26518c54294d81 Mon Sep 17 00:00:00 2001
From: simonsays1980 <simon.zehnder@gmail.com>
Date: Mon, 9 Sep 2024 20:06:58 +0200
Subject: [PATCH] [RLlib; Offline RL] - Replace GAE in
 `MARWILOfflinePreLearner` with `GeneralAdvantageEstimation` connector in
 learner pipeline. (#47532)

Signed-off-by: ujjawal-khare <ujjawal.khare@dream11.com>
---
 rllib/algorithms/marwil/tests/test_marwil.py  | 12 ++++------
 rllib/tuned_examples/cql/pendulum_cql.py      | 23 +++++++------------
 .../tuned_examples/marwil/cartpole_marwil.py  | 19 ++++-----------
 3 files changed, 17 insertions(+), 37 deletions(-)

diff --git a/rllib/algorithms/marwil/tests/test_marwil.py b/rllib/algorithms/marwil/tests/test_marwil.py
index 5c2584d2ed821..f31241c709125 100644
--- a/rllib/algorithms/marwil/tests/test_marwil.py
+++ b/rllib/algorithms/marwil/tests/test_marwil.py
@@ -167,14 +167,12 @@ def possibly_masked_mean(data_):
 
         # Calculate our own expected values (to then compare against the
         # agent's loss output).
-        module = algo.learner_group._learner.module[DEFAULT_MODULE_ID].unwrapped()
-        fwd_out = module.forward_train(
-            {k: v for k, v in batch[DEFAULT_MODULE_ID].items()}
-        )
-        advantages = (
-            batch[DEFAULT_MODULE_ID][Columns.VALUE_TARGETS].detach().cpu().numpy()
-            - module.compute_values(batch[DEFAULT_MODULE_ID]).detach().cpu().numpy()
+        fwd_out = (
+            algo.learner_group._learner.module[DEFAULT_MODULE_ID]
+            .unwrapped()
+            .forward_train({k: v for k, v in batch[DEFAULT_MODULE_ID].items()})
         )
+        advantages = batch[DEFAULT_MODULE_ID][Columns.ADVANTAGES].detach().cpu().numpy()
         advantages_squared = possibly_masked_mean(np.square(advantages))
         c_2 = 100.0 + 1e-8 * (advantages_squared - 100.0)
         c = np.sqrt(c_2)
diff --git a/rllib/tuned_examples/cql/pendulum_cql.py b/rllib/tuned_examples/cql/pendulum_cql.py
index 24e74f0781a7b..e2727aba5febe 100644
--- a/rllib/tuned_examples/cql/pendulum_cql.py
+++ b/rllib/tuned_examples/cql/pendulum_cql.py
@@ -39,21 +39,14 @@
     )
     .offline_data(
         input_=[data_path.as_posix()],
-        # The `kwargs` for the `input_read_method`. We override the
-        # the number of blocks to pull at once b/c our dataset is
-        # small.
-        input_read_method_kwargs={"override_num_blocks": max(args.num_gpus * 2, 2)},
-        # The `kwargs` for the `map_batches` method in which our
-        # `OfflinePreLearner` is run. 2 data workers should be run
-        # concurrently.
-        map_batches_kwargs={"concurrency": 2, "num_cpus": 2},
-        # The `kwargs` for the `iter_batches` method. Due to the small
-        # dataset we choose only a single batch to prefetch.
-        iter_batches_kwargs={"prefetch_batches": 1},
-        # The number of iterations to be run per learner when in multi-learner
-        # mode in a single RLlib training iteration. Leave this to `None` to
-        # run an entire epoch on the dataset during a single RLlib training
-        # iteration. For single-learner mode 1 is the only option.
+        # Define the number of reading blocks, these should be larger than 1
+        # and aligned with the data size.
+        input_read_method_kwargs={"override_num_blocks": max(args.num_gpus, 2)},
+        # Concurrency defines the number of processes that run the
+        # `map_batches` transformations. This should be aligned with the
+        # 'prefetch_batches' argument in 'iter_batches_kwargs'.
+        map_batches_kwargs={"concurrency": max(2, args.num_gpus * 2)},
+        actions_in_input_normalized=True,
         dataset_num_iters_per_learner=1 if args.num_gpus == 0 else None,
         # TODO (sven): Has this any influence in the connectors?
         actions_in_input_normalized=True,
diff --git a/rllib/tuned_examples/marwil/cartpole_marwil.py b/rllib/tuned_examples/marwil/cartpole_marwil.py
index e33a23d62c69a..d40d389de39e2 100644
--- a/rllib/tuned_examples/marwil/cartpole_marwil.py
+++ b/rllib/tuned_examples/marwil/cartpole_marwil.py
@@ -49,21 +49,10 @@
     # as remote learners.
     .offline_data(
         input_=[data_path.as_posix()],
-        # The `kwargs` for the `input_read_method`. We override the
-        # the number of blocks to pull at once b/c our dataset is
-        # small.
-        input_read_method_kwargs={"override_num_blocks": max(args.num_gpus * 2, 2)},
-        # The `kwargs` for the `map_batches` method in which our
-        # `OfflinePreLearner` is run. 2 data workers should be run
-        # concurrently.
-        map_batches_kwargs={"concurrency": 2, "num_cpus": 2},
-        # The `kwargs` for the `iter_batches` method. Due to the small
-        # dataset we choose only a single batch to prefetch.
-        iter_batches_kwargs={"prefetch_batches": 1},
-        # The number of iterations to be run per learner when in multi-learner
-        # mode in a single RLlib training iteration. Leave this to `None` to
-        # run an entire epoch on the dataset during a single RLlib training
-        # iteration. For single-learner mode 1 is the only option.
+        # Note, we want to have at leat 2 data blocks to read from such that
+        # concurrency in `map_batches` works.
+        input_read_method_kwargs={"override_num_blocks": max(args.num_gpus, 2)},
+        prelearner_module_synch_period=20,
         dataset_num_iters_per_learner=1 if args.num_gpus == 0 else None,
     )
     .training(