[RLlib; Offline RL] - Replace GAE in MARWILOfflinePreLearner with `…

…GeneralAdvantageEstimation` connector in learner pipeline. (ray-project#47532) Signed-off-by: ujjawal-khare <ujjawal.khare@dream11.com>
ujjawal-khare-27 · Oct 15, 2024 · dd18dd0 · dd18dd0
1 parent d28b712
commit dd18dd0
Show file tree

Hide file tree

Showing 3 changed files with 17 additions and 37 deletions.
diff --git a/rllib/algorithms/marwil/tests/test_marwil.py b/rllib/algorithms/marwil/tests/test_marwil.py
@@ -167,14 +167,12 @@ def possibly_masked_mean(data_):
 
         # Calculate our own expected values (to then compare against the
         # agent's loss output).
-        module = algo.learner_group._learner.module[DEFAULT_MODULE_ID].unwrapped()
-        fwd_out = module.forward_train(
-            {k: v for k, v in batch[DEFAULT_MODULE_ID].items()}
-        )
-        advantages = (
-            batch[DEFAULT_MODULE_ID][Columns.VALUE_TARGETS].detach().cpu().numpy()
-            - module.compute_values(batch[DEFAULT_MODULE_ID]).detach().cpu().numpy()
+        fwd_out = (
+            algo.learner_group._learner.module[DEFAULT_MODULE_ID]
+            .unwrapped()
+            .forward_train({k: v for k, v in batch[DEFAULT_MODULE_ID].items()})
         )
+        advantages = batch[DEFAULT_MODULE_ID][Columns.ADVANTAGES].detach().cpu().numpy()
         advantages_squared = possibly_masked_mean(np.square(advantages))
         c_2 = 100.0 + 1e-8 * (advantages_squared - 100.0)
         c = np.sqrt(c_2)

diff --git a/rllib/tuned_examples/cql/pendulum_cql.py b/rllib/tuned_examples/cql/pendulum_cql.py
@@ -39,21 +39,14 @@
     )
     .offline_data(
         input_=[data_path.as_posix()],
-        # The `kwargs` for the `input_read_method`. We override the
-        # the number of blocks to pull at once b/c our dataset is
-        # small.
-        input_read_method_kwargs={"override_num_blocks": max(args.num_gpus * 2, 2)},
-        # The `kwargs` for the `map_batches` method in which our
-        # `OfflinePreLearner` is run. 2 data workers should be run
-        # concurrently.
-        map_batches_kwargs={"concurrency": 2, "num_cpus": 2},
-        # The `kwargs` for the `iter_batches` method. Due to the small
-        # dataset we choose only a single batch to prefetch.
-        iter_batches_kwargs={"prefetch_batches": 1},
-        # The number of iterations to be run per learner when in multi-learner
-        # mode in a single RLlib training iteration. Leave this to `None` to
-        # run an entire epoch on the dataset during a single RLlib training
-        # iteration. For single-learner mode 1 is the only option.
+        # Define the number of reading blocks, these should be larger than 1
+        # and aligned with the data size.
+        input_read_method_kwargs={"override_num_blocks": max(args.num_gpus, 2)},
+        # Concurrency defines the number of processes that run the
+        # `map_batches` transformations. This should be aligned with the
+        # 'prefetch_batches' argument in 'iter_batches_kwargs'.
+        map_batches_kwargs={"concurrency": max(2, args.num_gpus * 2)},
+        actions_in_input_normalized=True,
         dataset_num_iters_per_learner=1 if args.num_gpus == 0 else None,
         # TODO (sven): Has this any influence in the connectors?
         actions_in_input_normalized=True,

diff --git a/rllib/tuned_examples/marwil/cartpole_marwil.py b/rllib/tuned_examples/marwil/cartpole_marwil.py
@@ -49,21 +49,10 @@
     # as remote learners.
     .offline_data(
         input_=[data_path.as_posix()],
-        # The `kwargs` for the `input_read_method`. We override the
-        # the number of blocks to pull at once b/c our dataset is
-        # small.
-        input_read_method_kwargs={"override_num_blocks": max(args.num_gpus * 2, 2)},
-        # The `kwargs` for the `map_batches` method in which our
-        # `OfflinePreLearner` is run. 2 data workers should be run
-        # concurrently.
-        map_batches_kwargs={"concurrency": 2, "num_cpus": 2},
-        # The `kwargs` for the `iter_batches` method. Due to the small
-        # dataset we choose only a single batch to prefetch.
-        iter_batches_kwargs={"prefetch_batches": 1},
-        # The number of iterations to be run per learner when in multi-learner
-        # mode in a single RLlib training iteration. Leave this to `None` to
-        # run an entire epoch on the dataset during a single RLlib training
-        # iteration. For single-learner mode 1 is the only option.
+        # Note, we want to have at leat 2 data blocks to read from such that
+        # concurrency in `map_batches` works.
+        input_read_method_kwargs={"override_num_blocks": max(args.num_gpus, 2)},
+        prelearner_module_synch_period=20,
         dataset_num_iters_per_learner=1 if args.num_gpus == 0 else None,
     )
     .training(