ray-project · sven1977 · Jan 10, 2024 · Nov 17, 2023 · Nov 17, 2023 · Dec 1, 2023
@@ -230,16 +230,16 @@ Updates
         .. testcode::
 
             # This is a blocking update
-            results = learner_group.update(DUMMY_BATCH)
+            results = learner_group.update(batch=DUMMY_BATCH)
 
             # This is a non-blocking update. The results are returned in a future
             # call to `async_update`
-            _ = learner_group.async_update(DUMMY_BATCH)
+            _ = learner_group.async_update(batch=DUMMY_BATCH)
 
             # Artificially wait for async request to be done to get the results
             # in the next call to `LearnerGroup.async_update()`.
             time.sleep(5)
-            results = learner_group.async_update(DUMMY_BATCH)
+            results = learner_group.async_update(batch=DUMMY_BATCH)
             # `results` is a list of results dict. The items in the list represent the different
             # remote results from the different calls to `async_update()`.
             assert len(results) > 0
@@ -257,7 +257,7 @@ Updates
         .. testcode::
 
             # This is a blocking update.
-            result = learner.update(DUMMY_BATCH)
+            result = learner.update(batch=DUMMY_BATCH)
 
             # This is an additional non-gradient based update.
             learner_group.additional_update(**ADDITIONAL_UPDATE_KWARGS)

@@ -5,12 +5,12 @@
 import functools
 import gymnasium as gym
 import importlib
+import importlib.metadata
 import json
 import logging
 import numpy as np
 import os
 from packaging import version
-import importlib.metadata
 import re
 import tempfile
 import time
@@ -1613,7 +1613,7 @@ def training_step(self) -> ResultDict:
             # TODO: (sven) rename MultiGPUOptimizer into something more
             #  meaningful.
             if self.config._enable_new_api_stack:
-                train_results = self.learner_group.update(train_batch)
+                train_results = self.learner_group.update(batch=train_batch)
             elif self.config.get("simple_optimizer") is True:
                 train_results = train_one_step(self, train_batch)
             else:

@@ -171,7 +171,7 @@ def training_step(self) -> ResultDict:
                 self._counters[NUM_ENV_STEPS_SAMPLED] += train_batch.env_steps()
 
             # Updating the policy.
-            train_results = self.learner_group.update(train_batch)
+            train_results = self.learner_group.update(batch=train_batch)
 
             # Synchronize weights.
             # As the results contain for each policy the loss and in addition the

@@ -607,7 +607,7 @@ def training_step(self) -> ResultDict:
 
                 # Perform the actual update via our learner group.
                 train_results = self.learner_group.update(
-                    SampleBatch(sample).as_multi_agent(),
+                    batch=SampleBatch(sample).as_multi_agent(),
                     reduce_fn=self._reduce_results,
                 )
                 self._counters[NUM_AGENT_STEPS_TRAINED] += replayed_steps

@@ -951,7 +951,7 @@ def learn_on_processed_samples(self) -> ResultDict:
             for batch in batches:
                 if blocking:
                     result = self.learner_group.update(
-                        batch,
+                        batch=batch,
                         reduce_fn=_reduce_impala_results,
                         num_iters=self.config.num_sgd_iter,
                         minibatch_size=self.config.minibatch_size,

@@ -94,7 +94,7 @@ def test_impala_loss(self):
                 env=algo.workers.local_worker().env
             )
             learner_group.set_weights(algo.get_weights())
-            learner_group.update(train_batch.as_multi_agent())
+            learner_group.update(batch=train_batch.as_multi_agent())
 
             algo.stop()
 

@@ -425,7 +425,7 @@ def training_step(self) -> ResultDict:
             # TODO (Kourosh) Clearly define what train_batch_size
             #  vs. sgd_minibatch_size and num_sgd_iter is in the config.
             train_results = self.learner_group.update(
-                train_batch,
+                batch=train_batch,
                 minibatch_size=self.config.sgd_minibatch_size,
                 num_iters=self.config.num_sgd_iter,
             )

@@ -101,7 +101,7 @@ def test_loss(self):
 
             # Load the algo weights onto the learner_group.
             learner_group.set_weights(algo.get_weights())
-            learner_group.update(train_batch.as_multi_agent())
+            learner_group.update(batch=train_batch.as_multi_agent())
 
             algo.stop()
 

@@ -225,7 +225,7 @@ def get_value():
             assert init_std == 0.0, init_std
             batch = compute_gae_for_sample_batch(policy, PENDULUM_FAKE_BATCH.copy())
             batch = policy._lazy_tensor_dict(batch)
-            algo.learner_group.update(batch.as_multi_agent())
+            algo.learner_group.update(batch=batch.as_multi_agent())
 
             # Check the variable is updated.
             post_std = get_value()

@@ -48,6 +48,7 @@
 from ray.rllib.utils.schedules.scheduler import Scheduler
 from ray.rllib.utils.serialization import serialize_type
 from ray.rllib.utils.typing import (
+    EpisodeType,
     LearningRateOrSchedule,
     ModuleID,
     Optimizer,
@@ -1099,13 +1100,18 @@ def additional_update_for_module(
 
     def update(
         self,
-        batch: MultiAgentBatch,
         *,
-        minibatch_size: Optional[int] = None,
-        num_iters: int = 1,
+        # TODO (sven): We should allow passing in a single agent batch here
+        #  as well for simplicity.
+        batch: Optional[MultiAgentBatch] = None,
+        episodes: Optional[List[EpisodeType]] = None,
         reduce_fn: Callable[[List[Dict[str, Any]]], ResultDict] = (
             _reduce_mean_results
         ),
+        # TODO (sven): Deprecate these in favor of config attributes for only those
+        #  algos that actually need (and know how) to do minibatching.
+        minibatch_size: Optional[int] = None,
+        num_iters: int = 1,
     ) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
         """Do `num_iters` minibatch updates given the original batch.
 
@@ -1114,34 +1120,50 @@ def update(
         will be used for all module ids in MultiAgentRLModule.
 
         Args:
-            batch: A batch of data.
-            minibatch_size: The size of the minibatch to use for each update.
-            num_iters: The number of complete passes over all the sub-batches
-                in the input multi-agent batch.
+            batch: An optional batch of training data. If None, the `episodes` arg
+                must be provided.
+            episodes: An optional list of episode objects. If None, the `batch` arg
+                must be provided.
             reduce_fn: reduce_fn: A function to reduce the results from a list of
                 minibatch updates. This can be any arbitrary function that takes a
                 list of dictionaries and returns a single dictionary. For example you
                 can either take an average (default) or concatenate the results (for
                 example for metrics) or be more selective about you want to report back
                 to the algorithm's training_step. If None is passed, the results will
                 not get reduced.
+            minibatch_size: The size of the minibatch to use for each update.
+            num_iters: The number of complete passes over all the sub-batches
+                in the input multi-agent batch.
+
         Returns:
             A dictionary of results, in numpy format or a list of such dictionaries in
             case `reduce_fn` is None and we have more than one minibatch pass.
         """
         self._check_is_built()
 
-        missing_module_ids = set(batch.policy_batches.keys()) - set(self.module.keys())
-        if len(missing_module_ids) > 0:
-            raise ValueError(
-                "Batch contains module ids that are not in the learner: "
-                f"{missing_module_ids}"
+        # If a (multi-agent) batch is provided, check, whether our RLModule
+        # contains all ModuleIDs found in this batch. If not, throw an error.
+        if batch is not None:
+            unknown_module_ids = set(batch.policy_batches.keys()) - set(
+                self.module.keys()
             )
+            if len(unknown_module_ids) > 0:
+                raise ValueError(
+                    "Batch contains module ids that are not in the learner: "
+                    f"{unknown_module_ids}"
+                )
 
         if num_iters < 1:
             # We must do at least one pass on the batch for training.
             raise ValueError("`num_iters` must be >= 1")
 
+        # Call the train data preprocessor.
+        batch, episodes = self._preprocess_train_data(batch=batch, episodes=episodes)
+
+        # TODO (sven): Insert a call to the Learner ConnectorV2 pipeline here, providing
+        #  it both `batch` and `episode` for further custom processing before the
+        #  actual `Learner._update()` call.
+
         if minibatch_size:
             batch_iter = MiniBatchCyclicIterator
         elif num_iters > 1:
@@ -1180,7 +1202,7 @@ def update(
                 metrics_per_module=defaultdict(dict, **metrics_per_module),
             )
             self._check_result(result)
-            # TODO (sven): Figure out whether `compile_metrics` should be forced
+            # TODO (sven): Figure out whether `compile_results` should be forced
             #  to return all numpy/python data, then we can skip this conversion
             #  step here.
             results.append(convert_to_numpy(result))
@@ -1201,6 +1223,39 @@ def update(
         # dict.
         return reduce_fn(results)
 
+    @OverrideToImplementCustomLogic
+    def _preprocess_train_data(
+        self,
+        *,
+        batch: Optional[MultiAgentBatch] = None,
+        episodes: Optional[List[EpisodeType]] = None,
+    ) -> Tuple[Optional[MultiAgentBatch], Optional[List[EpisodeType]]]:
+        """Allows custom preprocessing of batch/episode data before the actual update.
+
+        The higher level order, in which this method is called from within
+        `Learner.update(batch, episodes)` is:
+        * batch, episodes = self._preprocess_train_data(batch, episodes)
+        * batch = self._learner_connector(batch, episodes)
+        * results = self._update(batch)
+
+        The default implementation does not do any processing and is a mere pass
+        through. However, specific algorithms should override this method to implement
+        their specific training data preprocessing needs. It is possible to perform
+        preliminary RLModule forward passes (besides the main "forward_train()" call
+        during `self._update`) in this method and custom algorithms might also want to
+        use this Learner's `self._learner_connector` to prepare the data
+        (batch/episodes) for such extra forward calls.
+
+        Args:
+            batch: An optional batch of training data to preprocess.
+            episodes: An optional list of episodes objects to preprocess.
+
+        Returns:
+            A tuple consisting of the processed `batch` and the processed list of
+            `episodes`.
+        """
+        return batch, episodes
+
     @OverrideToImplementCustomLogic
     @abc.abstractmethod
     def _update(