[RLlib; fault-tolerance] Fix spot node preemption problem (RLlib does…

… not catch correct `ObjectLostError`). (ray-project#47940) Signed-off-by: ujjawal-khare <ujjawal.khare@dream11.com>
ujjawal-khare-27 · Oct 15, 2024 · 0170ec7 · 0170ec7
1 parent 8920d00
commit 0170ec7
Show file tree

Hide file tree

Showing 2 changed files with 14 additions and 21 deletions.
diff --git a/rllib/BUILD b/rllib/BUILD
@@ -944,13 +944,6 @@ py_test(
     srcs = ["algorithms/tests/test_callbacks_old_api_stack.py"]
 )
 
-py_test(
-    name = "test_node_failure",
-    tags = ["team:rllib", "tests_dir", "exclusive"],
-    size = "medium",
-    srcs = ["tests/test_node_failure.py"],
-)
-
 py_test(
     name = "test_registry",
     tags = ["team:rllib", "algorithms_dir", "algorithms_dir_generic"],

diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py
@@ -2240,7 +2240,7 @@ def training(
             minibatch_size: The size of minibatches to use to further split the train
                 batch into.
             shuffle_batch_per_epoch: Whether to shuffle the train batch once per epoch.
-                If the train batch has a time rank (axis=1), shuffling will only take
+                If the train batch has a time rank (axis=1), shuffling only takes
                 place along the batch axis to not disturb any intact (episode)
                 trajectories.
             model: Arguments passed into the policy model. See models/catalog.py for a
@@ -2603,7 +2603,7 @@ def offline_data(
                 files. See https://docs.ray.io/en/latest/data/api/input_output.html for
                 more info about available read methods in `ray.data`.
             input_read_method_kwargs: Keyword args for `input_read_method`. These
-                will be passed into the read method without checking. If no arguments
+                are passed into the read method without checking. If no arguments
                 are passed in the default argument
                 `{'override_num_blocks': max(num_learners * 2, 2)}` is used. Use these
                 keyword args together with `map_batches_kwargs` and
@@ -2647,8 +2647,8 @@ def offline_data(
                 ABS filesystem arguments.
             input_compress_columns: What input columns are compressed with LZ4 in the
                 input data. If data is stored in RLlib's `SingleAgentEpisode` (
-                `MultiAgentEpisode` not supported, yet). Note,
-                `rllib.core.columns.Columns.OBS` will also try to decompress
+                `MultiAgentEpisode` not supported, yet). Note the providing
+                `rllib.core.columns.Columns.OBS` also tries to decompress
                 `rllib.core.columns.Columns.NEXT_OBS`.
             materialize_data: Whether the raw data should be materialized in memory.
                 This boosts performance, but requires enough memory to avoid an OOM, so
@@ -2675,14 +2675,14 @@ def offline_data(
                 memory and your Learner connector pipeline requires an RLModule or is
                 stateful, set both `materialize_data` and `materialize_mapped_data` to
                 `False`.
-            map_batches_kwargs: Keyword args for the `map_batches` method. These will be
+            map_batches_kwargs: Keyword args for the `map_batches` method. These are
                 passed into the `ray.data.Dataset.map_batches` method when sampling
                 without checking. If no arguments passed in the default arguments
                 `{'concurrency': max(2, num_learners), 'zero_copy_batch': True}` is
                 used. Use these keyword args together with `input_read_method_kwargs`
                 and `iter_batches_kwargs` to tune the performance of the data pipeline.
-            iter_batches_kwargs: Keyword args for the `iter_batches` method. These will
-                be passed into the `ray.data.Dataset.iter_batches` method when sampling
+            iter_batches_kwargs: Keyword args for the `iter_batches` method. These are
+                passed into the `ray.data.Dataset.iter_batches` method when sampling
                 without checking. If no arguments are passed in, the default argument
                 `{'prefetch_batches': 2, 'local_buffer_shuffle_size':
                 train_batch_size_per_learner x 4}` is used. Use these keyword args
@@ -2708,9 +2708,9 @@ def offline_data(
                 complete epoch over its data block (the dataset is partitioned into
                 at least as many blocks as there are learners). The default is `None`.
             input_config: Arguments that describe the settings for reading the input.
-                If input is "sample", this will be environment configuration, e.g.
+                If input is "sample", this is the environment configuration, e.g.
                 `env_name` and `env_config`, etc. See `EnvContext` for more info.
-                If the input is "dataset", this will be e.g. `format`, `path`.
+                If the input is "dataset", this contains e.g. `format`, `path`.
             actions_in_input_normalized: True, if the actions in a given offline "input"
                 are already normalized (between -1.0 and 1.0). This is usually the case
                 when the offline file has been generated by another RLlib algorithm
@@ -3362,9 +3362,9 @@ def rl_module(
         """Sets the config's RLModule settings.
 
         Args:
-            model_config: The DefaultModelConfig object (or a config dictionary) passed
-                as `model_config` arg into each RLModule's constructor. This is used
-                for all RLModules, if not otherwise specified through `rl_module_spec`.
+            model_config_dict: The default model config dictionary for `RLModule`s. This
+                is used for any `RLModule` if not otherwise specified in the
+                `rl_module_spec`.
             rl_module_spec: The RLModule spec to use for this config. It can be either
                 a RLModuleSpec or a MultiRLModuleSpec. If the
                 observation_space, action_space, catalog_class, or the model config is
@@ -3443,7 +3443,7 @@ def experimental(
                 https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate)
                 classes or a dictionary mapping module IDs to such a list of respective
                 scheduler classes. Multiple scheduler classes can be applied in sequence
-                and will be stepped in the same sequence as defined here. Note, most
+                and are stepped in the same sequence as defined here. Note, most
                 learning rate schedulers need arguments to be configured, that is, you
                 might have to partially initialize the schedulers in the list(s) using
                 `functools.partial`.
@@ -4179,7 +4179,7 @@ def model_config(self):
 
         This method combines the auto configuration `self _model_config_auto_includes`
         defined by an algorithm with the user-defined configuration in
-        `self._model_config`.This configuration dictionary is used to
+        `self._model_config_dict`.This configuration dictionary is used to
         configure the `RLModule` in the new stack and the `ModelV2` in the old
         stack.