ray-project · krfricke · Feb 17, 2023 · Jan 24, 2023 · Jan 24, 2023 · Jan 24, 2023
@@ -568,6 +568,18 @@ py_test(
     deps = [":train_lib"]
 )
 
+py_test(
+    name = "test_trainer_restore",
+    size = "medium",
+    srcs = ["tests/test_trainer_restore.py"],
+    tags = [
+        "exclusive",
+        "ray_air",
+        "team:ml",
+    ],
+    deps = [":train_lib"],
+)
+
 # This is a dummy test dependency that causes the above tests to be
 # re-run if any of these files changes.
 py_library(

@@ -5,12 +5,12 @@
 from ray.air.config import DatasetConfig
 
 from ray.data import Dataset, DatasetPipeline
+from ray.data.preprocessor import Preprocessor
 from ray.data.preprocessors import Chain
 from ray.air._internal.util import _estimate_avail_object_store_memory
 
 if TYPE_CHECKING:
     from ray.data import DatasetIterator
-    from ray.data.preprocessor import Preprocessor
 
 RayDataset = Union["Dataset", "DatasetPipeline"]
 
@@ -113,7 +113,9 @@ def __init__(self, dataset_config: Dict[str, DatasetConfig]):
         self.preprocessor: Optional["Preprocessor"] = None
 
     def preprocess_datasets(
-        self, prep: "Preprocessor", datasets: Dict[str, "Dataset"]
+        self,
+        prep: "Preprocessor",
+        datasets: Dict[str, "Dataset"],
     ) -> Dict[str, "Dataset"]:
         """Preprocess the given datasets.
 
@@ -142,7 +144,10 @@ def preprocess_datasets(
                     continue
                 if conf.fit:
                     ds_to_fit = datasets[k]
-            if ds_to_fit:
+            if ds_to_fit and prep.fit_status() in (
+                Preprocessor.FitStatus.NOT_FITTED,
+                Preprocessor.FitStatus.PARTIALLY_FITTED,
+            ):
                 prep.fit(ds_to_fit)
             new_datasets = {}
 

@@ -2,11 +2,21 @@
 import copy
 import inspect
 import logging
+import os
+from pathlib import Path
+import tempfile
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Type, Union
 
 import ray
+import ray.cloudpickle as pickle
 from ray.air._internal.config import ensure_only_allowed_dataclass_keys_updated
+from ray.air._internal.remote_storage import (
+    download_from_uri,
+    is_non_local_path_uri,
+    list_at_uri,
+)
 from ray.air.checkpoint import Checkpoint
+from ray.air import session
 from ray.air.config import RunConfig, ScalingConfig
 from ray.air.result import Result
 from ray.train.constants import TRAIN_DATASET_KEY
@@ -20,6 +30,8 @@
 
     from ray.tune import Trainable
 
+_TRAINER_PKL = "trainer.pkl"
+
 # A type representing either a ray.data.Dataset or a function that returns a
 # ray.data.Dataset and accepts no arguments.
 GenDataset = Union["Dataset", Callable[[], "Dataset"]]
@@ -164,8 +176,169 @@ def __init__(
         self.preprocessor = preprocessor
         self.resume_from_checkpoint = resume_from_checkpoint
 
+        # This path should only be set through restore
+        self._restore_path = None
+
         self._validate_attributes()
 
+    @classmethod
+    def restore(
+        cls: Type["BaseTrainer"],
+        path: str,
+        datasets: Optional[Dict[str, GenDataset]] = None,
+        preprocessor: Optional["Preprocessor"] = None,
+        scaling_config: Optional[ScalingConfig] = None,
+        **kwargs,
+    ) -> "BaseTrainer":
+        """Restores a Train experiment from a previously interrupted/failed run.
+
+        Restore should be used for experiment-level fault tolerance in the event
+        that the head node crashes (e.g. OOM or some other runtime error) or the
+        entire cluster goes down (e.g. network error affecting all nodes).
+
+        The following example can be paired with implementing job retry using
+        :ref:`Ray Jobs <jobs-overview>` to produce a Train experiment that will
+        attempt to resume on both experiment-level and trial-level failures:
+
+        .. code-block:: python
+
+            import os
+            from ray.train.torch import TorchTrainer
+            from ray import tune
+
+            experiment_name = "unique_experiment_name"
+            upload_dir = "s3://bucket"
+            experiment_dir = os.path.join(upload_dir, experiment_name)
+
+            # Pretend this is a large object that's been loaded
+            large_data = {}
+            # Use an object reference to share this object across the cluster
+            large_data_ref = ray.put(large_dataset)
+
+            datasets = {"train": ray.data.from_items([{"a": i} for i in range(10)])}
+
+            def train_loop_per_worker(config):
+                pass
+
+            train_loop_config = {"obj_ref": large_data_ref}
+
+            if TorchTrainer.can_restore(experiment_dir):
+                trainer = TorchTrainer.restore(
+                    experiment_dir,
+                    train_loop_per_worker=train_loop_per_worker,
+                    train_loop_config=train_loop_config,
+                    datasets=datasets,
+                )
+            else:
+                trainer = TorchTrainer(
+                    train_loop_per_worker,
+                    train_loop_config,
+                    datasets=datasets,
+                    run_config=air.RunConfig(
+                        name=experiment_name,
+                        sync_config=tune.SyncConfig(upload_dir=upload_dir),
+                        # Tip: Add trial-level fault-tolerance on top.
+                        failure_config=air.FailureConfig(max_failures=3),
+                    ),
+                )
+
+            result = trainer.fit()
+
+
+        Args:
+            path: The path to the experiment directory of the training run to restore.
+                This can be a local path or a remote URI if the experiment was
+                uploaded to the cloud.
+            datasets: Re-specified datasets used in the original training run.
+                This must include all the datasets that were passed in the
+                original trainer constructor.
+            preprocessor: Optionally re-specified preprocessor that was passed in
+                the original trainer constructor. This should be used to re-supply
+                the preprocessor if it is not restorable in a new Ray cluster.
+                This preprocessor will be fit at the start before resuming training.
+                If no preprocessor is passed in restore, then the old preprocessor
+                will be loaded from the latest checkpoint and will not be re-fit.
+            scaling_config: Optionally re-specified scaling config. This can be
+                modified to be different from the original spec.
+            **kwargs: Other optionally re-specified arguments, passed in by subclasses.
+
+        Raises:
+            ValueError: If all datasets were not re-supplied on restore.
+
+        Returns:
+            BaseTrainer: A restored instance of the class that is calling this method.
+        """
+        assert cls.can_restore(path), (
+            f"Invalid restore path: {path}. Make sure that this path exists and "
+            "is the experiment directory that results from a call to `trainer.fit()`."
+        )
+        trainer_state_path = cls._maybe_sync_down_trainer_state(path)
+        assert (
+            trainer_state_path.exists()
+        ), f"Did not find trainer state at {str(trainer_state_path)}."
+
+        with open(trainer_state_path, "rb") as fp:
+            original_trainer = pickle.load(fp)
+        assert type(original_trainer) == cls, (
+            f"Invalid trainer type. Cannot restore a trainer of type "
+            f"{type(original_trainer)} with `{cls.__name__}.restore`. "
+            f"Use `{type(original_trainer).__name__}.restore` instead."
+        )
+
+        # Get the param dict used to initialize the original trainer
+        param_dict = original_trainer._param_dict
+
+        original_datasets = original_trainer.datasets or {}
+        if original_datasets and not datasets:
+            raise ValueError(
+                "The following datasets need to be provided again on restore: "
+                f"{list(original_datasets.keys())}\n"
+                f"Use {cls.__name__}.restore(..., datasets=datasets) "
+                "with the datasets that were provided to the original trainer."
+            )
+        datasets = datasets or {}
+        assert set(original_datasets.keys()) == set(datasets.keys()), (
+            "The provided datasets don't match the original dataset keys.\n"
+            f"  Expected datasets for the keys: {list(original_datasets.keys())}\n"
+            f"  Actual datasets provided: {list(datasets.keys())}"
+        )
+        param_dict["datasets"] = datasets
+
+        # If no preprocessor is re-specified, then it will be set to None
+        # here and loaded from the latest checkpoint
+        param_dict["preprocessor"] = preprocessor
+
+        if scaling_config:
+            param_dict["scaling_config"] = scaling_config
+
+        for param_name, val in kwargs.items():
+            # Overwrite the old value if something is passed into restore
+            if val is not None:
+                param_dict[param_name] = val
+
+        trainer = cls(**param_dict)
+        trainer._restore_path = path
+        return trainer
+
+    @classmethod
+    def can_restore(cls: Type["BaseTrainer"], path: Union[str, Path]) -> bool:
+        """Checks whether a given directory contains a restorable Train experiment.
+
+        Args:
+            path: The path to the experiment directory of the Train experiment.
+                This can be either a local directory (e.g. ~/ray_results/exp_name)
+                or a remote URI (e.g. s3://bucket/exp_name).
+
+        Returns:
+            bool: Whether or not this path exists and contains the pickled Trainer
+        """
+        path = str(path)
+        if is_non_local_path_uri(path):
+            dir_contents = list_at_uri(path)
+        else:
+            dir_contents = [] if not os.path.exists(path) else os.listdir(path)
+        return bool(dir_contents) and _TRAINER_PKL in dir_contents
+
     def __repr__(self):
         # A dictionary that maps parameters to their default values.
         default_values: Dict[str, Any] = {
@@ -265,6 +438,22 @@ def _validate_scaling_config(cls, scaling_config: ScalingConfig) -> ScalingConfi
         )
         return scaling_config
 
+    @classmethod
+    def _maybe_sync_down_trainer_state(cls, restore_path: str) -> Path:
+        """Sync down trainer state from remote storage.
+
+        Returns:
+            local_dir of the synced trainer state
+        """
+        if not is_non_local_path_uri(restore_path):
+            return Path(os.path.expanduser(restore_path)) / _TRAINER_PKL
+
+        tempdir = Path(tempfile.mkdtemp("tmp_experiment_dir"))
+
+        path = Path(restore_path)
+        download_from_uri(str(path / _TRAINER_PKL), str(tempdir / _TRAINER_PKL))
+        return tempdir / _TRAINER_PKL
+
     def setup(self) -> None:
         """Called during fit() to perform initial setup on the Trainer.
 
@@ -300,7 +489,10 @@ def preprocess_datasets(self) -> None:
 
         if self.preprocessor:
             train_dataset = self.datasets.get(TRAIN_DATASET_KEY, None)
-            if train_dataset:
+            if train_dataset and self.preprocessor.fit_status in (
+                ray.data.Preprocessor.FitStatus.NOT_FITTED,
+                ray.data.Preprocessor.FitStatus.PARTIALLY_FITTED,
+            ):
                 self.preprocessor.fit(train_dataset)
 
             # Execute dataset transformations serially for now.
@@ -351,15 +543,34 @@ def fit(self) -> Result:
             TrainingFailedError: If any failures during the execution of
             ``self.as_trainable()``.
         """
-        from ray.tune.tuner import Tuner
-        from ray.tune.error import TuneError
+        from ray.tune.tuner import Tuner, TunerInternal
+        from ray.tune import TuneError
 
         trainable = self.as_trainable()
         param_space = self._extract_fields_for_tuner_param_space()
 
-        tuner = Tuner(
-            trainable=trainable, param_space=param_space, run_config=self.run_config
+        if self._restore_path:
+            # TODO(justinvyu): Pass in the new trainable + param_space after Jun's PR
+            # This is because some params get propagated to the Tuner and will
+            # overwrite new ones from Trainer.restore.
+            tuner = Tuner.restore(
+                self._restore_path,
+                overwrite_trainable=trainable,
+                resume_unfinished=True,
+                resume_errored=True,
+            )
+        else:
+            tuner = Tuner(
+                trainable=trainable, param_space=param_space, run_config=self.run_config
+            )
+
+        experiment_path = Path(
+            TunerInternal.setup_create_experiment_checkpoint_dir(
+                trainable, self.run_config
+            )
         )
+        self._save(experiment_path)
+
         result_grid = tuner.fit()
         assert len(result_grid) == 1
         try:
@@ -370,6 +581,11 @@ def fit(self) -> Result:
             raise TrainingFailedError from e
         return result
 
+    def _save(self, experiment_path: Union[str, Path]):
+        experiment_path = Path(experiment_path)
+        with open(experiment_path / _TRAINER_PKL, "wb") as fp:
+            pickle.dump(self, fp)
+
     def _extract_fields_for_tuner_param_space(self) -> Dict:
         """Extracts fields to be included in `Tuner.param_space`.
 
@@ -399,16 +615,24 @@ def _generate_trainable_cls(self) -> Type["Trainable"]:
 
         trainer_cls = self.__class__
         scaling_config = self.scaling_config
+        restored = bool(self._restore_path)
 
-        def train_func(config, checkpoint_dir=None):
+        def train_func(config):
             # config already contains merged values.
             # Instantiate new Trainer in Trainable.
             trainer = trainer_cls(**config)
 
-            if checkpoint_dir:
-                trainer.resume_from_checkpoint = Checkpoint.from_directory(
-                    checkpoint_dir
-                )
+            # Get the checkpoint from the Tune session, and use it to initialize
+            # the restored trainer.
+            # This handles recovery from both trial-level and experiment-level failures.
+            checkpoint = session.get_checkpoint()
+            if checkpoint:
+                trainer.resume_from_checkpoint = checkpoint
+                # Always load the preprocessor from checkpoint
+                # Unless we are restoring the experiment and have passed in a new
+                # preprocessor
+                if not (restored and trainer.preprocessor):
+                    trainer.preprocessor = checkpoint.get_preprocessor()
 
             trainer.setup()
             trainer.preprocess_datasets()