ray-project · gjoliver · Mar 21, 2023 · Mar 9, 2023 · Mar 9, 2023 · Mar 9, 2023
@@ -93,7 +93,7 @@ PyTorch Lightning
 
     ~train.lightning.LightningTrainer
     ~train.lightning.LightningConfigBuilder
-
+    ~train.lightning.LightningCheckpoint
 
 Tensorflow/Keras
 ~~~~~~~~~~~~~~~~

@@ -400,6 +400,14 @@ py_test(
     deps = [":train_lib"]
 )
 
+py_test(
+    name = "test_lightning_checkpoint",
+    size = "medium",
+    srcs = ["tests/test_lightning_checkpoint.py"],
+    tags = ["team:ml", "exclusive", "ray_air", "gpu"],
+    deps = [":train_lib"]
+)
+
 py_test(
     name = "test_lightning_trainer",
     size = "large",

@@ -8,10 +8,10 @@
     )
 # isort: on
 
+from ray.train.lightning.lightning_checkpoint import LightningCheckpoint
 from ray.train.lightning.lightning_trainer import (
     LightningTrainer,
     LightningConfigBuilder,
 )
 
-
-__all__ = ["LightningTrainer", "LightningConfigBuilder"]
+__all__ = ["LightningTrainer", "LightningConfigBuilder", "LightningCheckpoint"]
@@ -1,14 +1,18 @@
 import logging
+import shutil
 import torch
-from typing import Any, Dict, Optional
-
+import tempfile
 import pytorch_lightning as pl
+
+from typing import Any, Dict, Optional
+from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.strategies import DDPStrategy
 from pytorch_lightning.plugins.environments import LightningEnvironment
 
 import ray
 from ray.air import session
-
+from ray.air.constants import MODEL_KEY
+from ray.train.lightning.lightning_checkpoint import LightningCheckpoint
 from torch.utils.data import IterableDataset, DataLoader
 from ray.data.dataset import DatasetIterator
 
@@ -46,11 +50,9 @@ def node_rank(self) -> int:
         return session.get_node_rank()
 
     def set_world_size(self, size: int) -> None:
-        logger.warning("world_size setter is disabled in AIR LightningTrainer.")
         pass
 
     def set_global_rank(self, rank: int) -> None:
-        logger.warning("global_rank setter is disabled in AIR LightningTrainer.")
         pass
 
     def teardown(self):
@@ -94,3 +96,72 @@ def _val_dataloader() -> DataLoader:
         # setting, we only override this method when `val_dataset` is not `None`.
         if val_dataset:
             self.val_dataloader = _val_dataloader
+
+
+class RayModelCheckpoint(ModelCheckpoint):
+    """
+    AIR customized ModelCheckpoint callback.
+
+    A subclass of ``pytorch_lightning.callbacks.ModelCheckpoint``.
+    This callback function reports the latest metrics to the AIR session and
+    creates an AIR checkpoint whenever a lightning checkpoint is saved.
+    """
+
+    def setup(self, *args, **kwargs) -> None:
+        super().setup(*args, **kwargs)
+        self.is_checkpoint_step = False
+
+    def _session_report(self, trainer: "pl.Trainer", stage: str):
+        """Report latest metrics dict and checkpoint to AIR training session.
+
+        This method is called whenever a new checkpoint is created. It creates
+        a `LightningCheckpoint` and reports it to the AIR session along with
+        the latest metrics.
+        """
+
+        # Align the frequency of checkpointing and logging
+        if not self.is_checkpoint_step:
+            return
+
+        # Report latest logged metrics
+        metrics = {"report_on": stage}
+        for k, v in self._monitor_candidates(trainer).items():
+            if k == "report_on":
+                logger.warning(
+                    "'report_on' is a reserved key in AIR report metrics. "
+                    "Original values are overwritten!"
+                )
+                continue
+            if isinstance(v, torch.Tensor):
+                metrics[k] = v.item()
+
+        # Report latest saved checkpoint
+        # Note that AIR only takes the checkpoint of rank 0.
+        # Save a dummy checkpoint on the other workers to avoid blocking.
+        with tempfile.TemporaryDirectory() as tmpdir:
+            if trainer.global_rank == 0:
+                shutil.copy(self.last_model_path, f"{tmpdir}/{MODEL_KEY}")
+                checkpoint = LightningCheckpoint.from_directory(path=tmpdir)
+            else:
+                checkpoint = LightningCheckpoint.from_dict(
+                    {"rank": session.get_world_rank()}
+                )
+            session.report(metrics=metrics, checkpoint=checkpoint)
+
+        self.is_checkpoint_step = False
+
+    def _save_last_checkpoint(self, *args, **kwargs) -> None:
+        super()._save_last_checkpoint(*args, **kwargs)
+        self.is_checkpoint_step = True
+
+    def on_train_batch_end(self, trainer: "pl.Trainer", *args, **kwargs) -> None:
+        super().on_train_batch_end(trainer, *args, **kwargs)
+        self._session_report(trainer=trainer, stage="train_batch_end")
+
+    def on_train_epoch_end(self, trainer: "pl.Trainer", *args, **kwargs) -> None:
+        super().on_train_epoch_end(trainer, *args, **kwargs)
+        self._session_report(trainer=trainer, stage="train_epoch_end")
+
+    def on_validation_end(self, trainer: "pl.Trainer", *args, **kwargs) -> None:
+        super().on_validation_end(trainer, *args, **kwargs)
+        self._session_report(trainer=trainer, stage="validation_end")
@@ -0,0 +1,100 @@
+import os
+import logging
+import pytorch_lightning as pl
+import tempfile
+import shutil
+
+from inspect import isclass
+from typing import Optional, Type
+
+from ray.air.constants import MODEL_KEY
+from ray.air._internal.checkpointing import save_preprocessor_to_dir
+from ray.data import Preprocessor
+from ray.train.torch import TorchCheckpoint
+from ray.util.annotations import PublicAPI
+
+logger = logging.getLogger(__name__)
+
+
+@PublicAPI(stability="alpha")
+class LightningCheckpoint(TorchCheckpoint):
+    """A :class:`~ray.air.checkpoint.Checkpoint` with Lightning-specific functionality.
+
+    LightningCheckpoint only support file based checkpoint loading.
+    Create this by calling ``LightningCheckpoint.from_directory(ckpt_dir)``,
+    ``LightningCheckpoint.from_uri(uri)`` or ``LightningCheckpoint.from_path(path)``
+
+    LightningCheckpoint loads file named ``model`` under the specified directory.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._cache_dir = None
+
+    @classmethod
+    def from_path(
+        cls,
+        path: str,
+        *,
+        preprocessor: Optional["Preprocessor"] = None,
+    ) -> "LightningCheckpoint":
+        """Create a ``ray.air.lightning.LightningCheckpoint`` from a checkpoint path.
+
+        Args:
+            path: The file path to the PyTorch Lightning checkpoint.
+            preprocessor: A fitted preprocessor to be applied before inference.
+
+        Returns:
+            An :py:class:`LightningCheckpoint` containing the model.
+
+        Examples:
+            >>> from ray.train.lightning import LightningCheckpoint
+            >>>
+            >>> checkpoint = LightningCheckpoint.from_path("/path/to/checkpoint.ckpt")
+        """
+
+        assert os.path.exists(path), f"Lightning checkpoint {path} doesn't exists!"
+
+        cache_dir = tempfile.mkdtemp()
+        new_checkpoint_path = os.path.join(cache_dir, MODEL_KEY)
+        shutil.copy(path, new_checkpoint_path)
+        if preprocessor:
+            save_preprocessor_to_dir(preprocessor, cache_dir)
+        checkpoint = cls.from_directory(cache_dir)
+        checkpoint._cache_dir = cache_dir
+        return checkpoint
+
+    def get_model(
+        self, model_class: Type[pl.LightningModule], **load_from_checkpoint_kwargs
+    ) -> pl.LightningModule:
+        """Retrieve the model stored in this checkpoint.
+
+        Args:
+            model_class: A subclass of ``pytorch_lightning.LightningModule`` that
+            defines your model and training logic.
+            load_from_checkpoint_kwargs: Arguments to pass into
+            ``pl.Trainer.load_from_checkpoint``
+
+        Returns:
+            pl.LightningModule: An instance of the loaded model.
+        """
+        if not isclass(model_class):
+            raise ValueError(
+                "'model_class' must be a class, not an instantiated Lightning trainer."
+            )
+
+        with self.as_directory() as checkpoint_dir:
+            ckpt_path = os.path.join(checkpoint_dir, MODEL_KEY)
+            if not os.path.exists(ckpt_path):
+                raise RuntimeError(
+                    f"File {ckpt_path} not found under the checkpoint directory."
+                )
+
+            model = model_class.load_from_checkpoint(
+                ckpt_path, **load_from_checkpoint_kwargs
+            )
+        return model
+
+    def __del__(self):
+        if self._cache_dir and os.path.exists(self._cache_dir):
+            shutil.rmtree(self._cache_dir)