NVIDIA · marcromeyn · Sep 6, 2024 · Aug 22, 2024 · Aug 24, 2024 · Aug 24, 2024
diff --git a/nemo/lightning/io/mixin.py b/nemo/lightning/io/mixin.py
@@ -140,7 +140,7 @@ def io_dump(self, output: Path):
                            will be stored.
         """
         output_path = Path(output)
-        local_artifacts_dir = "artifacts"
+        local_artifacts_dir = "."
         artifacts_dir = output_path / local_artifacts_dir
         artifacts_dir.mkdir(parents=True, exist_ok=True)
 
@@ -523,7 +523,7 @@ def _io_path_elements_fn(x):
     return x.__io__.__path_elements__()
 
 
-def _artifact_transform_save(cfg: fdl.Config, output_path: Path, relative_dir: Path = "artifacts"):
+def _artifact_transform_save(cfg: fdl.Config, output_path: Path, relative_dir: Path = "."):
     for artifact in getattr(cfg.__fn_or_cls__, "__io_artifacts__", []):
         current_val = getattr(cfg, artifact.attr)
         if current_val is None:

diff --git a/nemo/lightning/pytorch/callbacks/model_checkpoint.py b/nemo/lightning/pytorch/callbacks/model_checkpoint.py
@@ -48,13 +48,16 @@ class ModelCheckpoint(PTLModelCheckpoint):
             ``every_n_epochs`` or ``every_n_train_steps``.
         save_best_model: When ``True``, reloads and saves the best checkpoint.
         save_on_train_epoch_end: Whether to run checkpointing at the end of the training epoch
-        enable_nemo_ckpt_io: Whether to dump the current model model state, including the
-            config file, to allow for reproducibility of experiments.
+        always_save_artifacts: Whether to dump the artifacts needed to reinintialize the current
+            model, trainer, and dataloader to allow for reproducibility of experiments.
+        save_artifacts_on_train_end: Whether to dump the artifacts on_train_end regardless of whether
+            ``always_save_artifacts`` is ``True``.
         async_save: Whether to enable asynchronous checkpointing.
         try_restore_best_ckpt: Whether to restore the best model path.
     """
 
     UNFINISHED_CHECKPOINT_SUFFIX = "-unfinished"
+    MODEL_WEIGHTS_PATH = "model_weights"
 
     def __init__(
         self,
@@ -69,13 +72,16 @@ def __init__(
         train_time_interval: Optional[timedelta] = None,
         save_best_model: bool = False,
         save_on_train_epoch_end: Optional[bool] = False,  # Save after training, not after validation
-        enable_nemo_ckpt_io: bool = True,
+        always_save_artifacts: bool = False,
+        save_artifacts_on_train_end: bool = True,
         try_restore_best_ckpt: bool = True,
         **kwargs,
     ):
         self.save_best_model = save_best_model
         self.previous_best_path = ""
-        self.enable_nemo_ckpt_io = enable_nemo_ckpt_io
+        self.always_save_artifacts = always_save_artifacts
+        self.save_artifacts_on_train_end = save_artifacts_on_train_end
+
         # Checkpoints which removal is deferred until async save is done.
         # Each element of `deferred_ckpts_to_remove` is a growing list
         # that `self._remove_checkpoint` adds to. Once `self._save_checkpoint`
@@ -251,11 +257,9 @@ def setup(self, trainer, *args, **kwargs) -> None:
         self.async_save = getattr(trainer.strategy, "async_save", False)
         super().setup(trainer, *args, **kwargs)
 
-    def on_save_checkpoint(self, trainer, pl_module, checkpoint):
-        output = super().on_save_checkpoint(trainer, pl_module, checkpoint)
-        return output
-
     def on_train_end(self, trainer, pl_module):
+        from nemo.utils.get_rank import is_global_rank_zero
+
         if trainer.fast_dev_run:
             return None
 
@@ -272,6 +276,8 @@ def on_train_end(self, trainer, pl_module):
                     logging.debug(f'Last checkpoint {self.last_model_path} already saved')
                 else:
                     super()._save_last_checkpoint(trainer, monitor_candidates)
+            if self.save_artifacts_on_train_end and not self.always_save_artifacts and is_global_rank_zero():
+                TrainerContext.from_trainer(trainer).io_dump(ckpt_to_dir(self.last_model_path) / "artifacts")
         # Call parent on_train_end() to save the -last checkpoint
         super().on_train_end(trainer, pl_module)
 
@@ -287,7 +293,9 @@ def on_train_end(self, trainer, pl_module):
 
             else:
                 if os.path.isdir(self.best_model_path.split('.ckpt')[0]):
-                    self.best_model_path = self.best_model_path.split('.ckpt')[0]
+                    self.best_model_path = (
+                        Path(self.best_model_path.split('.ckpt')[0]) / ModelCheckpoint.MODEL_WEIGHTS_PATH
+                    )
                 if self.try_restore_best_ckpt:
                     self.best_model_path = trainer.strategy.broadcast(self.best_model_path)
                     trainer._checkpoint_connector.restore(self.best_model_path)
@@ -409,8 +417,11 @@ def _monitor_candidates(self, trainer: "pl.Trainer") -> Dict[str, torch.Tensor]:
         return monitor_candidates
 
     def _save_checkpoint(self, trainer: 'pytorch_lightning.Trainer', filepath: str) -> None:
+        from nemo.utils.get_rank import is_global_rank_zero
+
         # barrier_after=True, so all ranks continue after the unfinished checkpoint marker is placed.
         # if anything goes wrong during checkpointing, we should be able to detect that data is incomplete.
+        ckpt_filepath = ckpt_to_dir(filepath) / ModelCheckpoint.MODEL_WEIGHTS_PATH
         self.set_checkpoint_unfinished_marker(filepath, barrier_after=True)
         ema_callback = self._ema_callback(trainer)
 
@@ -420,15 +431,15 @@ def _save_checkpoint(self, trainer: 'pytorch_lightning.Trainer', filepath: str)
             if self.async_save:
                 raise ValueError('async_save with EMA not supported')
             with ema_callback.save_original_optimizer_state(trainer):
-                super()._save_checkpoint(trainer, filepath)
+                super()._save_checkpoint(trainer, ckpt_filepath)
 
             # save EMA copy of the model as well.
             with ema_callback.save_ema_model(trainer):
-                rank_zero_info(f"Saving EMA weights to separate checkpoint {filepath}")
-                filepath = self._ema_format_filepath(filepath)
+                rank_zero_info(f"Saving EMA weights to separate checkpoint {ckpt_filepath}")
+                ckpt_filepath = self._ema_format_filepath(ckpt_filepath)
                 if self.verbose:
-                    rank_zero_info(f"Saving EMA weights to separate checkpoint {filepath}")
-                super()._save_checkpoint(trainer, filepath)
+                    rank_zero_info(f"Saving EMA weights to separate checkpoint {ckpt_filepath}")
+                super()._save_checkpoint(trainer, ckpt_filepath)
             self.remove_checkpoint_unfinished_marker(filepath, barrier_before=True)
         else:
             # Async save passes the finalization function to checkpoint_io,
@@ -445,13 +456,11 @@ def _save_checkpoint(self, trainer: 'pytorch_lightning.Trainer', filepath: str)
                 self.deferred_ckpts_to_remove.append([])
             else:
                 storage_options = None
-            trainer.save_checkpoint(filepath, self.save_weights_only, storage_options=storage_options)
+            trainer.save_checkpoint(ckpt_filepath, self.save_weights_only, storage_options=storage_options)
 
-            ## NOTE: saving context happens synchronously always
-            from nemo.utils.get_rank import is_global_rank_zero
+            if self.always_save_artifacts and is_global_rank_zero():
+                TrainerContext.from_trainer(trainer).io_dump(ckpt_to_dir(filepath) / "artifacts")
 
-            if self.enable_nemo_ckpt_io and is_global_rank_zero():
-                TrainerContext.from_trainer(trainer).io_dump(ckpt_to_dir(filepath))
             if self.async_save:
                 logging.info(f'Scheduled async checkpoint save for {filepath}')
             else:

diff --git a/nemo/lightning/resume.py b/nemo/lightning/resume.py
@@ -37,6 +37,8 @@ class AutoResume(Resume, io.IOMixin):
     checkpoints in NeMo.
     """
 
+    MAYBE_MODEL_WEIGHTS_PATH = "model_weights"
+
     def __init__(
         self,
         path: Optional[str] = None,  ## old resume_from_checkpoint
@@ -151,6 +153,10 @@ def nemo_path(self, model=None) -> Optional[Path]:
         if checkpoint:
             if self.adapter_path:
                 return AdapterPath(checkpoint, adapter_path=Path(self.adapter_path))
+
+            model_weights_path = Path(checkpoint) / AutoResume.MAYBE_MODEL_WEIGHTS_PATH
+            if os.path.isdir(model_weights_path):
+                return model_weights_path
             return Path(checkpoint)
 
         return None