NVIDIA · marcromeyn · Sep 6, 2024 · Aug 22, 2024 · Aug 24, 2024 · Aug 24, 2024
diff --git a/examples/llm/megatron_gpt_pretraining.py b/examples/llm/megatron_gpt_pretraining.py
@@ -71,7 +71,6 @@ def get_args():
     strategy = nl.MegatronStrategy()
     checkpoint_callback = ModelCheckpoint(
         every_n_train_steps=5000,
-        enable_nemo_ckpt_io=False,
     )
     callbacks = [checkpoint_callback]
 

diff --git a/nemo/lightning/io/mixin.py b/nemo/lightning/io/mixin.py
@@ -140,7 +140,7 @@ def io_dump(self, output: Path):
                            will be stored.
         """
         output_path = Path(output)
-        local_artifacts_dir = "artifacts"
+        local_artifacts_dir = "."
         artifacts_dir = output_path / local_artifacts_dir
         artifacts_dir.mkdir(parents=True, exist_ok=True)
 
@@ -518,7 +518,7 @@ def _io_path_elements_fn(x):
     return x.__io__.__path_elements__()
 
 
-def _artifact_transform_save(cfg: fdl.Config, output_path: Path, relative_dir: Path = "artifacts"):
+def _artifact_transform_save(cfg: fdl.Config, output_path: Path, relative_dir: Path = "."):
     for artifact in getattr(cfg.__fn_or_cls__, "__io_artifacts__", []):
         current_val = getattr(cfg, artifact.attr)
         if current_val is None:

diff --git a/nemo/lightning/pytorch/callbacks/model_checkpoint.py b/nemo/lightning/pytorch/callbacks/model_checkpoint.py
@@ -46,15 +46,18 @@ class ModelCheckpoint(PTLModelCheckpoint):
         every_n_train_steps: Number of train steps between checkpoints.
         train_time_interval: After each interval, monitor checkpoints. Not to be used with
             ``every_n_epochs`` or ``every_n_train_steps``.
-        save_best_model: When ``True``, reloads and saves the best checkpoint.
         save_on_train_epoch_end: Whether to run checkpointing at the end of the training epoch
-        enable_nemo_ckpt_io: Whether to dump the current model model state, including the
-            config file, to allow for reproducibility of experiments.
+        save_optim_on_train_end: Whether to include the optimizer states in the final checkpoint
+            at the end of training. Only applicable when save_weights_only is ``True``.
+        always_save_context: Whether to dump the artifacts needed to reinintialize the current
+            model, trainer, and dataloader to allow for reproducibility of experiments.
+        save_context_on_train_end: Whether to dump the artifacts on_train_end regardless of whether
+            ``always_save_context`` is ``True``.
         async_save: Whether to enable asynchronous checkpointing.
-        try_restore_best_ckpt: Whether to restore the best model path.
     """
 
     UNFINISHED_CHECKPOINT_SUFFIX = "-unfinished"
+    WEIGHTS_PATH = "weights"
 
     def __init__(
         self,
@@ -67,21 +70,21 @@ def __init__(
         every_n_epochs: int = None,
         every_n_train_steps: Optional[int] = None,
         train_time_interval: Optional[timedelta] = None,
-        save_best_model: bool = False,
         save_on_train_epoch_end: Optional[bool] = False,  # Save after training, not after validation
-        enable_nemo_ckpt_io: bool = True,
-        try_restore_best_ckpt: bool = True,
+        save_optim_on_train_end: Optional[bool] = False,
+        always_save_context: bool = False,
+        save_context_on_train_end: bool = True,
         **kwargs,
     ):
-        self.save_best_model = save_best_model
-        self.previous_best_path = ""
-        self.enable_nemo_ckpt_io = enable_nemo_ckpt_io
+        self.always_save_context = always_save_context
+        self.save_context_on_train_end = save_context_on_train_end
+        self.save_optim_on_train_end = save_optim_on_train_end
+
         # Checkpoints which removal is deferred until async save is done.
         # Each element of `deferred_ckpts_to_remove` is a growing list
         # that `self._remove_checkpoint` adds to. Once `self._save_checkpoint`
         # is called, the last element is frozen and a new element is added.
         self.deferred_ckpts_to_remove: List[List[str]] = []
-        self.try_restore_best_ckpt = try_restore_best_ckpt
 
         # Call the parent class constructor with the remaining kwargs.
         super().__init__(
@@ -251,11 +254,9 @@ def setup(self, trainer, *args, **kwargs) -> None:
         self.async_save = getattr(trainer.strategy, "async_save", False)
         super().setup(trainer, *args, **kwargs)
 
-    def on_save_checkpoint(self, trainer, pl_module, checkpoint):
-        output = super().on_save_checkpoint(trainer, pl_module, checkpoint)
-        return output
-
     def on_train_end(self, trainer, pl_module):
+        from nemo.utils.get_rank import is_global_rank_zero
+
         if trainer.fast_dev_run:
             return None
 
@@ -272,26 +273,11 @@ def on_train_end(self, trainer, pl_module):
                     logging.debug(f'Last checkpoint {self.last_model_path} already saved')
                 else:
                     super()._save_last_checkpoint(trainer, monitor_candidates)
+            if self.save_context_on_train_end and not self.always_save_context and is_global_rank_zero():
+                TrainerContext.from_trainer(trainer).io_dump(ckpt_to_dir(self.last_model_path) / "context")
         # Call parent on_train_end() to save the -last checkpoint
         super().on_train_end(trainer, pl_module)
 
-        # Load the best model and then re-save it
-        if self.save_best_model:
-            # wait for all processes
-            trainer.strategy.barrier("SaveBestCheckpointConnector.resume_end")
-            if self.best_model_path == "":
-                logging.warning(
-                    f"{self} was told to save the best checkpoint at the end of training, but no saved checkpoints "
-                    "were found. Saving latest model instead."
-                )
-
-            else:
-                if os.path.isdir(self.best_model_path.split('.ckpt')[0]):
-                    self.best_model_path = self.best_model_path.split('.ckpt')[0]
-                if self.try_restore_best_ckpt:
-                    self.best_model_path = trainer.strategy.broadcast(self.best_model_path)
-                    trainer._checkpoint_connector.restore(self.best_model_path)
-
     def _del_model_without_trainer(self, filepath: str) -> None:
         from nemo.utils.get_rank import is_global_rank_zero
 
@@ -409,8 +395,11 @@ def _monitor_candidates(self, trainer: "pl.Trainer") -> Dict[str, torch.Tensor]:
         return monitor_candidates
 
     def _save_checkpoint(self, trainer: 'pytorch_lightning.Trainer', filepath: str) -> None:
+        from nemo.utils.get_rank import is_global_rank_zero
+
         # barrier_after=True, so all ranks continue after the unfinished checkpoint marker is placed.
         # if anything goes wrong during checkpointing, we should be able to detect that data is incomplete.
+        ckpt_filepath = ckpt_to_dir(filepath) / ModelCheckpoint.WEIGHTS_PATH
         self.set_checkpoint_unfinished_marker(filepath, barrier_after=True)
         ema_callback = self._ema_callback(trainer)
 
@@ -420,17 +409,22 @@ def _save_checkpoint(self, trainer: 'pytorch_lightning.Trainer', filepath: str)
             if self.async_save:
                 raise ValueError('async_save with EMA not supported')
             with ema_callback.save_original_optimizer_state(trainer):
-                super()._save_checkpoint(trainer, filepath)
+                super()._save_checkpoint(trainer, ckpt_filepath)
 
             # save EMA copy of the model as well.
             with ema_callback.save_ema_model(trainer):
-                rank_zero_info(f"Saving EMA weights to separate checkpoint {filepath}")
-                filepath = self._ema_format_filepath(filepath)
+                rank_zero_info(f"Saving EMA weights to separate checkpoint {ckpt_filepath}")
+                ckpt_filepath = self._ema_format_filepath(ckpt_filepath)
                 if self.verbose:
-                    rank_zero_info(f"Saving EMA weights to separate checkpoint {filepath}")
-                super()._save_checkpoint(trainer, filepath)
+                    rank_zero_info(f"Saving EMA weights to separate checkpoint {ckpt_filepath}")
+                super()._save_checkpoint(trainer, ckpt_filepath)
             self.remove_checkpoint_unfinished_marker(filepath, barrier_before=True)
         else:
+            ## Whether to include optimizer states
+            save_weights_only = self.save_weights_only or (
+                not self.save_optim_on_train_end and trainer.global_step == trainer.max_steps
+            )
+
             # Async save passes the finalization function to checkpoint_io,
             # sync save calls the finalization function immediately after save.
             finalize_fn = self._get_finalize_save_checkpoint_callback(trainer, filepath, trainer.global_step)
@@ -445,13 +439,11 @@ def _save_checkpoint(self, trainer: 'pytorch_lightning.Trainer', filepath: str)
                 self.deferred_ckpts_to_remove.append([])
             else:
                 storage_options = None
-            trainer.save_checkpoint(filepath, self.save_weights_only, storage_options=storage_options)
+            trainer.save_checkpoint(ckpt_filepath, save_weights_only, storage_options=storage_options)
 
-            ## NOTE: saving context happens synchronously always
-            from nemo.utils.get_rank import is_global_rank_zero
+            if self.always_save_context and is_global_rank_zero():
+                TrainerContext.from_trainer(trainer).io_dump(ckpt_to_dir(filepath) / "context")
 
-            if self.enable_nemo_ckpt_io and is_global_rank_zero():
-                TrainerContext.from_trainer(trainer).io_dump(ckpt_to_dir(filepath))
             if self.async_save:
                 logging.info(f'Scheduled async checkpoint save for {filepath}')
             else:

diff --git a/nemo/lightning/pytorch/strategies/fsdp_strategy.py b/nemo/lightning/pytorch/strategies/fsdp_strategy.py
@@ -56,7 +56,7 @@ def __init__(
         self,
         auto_wrap_policy={TransformerLayer},
         state_dict_type="sharded",
-        ckpt_include_optimizer=False,
+        ckpt_include_optimizer=True,
         data_sampler=None,
         **kwargs,
     ):
@@ -189,11 +189,9 @@ def save_checkpoint(
         checkpoint["sharded_state_dict"] = pyt_to_mcore_state_dict(checkpoint.pop("state_dict"))
         checkpoint["state_dict"] = OrderedDict([])
 
-        # TODO: do we still need to keep this?
-        for optim_state in checkpoint['optimizer_states']:
-            optim_state.pop("state")
-
-        if self.trainer.state.fn == TrainerFn.FITTING and self.ckpt_include_optimizer:
+        ## replace unsharded optimizer_states with sharded dict
+        if "optimizer_states" in checkpoint:
+            del checkpoint["optimizer_states"]
             checkpoint['optimizer'] = get_optimizer_state_dict(self.model, self.optimizers)
             pyt_to_mcore_state_dict(checkpoint['optimizer']['state'], prefix="optimizer.state.")
 
@@ -224,7 +222,9 @@ def load_checkpoint(self, checkpoint_path: str | Path) -> Dict[str, Any]:
             pyt_to_mcore_state_dict(msd)
             sharded_state_dict["sharded_state_dict"] = msd
 
-        if self.ckpt_include_optimizer and self.trainer.state.fn == TrainerFn.FITTING:
+        if (
+            self.ckpt_include_optimizer and self.trainer.state.fn == TrainerFn.FITTING
+        ):  ## TODO: remove ckpt_include_optimizer
             osd = get_optimizer_state_dict(self.model, self.optimizers, options=StateDictOptions(cpu_offload=True))
             pyt_to_mcore_state_dict(osd['state'], prefix="optimizer.state.")
             sharded_state_dict["optimizer"] = osd

diff --git a/nemo/lightning/pytorch/strategies/megatron_strategy.py b/nemo/lightning/pytorch/strategies/megatron_strategy.py
@@ -602,7 +602,10 @@ def save_checkpoint(
         # retrieve `sharded_state_dict` if it has not already been configured in `on_save_checkpoint`
         if "sharded_state_dict" not in checkpoint:
             checkpoint["sharded_state_dict"] = self.megatron_parallel.sharded_state_dict()
-        if self.trainer.state.fn == TrainerFn.FITTING and self.ckpt_include_optimizer:
+
+        ## replace unsharded optimizer_states with sharded dict
+        if "optimizer_states" in checkpoint:
+            del checkpoint["optimizer_states"]
             checkpoint["optimizer"] = [self.optimizer_sharded_state_dict()]
 
         self.checkpoint_io.save_checkpoint(checkpoint, filepath, storage_options=storage_options)
@@ -630,7 +633,7 @@ def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]:
 
     @override
     def load_optimizer_state_dict(self, checkpoint: Mapping[str, Any]) -> None:
-        if not self.ckpt_include_optimizer:
+        if not self.ckpt_include_optimizer:  ## TODO: remove ckpt_include_optimizer
             return
 
         optimizer_states = checkpoint["optimizer"]

diff --git a/nemo/lightning/resume.py b/nemo/lightning/resume.py
@@ -37,6 +37,8 @@ class AutoResume(Resume, io.IOMixin):
     checkpoints in NeMo.
     """
 
+    WEIGHTS_PATH = "weights"
+
     def __init__(
         self,
         path: Optional[str] = None,  ## old resume_from_checkpoint
@@ -169,7 +171,8 @@ def nemo_path(self, model=None) -> Optional[Path]:
         if checkpoint:
             if self.adapter_path:
                 return AdapterPath(checkpoint, adapter_path=Path(self.adapter_path))
-            return Path(checkpoint)
+
+            return Path(checkpoint) / AutoResume.WEIGHTS_PATH
 
         return None
 

diff --git a/tests/collections/llm/test_mnist_model_nemo2.py b/tests/collections/llm/test_mnist_model_nemo2.py
@@ -496,13 +496,12 @@ def run_train_mnist_litautoencoder_with_megatron_strategy_single_gpu():
             # Configure our custom Checkpointer
             name = "test_experiment"
             checkpoint_callback = nl_callbacks.ModelCheckpoint(
-                save_best_model=True,
                 save_last=True,
                 monitor="val_loss",
                 save_top_k=1,
                 every_n_train_steps=5,
                 # Enables the .nemo file-like checkpointing where all IOMixins are under SerDe
-                enable_nemo_ckpt_io=True,
+                always_save_context=True,
             )
             root_dir = tmpdir
             save_dir = root_dir / name

diff --git a/tests/collections/llm/test_mnist_model_nemo2_fsdp.py b/tests/collections/llm/test_mnist_model_nemo2_fsdp.py
@@ -519,13 +519,12 @@ def run_train_mnist_litautoencoder_with_fsdp_strategy_single_gpu():
             # Configure our custom Checkpointer
             name = "test_experiment"
             checkpoint_callback = nl_callbacks.ModelCheckpoint(
-                save_best_model=True,
                 save_last=True,
                 monitor="val_loss",
                 save_top_k=1,
                 every_n_train_steps=5,
                 # Enables the .nemo file-like checkpointing where all IOMixins are under SerDe
-                enable_nemo_ckpt_io=True,
+                always_save_context=True,
             )
             root_dir = tmpdir
             save_dir = root_dir / name