Skip to content

Commit

Permalink
Wrap CPU model init with megatron_lazy_init_context (#10219)
Browse files Browse the repository at this point in the history
* Wrap CPU model init with megatron_lazy_init_context

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Cleanup checkpoint-dir if saving fails

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>
  • Loading branch information
2 people authored and Shanmugam Ramasamy committed Aug 27, 2024
1 parent 7a8c0e8 commit ac5cb06
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 8 deletions.
3 changes: 2 additions & 1 deletion nemo/lightning/io/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ def nemo_setup(self, model: pl.LightningModule, trainer: Optional[pl.Trainer] =
pl.Trainer: The trainer configured with the model and strategy.
"""
from nemo.lightning import MegatronStrategy, Trainer
from nemo.lightning._strategy_lib import megatron_lazy_init_context

_trainer = trainer or Trainer(
devices=1, accelerator="cpu", strategy=MegatronStrategy(store_optimizer_states=False)
Expand All @@ -155,7 +156,7 @@ def nemo_setup(self, model: pl.LightningModule, trainer: Optional[pl.Trainer] =

if not model.state_dict():
_trainer.strategy.lazy_init = True
with _trainer.init_module():
with _trainer.init_module(), megatron_lazy_init_context(model.config):
model.configure_model()

return _trainer
Expand Down
23 changes: 16 additions & 7 deletions nemo/lightning/io/pl.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,13 +126,22 @@ def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_optio

validate_sharding_integrity = not (self.validated_consistency and self.assume_constant_structure)
self.validated_consistency = True
return dist_checkpointing.save(
sharded_state_dict=checkpoint,
checkpoint_dir=checkpoint_dir,
sharded_strategy=self.save_sharded_strategy,
validate_access_integrity=validate_sharding_integrity,
async_sharded_save=self.async_save,
)

try:
return dist_checkpointing.save(
sharded_state_dict=checkpoint,
checkpoint_dir=checkpoint_dir,
sharded_strategy=self.save_sharded_strategy,
validate_access_integrity=validate_sharding_integrity,
async_sharded_save=self.async_save,
)
except:
logging.error(f"Failed to save checkpoint to {checkpoint_dir}")
# Do cleanup.
import shutil

shutil.rmtree(checkpoint_dir)
raise

@override
def load_checkpoint(
Expand Down

0 comments on commit ac5cb06

Please sign in to comment.