Skip to content

Commit

Permalink
Apply isort and black reformatting
Browse files Browse the repository at this point in the history
Signed-off-by: ShriyaPalsamudram <ShriyaPalsamudram@users.noreply.github.com>
  • Loading branch information
ShriyaPalsamudram committed Aug 15, 2024
1 parent 5c4e11b commit f79a1b7
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 6 deletions.
10 changes: 9 additions & 1 deletion nemo/lightning/pytorch/callbacks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,12 @@
from nemo.lightning.pytorch.callbacks.progress import MegatronProgressBar


__all__ = ["MemoryProfileCallback", "ModelCheckpoint", "ModelTransform", "PEFT", "NsysCallback", "MegatronProgressBar", "PreemptionCallback"]
__all__ = [
"MemoryProfileCallback",
"ModelCheckpoint",
"ModelTransform",
"PEFT",
"NsysCallback",
"MegatronProgressBar",
"PreemptionCallback",
]
10 changes: 5 additions & 5 deletions nemo/lightning/pytorch/callbacks/memory_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

class MemoryProfileCallback(Callback, io.IOMixin):
"""
This callback enables recording a timeline of memory allocations during training.
This callback enables recording a timeline of memory allocations during training.
The generated .pickle profiles can be analyzed at https://pytorch.org/memory_viz
More info about the profiles can be found [here](https://pytorch.org/blog/understanding-gpu-memory-1/).
Expand All @@ -30,7 +30,6 @@ def __init__(self, dir: str = "/mem_profile"):
os.makedirs(self.dir, exist_ok=True)
logging.info(f"Torch memory profiles will be written to: {self.dir},")


def setup(self, trainer, pl_module, stage) -> None:
"""PyTorch Lightning hook:
https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-train-end
Expand All @@ -40,19 +39,20 @@ def setup(self, trainer, pl_module, stage) -> None:
if torch.distributed.is_initialized():
torch.cuda.memory._record_memory_history(max_entries=100000)


def on_train_end(self, trainer, pl_module) -> None:
"""PyTorch Lightning hook:
https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-train-end
We use it here to finish memory profiling and write the snapshot.
"""

logging.info(f"on_train_batch_end rank: {torch.distributed.get_rank()} mem: {torch.cuda.memory_allocated()/1024/1024/1024} / {torch.cuda.max_memory_reserved()/1024/1024/1024}")
logging.info(
f"on_train_batch_end rank: {torch.distributed.get_rank()} mem: {torch.cuda.memory_allocated()/1024/1024/1024} / {torch.cuda.max_memory_reserved()/1024/1024/1024}"
)

if torch.distributed.is_initialized():
rank = torch.distributed.get_rank()
_snapshot_path = f"{self.dir}/memory_snapshot-rank{rank}.pickle"
logging.info(f"Writing memory profile snapshot to {_snapshot_path}")
torch.cuda.memory._dump_snapshot(f"{_snapshot_path}")
torch.cuda.memory._record_memory_history(enabled=None)
logging.info(f"Finished writing memory profile snapshot: {_snapshot_path}")
logging.info(f"Finished writing memory profile snapshot: {_snapshot_path}")

0 comments on commit f79a1b7

Please sign in to comment.