Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[LLM] add memory stats to logger of trainer #8269

Merged
merged 1 commit into from
Apr 17, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 16 additions & 2 deletions paddlenlp/trainer/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@
import paddle.distributed as dist
import paddle.nn as nn
from packaging import version
from paddle import framework
from paddle.base import core
from paddle.distributed import fleet
from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.hybrid_parallel_optimizer import (
HybridParallelOptimizer,
Expand Down Expand Up @@ -1256,6 +1258,20 @@ def _maybe_log_save_evaluate(self, tr_loss, model, epoch, ignore_keys_for_eval,
logs["learning_rate"] = float("{0:.3e}".format(self._get_learning_rate()))
logs["global_step"] = int(self.state.global_step)

divisor = 2**30
# TODO(@gexiao): replace these codes with unified APIs in Paddle
current_device = framework._current_expected_place_()
if str(current_device) != "Place(cpu)":
device_id = current_device.get_device_id()
current_memory_allocated = core.device_memory_stat_current_value("Allocated", device_id)
current_memory_reserved = core.device_memory_stat_current_value("Reserved", device_id)
max_memory_allocated = core.device_memory_stat_peak_value("Allocated", device_id)
max_memory_reserved = core.device_memory_stat_peak_value("Reserved", device_id)
logs["current_memory_allocated"] = current_memory_allocated / divisor
logs["current_memory_reserved"] = current_memory_reserved / divisor
logs["max_memory_allocated"] = max_memory_allocated / divisor
logs["max_memory_reserved"] = max_memory_reserved / divisor

total_train_batch_size = (
self.args.train_batch_size * self.args.gradient_accumulation_steps * self.args.dataset_world_size
)
Expand Down Expand Up @@ -1586,8 +1602,6 @@ def _load_rng_state(self, checkpoint):
random.setstate(checkpoint_rng_state["python"])
np.random.set_state(checkpoint_rng_state["numpy"])

core = paddle.framework.core

core.default_cpu_generator().set_state(checkpoint_rng_state["cpu"])
if core.is_compiled_with_cuda():
if not len(checkpoint_rng_state["cuda"]) == core.get_cuda_device_count():
Expand Down
Loading