From 7b97a6b0c33f528110fc390622629b41218d578d Mon Sep 17 00:00:00 2001 From: Anmol Gupta <14880251+anmolgupt@users.noreply.github.com> Date: Mon, 8 May 2023 11:16:34 -0700 Subject: [PATCH 1/2] added train_samples keyword for compliance check --- large_language_model/megatron-lm/megatron/training.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/large_language_model/megatron-lm/megatron/training.py b/large_language_model/megatron-lm/megatron/training.py index 6bbd8f407..63606ccd2 100755 --- a/large_language_model/megatron-lm/megatron/training.py +++ b/large_language_model/megatron-lm/megatron/training.py @@ -232,6 +232,9 @@ def pretrain(train_valid_test_dataset_provider, mllogger.event(key="trained_samples", value=(args.consumed_train_samples - args.ext_lr_steps) * args.seq_length, sync=False) + mllogger.event(key="train_samples", + value=(args.consumed_train_samples - args.ext_lr_steps) * args.seq_length, + sync=False) mllogger.end(key=mllogger.constants.BLOCK_STOP, metadata={'first_epoch_num': 0}, sync=False) @@ -813,6 +816,9 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler, mllogger.event(key="trained_samples", value=(args.consumed_train_samples - args.ext_lr_steps) * args.seq_length, sync=False) + mllogger.event(key="train_samples", + value=(args.consumed_train_samples - args.ext_lr_steps) * args.seq_length, + sync=False) if not saved_checkpoint: save_checkpoint_and_time(iteration, model, optimizer, opt_param_scheduler) From 40e35b3beff555b310bb1c2e3fec7db260a301f6 Mon Sep 17 00:00:00 2001 From: Anmol Gupta <14880251+anmolgupt@users.noreply.github.com> Date: Tue, 9 May 2023 10:04:23 -0700 Subject: [PATCH 2/2] added cache clear logging --- large_language_model/megatron-lm/megatron/training.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/large_language_model/megatron-lm/megatron/training.py b/large_language_model/megatron-lm/megatron/training.py index 63606ccd2..b4f5e1956 100755 --- a/large_language_model/megatron-lm/megatron/training.py +++ b/large_language_model/megatron-lm/megatron/training.py @@ -95,6 +95,9 @@ def pretrain(train_valid_test_dataset_provider, args_defaults: a dictionary from argument-name to argument-value. It to set already parse arguments. """ + # The reference implementation does not clear the cache currently + # but the submissions are required to do so + mllogger.event(key=mllogger.constants.CACHE_CLEAR, value=True) mllogger.start(key=mllogger.constants.INIT_START, sync=False) # Initalize and get arguments, timers, and Tensorboard writer.