NVIDIA · srivatsankrishnan · Feb 12, 2025 · Feb 5, 2025 · Feb 6, 2025 · Feb 6, 2025
diff --git a/conf/common/test/dse_nemo_run_llama3_8b_fp8.toml b/conf/common/test/dse_nemo_run_llama3_8b_fp8.toml
@@ -0,0 +1,49 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "dse_nemo_run_llama3_8b_fp8"
+description = "dse_nemo_run_llama3_8b"
+test_template_name = "NeMoRun"
+
+[cmd_args]
+docker_image_url = "nvcr.io/nvidia/nemo:24.12.rc3"
+task = "pretrain"
+recipe_name = "llama3_8b"
+
+  [cmd_args.data]
+  micro_batch_size = [1]
+  global_batch_size = [128, 256, 512]
+
+  [cmd_args.trainer]
+  max_steps = 100
+  val_check_interval = 1000
+  num_nodes = 1
+
+    [cmd_args.trainer.strategy]
+    tensor_model_parallel_size = [1]
+    pipeline_model_parallel_size = [1]
+    context_parallel_size = [2]
+
+    [cmd_args.trainer.plugins]
+    fp8 = "hybrid"
+    fp8_margin = 0
+    fp8_amax_history_len = 1024
+    fp8_amax_compute_algo = "max"
+    fp8_params = true
+
+  [cmd_args.log.ckpt]
+  save_on_train_epoch_end = false
+  save_last = false
diff --git a/src/cloudai/test_definitions/nemo_run.py b/src/cloudai/test_definitions/nemo_run.py
@@ -22,6 +22,17 @@
 from cloudai.installer.installables import DockerImage, Installable
 
 
+class Plugin(BaseModel):
+    """Plugin configuration for NeMoRun."""
+
+    fp8: Optional[str] = "hybrid"
+    fp8_margin: Optional[int] = 0
+    fp8_amax_history_len: Optional[int] = 1
+    fp8_amax_compute_algo: Optional[str] = "most_recent"
+    fp8_wgrad: Optional[bool] = True
+    fp8_params: Optional[bool] = True
+
+
 class Data(BaseModel):
     """Data configuration for NeMoRun."""
 
@@ -45,6 +56,7 @@ class Trainer(BaseModel):
     val_check_interval: Union[int, List[int]] = 1000
     num_nodes: Union[int, List[int]] = 1
     strategy: TrainerStrategy = Field(default_factory=TrainerStrategy)
+    plugins: Plugin = Field(default_factory=Plugin)
 
 
 class LogCkpt(BaseModel):