NVIDIA · amaslenn · Sep 25, 2024 · Jul 11, 2024 · Jul 11, 2024 · Jul 11, 2024
diff --git a/README.md b/README.md
@@ -63,13 +63,12 @@ CloudAI supports five modes: install, dry-run, run, generate-report, and uninsta
 * Use the generate-report mode to generate reports under the test directories alongside the raw data.
 * Use the uninstall mode to remove installed test templates.
 
-To install test templates, run CloudAI CLI in install mode.
+To install test prerequisites, run CloudAI CLI in install mode.
 Please make sure to use the correct system configuration file that corresponds to your current setup for installation and experiments.
 ```bash
 cloudai\
     --mode install\
     --system-config conf/common/system/example_slurm_cluster.toml\
-    --test-templates-dir conf/common/test_template\
     --tests-dir conf/common/test
 ```
 
@@ -78,7 +77,6 @@ To simulate running experiments without execution, use the dry-run mode:
 cloudai\
     --mode dry-run\
     --system-config conf/common/system/example_slurm_cluster.toml\
-    --test-templates-dir conf/common/test_template\
     --tests-dir conf/common/test\
     --test-scenario conf/common/test_scenario/sleep.toml
 ```
@@ -88,7 +86,6 @@ To run experiments, execute CloudAI CLI in run mode:
 cloudai\
     --mode run\
     --system-config conf/common/system/example_slurm_cluster.toml\
-    --test-templates-dir conf/common/test_template\
     --tests-dir conf/common/test\
     --test-scenario conf/common/test_scenario/sleep.toml
 ```
@@ -98,19 +95,17 @@ To generate reports, execute CloudAI CLI in generate-report mode:
 cloudai\
     --mode generate-report\
     --system-config conf/common/system/example_slurm_cluster.toml\
-    --test-templates-dir conf/common/test_template\
     --tests-dir conf/common/test\
     --output-dir /path/to/output_directory
 ```
 In the generate-report mode, use the --output-dir argument to specify a subdirectory under the result directory.
 This subdirectory is usually named with a timestamp for unique identification.
 
-To uninstall test templates, run CloudAI CLI in uninstall mode:
+To uninstall test prerequisites, run CloudAI CLI in uninstall mode:
 ```bash
 cloudai\
     --mode uninstall\
     --system-config conf/common/system/example_slurm_cluster.toml\
-    --test-templates-dir conf/common/test_template\
     --tests-dir conf/common/test
 ```
 
@@ -119,11 +114,19 @@ Verify if system configs are valid:
 cloudai\
     --mode verify-systems\
     --tests-dir conf/common/test\
-    --test-templates-dir conf/common/test_template\
     --system-config conf/common/system
 ```
 `--system-config` can be a file or a directory to verify all configs in the directory.
 
+Verify if test configs are valid:
+```bash
+cloudai\
+    --mode verify-tests\
+    --system-config conf/common/system/example_slurm_cluster.toml\
+    --tests-dir conf/common/test
+```
+`--tests-dir` can be a file or a directory to verify all configs in the directory.
+
 ## Contributing
 Feel free to contribute to the CloudAI project. Your contributions are highly appreciated.
 

diff --git a/USER_GUIDE.md b/USER_GUIDE.md
@@ -1,8 +1,5 @@
 # CloudAI User Guide
-This is a CloudAI user guide to help users use CloudAI, covering topics such as adding a new test template and downloading datasets for running NeMo-launcher.
-
-## Adding a New Test Template
-CloudAI allows users to package workloads as test templates to facilitate the automation of running experiments. This method involves packaging workloads as docker images, which is one of several approaches you can take with CloudAI. Users can run workloads using test templates. However, since docker images are not part of the CloudAI distribution, users must build their own docker image. This guide describes how to build a docker image and then run experiments.
+This is a CloudAI user guide to help users use CloudAI, covering topics such as adding new tests and downloading datasets for running NeMo-launcher.
 
 #### Step 1: Create a Docker Image
 1. **Set Up the GitLab Repository**
@@ -49,49 +46,30 @@ CloudAI allows users to package workloads as test templates to facilitate the au
 
 #### Step 2: Prepare configuration files
 CloudAI is fully configurable via set of TOML configuration files. You can find examples of these files under `conf/common`. In this guide, we will use the following configuration files:
-1. `myconfig/test_templates/nccl_template.toml` - Describes the test template configuration.
 1. `myconfig/system.toml` - Describes the system configuration.
 1. `myconfig/tests/nccl_test.toml` - Describes the test to run.
 1. `myconfig/scenario.toml` - Describes the test scenario configuration.
 
 
-#### Step 3: Test Template
-Test template config describes all arguments of a test. Let's create a test template file for the NCCL test. You can find more examples of test templates under `conf/common/test_template/`. Our example will be small for demonstration purposes. Below is the `myconfig/test_templates/nccl_template.toml` file:
-```toml
-name = "NcclTest"
+#### Step 3: Test definition
+Test definition is a Pydantic model that describes the arguments of a test. Such models should be inherited from the `TestDefinition` class:
+```py
+class MyTestCmdArgs(CmdArgs):
+     an_arg: str
+     docker_image_url: str = "nvcr.io/nvidia/pytorch:24.02-py3"
 
-[cmd_args]
-  [cmd_args.docker_image_url]
-  type = "str"
-  default = "nvcr.io/nvidia/pytorch:24.02-py3"
-
-  [cmd_args.subtest_name]
-  type = "preset"
-  values = ["all_reduce_perf_mpi"]
-  default = "all_reduce_perf_mpi"
-
-  [cmd_args.ngpus]
-  type = "int"
-  default = "1"
-
-  [cmd_args.minbytes]
-  type = "str"
-  default = "32M"
-
-  [cmd_args.maxbytes]
-  type = "str"
-  default = "32M"
-
-  [cmd_args.iters]
-  type = "int"
-  default = "20"
-
-  [cmd_args.warmup_iters]
-  type = "int"
-  default = "5"
+class MyTestDefinition(TestDefinition):
+    cmd_args: MyTestCmdArgs
 ```
 Notice that `cmd_args.docker_image_url` uses `nvcr.io/nvidia/pytorch:24.02-py3`, but you can use Docker image from Step 1.
 
+A custom test definition should be registered to handle relevant Test Configs. For this, `Registry()` object is used:
+```py
+Registry().add_test_definition("MyTest", MyTestDefinition)
+Registry().add_test_template("MyTest", MyTest)
+```
+Relevant Test Configs should specify `test_template_name = MyTest` to use the custom test definition.
+
 #### Step 3: System Config
 System config describes the system configuration. You can find more examples of system configs under `conf/common/system/`. Our example will be small for demonstration purposes. Below is the `myconfig/system.toml` file:
 ```toml
@@ -119,12 +97,11 @@ Once all configs are ready, it is time to install test requirements. It is done
 ```bash
 cloudai --mode install \
    --system-config myconfig/system.toml \
-   --test-templates-dir myconfig/test_templates/ \
    --tests-dir myconfig/tests/
 ```
 
 #### Step 5: Test Configuration
-Test Config describes a particular test configuration to be run. It is based on Test Template and will be used in Test Sceanrio. Below is the `myconfig/tests/nccl_test.toml` file:
+Test Config describes a particular test configuration to be run. It is based on Test definition and will be used in Test Sceanrio. Below is the `myconfig/tests/nccl_test.toml` file, definition is based on built-in `NcclTest` definition:
 ```toml
 name = "nccl_test_all_reduce_single_node"
 description = "all_reduce"
@@ -174,7 +151,6 @@ To generate NCCL test commands without actual execution, use the `dry-run` mode.
 cloudai --mode dry-run \
     --test-scenario myconfig/scenario.toml \
     --system-config myconfig/system.toml \
-    --test-templates-dir myconfig/test_templates/ \
     --tests-dir myconfig/tests/
 ```
 
@@ -183,7 +159,6 @@ You can run NCCL test experiments with the following command. Whenever you run C
 cloudai --mode run \
     --test-scenario myconfig/scenario.toml \
     --system-config myconfig/system.toml \
-    --test-templates-dir myconfig/test_templates/ \
     --tests-dir myconfig/tests/
 ```
 
@@ -193,7 +168,6 @@ Once the test scenario is completed, you can generate reports using the followin
 cloudai --mode generate-report \
    --test-scenario myconfig/scenario.toml \
    --system-config myconfig/system.toml \
-   --test-templates-dir myconfig/test_templates/ \
    --tests-dir myconfig/tests/ \
    --output-dir results/2024-06-18_17-40-13/
 ```
@@ -257,14 +231,14 @@ cache_docker_images_locally = true
 ### Field Descriptions
 - **name**: Specifies the name of the system. Users can choose any name that is convenient for them.
 - **scheduler**: Indicates the type of system. It should be one of the supported types, currently `slurm` or `standalone`. `slurm` refers to a system with the Slurm scheduler, while `standalone` refers to a single-node system without any slave nodes.
-- **install_path**: Specifies the path where test templates are installed. Docker images are downloaded to this path if the user chooses to cache Docker images.
+- **install_path**: Specifies the path where test prerequisites are installed. Docker images are downloaded to this path if the user chooses to cache Docker images.
 - **output_path**: Defines the default path where outputs are stored. Whenever a user runs a test scenario, a new subdirectory will be created under this path.
 - **default_partition**: Specifies the default partition where jobs are scheduled.
 - **partitions**: Describes the available partitions and nodes within those partitions.
   - **groups**: Within the same partition, users can define groups of nodes. This is a logical grouping that does not overlap between groups. The group concept can be used to allocate nodes from specific groups in a test scenario schema.
 - **mpi**: Indicates the Process Management Interface (PMI) implementation to be used for inter-process communication.
 - **gpus_per_node** and **ntasks_per_node**: These are Slurm arguments passed to the `sbatch` script and `srun`.
-- **cache_docker_images_locally**: Specifies whether CloudAI should cache remote Docker images locally during installation. If set to `true`, CloudAI will cache the Docker images, enabling local access without needing to download them each time a test template is run. This approach saves network bandwidth but requires more disk capacity. If set to `false`, CloudAI will allow Slurm to download the Docker images as needed when they are not cached locally by Slurm.
+- **cache_docker_images_locally**: Specifies whether CloudAI should cache remote Docker images locally during installation. If set to `true`, CloudAI will cache the Docker images, enabling local access without needing to download them each time a test is run. This approach saves network bandwidth but requires more disk capacity. If set to `false`, CloudAI will allow Slurm to download the Docker images as needed when they are not cached locally by Slurm.
 - **global_env_vars**: Lists all global environment variables that will be applied globally whenever tests are run.
 
 ## Describing a Test Scenario in the Test Scenario Schema
@@ -306,7 +280,7 @@ Dependencies of a test can be described as a subsection of the test. The depende
 
 
 ## Downloading and Installing the NeMo Dataset (The Pile Dataset)
-This section describes how you can download the NeMo datasets on your server. The install mode of CloudAI handles the installation of all test templates, but downloading and installing datasets is not the responsibility of the install mode. This is because any large datasets should be installed globally by the administrator and shared with multiple users, even if a user does not use CloudAI. For CloudAI users, we provide a detailed guide about downloading and installing the NeMo datasets in this section. To understand the datasets available in the NeMo framework, you can refer to the Data Preparation section of [the document](https://docs.nvidia.com/nemo-framework/user-guide/latest/llms/baichuan2/dataprep.html). According to the document, you can download and use the Pile dataset. The document also provides detailed instructions on how to download these datasets for various platforms. Let’s assume that we have a Slurm cluster.
+This section describes how you can download the NeMo datasets on your server. The install mode of CloudAI handles the installation of all test prerequisites, but downloading and installing datasets is not the responsibility of the install mode. This is because any large datasets should be installed globally by the administrator and shared with multiple users, even if a user does not use CloudAI. For CloudAI users, we provide a detailed guide about downloading and installing the NeMo datasets in this section. To understand the datasets available in the NeMo framework, you can refer to the Data Preparation section of [the document](https://docs.nvidia.com/nemo-framework/user-guide/latest/llms/baichuan2/dataprep.html). According to the document, you can download and use the Pile dataset. The document also provides detailed instructions on how to download these datasets for various platforms. Let’s assume that we have a Slurm cluster.
 
 You can download the datasets with the following command:
 ```bash
@@ -364,7 +338,7 @@ You can update the fields to adjust the behavior. For example, you can update th
 3. Replace "training.model.tokenizer.model=TOKENIZER_MODEL" with "training.model.tokenizer.model=YOUR_TOKENIZER_PATH" (the tokenizer should be a .model file) in conf/common/test/llama.toml.
 
 ## Troubleshooting
-In this section, we will guide you through identifying the root cause of issues, determining whether they stem from system infrastructure or a bug in CloudAI. Users should closely follow the USER_GUIDE.md and README.md for installation, adding test templates, tests, and test scenarios.
+In this section, we will guide you through identifying the root cause of issues, determining whether they stem from system infrastructure or a bug in CloudAI. Users should closely follow the USER_GUIDE.md and README.md for installation, tests, and test scenarios.
 
 ### Identifying the Root Cause
 If you encounter issues running a command, start by reading the error message to understand the root cause. We strive to make our error messages and exception messages as readable and interpretable as possible.
@@ -373,7 +347,7 @@ If you encounter issues running a command, start by reading the error message to
 To determine whether an issue is due to system infrastructure or a CloudAI bug, follow these steps:
 
 1. **Check stdout Messages**
-   If CloudAI fails to run a test template successfully, it will be indicated in the stdout messages that a test has failed.
+   If CloudAI fails to run a test successfully, it will be indicated in the stdout messages that a test has failed.
 
 2. **Review Log Files**
    - Navigate to the output directory and review `debug.log`, stdout, and stderr files.
@@ -392,8 +366,8 @@ To determine whether an issue is due to system infrastructure or a CloudAI bug,
 
 If the problem persists, please report the issue at [https://github.com/NVIDIA/cloudai/issues/new/choose](https://github.com/NVIDIA/cloudai/issues/new/choose). When you report an issue, ensure it is reproducible. Follow the issue template and provide any necessary details, such as the hash commit used, system settings, any changes in the schema files, and the command.
 
-### Test Template-Specific Troubleshooting Guides
-In addition to the general troubleshooting steps, this section provides specific troubleshooting guides for each test template used in CloudAI. These guides help you identify and resolve issues unique to each template.
+### Test Specific Troubleshooting Guides
+In addition to the general troubleshooting steps, this section provides specific troubleshooting guides for each test used in CloudAI. These guides help you identify and resolve issues unique to each template.
 
 #### NeMo Launcher
 * If your run is not successful, please review the stderr and stdout files generated under the results directory. Within the output directory, locate the run directory, and under the run directory, you will find stderr files like log-nemo-megatron-run_[job_id].err. Please review these files for any meaningful error messages.

diff --git a/conf/common/test/chakra_replay.toml b/conf/common/test/chakra_replay.toml
@@ -17,7 +17,10 @@
 name = "chakra_replay"
 description = "chakra_replay"
 test_template_name = "ChakraReplay"
-extra_cmd_args = "--reuse-tensors"
 
 [cmd_args]
 "trace_path" = "TRACE_PATH"
+docker_image_url = "DOCKER_IMAGE"
+
+[extra_cmd_args]
+"--reuse-tensors" = ""
diff --git a/conf/common/test/gpt.toml b/conf/common/test/gpt.toml
@@ -17,3 +17,5 @@
 name = "gpt"
 description = "gpt"
 test_template_name = "NeMoLauncher"
+
+[cmd_args]
diff --git a/conf/common/test/llama.toml b/conf/common/test/llama.toml
@@ -17,13 +17,31 @@
 name = "llama"
 description = "Llama2 70b"
 test_template_name = "NeMoLauncher"
+
+[cmd_args]
+  [cmd_args.training]
+  values = "llama/llama2_70b"
+    [cmd_args.training.trainer]
+    max_steps = "120"
+    [cmd_args.training.model]
+    global_batch_size = "256"
+    pipeline_model_parallel_size = "2"
+
 # FIXME : ~training.model.position_embedding_type was added in the extra_cmd_args in order to fix a bug from NeMo repository (https://github.com/NVIDIA/NeMo).
 # the commit that should fix this issue in NeMo is : 5b296e8af832c67d361fdfb80a165db3affaf76a.
 # Once the new release of NeMoLauncher includes this commit (check by downloading the corresponding container and look inside /opt for this commit), ~training.model.position_embedding_type should be removed from the extra args
-extra_cmd_args = "~training.model.position_embedding_type +training.model.fsdp=True ~training.model.optim.bucket_cap_mb ~training.model.optim.overlap_grad_sync ~training.model.optim.overlap_param_sync ~training.model.optim.contiguous_grad_buffer training.model.virtual_pipeline_model_parallel_size=null training.model.megatron_amp_O2=False training.model.activations_checkpoint_num_layers=null training.model.gradient_accumulation_fusion=False training.model.use_cpu_initialization=True training.model.optim.name=fused_adam training.model.tokenizer.model=TOKENIZER_MODEL training.exp_manager.create_wandb_logger=False"
-
-[cmd_args]
-"training" = "llama/llama2_70b"
-"training.trainer.max_steps" = "120"
-"training.model.global_batch_size" = "256"
-"training.model.pipeline_model_parallel_size" = "1"
+[extra_cmd_args]
+"~training.model.position_embedding_type" = ""
+"+training.model.fsdp" = "True"
+"~training.model.optim.bucket_cap_mb" = ""
+"~training.model.optim.overlap_grad_sync" = ""
+"~training.model.optim.overlap_param_sync" = ""
+"~training.model.optim.contiguous_grad_buffer" = ""
+"training.model.virtual_pipeline_model_parallel_size" = "null"
+"training.model.megatron_amp_O2" = "False"
+"training.model.activations_checkpoint_num_layers" = "null"
+"training.model.gradient_accumulation_fusion" = "False"
+"training.model.use_cpu_initialization" = "True"
+"training.model.optim.name" = "fused_adam"
+"training.model.tokenizer.model" = "TOKENIZER_MODEL"
+"training.exp_manager.create_wandb_logger" = "False"
diff --git a/conf/common/test/nccl_test_all_gather.toml b/conf/common/test/nccl_test_all_gather.toml
@@ -17,7 +17,6 @@
 name = "nccl_test_all_gather"
 description = "all_gather"
 test_template_name = "NcclTest"
-extra_cmd_args = "--stepfactor 2"
 
 [cmd_args]
 "subtest_name" = "all_gather_perf_mpi"
@@ -27,5 +26,8 @@ extra_cmd_args = "--stepfactor 2"
 "iters" = "100"
 "warmup_iters" = "50"
 
+[extra_cmd_args]
+"--stepfactor" = "2"
+
 [extra_env_vars]
 "NCCL_TEST_SPLIT_MASK" = "0x7"
diff --git a/conf/common/test/nccl_test_all_reduce.toml b/conf/common/test/nccl_test_all_reduce.toml
@@ -17,7 +17,6 @@
 name = "nccl_test_all_reduce"
 description = "all_reduce"
 test_template_name = "NcclTest"
-extra_cmd_args = "--stepfactor 2"
 
 [cmd_args]
 "subtest_name" = "all_reduce_perf_mpi"
@@ -26,3 +25,6 @@ extra_cmd_args = "--stepfactor 2"
 "maxbytes" = "16G"
 "iters" = "100"
 "warmup_iters" = "50"
+
+[extra_cmd_args]
+"--stepfactor" = "2"
diff --git a/conf/common/test/nccl_test_alltoall.toml b/conf/common/test/nccl_test_alltoall.toml
@@ -17,7 +17,6 @@
 name = "nccl_test_alltoall"
 description = "alltoall"
 test_template_name = "NcclTest"
-extra_cmd_args = "--stepfactor 2"
 
 [cmd_args]
 "subtest_name" = "alltoall_perf_mpi"
@@ -26,3 +25,6 @@ extra_cmd_args = "--stepfactor 2"
 "maxbytes" = "4G"
 "iters" = "100"
 "warmup_iters" = "50"
+
+[extra_cmd_args]
+"--stepfactor" = "2"
diff --git a/conf/common/test/nccl_test_reduce_scatter.toml b/conf/common/test/nccl_test_reduce_scatter.toml
@@ -17,7 +17,6 @@
 name = "nccl_test_reduce_scatter"
 description = "reduce_scatter"
 test_template_name = "NcclTest"
-extra_cmd_args = "--stepfactor 2"
 
 [cmd_args]
 "subtest_name" = "reduce_scatter_perf_mpi"
@@ -27,5 +26,8 @@ extra_cmd_args = "--stepfactor 2"
 "iters" = "100"
 "warmup_iters" = "50"
 
+[extra_cmd_args]
+"--stepfactor" = "2"
+
 [extra_env_vars]
 "NCCL_TEST_SPLIT_MASK" = "0x7"