NVIDIA · TaekyungHeo · Nov 7, 2024 · Oct 22, 2024 · Oct 22, 2024 · Oct 23, 2024
diff --git a/conf/common/test_scenario/nccl_test.toml b/conf/common/test_scenario/nccl_test.toml
@@ -15,6 +15,10 @@
 # limitations under the License.
 
 name = "nccl-test"
+
+prologue = "nccl_test_prologue"
+epilogue = "nccl_test_epilogue"
+
 [[Tests]]
 id = "Tests.1"
 test_name = "nccl_test_all_reduce"

diff --git a/conf/plugin/nccl_test_epilogue.toml b/conf/plugin/nccl_test_epilogue.toml
@@ -0,0 +1,22 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "nccl_test_epilogue"
+
+[[Tests]]
+id = "Tests.1"
+test_name = "nccl_test_all_gather"
+time_limit = "00:20:00"
diff --git a/conf/plugin/nccl_test_prologue.toml b/conf/plugin/nccl_test_prologue.toml
@@ -0,0 +1,22 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "nccl_test_prologue"
+
+[[Tests]]
+id = "Tests.1"
+test_name = "nccl_test_all_reduce"
+time_limit = "00:20:00"
diff --git a/conf/plugin/test/nccl_test_all_gather.toml b/conf/plugin/test/nccl_test_all_gather.toml
@@ -0,0 +1,33 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "nccl_test_all_gather"
+description = "all_gather"
+test_template_name = "NcclTest"
+
+[cmd_args]
+"subtest_name" = "all_gather_perf_mpi"
+"ngpus" = "1"
+"minbytes" = "128"
+"maxbytes" = "4G"
+"iters" = "100"
+"warmup_iters" = "50"
+
+[extra_cmd_args]
+"--stepfactor" = "2"
+
+[extra_env_vars]
+"NCCL_TEST_SPLIT_MASK" = "0x7"
diff --git a/conf/plugin/test/nccl_test_all_reduce.toml b/conf/plugin/test/nccl_test_all_reduce.toml
@@ -0,0 +1,30 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "nccl_test_all_reduce"
+description = "all_reduce"
+test_template_name = "NcclTest"
+
+[cmd_args]
+"subtest_name" = "all_reduce_perf_mpi"
+"ngpus" = "1"
+"minbytes" = "128"
+"maxbytes" = "16G"
+"iters" = "100"
+"warmup_iters" = "50"
+
+[extra_cmd_args]
+"--stepfactor" = "2"
diff --git a/src/cloudai/_core/command_gen_strategy.py b/src/cloudai/_core/command_gen_strategy.py
@@ -39,3 +39,29 @@ def gen_exec_command(self, tr: TestRun) -> str:
             str: The generated execution command.
         """
         pass
+
+    @abstractmethod
+    def gen_srun_command(self, tr: TestRun) -> str:
+        """
+        Generate the Slurm srun command for a test based on the given parameters.
+
+        Args:
+            tr (TestRun): Contains the test and its run-specific configurations.
+
+        Returns:
+            str: The generated Slurm srun command.
+        """
+        pass
+
+    @abstractmethod
+    def gen_srun_success_check(self, tr: TestRun) -> str:
+        """
+        Generate the Slurm success check command to verify if a test run was successful.
+
+        Args:
+            tr (TestRun): Contains the test and its run-specific configurations.
+
+        Returns:
+            str: The generated command to check the success of the test run.
+        """
+        pass
diff --git a/src/cloudai/_core/test_scenario.py b/src/cloudai/_core/test_scenario.py
@@ -58,6 +58,8 @@ class TestRun:
     weight: float = 0.0
     ideal_perf: float = 1.0
     dependencies: dict[str, TestDependency] = field(default_factory=dict)
+    prologue: Optional["TestScenario"] = None
+    epilogue: Optional["TestScenario"] = None
 
     def __hash__(self) -> int:
         return hash(self.name + self.test.name + str(self.iterations) + str(self.current_iteration))

diff --git a/src/cloudai/_core/test_scenario_parser.py b/src/cloudai/_core/test_scenario_parser.py
@@ -54,6 +54,8 @@ class _TestScenarioTOML(BaseModel):
     name: str
     job_status_check: bool = True
     tests: list[_TestRunTOML] = Field(alias="Tests", min_length=1)
+    prologue: Optional[str] = None
+    epilogue: Optional[str] = None
 
     @model_validator(mode="after")
     def check_no_self_dependency(self):
@@ -99,9 +101,10 @@ class TestScenarioParser:
 
     __test__ = False
 
-    def __init__(self, file_path: Path, test_mapping: Dict[str, Test]) -> None:
+    def __init__(self, file_path: Path, test_mapping: Dict[str, Test], plugin_mapping: Dict[str, TestScenario]) -> None:
         self.file_path = file_path
         self.test_mapping = test_mapping
+        self.plugin_mapping = plugin_mapping
 
     def parse(self) -> TestScenario:
         """
@@ -136,8 +139,24 @@ def _parse_data(self, data: Dict[str, Any]) -> TestScenario:
         total_weight = sum(tr.weight for tr in ts_model.tests)
         normalized_weight = 0 if total_weight == 0 else 100 / total_weight
 
+        prologue, epilogue = None, None
+        if ts_model.prologue:
+            prologue = self.plugin_mapping.get(ts_model.prologue)
+            if prologue is None:
+                logging.warning(
+                    f"Prologue '{ts_model.prologue}' not found in plugin mapping. "
+                    "Ensure that a proper plugin directory is set under the working directory."
+                )
+        if ts_model.epilogue:
+            epilogue = self.plugin_mapping.get(ts_model.epilogue)
+            if epilogue is None:
+                logging.warning(
+                    f"Epilogue '{ts_model.epilogue}' not found in plugin mapping. "
+                    "Ensure that a proper plugin directory is set under the working directory."
+                )
+
         test_runs_by_id: dict[str, TestRun] = {
-            tr.id: self._create_test_run(tr, normalized_weight) for tr in ts_model.tests
+            tr.id: self._create_test_run(tr, normalized_weight, prologue, epilogue) for tr in ts_model.tests
         }
 
         tests_data: dict[str, _TestRunTOML] = {tr.id: tr for tr in ts_model.tests}
@@ -153,13 +172,21 @@ def _parse_data(self, data: Dict[str, Any]) -> TestScenario:
             job_status_check=ts_model.job_status_check,
         )
 
-    def _create_test_run(self, test_info: _TestRunTOML, normalized_weight: float) -> TestRun:
+    def _create_test_run(
+        self,
+        test_info: _TestRunTOML,
+        normalized_weight: float,
+        prologue: Optional[TestScenario] = None,
+        epilogue: Optional[TestScenario] = None,
+    ) -> TestRun:
         """
         Create a section-specific Test object by copying from the test mapping.
 
         Args:
             test_info (Dict[str, Any]): Information of the test.
             normalized_weight (float): Normalized weight for the test.
+            prologue (Optional[TestScenario]): TestScenario object representing the prologue sequence.
+            epilogue (Optional[TestScenario]): TestScenario object representing the epilogue sequence.
 
         Returns:
             Test: Copied and updated Test object for the section.
@@ -192,5 +219,7 @@ def _create_test_run(self, test_info: _TestRunTOML, normalized_weight: float) ->
             sol=test_info.sol,
             weight=test_info.weight * normalized_weight,
             ideal_perf=test_info.ideal_perf,
+            prologue=prologue,
+            epilogue=epilogue,
         )
         return tr
diff --git a/src/cloudai/_core/test_template.py b/src/cloudai/_core/test_template.py
@@ -94,6 +94,40 @@ def gen_exec_command(self, tr: TestRun) -> str:
             )
         return self.command_gen_strategy.gen_exec_command(tr)
 
+    def gen_srun_command(self, tr: TestRun) -> str:
+        """
+        Generate an Slurm srun command for a test using the provided command generation strategy.
+
+        Args:
+            tr (TestRun): Contains the test and its run-specific configurations.
+
+        Returns:
+            str: The generated Slurm srun command.
+        """
+        if self.command_gen_strategy is None:
+            raise ValueError(
+                "command_gen_strategy is missing. Ensure the strategy is registered in the Registry "
+                "by calling the appropriate registration function for the system type."
+            )
+        return self.command_gen_strategy.gen_srun_command(tr)
+
+    def gen_srun_success_check(self, tr: TestRun) -> str:
+        """
+        Generate a Slurm success check command for a test using the provided command generation strategy.
+
+        Args:
+            tr (TestRun): Contains the test and its run-specific configurations.
+
+        Returns:
+            str: The generated command to check the success of the test run.
+        """
+        if self.command_gen_strategy is None:
+            raise ValueError(
+                "command_gen_strategy is missing. Ensure the strategy is registered in the Registry "
+                "by calling the appropriate registration function for the system type."
+            )
+        return self.command_gen_strategy.gen_srun_success_check(tr)
+
     def gen_json(self, tr: TestRun) -> Dict[Any, Any]:
         """
         Generate a JSON string representing the Kubernetes job specification for this test using this template.

diff --git a/src/cloudai/parser.py b/src/cloudai/parser.py
@@ -34,6 +34,9 @@
     format_validation_error,
 )
 
+PLUGIN_ROOT = Path("conf/plugin")
+PLUGIN_TEST_ROOT = PLUGIN_ROOT / "test"
+
 
 class Parser:
     """Main parser for parsing all types of configurations."""
@@ -49,14 +52,21 @@ def __init__(self, system_config_path: Path) -> None:
         self.system_config_path = system_config_path
 
     def parse(
-        self, test_path: Path, test_scenario_path: Optional[Path] = None
+        self,
+        test_path: Path,
+        test_scenario_path: Optional[Path] = None,
     ) -> Tuple[System, List[Test], Optional[TestScenario]]:
         """
         Parse configurations for system, test templates, and test scenarios.
 
-        Returns
-            Tuple[System, List[TestTemplate], TestScenario]: A tuple containing the system object, a list of test
-                template objects, and the test scenario object.
+        Args:
+            test_path (Path): The file path for tests.
+            test_scenario_path (Optional[Path]): The file path for the main test scenario.
+                If None, all tests are included.
+
+        Returns:
+            Tuple[System, List[Test], Optional[TestScenario]]: A tuple containing the system object, a list of filtered
+                test template objects, and the main test scenario object if provided.
         """
         if not test_path.exists():
             raise FileNotFoundError(f"Test path '{test_path}' not found.")
@@ -71,24 +81,63 @@ def parse(
         except TestConfigParsingError:
             exit(1)  # exit right away to keep error message readable for users
 
-        logging.debug(f"Parsed {len(tests)} tests: {[t.name for t in tests]}")
-        test_mapping = {t.name: t for t in tests}
+        try:
+            plugin_tests = (
+                self.parse_tests(list(PLUGIN_TEST_ROOT.glob("*.toml")), system) if PLUGIN_TEST_ROOT.exists() else []
+            )
+        except TestConfigParsingError:
+            exit(1)  # exit right away to keep error message readable for users
 
-        filtered_tests = tests
-        test_scenario: Optional[TestScenario] = None
-        if test_scenario_path:
+        if not test_scenario_path:
+            all_tests = list({test.name: test for test in tests + plugin_tests}.values())
+            return system, all_tests, None
+
+        test_mapping = {t.name: t for t in tests}
+        plugin_test_scenario_mapping = {}
+        if PLUGIN_ROOT.exists() and list(PLUGIN_ROOT.glob("*.toml")):
             try:
-                test_scenario = self.parse_test_scenario(test_scenario_path, test_mapping)
+                plugin_test_scenario_mapping = self.parse_plugins(
+                    list(PLUGIN_ROOT.glob("*.toml")), {t.name: t for t in plugin_tests}
+                )
             except TestScenarioParsingError:
                 exit(1)  # exit right away to keep error message readable for users
-            scenario_tests = set(tr.test.name for tr in test_scenario.test_runs)
-            filtered_tests = [t for t in tests if t.name in scenario_tests]
+
+        try:
+            test_scenario = self.parse_test_scenario(test_scenario_path, test_mapping, plugin_test_scenario_mapping)
+        except TestScenarioParsingError:
+            exit(1)  # exit right away to keep error message readable for users
+
+        scenario_tests = {tr.test.name for tr in test_scenario.test_runs}
+        plugin_scenario_tests = {
+            tr.test.name
+            for plugin_scenario in plugin_test_scenario_mapping.values()
+            for tr in plugin_scenario.test_runs
+        }
+
+        relevant_test_names = scenario_tests.union(plugin_scenario_tests)
+        filtered_tests = [t for t in tests if t.name in relevant_test_names] + plugin_tests
+        filtered_tests = list({test.name: test for test in filtered_tests}.values())
 
         return system, filtered_tests, test_scenario
 
     @staticmethod
-    def parse_test_scenario(test_scenario_path: Path, test_mapping: Dict[str, Test]) -> TestScenario:
-        test_scenario_parser = TestScenarioParser(test_scenario_path, test_mapping)
+    def parse_plugins(plugin_tomls: List[Path], test_mapping: Dict[str, Test]) -> Dict[str, TestScenario]:
+        plugin_mapping = {}
+        for plugin_test_scenario_path in plugin_tomls:
+            plugin_scenario = Parser.parse_test_scenario(plugin_test_scenario_path, test_mapping)
+            plugin_mapping[plugin_scenario.name] = plugin_scenario
+        return plugin_mapping
+
+    @staticmethod
+    def parse_test_scenario(
+        test_scenario_path: Path,
+        test_mapping: Dict[str, Test],
+        plugin_mapping: Optional[Dict[str, TestScenario]] = None,
+    ) -> TestScenario:
+        if plugin_mapping is None:
+            plugin_mapping = {}
+
+        test_scenario_parser = TestScenarioParser(test_scenario_path, test_mapping, plugin_mapping)
         test_scenario = test_scenario_parser.parse()
         return test_scenario