Test hook support (#263)

NVIDIA · Nov 7, 2024 · 95b3681 · 95b3681
1 parent 12a807c
commit 95b3681
Show file tree

Hide file tree

Showing 27 changed files with 653 additions and 406 deletions.
diff --git a/conf/common/test_scenario/nccl_test.toml b/conf/common/test_scenario/nccl_test.toml
@@ -15,6 +15,10 @@
 # limitations under the License.
 
 name = "nccl-test"
+
+pre_test = "nccl_test"
+post_test = "nccl_test"
+
 [[Tests]]
 id = "Tests.1"
 test_name = "nccl_test_all_reduce"

diff --git a/conf/hook/nccl_test.toml b/conf/hook/nccl_test.toml
@@ -0,0 +1,22 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "nccl_test"
+
+[[Tests]]
+id = "Tests.1"
+test_name = "nccl_test_all_gather"
+time_limit = "00:20:00"
diff --git a/conf/hook/test/nccl_test_all_gather.toml b/conf/hook/test/nccl_test_all_gather.toml
@@ -0,0 +1,33 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "nccl_test_all_gather"
+description = "all_gather"
+test_template_name = "NcclTest"
+
+[cmd_args]
+"subtest_name" = "all_gather_perf_mpi"
+"ngpus" = "1"
+"minbytes" = "128"
+"maxbytes" = "4G"
+"iters" = "100"
+"warmup_iters" = "50"
+
+[extra_cmd_args]
+"--stepfactor" = "2"
+
+[extra_env_vars]
+"NCCL_TEST_SPLIT_MASK" = "0x7"
diff --git a/src/cloudai/_core/command_gen_strategy.py b/src/cloudai/_core/command_gen_strategy.py
@@ -39,3 +39,29 @@ def gen_exec_command(self, tr: TestRun) -> str:
             str: The generated execution command.
         """
         pass
+
+    @abstractmethod
+    def gen_srun_command(self, tr: TestRun) -> str:
+        """
+        Generate the Slurm srun command for a test based on the given parameters.
+
+        Args:
+            tr (TestRun): Contains the test and its run-specific configurations.
+
+        Returns:
+            str: The generated Slurm srun command.
+        """
+        pass
+
+    @abstractmethod
+    def gen_srun_success_check(self, tr: TestRun) -> str:
+        """
+        Generate the Slurm success check command to verify if a test run was successful.
+
+        Args:
+            tr (TestRun): Contains the test and its run-specific configurations.
+
+        Returns:
+            str: The generated command to check the success of the test run.
+        """
+        pass
diff --git a/src/cloudai/_core/test_scenario.py b/src/cloudai/_core/test_scenario.py
@@ -58,6 +58,8 @@ class TestRun:
     weight: float = 0.0
     ideal_perf: float = 1.0
     dependencies: dict[str, TestDependency] = field(default_factory=dict)
+    pre_test: Optional["TestScenario"] = None
+    post_test: Optional["TestScenario"] = None
 
     def __hash__(self) -> int:
         return hash(self.name + self.test.name + str(self.iterations) + str(self.current_iteration))

diff --git a/src/cloudai/_core/test_scenario_parser.py b/src/cloudai/_core/test_scenario_parser.py
@@ -54,6 +54,8 @@ class _TestScenarioTOML(BaseModel):
     name: str
     job_status_check: bool = True
     tests: list[_TestRunTOML] = Field(alias="Tests", min_length=1)
+    pre_test: Optional[str] = None
+    post_test: Optional[str] = None
 
     @model_validator(mode="after")
     def check_no_self_dependency(self):
@@ -99,9 +101,10 @@ class TestScenarioParser:
 
     __test__ = False
 
-    def __init__(self, file_path: Path, test_mapping: Dict[str, Test]) -> None:
+    def __init__(self, file_path: Path, test_mapping: Dict[str, Test], hook_mapping: Dict[str, TestScenario]) -> None:
         self.file_path = file_path
         self.test_mapping = test_mapping
+        self.hook_mapping = hook_mapping
 
     def parse(self) -> TestScenario:
         """
@@ -136,8 +139,31 @@ def _parse_data(self, data: Dict[str, Any]) -> TestScenario:
         total_weight = sum(tr.weight for tr in ts_model.tests)
         normalized_weight = 0 if total_weight == 0 else 100 / total_weight
 
+        pre_test, post_test = None, None
+        if ts_model.pre_test:
+            pre_test = self.hook_mapping.get(ts_model.pre_test)
+            if pre_test is None:
+                msg = (
+                    f"Pre-test hook '{ts_model.pre_test}' not found in hook mapping. "
+                    "A corresponding hook should exist under 'conf/hook'. "
+                    "Ensure that a proper hook directory is set under the working directory."
+                )
+                logging.error(msg)
+                raise TestScenarioParsingError(msg)
+
+        if ts_model.post_test:
+            post_test = self.hook_mapping.get(ts_model.post_test)
+            if post_test is None:
+                msg = (
+                    f"Post-test hook '{ts_model.post_test}' not found in hook mapping. "
+                    "A corresponding hook should exist under 'conf/hook'. "
+                    "Ensure that a proper hook directory is set under the working directory."
+                )
+                logging.error(msg)
+                raise TestScenarioParsingError(msg)
+
         test_runs_by_id: dict[str, TestRun] = {
-            tr.id: self._create_test_run(tr, normalized_weight) for tr in ts_model.tests
+            tr.id: self._create_test_run(tr, normalized_weight, pre_test, post_test) for tr in ts_model.tests
         }
 
         tests_data: dict[str, _TestRunTOML] = {tr.id: tr for tr in ts_model.tests}
@@ -153,13 +179,21 @@ def _parse_data(self, data: Dict[str, Any]) -> TestScenario:
             job_status_check=ts_model.job_status_check,
         )
 
-    def _create_test_run(self, test_info: _TestRunTOML, normalized_weight: float) -> TestRun:
+    def _create_test_run(
+        self,
+        test_info: _TestRunTOML,
+        normalized_weight: float,
+        pre_test: Optional[TestScenario] = None,
+        post_test: Optional[TestScenario] = None,
+    ) -> TestRun:
         """
         Create a section-specific Test object by copying from the test mapping.
 
         Args:
             test_info (Dict[str, Any]): Information of the test.
             normalized_weight (float): Normalized weight for the test.
+            pre_test (Optional[TestScenario]): TestScenario object representing the pre-test sequence.
+            post_test (Optional[TestScenario]): TestScenario object representing the post-test sequence.
 
         Returns:
             Test: Copied and updated Test object for the section.
@@ -192,5 +226,7 @@ def _create_test_run(self, test_info: _TestRunTOML, normalized_weight: float) ->
             sol=test_info.sol,
             weight=test_info.weight * normalized_weight,
             ideal_perf=test_info.ideal_perf,
+            pre_test=pre_test,
+            post_test=post_test,
         )
         return tr
diff --git a/src/cloudai/_core/test_template.py b/src/cloudai/_core/test_template.py
@@ -93,6 +93,40 @@ def gen_exec_command(self, tr: TestRun) -> str:
             )
         return self.command_gen_strategy.gen_exec_command(tr)
 
+    def gen_srun_command(self, tr: TestRun) -> str:
+        """
+        Generate an Slurm srun command for a test using the provided command generation strategy.
+
+        Args:
+            tr (TestRun): Contains the test and its run-specific configurations.
+
+        Returns:
+            str: The generated Slurm srun command.
+        """
+        if self.command_gen_strategy is None:
+            raise ValueError(
+                "command_gen_strategy is missing. Ensure the strategy is registered in the Registry "
+                "by calling the appropriate registration function for the system type."
+            )
+        return self.command_gen_strategy.gen_srun_command(tr)
+
+    def gen_srun_success_check(self, tr: TestRun) -> str:
+        """
+        Generate a Slurm success check command for a test using the provided command generation strategy.
+
+        Args:
+            tr (TestRun): Contains the test and its run-specific configurations.
+
+        Returns:
+            str: The generated command to check the success of the test run.
+        """
+        if self.command_gen_strategy is None:
+            raise ValueError(
+                "command_gen_strategy is missing. Ensure the strategy is registered in the Registry "
+                "by calling the appropriate registration function for the system type."
+            )
+        return self.command_gen_strategy.gen_srun_success_check(tr)
+
     def gen_json(self, tr: TestRun) -> Dict[Any, Any]:
         """
         Generate a JSON string representing the Kubernetes job specification for this test using this template.

diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py
@@ -23,6 +23,8 @@
 
 from cloudai import Installable, Parser, Registry, ReportGenerator, Runner, System
 
+from ..parser import HOOK_ROOT
+
 
 def handle_install_and_uninstall(args: argparse.Namespace) -> int:
     """
@@ -212,7 +214,11 @@ def verify_test_configs(test_tomls: List[Path]) -> int:
 
 
 def verify_test_scenarios(
-    scenario_tomls: List[Path], test_tomls: list[Path], system_config: Optional[Path] = None
+    scenario_tomls: List[Path],
+    test_tomls: list[Path],
+    hook_tomls: List[Path],
+    hook_test_tomls: list[Path],
+    system_config: Optional[Path] = None,
 ) -> int:
     system = Mock(spec=System)
     if system_config:
@@ -225,7 +231,9 @@ def verify_test_scenarios(
         logging.debug(f"Verifying Test Scenario: {scenario_file}...")
         try:
             tests = Parser.parse_tests(test_tomls, system)
-            Parser.parse_test_scenario(scenario_file, {t.name: t for t in tests})
+            hook_tests = Parser.parse_tests(hook_test_tomls, system)
+            hooks = Parser.parse_hooks(hook_tomls, {t.name: t for t in hook_tests})
+            Parser.parse_test_scenario(scenario_file, {t.name: t for t in tests}, hooks)
         except Exception:
             nfailed += 1
 
@@ -243,6 +251,9 @@ def handle_verify_all_configs(args: argparse.Namespace) -> int:
     if err:
         return err
 
+    err, hook_tomls = expand_file_list(HOOK_ROOT, glob="**/*.toml")
+    tomls += hook_tomls
+
     files = load_tomls_by_type(tomls)
 
     test_tomls = files["test"]
@@ -259,7 +270,9 @@ def handle_verify_all_configs(args: argparse.Namespace) -> int:
     if files["test"]:
         nfailed += verify_test_configs(files["test"])
     if files["scenario"]:
-        nfailed += verify_test_scenarios(files["scenario"], test_tomls, args.system_config)
+        nfailed += verify_test_scenarios(
+            files["scenario"], test_tomls, files["hook"], files["hook_test"], args.system_config
+        )
     if files["unknown"]:
         logging.error(f"Unknown configuration files: {[str(f) for f in files['unknown']]}")
         nfailed += len(files["unknown"])
@@ -273,9 +286,31 @@ def handle_verify_all_configs(args: argparse.Namespace) -> int:
 
 
 def load_tomls_by_type(tomls: List[Path]) -> dict[str, List[Path]]:
-    files: dict[str, List[Path]] = {"system": [], "test": [], "scenario": [], "unknown": []}
+    files: dict[str, List[Path]] = {
+        "system": [],
+        "test": [],
+        "scenario": [],
+        "hook_test": [],
+        "hook": [],
+        "unknown": [],
+    }
     for toml_file in tomls:
         content = toml_file.read_text()
+
+        is_in_hook_root = False
+        try:
+            toml_file.relative_to(HOOK_ROOT)
+            is_in_hook_root = True
+        except ValueError:
+            pass
+
+        if is_in_hook_root:
+            if "test" in toml_file.parts:
+                files["hook_test"].append(toml_file)
+            else:
+                files["hook"].append(toml_file)
+            continue
+
         if "scheduler =" in content:
             files["system"].append(toml_file)
         elif "test_template_name =" in content: