Skip to content

Commit

Permalink
Test hook support (#263)
Browse files Browse the repository at this point in the history
  • Loading branch information
TaekyungHeo authored Nov 7, 2024
1 parent 12a807c commit 95b3681
Show file tree
Hide file tree
Showing 27 changed files with 653 additions and 406 deletions.
4 changes: 4 additions & 0 deletions conf/common/test_scenario/nccl_test.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@
# limitations under the License.

name = "nccl-test"

pre_test = "nccl_test"
post_test = "nccl_test"

[[Tests]]
id = "Tests.1"
test_name = "nccl_test_all_reduce"
Expand Down
22 changes: 22 additions & 0 deletions conf/hook/nccl_test.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name = "nccl_test"

[[Tests]]
id = "Tests.1"
test_name = "nccl_test_all_gather"
time_limit = "00:20:00"
33 changes: 33 additions & 0 deletions conf/hook/test/nccl_test_all_gather.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name = "nccl_test_all_gather"
description = "all_gather"
test_template_name = "NcclTest"

[cmd_args]
"subtest_name" = "all_gather_perf_mpi"
"ngpus" = "1"
"minbytes" = "128"
"maxbytes" = "4G"
"iters" = "100"
"warmup_iters" = "50"

[extra_cmd_args]
"--stepfactor" = "2"

[extra_env_vars]
"NCCL_TEST_SPLIT_MASK" = "0x7"
26 changes: 26 additions & 0 deletions src/cloudai/_core/command_gen_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,29 @@ def gen_exec_command(self, tr: TestRun) -> str:
str: The generated execution command.
"""
pass

@abstractmethod
def gen_srun_command(self, tr: TestRun) -> str:
"""
Generate the Slurm srun command for a test based on the given parameters.
Args:
tr (TestRun): Contains the test and its run-specific configurations.
Returns:
str: The generated Slurm srun command.
"""
pass

@abstractmethod
def gen_srun_success_check(self, tr: TestRun) -> str:
"""
Generate the Slurm success check command to verify if a test run was successful.
Args:
tr (TestRun): Contains the test and its run-specific configurations.
Returns:
str: The generated command to check the success of the test run.
"""
pass
2 changes: 2 additions & 0 deletions src/cloudai/_core/test_scenario.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ class TestRun:
weight: float = 0.0
ideal_perf: float = 1.0
dependencies: dict[str, TestDependency] = field(default_factory=dict)
pre_test: Optional["TestScenario"] = None
post_test: Optional["TestScenario"] = None

def __hash__(self) -> int:
return hash(self.name + self.test.name + str(self.iterations) + str(self.current_iteration))
Expand Down
42 changes: 39 additions & 3 deletions src/cloudai/_core/test_scenario_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ class _TestScenarioTOML(BaseModel):
name: str
job_status_check: bool = True
tests: list[_TestRunTOML] = Field(alias="Tests", min_length=1)
pre_test: Optional[str] = None
post_test: Optional[str] = None

@model_validator(mode="after")
def check_no_self_dependency(self):
Expand Down Expand Up @@ -99,9 +101,10 @@ class TestScenarioParser:

__test__ = False

def __init__(self, file_path: Path, test_mapping: Dict[str, Test]) -> None:
def __init__(self, file_path: Path, test_mapping: Dict[str, Test], hook_mapping: Dict[str, TestScenario]) -> None:
self.file_path = file_path
self.test_mapping = test_mapping
self.hook_mapping = hook_mapping

def parse(self) -> TestScenario:
"""
Expand Down Expand Up @@ -136,8 +139,31 @@ def _parse_data(self, data: Dict[str, Any]) -> TestScenario:
total_weight = sum(tr.weight for tr in ts_model.tests)
normalized_weight = 0 if total_weight == 0 else 100 / total_weight

pre_test, post_test = None, None
if ts_model.pre_test:
pre_test = self.hook_mapping.get(ts_model.pre_test)
if pre_test is None:
msg = (
f"Pre-test hook '{ts_model.pre_test}' not found in hook mapping. "
"A corresponding hook should exist under 'conf/hook'. "
"Ensure that a proper hook directory is set under the working directory."
)
logging.error(msg)
raise TestScenarioParsingError(msg)

if ts_model.post_test:
post_test = self.hook_mapping.get(ts_model.post_test)
if post_test is None:
msg = (
f"Post-test hook '{ts_model.post_test}' not found in hook mapping. "
"A corresponding hook should exist under 'conf/hook'. "
"Ensure that a proper hook directory is set under the working directory."
)
logging.error(msg)
raise TestScenarioParsingError(msg)

test_runs_by_id: dict[str, TestRun] = {
tr.id: self._create_test_run(tr, normalized_weight) for tr in ts_model.tests
tr.id: self._create_test_run(tr, normalized_weight, pre_test, post_test) for tr in ts_model.tests
}

tests_data: dict[str, _TestRunTOML] = {tr.id: tr for tr in ts_model.tests}
Expand All @@ -153,13 +179,21 @@ def _parse_data(self, data: Dict[str, Any]) -> TestScenario:
job_status_check=ts_model.job_status_check,
)

def _create_test_run(self, test_info: _TestRunTOML, normalized_weight: float) -> TestRun:
def _create_test_run(
self,
test_info: _TestRunTOML,
normalized_weight: float,
pre_test: Optional[TestScenario] = None,
post_test: Optional[TestScenario] = None,
) -> TestRun:
"""
Create a section-specific Test object by copying from the test mapping.
Args:
test_info (Dict[str, Any]): Information of the test.
normalized_weight (float): Normalized weight for the test.
pre_test (Optional[TestScenario]): TestScenario object representing the pre-test sequence.
post_test (Optional[TestScenario]): TestScenario object representing the post-test sequence.
Returns:
Test: Copied and updated Test object for the section.
Expand Down Expand Up @@ -192,5 +226,7 @@ def _create_test_run(self, test_info: _TestRunTOML, normalized_weight: float) ->
sol=test_info.sol,
weight=test_info.weight * normalized_weight,
ideal_perf=test_info.ideal_perf,
pre_test=pre_test,
post_test=post_test,
)
return tr
34 changes: 34 additions & 0 deletions src/cloudai/_core/test_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,40 @@ def gen_exec_command(self, tr: TestRun) -> str:
)
return self.command_gen_strategy.gen_exec_command(tr)

def gen_srun_command(self, tr: TestRun) -> str:
"""
Generate an Slurm srun command for a test using the provided command generation strategy.
Args:
tr (TestRun): Contains the test and its run-specific configurations.
Returns:
str: The generated Slurm srun command.
"""
if self.command_gen_strategy is None:
raise ValueError(
"command_gen_strategy is missing. Ensure the strategy is registered in the Registry "
"by calling the appropriate registration function for the system type."
)
return self.command_gen_strategy.gen_srun_command(tr)

def gen_srun_success_check(self, tr: TestRun) -> str:
"""
Generate a Slurm success check command for a test using the provided command generation strategy.
Args:
tr (TestRun): Contains the test and its run-specific configurations.
Returns:
str: The generated command to check the success of the test run.
"""
if self.command_gen_strategy is None:
raise ValueError(
"command_gen_strategy is missing. Ensure the strategy is registered in the Registry "
"by calling the appropriate registration function for the system type."
)
return self.command_gen_strategy.gen_srun_success_check(tr)

def gen_json(self, tr: TestRun) -> Dict[Any, Any]:
"""
Generate a JSON string representing the Kubernetes job specification for this test using this template.
Expand Down
43 changes: 39 additions & 4 deletions src/cloudai/cli/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@

from cloudai import Installable, Parser, Registry, ReportGenerator, Runner, System

from ..parser import HOOK_ROOT


def handle_install_and_uninstall(args: argparse.Namespace) -> int:
"""
Expand Down Expand Up @@ -212,7 +214,11 @@ def verify_test_configs(test_tomls: List[Path]) -> int:


def verify_test_scenarios(
scenario_tomls: List[Path], test_tomls: list[Path], system_config: Optional[Path] = None
scenario_tomls: List[Path],
test_tomls: list[Path],
hook_tomls: List[Path],
hook_test_tomls: list[Path],
system_config: Optional[Path] = None,
) -> int:
system = Mock(spec=System)
if system_config:
Expand All @@ -225,7 +231,9 @@ def verify_test_scenarios(
logging.debug(f"Verifying Test Scenario: {scenario_file}...")
try:
tests = Parser.parse_tests(test_tomls, system)
Parser.parse_test_scenario(scenario_file, {t.name: t for t in tests})
hook_tests = Parser.parse_tests(hook_test_tomls, system)
hooks = Parser.parse_hooks(hook_tomls, {t.name: t for t in hook_tests})
Parser.parse_test_scenario(scenario_file, {t.name: t for t in tests}, hooks)
except Exception:
nfailed += 1

Expand All @@ -243,6 +251,9 @@ def handle_verify_all_configs(args: argparse.Namespace) -> int:
if err:
return err

err, hook_tomls = expand_file_list(HOOK_ROOT, glob="**/*.toml")
tomls += hook_tomls

files = load_tomls_by_type(tomls)

test_tomls = files["test"]
Expand All @@ -259,7 +270,9 @@ def handle_verify_all_configs(args: argparse.Namespace) -> int:
if files["test"]:
nfailed += verify_test_configs(files["test"])
if files["scenario"]:
nfailed += verify_test_scenarios(files["scenario"], test_tomls, args.system_config)
nfailed += verify_test_scenarios(
files["scenario"], test_tomls, files["hook"], files["hook_test"], args.system_config
)
if files["unknown"]:
logging.error(f"Unknown configuration files: {[str(f) for f in files['unknown']]}")
nfailed += len(files["unknown"])
Expand All @@ -273,9 +286,31 @@ def handle_verify_all_configs(args: argparse.Namespace) -> int:


def load_tomls_by_type(tomls: List[Path]) -> dict[str, List[Path]]:
files: dict[str, List[Path]] = {"system": [], "test": [], "scenario": [], "unknown": []}
files: dict[str, List[Path]] = {
"system": [],
"test": [],
"scenario": [],
"hook_test": [],
"hook": [],
"unknown": [],
}
for toml_file in tomls:
content = toml_file.read_text()

is_in_hook_root = False
try:
toml_file.relative_to(HOOK_ROOT)
is_in_hook_root = True
except ValueError:
pass

if is_in_hook_root:
if "test" in toml_file.parts:
files["hook_test"].append(toml_file)
else:
files["hook"].append(toml_file)
continue

if "scheduler =" in content:
files["system"].append(toml_file)
elif "test_template_name =" in content:
Expand Down
Loading

0 comments on commit 95b3681

Please sign in to comment.