From bf1d9fb497286a504e45968290f14b49b982f250 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Tue, 22 Oct 2024 12:48:01 -0400 Subject: [PATCH 01/64] Reorder SlurmCommandGenStrategy methods --- .../strategy/slurm_command_gen_strategy.py | 50 +++++++++---------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py index 506e83f1..71562496 100644 --- a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py +++ b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py @@ -57,22 +57,14 @@ def __init__(self, system: SlurmSystem, cmd_args: Dict[str, Any]) -> None: ) self.docker_image_url = self.cmd_args.get("docker_image_url", "") - def _format_env_vars(self, env_vars: Dict[str, Any]) -> str: - """ - Format environment variables for inclusion in a batch script. - - Args: - env_vars (Dict[str, Any]): Environment variables to format. - - Returns: - str: A string representation of the formatted environment variables. - """ - formatted_vars = [] - for key in sorted(env_vars.keys()): - value = env_vars[key] - formatted_value = str(value["default"]) if isinstance(value, dict) and "default" in value else str(value) - formatted_vars.append(f"export {key}={formatted_value}") - return "\n".join(formatted_vars) + def gen_exec_command(self, tr: TestRun) -> str: + env_vars = self._override_env_vars(self.system.global_env_vars, tr.test.extra_env_vars) + cmd_args = self._override_cmd_args(self.default_cmd_args, tr.test.cmd_args) + slurm_args = self._parse_slurm_args( + tr.test.test_template.__class__.__name__, env_vars, cmd_args, tr.num_nodes, tr.nodes + ) + srun_command = self.generate_srun_command(slurm_args, env_vars, cmd_args, tr.test.extra_cmd_args) + return self._write_sbatch_script(slurm_args, env_vars, srun_command, tr.output_path) def _parse_slurm_args( self, @@ -139,15 +131,6 @@ def generate_srun_prefix(self, slurm_args: Dict[str, Any]) -> List[str]: return srun_command_parts - def gen_exec_command(self, tr: TestRun) -> str: - env_vars = self._override_env_vars(self.system.global_env_vars, tr.test.extra_env_vars) - cmd_args = self._override_cmd_args(self.default_cmd_args, tr.test.cmd_args) - slurm_args = self._parse_slurm_args( - tr.test.test_template.__class__.__name__, env_vars, cmd_args, tr.num_nodes, tr.nodes - ) - srun_command = self.generate_srun_command(slurm_args, env_vars, cmd_args, tr.test.extra_cmd_args) - return self._write_sbatch_script(slurm_args, env_vars, srun_command, tr.output_path) - def generate_test_command( self, env_vars: Dict[str, str], cmd_args: Dict[str, str], extra_cmd_args: str ) -> List[str]: @@ -237,3 +220,20 @@ def _append_sbatch_directives( batch_script_content.append( "\nexport SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)" ) + + def _format_env_vars(self, env_vars: Dict[str, Any]) -> str: + """ + Format environment variables for inclusion in a batch script. + + Args: + env_vars (Dict[str, Any]): Environment variables to format. + + Returns: + str: A string representation of the formatted environment variables. + """ + formatted_vars = [] + for key in sorted(env_vars.keys()): + value = env_vars[key] + formatted_value = str(value["default"]) if isinstance(value, dict) and "default" in value else str(value) + formatted_vars.append(f"export {key}={formatted_value}") + return "\n".join(formatted_vars) From 38bb8a7b5a23431aace8a3803a63ce8889565293 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Tue, 22 Oct 2024 13:45:01 -0400 Subject: [PATCH 02/64] Rename generate_srun_command to _gen_srun_command --- .../jax_toolbox/slurm_command_gen_strategy.py | 2 +- .../systems/slurm/strategy/slurm_command_gen_strategy.py | 8 ++++---- .../test_common_slurm_command_gen_strategy.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py index fc21554e..7a2616e9 100644 --- a/src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py +++ b/src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py @@ -152,7 +152,7 @@ def _parse_slurm_args( return base_args - def generate_srun_command( + def _gen_srun_command( self, slurm_args: Dict[str, Any], env_vars: Dict[str, str], cmd_args: Dict[str, Any], extra_cmd_args: str ) -> str: self._create_run_script(env_vars, cmd_args, extra_cmd_args) diff --git a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py index 71562496..3b7a0649 100644 --- a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py +++ b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py @@ -63,7 +63,7 @@ def gen_exec_command(self, tr: TestRun) -> str: slurm_args = self._parse_slurm_args( tr.test.test_template.__class__.__name__, env_vars, cmd_args, tr.num_nodes, tr.nodes ) - srun_command = self.generate_srun_command(slurm_args, env_vars, cmd_args, tr.test.extra_cmd_args) + srun_command = self._gen_srun_command(slurm_args, env_vars, cmd_args, tr.test.extra_cmd_args) return self._write_sbatch_script(slurm_args, env_vars, srun_command, tr.output_path) def _parse_slurm_args( @@ -112,14 +112,14 @@ def job_name(self, job_name_prefix: str) -> str: job_name = f"{self.system.account}-{job_name_prefix}.{datetime.now().strftime('%Y%m%d_%H%M%S')}" return job_name - def generate_srun_command( + def _gen_srun_command( self, slurm_args: Dict[str, Any], env_vars: Dict[str, str], cmd_args: Dict[str, str], extra_cmd_args: str ) -> str: - srun_command_parts = self.generate_srun_prefix(slurm_args) + srun_command_parts = self.gen_srun_prefix(slurm_args) test_command_parts = self.generate_test_command(env_vars, cmd_args, extra_cmd_args) return " \\\n".join(srun_command_parts + test_command_parts) - def generate_srun_prefix(self, slurm_args: Dict[str, Any]) -> List[str]: + def gen_srun_prefix(self, slurm_args: Dict[str, Any]) -> List[str]: srun_command_parts = ["srun", f"--mpi={self.system.mpi}"] if slurm_args.get("image_path"): srun_command_parts.append(f'--container-image={slurm_args["image_path"]}') diff --git a/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py index 0ea7fc38..36db4473 100644 --- a/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py +++ b/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py @@ -56,7 +56,7 @@ def test_filename_generation(strategy_fixture: SlurmCommandGenStrategy, testrun_ slurm_args = strategy_fixture._parse_slurm_args( job_name_prefix, env_vars, cmd_args, testrun_fixture.num_nodes, testrun_fixture.nodes ) - srun_command = strategy_fixture.generate_srun_command(slurm_args, env_vars, cmd_args, "") + srun_command = strategy_fixture._gen_srun_command(slurm_args, env_vars, cmd_args, "") sbatch_command = strategy_fixture._write_sbatch_script( slurm_args, env_vars, srun_command, testrun_fixture.output_path From 57192301307b4155e54b94ab83ee023b695a3f25 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Wed, 23 Oct 2024 12:05:42 -0400 Subject: [PATCH 03/64] Remove pre-test implementation from JaxToolbox --- .../jax_toolbox/slurm_command_gen_strategy.py | 93 ---------------- src/cloudai/test_definitions/gpt.py | 5 +- src/cloudai/test_definitions/grok.py | 5 +- src/cloudai/test_definitions/jax_toolbox.py | 34 +----- tests/ref_data/gpt-pretest.sbatch | 55 --------- .../{gpt-no-pretest.sbatch => gpt.sbatch} | 2 +- tests/ref_data/grok-pretest.sbatch | 55 --------- .../{grok-no-pretest.sbatch => grok.sbatch} | 2 +- ..._jax_toolbox_slurm_command_gen_strategy.py | 105 +----------------- tests/test_acceptance.py | 18 +-- 10 files changed, 16 insertions(+), 358 deletions(-) delete mode 100644 tests/ref_data/gpt-pretest.sbatch rename tests/ref_data/{gpt-no-pretest.sbatch => gpt.sbatch} (96%) delete mode 100644 tests/ref_data/grok-pretest.sbatch rename tests/ref_data/{grok-no-pretest.sbatch => grok.sbatch} (98%) diff --git a/src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py index 7a2616e9..49f4e772 100644 --- a/src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py +++ b/src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py @@ -158,25 +158,11 @@ def _gen_srun_command( self._create_run_script(env_vars, cmd_args, extra_cmd_args) commands = [] - - run_pre_test = cmd_args.get("pre_test.enable", False) - - if run_pre_test: - output_path = Path(cmd_args["output_path"]).resolve() / "output_pretest-%j-%n-%t.txt" - error_path = Path(cmd_args["output_path"]).resolve() / "error_pretest-%j-%n-%t.txt" - commands.append(self._generate_pre_test_command(cmd_args, output_path, error_path)) - commands.append(self._generate_pre_test_check_command(cmd_args, output_path)) - commands.append('if [ "$PRE_TEST_SUCCESS" = true ]; then') - load_container = cmd_args.get("load_container", False) if load_container: commands += self._generate_container_load_command(slurm_args) - commands += self._generate_run_command(slurm_args) - if run_pre_test: - commands.append("fi") - return "\n".join(commands) def _create_run_script( @@ -347,85 +333,6 @@ def _create_pgo_nsys_converter_command(self, stage: str, cmd_args: Dict[str, str ["", 'if [ "$SLURM_NODEID" -eq 0 ] && [ "$SLURM_PROCID" -eq 0 ]; then', f" {command}", "fi"] ) - def _generate_pre_test_command(self, cmd_args: Dict[str, Any], output_path: Path, error_path: Path) -> str: - """ - Generate the pre-test command for running a test. - - This method constructs the pre-test command based on the command-line - arguments provided. - - Args: - cmd_args (Dict[str, Any]): A dictionary containing command arguments. - output_path (Path): The path to the output file. - error_path (Path): The path to the error file. - - Returns: - str: The generated pre-test command. - """ - nccl_test_prefix = "pre_test.nccl_test." - nccl_test = {} - - for key, value in cmd_args.items(): - if key.startswith(nccl_test_prefix): - flag_name = key[len(nccl_test_prefix) :] - nccl_test[flag_name] = value - pre_test_command_parts = [ - "srun", - "--mpi=pmix", - f"-N {nccl_test.get('num_nodes', 2)}", - f"-o {output_path}", - f"-e {error_path}", - f"--container-image={nccl_test.get('docker_image_url', 'nvcr.io/nvidia/pytorch:24.02-py3')}", - f"/usr/local/bin/{nccl_test.get('subtest_name', 'all_gather_perf_mpi')}", - f"--nthreads {nccl_test.get('nthreads', 1)}", - f"--ngpus {nccl_test.get('ngpus', 1)}", - f"--minbytes {nccl_test.get('minbytes', '32M')}", - f"--maxbytes {nccl_test.get('maxbytes', '16G')}", - f"--stepbytes {nccl_test.get('stepbytes', '1M')}", - f"--op {nccl_test.get('op', 'sum')}", - f"--datatype {nccl_test.get('datatype', 'float')}", - f"--root {nccl_test.get('root', 0)}", - f"--iters {nccl_test.get('iters', 20)}", - f"--warmup_iters {nccl_test.get('warmup_iters', 5)}", - f"--agg_iters {nccl_test.get('agg_iters', 1)}", - f"--average {nccl_test.get('average', 1)}", - f"--parallel_init {nccl_test.get('parallel_init', 0)}", - f"--check {nccl_test.get('check', 1)}", - f"--blocking {nccl_test.get('blocking', 0)}", - f"--cudagraph {nccl_test.get('cudagraph', 0)}", - f"--stepfactor {nccl_test.get('stepfactor', 2)}", - ] - return " \\\n".join(pre_test_command_parts) - - def _generate_pre_test_check_command(self, cmd_args: Dict[str, str], output_path: Path) -> str: - """ - Generate the command for pre-test check. - - This method generates the command that checks the output of the pre-test to determine if the main test should - be run. - - Args: - cmd_args (Dict[str, str]): Command-line arguments for the job. - output_path (str): The path to the output file. - - Returns: - str: The generated command for pre-test check. - """ - pretest_output_files = str(Path(output_path).parent / "output_pretest-*.txt") - keyword = cmd_args.get("keyword", "Avg bus bandwidth") - - return "\n".join( - [ - f'PRETEST_OUTPUT_FILES="{pretest_output_files}"', - f'keyword="{keyword}"', - "", - "# Use grep to search for the keyword in the files", - 'if grep -q "$keyword" $PRETEST_OUTPUT_FILES; then', - " PRE_TEST_SUCCESS=true", - "fi", - ] - ) - def _generate_container_load_command(self, slurm_args: Dict[str, Any]) -> List[str]: """Generate the command for loading a container.""" container_image = slurm_args.get("image_path") diff --git a/src/cloudai/test_definitions/gpt.py b/src/cloudai/test_definitions/gpt.py index 35736ebb..5b003d55 100644 --- a/src/cloudai/test_definitions/gpt.py +++ b/src/cloudai/test_definitions/gpt.py @@ -16,7 +16,7 @@ from pydantic import Field -from .jax_toolbox import JaxFdl, JaxToolboxCmdArgs, JaxToolboxTestDefinition, PreTest, SetupFlags, XLAFlags +from .jax_toolbox import JaxFdl, JaxToolboxCmdArgs, JaxToolboxTestDefinition, SetupFlags, XLAFlags class GPTFdl(JaxFdl): @@ -43,7 +43,6 @@ class GPTCmdArgs(JaxToolboxCmdArgs): fdl_config: str fdl: GPTFdl = Field(default_factory=GPTFdl) - pre_test: PreTest = Field(default_factory=PreTest) xla_flags: GPTXLAFlags = Field(default_factory=GPTXLAFlags) setup_flags: GPTSetupFlags = Field(default_factory=GPTSetupFlags) @@ -58,7 +57,7 @@ def cmd_args_dict(self): d = self.cmd_args.model_dump() res = {} for k, v in d.items(): - if k in {"pre_test", "docker_image_url", "load_container", "output_path"}: + if k in {"docker_image_url", "load_container", "output_path"}: res[k] = v else: if k == "xla_flags": diff --git a/src/cloudai/test_definitions/grok.py b/src/cloudai/test_definitions/grok.py index f5e7f19c..9e42d7a2 100644 --- a/src/cloudai/test_definitions/grok.py +++ b/src/cloudai/test_definitions/grok.py @@ -16,7 +16,7 @@ from pydantic import ConfigDict, Field -from .jax_toolbox import JaxFdl, JaxToolboxCmdArgs, JaxToolboxTestDefinition, PreTest, SetupFlags, XLAFlags +from .jax_toolbox import JaxFdl, JaxToolboxCmdArgs, JaxToolboxTestDefinition, SetupFlags, XLAFlags class GrokFdl(JaxFdl): @@ -72,7 +72,6 @@ class GrokCmdArgs(JaxToolboxCmdArgs): setup_flags: SetupFlags = Field(default_factory=SetupFlags) profile: GrokProfileXLAFlags = Field(default_factory=GrokProfileXLAFlags) perf: GrokPerfXLAFlags = Field(default_factory=GrokPerfXLAFlags) - pre_test: PreTest = Field(default_factory=PreTest) class GrokTestDefinition(JaxToolboxTestDefinition): @@ -91,7 +90,7 @@ def cmd_args_dict(self): if k in {"profile", "perf"}: res.setdefault(f"Grok.{k}", {}) res[f"Grok.{k}"]["XLA_FLAGS"] = v - elif k in {"pre_test", "docker_image_url", "load_container", "output_path"}: + elif k in {"docker_image_url", "load_container", "output_path"}: res[k] = v else: res[f"Grok.{k}"] = v diff --git a/src/cloudai/test_definitions/jax_toolbox.py b/src/cloudai/test_definitions/jax_toolbox.py index 079e5b4e..4593028a 100644 --- a/src/cloudai/test_definitions/jax_toolbox.py +++ b/src/cloudai/test_definitions/jax_toolbox.py @@ -14,12 +14,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, Optional +from typing import Optional -from pydantic import BaseModel, ConfigDict, Field, field_serializer +from pydantic import BaseModel, ConfigDict, field_serializer from cloudai import CmdArgs, TestDefinition -from cloudai.test_definitions.nccl import NCCLCmdArgs class JaxFdl(BaseModel): @@ -54,35 +53,6 @@ def checkpoint_policy_serializer(self, value: str) -> str: return f'\\"{value}\\"' -class NCCLCmdAgrsPreTest(NCCLCmdArgs): - """NCCL pre-test command arguments.""" - - num_nodes: int = 8 - stepfactor: int = 2 - minbytes: str = "8M" - maxbytes: str = "16G" - blocking: int = 1 - - def model_post_init(self, _: Any) -> None: - self.subtest_name = "all_gather_perf_mpi" - self.docker_image_url = "nvcr.io/nvidia/pytorch:24.02-py3" - - -class PreTest(BaseModel): - """Pre-test configuration.""" - - model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True) - enable: bool = True - nccl_test: NCCLCmdAgrsPreTest = Field(default_factory=NCCLCmdAgrsPreTest) - - -class NCCLPreTest(BaseModel): - """Pre-test configuration.""" - - model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True) - nccl_test: Optional[NCCLCmdAgrsPreTest] = None - - class JaxToolboxCmdArgs(CmdArgs): """JAX Toolbox test command arguments.""" diff --git a/tests/ref_data/gpt-pretest.sbatch b/tests/ref_data/gpt-pretest.sbatch deleted file mode 100644 index 17c2f53b..00000000 --- a/tests/ref_data/gpt-pretest.sbatch +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=__JOB_NAME__ -#SBATCH -N 1 -#SBATCH --partition=main - -export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) -export COMBINE_THRESHOLD=1 -export PER_GPU_COMBINE_THRESHOLD=0 -export XLA_FLAGS="--xla_gpu_all_gather_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_all_reduce_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_reduce_scatter_combine_threshold_bytes=$PER_GPU_COMBINE_THRESHOLD" - -srun \ ---mpi=pmix \ --N 8 \ --o __OUTPUT_DIR__/output_pretest-%j-%n-%t.txt \ --e __OUTPUT_DIR__/error_pretest-%j-%n-%t.txt \ ---container-image=nvcr.io/nvidia/pytorch:24.02-py3 \ -/usr/local/bin/all_gather_perf_mpi \ ---nthreads 1 \ ---ngpus 1 \ ---minbytes 8M \ ---maxbytes 16G \ ---stepbytes 1M \ ---op sum \ ---datatype float \ ---root 0 \ ---iters 20 \ ---warmup_iters 5 \ ---agg_iters 1 \ ---average 1 \ ---parallel_init 0 \ ---check 1 \ ---blocking 1 \ ---cudagraph 0 \ ---stepfactor 2 -PRETEST_OUTPUT_FILES="__OUTPUT_DIR__/output_pretest-*.txt" -keyword="Avg bus bandwidth" - -# Use grep to search for the keyword in the files -if grep -q "$keyword" $PRETEST_OUTPUT_FILES; then - PRE_TEST_SUCCESS=true -fi -if [ "$PRE_TEST_SUCCESS" = true ]; then - echo "Loading container with srun command" - srun --mpi=none --container-image=https:/docker/url --container-name=cont true - echo "Running srun command" - srun \ - --mpi=none \ - \ - --export=ALL \ - -o __OUTPUT_DIR__/output-%j-%n-%t.txt \ - -e __OUTPUT_DIR__/error-%j-%n-%t.txt \ - --container-name=cont \ - --container-mounts=__OUTPUT_DIR__:/opt/paxml/workspace/ \ - /opt/paxml/workspace/run.sh -fi \ No newline at end of file diff --git a/tests/ref_data/gpt-no-pretest.sbatch b/tests/ref_data/gpt.sbatch similarity index 96% rename from tests/ref_data/gpt-no-pretest.sbatch rename to tests/ref_data/gpt.sbatch index 30b48294..d8789804 100644 --- a/tests/ref_data/gpt-no-pretest.sbatch +++ b/tests/ref_data/gpt.sbatch @@ -19,4 +19,4 @@ export XLA_FLAGS="--xla_gpu_all_gather_combine_threshold_bytes=$COMBINE_THRESHOL -e __OUTPUT_DIR__/error-%j-%n-%t.txt \ --container-name=cont \ --container-mounts=__OUTPUT_DIR__:/opt/paxml/workspace/ \ - /opt/paxml/workspace/run.sh \ No newline at end of file + /opt/paxml/workspace/run.sh diff --git a/tests/ref_data/grok-pretest.sbatch b/tests/ref_data/grok-pretest.sbatch deleted file mode 100644 index 661ddd0b..00000000 --- a/tests/ref_data/grok-pretest.sbatch +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=__JOB_NAME__ -#SBATCH -N 1 -#SBATCH --partition=main - -export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) -export COMBINE_THRESHOLD=1 -export PER_GPU_COMBINE_THRESHOLD=0 -export XLA_FLAGS="--xla_disable_hlo_passes=rematerialization --xla_dump_hlo_pass_re=.* --xla_gpu_all_gather_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_all_reduce_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_enable_all_gather_combine_by_dim=false --xla_gpu_enable_highest_priority_async_stream=true --xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_pipelined_all_gather=true --xla_gpu_enable_pipelined_all_reduce=true --xla_gpu_enable_pipelined_reduce_scatter=true --xla_gpu_enable_reduce_scatter_combine_by_dim=false --xla_gpu_enable_triton_gemm=false --xla_gpu_enable_triton_softmax_fusion=false --xla_gpu_enable_while_loop_double_buffering=true --xla_gpu_graph_level=0 --xla_gpu_pgle_profile_file_or_directory_path=/opt/paxml/workspace/pgle_output_profile.pbtxt --xla_gpu_reduce_scatter_combine_threshold_bytes=$PER_GPU_COMBINE_THRESHOLD --xla_gpu_run_post_layout_collective_pipeliner=false --xla_gpu_use_memcpy_local_p2p=false" - -srun \ ---mpi=pmix \ --N 8 \ --o __OUTPUT_DIR__/output_pretest-%j-%n-%t.txt \ --e __OUTPUT_DIR__/error_pretest-%j-%n-%t.txt \ ---container-image=nvcr.io/nvidia/pytorch:24.02-py3 \ -/usr/local/bin/all_gather_perf_mpi \ ---nthreads 1 \ ---ngpus 1 \ ---minbytes 8M \ ---maxbytes 16G \ ---stepbytes 1M \ ---op sum \ ---datatype float \ ---root 0 \ ---iters 20 \ ---warmup_iters 5 \ ---agg_iters 1 \ ---average 1 \ ---parallel_init 0 \ ---check 1 \ ---blocking 1 \ ---cudagraph 0 \ ---stepfactor 2 -PRETEST_OUTPUT_FILES="__OUTPUT_DIR__/output_pretest-*.txt" -keyword="Avg bus bandwidth" - -# Use grep to search for the keyword in the files -if grep -q "$keyword" $PRETEST_OUTPUT_FILES; then - PRE_TEST_SUCCESS=true -fi -if [ "$PRE_TEST_SUCCESS" = true ]; then - echo "Loading container with srun command" - srun --mpi=none --container-image=https:/docker/url --container-name=cont true - echo "Running srun command" - srun \ - --mpi=none \ - \ - --export=ALL \ - -o __OUTPUT_DIR__/output-%j-%n-%t.txt \ - -e __OUTPUT_DIR__/error-%j-%n-%t.txt \ - --container-name=cont \ - --container-mounts=__OUTPUT_DIR__:/opt/paxml/workspace/ \ - /opt/paxml/workspace/run.sh -fi \ No newline at end of file diff --git a/tests/ref_data/grok-no-pretest.sbatch b/tests/ref_data/grok.sbatch similarity index 98% rename from tests/ref_data/grok-no-pretest.sbatch rename to tests/ref_data/grok.sbatch index 725d29fa..808973bb 100644 --- a/tests/ref_data/grok-no-pretest.sbatch +++ b/tests/ref_data/grok.sbatch @@ -19,4 +19,4 @@ export XLA_FLAGS="--xla_disable_hlo_passes=rematerialization --xla_dump_hlo_pass -e __OUTPUT_DIR__/error-%j-%n-%t.txt \ --container-name=cont \ --container-mounts=__OUTPUT_DIR__:/opt/paxml/workspace/ \ - /opt/paxml/workspace/run.sh \ No newline at end of file + /opt/paxml/workspace/run.sh diff --git a/tests/slurm_command_gen_strategy/test_jax_toolbox_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_jax_toolbox_slurm_command_gen_strategy.py index 9e134c26..99935121 100644 --- a/tests/slurm_command_gen_strategy/test_jax_toolbox_slurm_command_gen_strategy.py +++ b/tests/slurm_command_gen_strategy/test_jax_toolbox_slurm_command_gen_strategy.py @@ -25,7 +25,7 @@ from cloudai.systems import SlurmSystem from cloudai.test_definitions.gpt import GPTCmdArgs, GPTTestDefinition from cloudai.test_definitions.grok import GrokCmdArgs, GrokTestDefinition -from cloudai.test_definitions.jax_toolbox import JaxFdl, PreTest +from cloudai.test_definitions.jax_toolbox import JaxFdl class TestJaxToolboxSlurmCommandGenStrategy: @@ -63,7 +63,6 @@ def test_gen_exec_command( test_fixture, ) -> None: test_def = request.getfixturevalue(test_fixture) - test_def.cmd_args.pre_test = PreTest(enable=True) test = Test(test_definition=test_def, test_template=JaxToolbox(slurm_system, "name")) test_run = TestRun( @@ -74,14 +73,10 @@ def test_gen_exec_command( name="test-job", ) - cmd_gen_strategy._generate_pre_test_command = MagicMock(return_value="pre_test_command") cmd = cmd_gen_strategy.gen_exec_command(test_run) assert cmd == f"sbatch {test_run.output_path}/cloudai_sbatch_script.sh" assert (test_run.output_path / "run.sh").exists() - content = Path(f"{test_run.output_path}/cloudai_sbatch_script.sh").read_text() - assert "pre_test_command" in content - @pytest.mark.parametrize( "cmd_args, expected", [ @@ -215,100 +210,6 @@ def test_generate_python_command( "fi", ] - def test_generate_pre_test_command( - self, cmd_gen_strategy: JaxToolboxSlurmCommandGenStrategy, grok_test: GrokTestDefinition, tmp_path: Path - ) -> None: - grok_test.cmd_args.pre_test = PreTest(enable=True) - - nccl_test = grok_test.cmd_args.pre_test.nccl_test - nccl_test.num_nodes = 2 - nccl_test.minbytes = "32M" - nccl_test.blocking = 0 - - cargs = {"output_path": str(tmp_path), **grok_test.cmd_args_dict} - - pre_test_cli = cmd_gen_strategy._generate_pre_test_command(cargs, tmp_path, tmp_path).splitlines() - - expected_pre_test_cli = [ - "srun \\", - "--mpi=pmix \\", - f"-N {nccl_test.num_nodes} \\", - f"-o {tmp_path} \\", - f"-e {tmp_path} \\", - f"--container-image={nccl_test.docker_image_url} \\", - f"/usr/local/bin/{nccl_test.subtest_name} \\", - f"--nthreads {nccl_test.nthreads} \\", - f"--ngpus {nccl_test.ngpus} \\", - f"--minbytes {nccl_test.minbytes} \\", - f"--maxbytes {nccl_test.maxbytes} \\", - f"--stepbytes {nccl_test.stepbytes} \\", - f"--op {nccl_test.op} \\", - f"--datatype {nccl_test.datatype} \\", - f"--root {nccl_test.root} \\", - f"--iters {nccl_test.iters} \\", - f"--warmup_iters {nccl_test.warmup_iters} \\", - f"--agg_iters {nccl_test.agg_iters} \\", - f"--average {nccl_test.average} \\", - f"--parallel_init {nccl_test.parallel_init} \\", - f"--check {nccl_test.check} \\", - f"--blocking {nccl_test.blocking} \\", - f"--cudagraph {nccl_test.cudagraph} \\", - f"--stepfactor {nccl_test.stepfactor}", - ] - - assert pre_test_cli == expected_pre_test_cli, ( - "The generated pre-test command did not match the expected command.\n" - f"Expected: {expected_pre_test_cli}\n" - f"Actual: {pre_test_cli}" - ) - - def test_generate_srun_command(self, slurm_system, cmd_gen_strategy, grok_test): - cmd_gen_strategy.test_name = grok_test.name - Path("/tmp/output").mkdir(parents=True, exist_ok=True) - - output_path = Path("/tmp/output/output") - output_path.mkdir(parents=True, exist_ok=True) - - # Use the existing setup for mocking internal methods - cmd_gen_strategy._generate_pre_test_command = MagicMock(return_value="srun --mpi=none pre_test_command") - cmd_gen_strategy._generate_run_command = MagicMock(return_value="srun --mpi=none run_command") - cmd_gen_strategy._generate_container_load_command = MagicMock( - return_value="srun --mpi=none container_load_command" - ) - - slurm_args = { - "output": "/tmp/output/output-%j.txt", - "error": "/tmp/output/error-%j.txt", - "image_path": "fake_image_url", - "container_mounts": "/tmp/output:/workspace", - } - cmd_args = { - "output_path": "/tmp/output", - "pre_test": {"enable": True}, - f"{grok_test.name}.setup_flags.docker_workspace_dir": "/workspace/docker", - f"{grok_test.name}.setup_flags.tfds_data_dir": "/workspace/tfds", - f"{grok_test.name}.setup_flags.enable_checkpoint_saving": True, - } - - pre_test_command = cmd_gen_strategy._generate_pre_test_command( - cmd_args, Path("/tmp/output"), Path("/tmp/output") - ) - run_command = cmd_gen_strategy._generate_run_command(slurm_args) - container_load_command = cmd_gen_strategy._generate_container_load_command(slurm_args) - - result_command = f"{pre_test_command}\n{container_load_command}\n{run_command}" - - # Assert expected parts of the command are in the generated result - assert "pre_test_command" in result_command - assert "container_load_command" in result_command - assert "run_command" in result_command - assert "srun" in result_command - assert "--mpi=none" in result_command - - cmd_gen_strategy._generate_pre_test_command.assert_called_once() - cmd_gen_strategy._generate_run_command.assert_called_once() - cmd_gen_strategy._generate_container_load_command.assert_called_once() - def test_gpt_test_definition_cmd_args_dict(): gpt = GPTTestDefinition( @@ -324,7 +225,7 @@ def test_gpt_test_definition_cmd_args_dict(): assert "GPT.setup_flags" in cargs assert "GPT.XLA_FLAGS" in cargs - for k in {"pre_test", "docker_image_url", "load_container"}: + for k in {"docker_image_url", "load_container"}: assert k in cargs assert f"GPT.{k}" not in cargs @@ -348,7 +249,7 @@ def test_grok_test_definition_cmd_args_dict(): assert "Grok.perf" in cargs assert "XLA_FLAGS" in cargs["Grok.perf"] - for k in {"pre_test", "docker_image_url", "load_container"}: + for k in {"docker_image_url", "load_container"}: assert k in cargs assert f"Grok.{k}" not in cargs diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py index 36f5e6cd..384c22c5 100644 --- a/tests/test_acceptance.py +++ b/tests/test_acceptance.py @@ -142,7 +142,7 @@ def partial_tr(slurm_system: SlurmSystem) -> partial[TestRun]: return partial(TestRun, num_nodes=1, nodes=[], output_path=slurm_system.output_path) -@pytest.fixture(params=["ucc", "nccl", "sleep", "gpt-pretest", "gpt-no-pretest", "grok-pretest", "grok-no-pretest"]) +@pytest.fixture(params=["ucc", "nccl", "sleep", "gpt", "grok"]) def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -> tuple[TestRun, str, Optional[str]]: if request.param == "ucc": tr = partial_tr( @@ -192,7 +192,7 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name") return (tr, "sleep.sbatch", None) - elif request.param.startswith("gpt-"): + elif request.param.startswith("gpt"): tr = partial_tr( name="gpt", test=Test( @@ -210,13 +210,9 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - slurm_system, tr.test.test_definition.cmd_args_dict ) tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name") - if "no-pretest" in request.param: - tr.test.test_definition.cmd_args.pre_test.enable = False - else: - tr.test.test_definition.cmd_args.pre_test.enable = True return (tr, f"{request.param}.sbatch", "gpt.run") - elif request.param.startswith("grok-"): + elif request.param.startswith("grok"): tr = partial_tr( name="grok", test=Test( @@ -234,10 +230,6 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - slurm_system, tr.test.test_definition.cmd_args_dict ) tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name") - if "no-pretest" in request.param: - tr.test.test_definition.cmd_args.pre_test.enable = False - else: - tr.test.test_definition.cmd_args.pre_test.enable = True return (tr, f"{request.param}.sbatch", "grok.run") @@ -251,8 +243,8 @@ def test_sbatch_generation(slurm_system: SlurmSystem, test_req: tuple[TestRun, s sbatch_script = tr.test.test_template.gen_exec_command(tr).split()[-1] - curr = Path(sbatch_script).read_text() - ref = (Path(__file__).parent / "ref_data" / test_req[1]).read_text() + curr = Path(sbatch_script).read_text().strip() + ref = (Path(__file__).parent / "ref_data" / test_req[1]).read_text().strip() ref = ref.replace("__OUTPUT_DIR__", str(slurm_system.output_path)).replace("__JOB_NAME__", "job_name") assert curr == ref From 8e8ee3ec33d5ec1001dfbe4225c2cf0c72adcdbf Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Wed, 23 Oct 2024 07:24:23 -0400 Subject: [PATCH 04/64] Add prologue and epilogue to _TestScenarioTOML --- src/cloudai/_core/test_scenario_parser.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/cloudai/_core/test_scenario_parser.py b/src/cloudai/_core/test_scenario_parser.py index 08526dca..16302a2a 100644 --- a/src/cloudai/_core/test_scenario_parser.py +++ b/src/cloudai/_core/test_scenario_parser.py @@ -54,6 +54,8 @@ class _TestScenarioTOML(BaseModel): name: str job_status_check: bool = True tests: list[_TestRunTOML] = Field(alias="Tests", min_length=1) + prologue: str = "" + epilogue: str = "" @model_validator(mode="after") def check_no_self_dependency(self): From 32d7d93a30ec2c1139de646af0dc66ec12b79847 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Tue, 22 Oct 2024 14:50:35 -0400 Subject: [PATCH 05/64] Add example plugin files --- conf/common/plugin/nccl_test_epilogue.toml | 22 ++++++++++++++++++++++ conf/common/plugin/nccl_test_prologue.toml | 22 ++++++++++++++++++++++ conf/common/test_scenario/nccl_test.toml | 4 ++++ 3 files changed, 48 insertions(+) create mode 100644 conf/common/plugin/nccl_test_epilogue.toml create mode 100644 conf/common/plugin/nccl_test_prologue.toml diff --git a/conf/common/plugin/nccl_test_epilogue.toml b/conf/common/plugin/nccl_test_epilogue.toml new file mode 100644 index 00000000..346dc8e4 --- /dev/null +++ b/conf/common/plugin/nccl_test_epilogue.toml @@ -0,0 +1,22 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name = "nccl_test_epilogue" + +[[Tests]] +id = "Tests.1" +test_name = "nccl_test_all_gather" +time_limit = "00:20:00" diff --git a/conf/common/plugin/nccl_test_prologue.toml b/conf/common/plugin/nccl_test_prologue.toml new file mode 100644 index 00000000..e5c1a1e4 --- /dev/null +++ b/conf/common/plugin/nccl_test_prologue.toml @@ -0,0 +1,22 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name = "nccl_test_prologue" + +[[Tests]] +id = "Tests.1" +test_name = "nccl_test_all_reduce" +time_limit = "00:20:00" diff --git a/conf/common/test_scenario/nccl_test.toml b/conf/common/test_scenario/nccl_test.toml index f6ccf02c..9b731e96 100644 --- a/conf/common/test_scenario/nccl_test.toml +++ b/conf/common/test_scenario/nccl_test.toml @@ -15,6 +15,10 @@ # limitations under the License. name = "nccl-test" + +prologue = "nccl_test_prologue" +epilogue = "nccl_test_epilogue" + [[Tests]] id = "Tests.1" test_name = "nccl_test_all_reduce" From 28a38b841b4cf3473c585fc95e11f09f73a9ff23 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Wed, 23 Oct 2024 07:35:01 -0400 Subject: [PATCH 06/64] Add plugin option to CLI --- src/cloudai/cli/cli.py | 4 ++ tests/test_cli.py | 92 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 94 insertions(+), 2 deletions(-) diff --git a/src/cloudai/cli/cli.py b/src/cloudai/cli/cli.py index 53059799..f962a2d5 100644 --- a/src/cloudai/cli/cli.py +++ b/src/cloudai/cli/cli.py @@ -60,6 +60,7 @@ def add_command( handler: Callable[[argparse.Namespace], int], system_config: Optional[bool] = None, tests_dir: Optional[bool] = None, + plugin_dir: Optional[bool] = None, test_scenario: Optional[bool] = None, output_dir: Optional[bool] = None, result_dir: Optional[bool] = None, @@ -74,6 +75,8 @@ def add_command( p.add_argument( "--tests-dir", help="Path to the test configuration directory.", required=tests_dir, type=Path ) + if plugin_dir is not None: + p.add_argument("--plugin-dir", help="Path to the plugin directory.", required=plugin_dir, type=Path) if test_scenario is not None: p.add_argument("--test-scenario", help="Path to the test scenario file.", required=test_scenario, type=Path) if output_dir is not None: @@ -127,6 +130,7 @@ def add_run_and_dry_run(self): handle_dry_run_and_run, system_config=True, tests_dir=True, + plugin_dir=False, test_scenario=True, output_dir=False, ) diff --git a/tests/test_cli.py b/tests/test_cli.py index 538e497f..bb6c1a5d 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -20,8 +20,8 @@ import pytest -from cloudai.cli import CloudAICLI, handle_generate_report, handle_install_and_uninstall -from cloudai.cli.handlers import handle_verify_all_configs +from cloudai.cli import CloudAICLI +from cloudai.cli.handlers import handle_generate_report, handle_install_and_uninstall, handle_verify_all_configs def test_help_message(capsys: pytest.CaptureFixture[str]) -> None: @@ -108,6 +108,7 @@ def test_add_command_all_optional(): lambda _: 0, system_config=False, tests_dir=False, + plugin_dir=False, test_scenario=False, output_dir=False, ) @@ -118,6 +119,7 @@ def test_add_command_all_optional(): mode="test", system_config=None, tests_dir=None, + plugin_dir=None, test_scenario=None, output_dir=None, ) @@ -132,6 +134,7 @@ def test_add_command_all_required(): lambda _: 0, system_config=True, tests_dir=True, + plugin_dir=True, test_scenario=True, output_dir=True, ) @@ -142,6 +145,8 @@ def test_add_command_all_required(): "system_config", "--tests-dir", "tests_dir", + "--plugin-dir", + "plugin_dir", "--test-scenario", "test_scenario", "--output-dir", @@ -154,11 +159,91 @@ def test_add_command_all_required(): mode="test", system_config=Path("system_config"), tests_dir=Path("tests_dir"), + plugin_dir=Path("plugin_dir"), test_scenario=Path("test_scenario"), output_dir=Path("output_dir"), ) +@pytest.mark.parametrize( + "mode,args,expected_plugin_dir", + [ + ( + "run", + [ + "run", + "--system-config", + "system_config", + "--tests-dir", + "tests_dir", + "--plugin-dir", + "plugin_dir", + "--test-scenario", + "test_scenario", + ], + Path("plugin_dir"), + ), + ( + "run", + [ + "run", + "--system-config", + "system_config", + "--tests-dir", + "tests_dir", + "--test-scenario", + "test_scenario", + ], + None, + ), + ( + "dry-run", + [ + "dry-run", + "--system-config", + "system_config", + "--tests-dir", + "tests_dir", + "--plugin-dir", + "plugin_dir", + "--test-scenario", + "test_scenario", + ], + Path("plugin_dir"), + ), + ( + "dry-run", + [ + "dry-run", + "--system-config", + "system_config", + "--tests-dir", + "tests_dir", + "--test-scenario", + "test_scenario", + ], + None, + ), + ], +) +def test_modes_with_or_without_plugin_dir(mode, args, expected_plugin_dir): + cli = CloudAICLI() + + cli.add_command( + mode, + f"{mode} command", + lambda _: 0, + system_config=True, + tests_dir=True, + plugin_dir=False, + test_scenario=True, + output_dir=False, + ) + + parsed_args = cli.parser.parse_args(args) + assert parsed_args.plugin_dir == expected_plugin_dir + + def test_real_uninstall(): cli = CloudAICLI() cli.init_default_args() @@ -277,6 +362,8 @@ def test_run_dry_run_modes(self, cli: CloudAICLI): "tests_dir", "--test-scenario", "test_scenario", + "--plugin-dir", + "plugin_dir", ] ) @@ -286,6 +373,7 @@ def test_run_dry_run_modes(self, cli: CloudAICLI): mode=mode, system_config=Path("system_config"), tests_dir=Path("tests_dir"), + plugin_dir=Path("plugin_dir"), test_scenario=Path("test_scenario"), output_dir=None, ) From bb3275fc6ea511f0a723add1909c46db1f3fa8a0 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Wed, 23 Oct 2024 07:43:01 -0400 Subject: [PATCH 07/64] Parse plugins and pass them to TestRun --- src/cloudai/_core/test_scenario.py | 2 + src/cloudai/_core/test_scenario_parser.py | 23 +++++++- src/cloudai/cli/handlers.py | 2 +- src/cloudai/parser.py | 47 ++++++++++++--- tests/test_acceptance.py | 1 + tests/test_parser.py | 70 ++++++++++++++++++++++- tests/test_test_scenario.py | 2 +- 7 files changed, 131 insertions(+), 16 deletions(-) diff --git a/src/cloudai/_core/test_scenario.py b/src/cloudai/_core/test_scenario.py index 3a60c036..97c89994 100644 --- a/src/cloudai/_core/test_scenario.py +++ b/src/cloudai/_core/test_scenario.py @@ -58,6 +58,8 @@ class TestRun: weight: float = 0.0 ideal_perf: float = 1.0 dependencies: dict[str, TestDependency] = field(default_factory=dict) + prologue: Optional["TestScenario"] = None + epilogue: Optional["TestScenario"] = None def __hash__(self) -> int: return hash(self.name + self.test.name + str(self.iterations) + str(self.current_iteration)) diff --git a/src/cloudai/_core/test_scenario_parser.py b/src/cloudai/_core/test_scenario_parser.py index 16302a2a..c59adeba 100644 --- a/src/cloudai/_core/test_scenario_parser.py +++ b/src/cloudai/_core/test_scenario_parser.py @@ -101,9 +101,10 @@ class TestScenarioParser: __test__ = False - def __init__(self, file_path: Path, test_mapping: Dict[str, Test]) -> None: + def __init__(self, file_path: Path, test_mapping: Dict[str, Test], plugin_mapping: Dict[str, TestScenario]) -> None: self.file_path = file_path self.test_mapping = test_mapping + self.plugin_mapping = plugin_mapping def parse(self) -> TestScenario: """ @@ -138,8 +139,14 @@ def _parse_data(self, data: Dict[str, Any]) -> TestScenario: total_weight = sum(tr.weight for tr in ts_model.tests) normalized_weight = 0 if total_weight == 0 else 100 / total_weight + prologue_name = data.get("prologue", "") + epilogue_name = data.get("epilogue", "") + + prologue = self.plugin_mapping.get(prologue_name, None) if prologue_name else None + epilogue = self.plugin_mapping.get(epilogue_name, None) if epilogue_name else None + testruns_by_id: dict[str, TestRun] = { - tr.id: self._create_section_test_run(tr, normalized_weight) for tr in ts_model.tests + tr.id: self._create_section_test_run(tr, normalized_weight, prologue, epilogue) for tr in ts_model.tests } tests_data: dict[str, _TestRunTOML] = {tr.id: tr for tr in ts_model.tests} @@ -155,13 +162,21 @@ def _parse_data(self, data: Dict[str, Any]) -> TestScenario: job_status_check=ts_model.job_status_check, ) - def _create_section_test_run(self, test_info: _TestRunTOML, normalized_weight: float) -> TestRun: + def _create_section_test_run( + self, + test_info: _TestRunTOML, + normalized_weight: float, + prologue: Optional[TestScenario], + epilogue: Optional[TestScenario], + ) -> TestRun: """ Create a section-specific Test object by copying from the test mapping. Args: test_info (Dict[str, Any]): Information of the test. normalized_weight (float): Normalized weight for the test. + prologue (Optional[TestScenario]): TestScenario object representing the prologue sequence. + epilogue (Optional[TestScenario]): TestScenario object representing the epilogue sequence. Returns: Test: Copied and updated Test object for the section. @@ -194,5 +209,7 @@ def _create_section_test_run(self, test_info: _TestRunTOML, normalized_weight: f sol=test_info.sol, weight=test_info.weight * normalized_weight, ideal_perf=test_info.ideal_perf, + prologue=prologue if prologue is not None else TestScenario(name="default_prologue", test_runs=[]), + epilogue=epilogue if epilogue is not None else TestScenario(name="default_epilogue", test_runs=[]), ) return tr diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py index 34fa9b0b..a204f70e 100644 --- a/src/cloudai/cli/handlers.py +++ b/src/cloudai/cli/handlers.py @@ -114,7 +114,7 @@ def handle_dry_run_and_run(args: argparse.Namespace) -> int: args (argparse.Namespace): The parsed command-line arguments. """ parser = Parser(args.system_config) - system, tests, test_scenario = parser.parse(args.tests_dir, args.test_scenario) + system, tests, test_scenario = parser.parse(args.tests_dir, args.test_scenario, args.plugin_dir) assert test_scenario is not None if args.output_dir: diff --git a/src/cloudai/parser.py b/src/cloudai/parser.py index a627b312..5b21ab3f 100644 --- a/src/cloudai/parser.py +++ b/src/cloudai/parser.py @@ -16,7 +16,7 @@ import logging from pathlib import Path -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Set, Tuple import toml from pydantic import ValidationError @@ -49,7 +49,7 @@ def __init__(self, system_config_path: Path) -> None: self.system_config_path = system_config_path def parse( - self, test_path: Path, test_scenario_path: Optional[Path] = None + self, test_path: Path, test_scenario_path: Optional[Path] = None, plugin_path: Optional[Path] = None ) -> Tuple[System, List[Test], Optional[TestScenario]]: """ Parse configurations for system, test templates, and test scenarios. @@ -74,21 +74,50 @@ def parse( logging.debug(f"Parsed {len(tests)} tests: {[t.name for t in tests]}") test_mapping = {t.name: t for t in tests} - filtered_tests = tests test_scenario: Optional[TestScenario] = None + scenario_test_names: Set[str] = set() if test_scenario_path: + plugin_mapping: Dict[str, TestScenario] = {} + plugin_test_names: Set[str] = set() + if plugin_path and plugin_path.exists(): + try: + plugin_mapping = self.parse_plugins(list(plugin_path.glob("*.toml")), test_mapping) + for plugin_scenario in plugin_mapping.values(): + plugin_test_names.update(tr.test.name for tr in plugin_scenario.test_runs) + except TestScenarioParsingError: + exit(1) + try: - test_scenario = self.parse_test_scenario(test_scenario_path, test_mapping) + test_scenario = self.parse_test_scenario(test_scenario_path, test_mapping, plugin_mapping) + scenario_test_names = set(tr.test.name for tr in test_scenario.test_runs) except TestScenarioParsingError: - exit(1) # exit right away to keep error message readable for users - scenario_tests = set(tr.test.name for tr in test_scenario.test_runs) - filtered_tests = [t for t in tests if t.name in scenario_tests] + exit(1) + + all_used_test_names = plugin_test_names.union(scenario_test_names) + filtered_tests = [t for t in tests if t.name in all_used_test_names] + else: + filtered_tests = tests return system, filtered_tests, test_scenario @staticmethod - def parse_test_scenario(test_scenario_path: Path, test_mapping: Dict[str, Test]) -> TestScenario: - test_scenario_parser = TestScenarioParser(test_scenario_path, test_mapping) + def parse_plugins(plugin_tomls: List[Path], test_mapping: Dict[str, Test]) -> Dict[str, TestScenario]: + plugin_mapping = {} + for plugin_path in plugin_tomls: + plugin_scenario = Parser.parse_test_scenario(plugin_path, test_mapping) + plugin_mapping[plugin_scenario.name] = plugin_scenario + return plugin_mapping + + @staticmethod + def parse_test_scenario( + test_scenario_path: Path, + test_mapping: Dict[str, Test], + plugin_mapping: Optional[Dict[str, TestScenario]] = None, + ) -> TestScenario: + if plugin_mapping is None: + plugin_mapping = {} + + test_scenario_parser = TestScenarioParser(test_scenario_path, test_mapping, plugin_mapping) test_scenario = test_scenario_parser.parse() return test_scenario diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py index 384c22c5..7c017ce5 100644 --- a/tests/test_acceptance.py +++ b/tests/test_acceptance.py @@ -60,6 +60,7 @@ def test_slurm(tmp_path: Path, scenario: Dict): system_config=Path("conf/common/system/example_slurm_cluster.toml"), test_templates_dir=Path("conf/common/test_template"), tests_dir=Path("conf/common/test"), + plugin_dir=Path("conf/common/plugin"), test_scenario=test_scenario_path, output_dir=tmp_path, ) diff --git a/tests/test_parser.py b/tests/test_parser.py index d35896a9..cb809d36 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -34,7 +34,7 @@ def parser(self, tmp_path: Path) -> Parser: def test_no_tests_dir(self, parser: Parser): tests_dir = parser.system_config_path.parent / "tests" with pytest.raises(FileNotFoundError) as exc_info: - parser.parse(tests_dir, None) + parser.parse(tests_dir, None, None) assert "Test path" in str(exc_info.value) @patch("cloudai._core.test_parser.TestParser.parse_all") @@ -50,19 +50,85 @@ def test_no_scenario(self, test_parser: Mock, parser: Parser): @patch("cloudai._core.test_parser.TestParser.parse_all") @patch("cloudai._core.test_scenario_parser.TestScenarioParser.parse") - def test_scenario_filters_tests(self, test_scenario_parser: Mock, test_parser: Mock, parser: Parser): + def test_scenario_without_plugin(self, test_scenario_parser: Mock, test_parser: Mock, parser: Parser): tests_dir = parser.system_config_path.parent.parent / "test" + fake_tests = [] for i in range(3): fake_tests.append(Mock()) fake_tests[-1].name = f"test-{i}" test_parser.return_value = fake_tests + fake_scenario = Mock() fake_scenario.test_runs = [Mock()] fake_scenario.test_runs[0].test.name = "test-1" test_scenario_parser.return_value = fake_scenario + _, tests, _ = parser.parse(tests_dir, Path()) + + assert len(tests) == 1 + assert tests[0].name == "test-1" + + @patch("cloudai._core.test_parser.TestParser.parse_all") + @patch("cloudai._core.test_scenario_parser.TestScenarioParser.parse") + @patch("cloudai.parser.Parser.parse_plugins") + def test_scenario_with_plugin_common_tests( + self, parse_plugins: Mock, test_scenario_parser: Mock, test_parser: Mock, parser: Parser + ): + tests_dir = parser.system_config_path.parent.parent / "test" + + fake_tests = [] + for i in range(3): + fake_tests.append(Mock()) + fake_tests[-1].name = f"test-{i}" + test_parser.return_value = fake_tests + + fake_scenario = Mock() + fake_scenario.test_runs = [Mock()] + fake_scenario.test_runs[0].test.name = "test-1" + test_scenario_parser.return_value = fake_scenario + + fake_plugin = Mock() + fake_plugin.test_runs = [Mock()] + fake_plugin.test_runs[0].test.name = "test-1" + parse_plugins.return_value = {"plugin-1": fake_plugin} + + _, tests, _ = parser.parse(tests_dir, Path(), Path()) + assert len(tests) == 1 + assert tests[0].name == "test-1" + + @patch("cloudai._core.test_parser.TestParser.parse_all") + @patch("cloudai._core.test_scenario_parser.TestScenarioParser.parse") + @patch("cloudai.parser.Parser.parse_plugins") + def test_scenario_with_plugin_exclusive_tests( + self, parse_plugins: Mock, test_scenario_parser: Mock, test_parser: Mock, parser: Parser + ): + tests_dir = parser.system_config_path.parent.parent / "test" + + fake_tests = [] + for i in range(4): + fake_tests.append(Mock()) + fake_tests[-1].name = f"test-{i}" + test_parser.return_value = fake_tests + + fake_scenario = Mock() + fake_scenario.test_runs = [Mock()] + fake_scenario.test_runs[0].test.name = "test-1" + test_scenario_parser.return_value = fake_scenario + + fake_plugin = Mock() + fake_plugin.test_runs = [Mock()] + fake_plugin.test_runs[0].test.name = "test-2" + parse_plugins.return_value = {"plugin-1": fake_plugin} + + _, tests, _ = parser.parse(tests_dir, Path(), Path()) + + assert len(tests) == 2 + assert "test-1" in [t.name for t in tests] + assert "test-2" in [t.name for t in tests] + assert "test-0" not in [t.name for t in tests] + assert "test-3" not in [t.name for t in tests] def test_parse_system(self, parser: Parser): parser.system_config_path = Path("conf/common/system/example_slurm_cluster.toml") diff --git a/tests/test_test_scenario.py b/tests/test_test_scenario.py index 6e2e1504..87a96aaf 100644 --- a/tests/test_test_scenario.py +++ b/tests/test_test_scenario.py @@ -27,7 +27,7 @@ @pytest.fixture def test_scenario_parser(tmp_path: Path) -> TestScenarioParser: - tsp = TestScenarioParser(Path(""), {}) + tsp = TestScenarioParser(Path(""), {}, {}) return tsp From 3bc3822fad704bea61615d5bcd2e998ac9b3e924 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Fri, 25 Oct 2024 06:28:41 -0400 Subject: [PATCH 08/64] Generate plugin commands --- src/cloudai/_core/command_gen_strategy.py | 26 ++++ src/cloudai/_core/test_template.py | 34 +++++ .../nccl_test/slurm_command_gen_strategy.py | 5 + .../strategy/slurm_command_gen_strategy.py | 110 +++++++++++++- tests/ref_data/gpt.sbatch | 2 +- tests/ref_data/grok.sbatch | 2 +- tests/ref_data/nccl.sbatch | 21 +-- tests/ref_data/sleep.sbatch | 4 +- tests/ref_data/ucc.sbatch | 10 +- .../test_common_slurm_command_gen_strategy.py | 137 +++++++++++++++++- 10 files changed, 312 insertions(+), 39 deletions(-) diff --git a/src/cloudai/_core/command_gen_strategy.py b/src/cloudai/_core/command_gen_strategy.py index 16bd04f9..9c8bb389 100644 --- a/src/cloudai/_core/command_gen_strategy.py +++ b/src/cloudai/_core/command_gen_strategy.py @@ -39,3 +39,29 @@ def gen_exec_command(self, tr: TestRun) -> str: str: The generated execution command. """ pass + + @abstractmethod + def gen_srun_command(self, tr: TestRun) -> str: + """ + Generate the Slurm srun command for a test based on the given parameters. + + Args: + tr (TestRun): Contains the test and its run-specific configurations. + + Returns: + str: The generated Slurm srun command. + """ + pass + + @abstractmethod + def gen_srun_success_check(self, tr: TestRun) -> str: + """ + Generate the Slurm success check command to verify if a test run was successful. + + Args: + tr (TestRun): Contains the test and its run-specific configurations. + + Returns: + str: The generated command to check the success of the test run. + """ + pass diff --git a/src/cloudai/_core/test_template.py b/src/cloudai/_core/test_template.py index c0227d3b..0b90b737 100644 --- a/src/cloudai/_core/test_template.py +++ b/src/cloudai/_core/test_template.py @@ -133,6 +133,40 @@ def gen_exec_command(self, tr: TestRun) -> str: ) return self.command_gen_strategy.gen_exec_command(tr) + def gen_srun_command(self, tr: TestRun) -> str: + """ + Generate an Slurm srun command for a test using the provided command generation strategy. + + Args: + tr (TestRun): Contains the test and its run-specific configurations. + + Returns: + str: The generated Slurm srun command. + """ + if self.command_gen_strategy is None: + raise ValueError( + "command_gen_strategy is missing. Ensure the strategy is registered in the Registry " + "by calling the appropriate registration function for the system type." + ) + return self.command_gen_strategy.gen_srun_command(tr) + + def gen_srun_success_check(self, tr: TestRun) -> str: + """ + Generate a Slurm success check command for a test using the provided command generation strategy. + + Args: + tr (TestRun): Contains the test and its run-specific configurations. + + Returns: + str: The generated command to check the success of the test run. + """ + if self.command_gen_strategy is None: + raise ValueError( + "command_gen_strategy is missing. Ensure the strategy is registered in the Registry " + "by calling the appropriate registration function for the system type." + ) + return self.command_gen_strategy.gen_srun_success_check(tr) + def gen_json(self, tr: TestRun) -> Dict[Any, Any]: """ Generate a JSON string representing the Kubernetes job specification for this test using this template. diff --git a/src/cloudai/schema/test_template/nccl_test/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/nccl_test/slurm_command_gen_strategy.py index e982c28a..b63ab35c 100644 --- a/src/cloudai/schema/test_template/nccl_test/slurm_command_gen_strategy.py +++ b/src/cloudai/schema/test_template/nccl_test/slurm_command_gen_strategy.py @@ -17,6 +17,7 @@ from pathlib import Path from typing import Any, Dict, List +from cloudai import TestRun from cloudai.systems.slurm.strategy import SlurmCommandGenStrategy from .slurm_install_strategy import NcclTestSlurmInstallStrategy @@ -83,3 +84,7 @@ def generate_test_command( srun_command_parts.append(extra_cmd_args) return srun_command_parts + + def gen_srun_success_check(self, tr: TestRun) -> str: + output_file = Path(tr.output_path) / "stdout.txt" + return f'grep -q "Avg bus bandwidth" {output_file} && echo 1 || echo 0' diff --git a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py index 3b7a0649..d60a16ef 100644 --- a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py +++ b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py @@ -18,7 +18,7 @@ from pathlib import Path from typing import Any, Dict, List -from cloudai import CommandGenStrategy, TestRun +from cloudai import CommandGenStrategy, TestRun, TestScenario from cloudai.systems import SlurmSystem from cloudai.util.docker_image_cache_manager import DockerImageCacheManager @@ -63,8 +63,35 @@ def gen_exec_command(self, tr: TestRun) -> str: slurm_args = self._parse_slurm_args( tr.test.test_template.__class__.__name__, env_vars, cmd_args, tr.num_nodes, tr.nodes ) - srun_command = self._gen_srun_command(slurm_args, env_vars, cmd_args, tr.test.extra_cmd_args) - return self._write_sbatch_script(slurm_args, env_vars, srun_command, tr.output_path) + + if tr.prologue: + prologue_command = self.gen_prologue(tr.prologue, tr.output_path) + srun_command = self._gen_srun_command(slurm_args, env_vars, cmd_args, tr.test.extra_cmd_args) + command_list = [prologue_command, "if [ $PROLOGUE_SUCCESS -eq 1 ]; then", f" {srun_command}"] + + if tr.epilogue: + epilogue_command = self.gen_epilogue(tr.epilogue, tr.output_path) + command_list.append(f" {epilogue_command}") + + command_list.append("fi") + else: + srun_command = self._gen_srun_command(slurm_args, env_vars, cmd_args, tr.test.extra_cmd_args) + command_list = [srun_command] + + if tr.epilogue: + epilogue_command = self.gen_epilogue(tr.epilogue, tr.output_path) + command_list.append(epilogue_command) + + full_command = "\n".join(command_list).strip() + return self._write_sbatch_script(slurm_args, env_vars, full_command, tr.output_path) + + def gen_srun_command(self, tr: TestRun) -> str: + env_vars = self._override_env_vars(self.system.global_env_vars, tr.test.extra_env_vars) + cmd_args = self._override_cmd_args(self.default_cmd_args, tr.test.cmd_args) + slurm_args = self._parse_slurm_args( + tr.test.test_template.__class__.__name__, env_vars, cmd_args, tr.num_nodes, tr.nodes + ) + return self._gen_srun_command(slurm_args, env_vars, cmd_args, tr.test.extra_cmd_args) def _parse_slurm_args( self, @@ -112,12 +139,87 @@ def job_name(self, job_name_prefix: str) -> str: job_name = f"{self.system.account}-{job_name_prefix}.{datetime.now().strftime('%Y%m%d_%H%M%S')}" return job_name + def gen_prologue(self, prologue: TestScenario, base_output_path: Path) -> str: + """ + Generate the prologue command by running all tests defined in the prologue test scenario. + + Args: + prologue (TestScenario): The prologue test scenario containing the tests to be run. + base_output_path (Path): The base output directory path for storing prologue outputs. + + Returns: + str: A string with all the Slurm srun commands generated for the prologue. + """ + if not prologue.test_runs: + return "PROLOGUE_SUCCESS=1\n" + + prologue_output_dir = base_output_path / "prologue" + prologue_output_dir.mkdir(parents=True, exist_ok=True) + + prologue_commands = [] + success_vars = [] + + for idx, tr in enumerate(prologue.test_runs): + plugin_dir = prologue_output_dir / tr.test.name + plugin_dir.mkdir(parents=True, exist_ok=True) + tr.output_path = plugin_dir + + srun_command = tr.test.test_template.gen_srun_command(tr) + srun_command_with_output = srun_command.replace( + "srun ", f"srun --output={plugin_dir / 'stdout.txt'} --error={plugin_dir / 'stderr.txt'} " + ) + prologue_commands.append(srun_command_with_output) + + success_var = f"SUCCESS_{idx}" + success_vars.append(success_var) + + success_check_command = tr.test.test_template.gen_srun_success_check(tr) + prologue_commands.append(f"{success_var}=$({success_check_command})") + + combined_success_var = " && ".join([f"[ ${var} -eq 1 ]" for var in success_vars]) + + prologue_commands.append(f"PROLOGUE_SUCCESS=$( {combined_success_var} && echo 1 || echo 0 )") + + return "\n".join(prologue_commands) + + def gen_epilogue(self, epilogue: TestScenario, base_output_path: Path) -> str: + """ + Generate the epilogue command by running all tests defined in the epilogue test scenario. + + Args: + epilogue (TestScenario): The epilogue test scenario containing the tests to be run. + base_output_path (Path): The base output directory path for storing epilogue outputs. + + Returns: + str: A string with all the Slurm srun commands generated for the epilogue. + """ + if not epilogue.test_runs: + return "" + + epilogue_output_dir = base_output_path / "epilogue" + epilogue_output_dir.mkdir(parents=True, exist_ok=True) + + epilogue_commands = [] + + for tr in epilogue.test_runs: + plugin_dir = epilogue_output_dir / tr.test.name + plugin_dir.mkdir(parents=True, exist_ok=True) + tr.output_path = plugin_dir + + srun_command = tr.test.test_template.gen_srun_command(tr) + srun_command_with_output = srun_command.replace( + "srun ", f"srun --output={plugin_dir / 'stdout.txt'} --error={plugin_dir / 'stderr.txt'} " + ) + epilogue_commands.append(srun_command_with_output) + + return "\n".join(epilogue_commands) + def _gen_srun_command( self, slurm_args: Dict[str, Any], env_vars: Dict[str, str], cmd_args: Dict[str, str], extra_cmd_args: str ) -> str: srun_command_parts = self.gen_srun_prefix(slurm_args) test_command_parts = self.generate_test_command(env_vars, cmd_args, extra_cmd_args) - return " \\\n".join(srun_command_parts + test_command_parts) + return " ".join(srun_command_parts + test_command_parts) def gen_srun_prefix(self, slurm_args: Dict[str, Any]) -> List[str]: srun_command_parts = ["srun", f"--mpi={self.system.mpi}"] diff --git a/tests/ref_data/gpt.sbatch b/tests/ref_data/gpt.sbatch index d8789804..ec00c0d1 100644 --- a/tests/ref_data/gpt.sbatch +++ b/tests/ref_data/gpt.sbatch @@ -8,7 +8,7 @@ export COMBINE_THRESHOLD=1 export PER_GPU_COMBINE_THRESHOLD=0 export XLA_FLAGS="--xla_gpu_all_gather_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_all_reduce_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_reduce_scatter_combine_threshold_bytes=$PER_GPU_COMBINE_THRESHOLD" - echo "Loading container with srun command" +echo "Loading container with srun command" srun --mpi=none --container-image=https:/docker/url --container-name=cont true echo "Running srun command" srun \ diff --git a/tests/ref_data/grok.sbatch b/tests/ref_data/grok.sbatch index 808973bb..8ca5ebbe 100644 --- a/tests/ref_data/grok.sbatch +++ b/tests/ref_data/grok.sbatch @@ -8,7 +8,7 @@ export COMBINE_THRESHOLD=1 export PER_GPU_COMBINE_THRESHOLD=0 export XLA_FLAGS="--xla_disable_hlo_passes=rematerialization --xla_dump_hlo_pass_re=.* --xla_gpu_all_gather_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_all_reduce_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_enable_all_gather_combine_by_dim=false --xla_gpu_enable_highest_priority_async_stream=true --xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_pipelined_all_gather=true --xla_gpu_enable_pipelined_all_reduce=true --xla_gpu_enable_pipelined_reduce_scatter=true --xla_gpu_enable_reduce_scatter_combine_by_dim=false --xla_gpu_enable_triton_gemm=false --xla_gpu_enable_triton_softmax_fusion=false --xla_gpu_enable_while_loop_double_buffering=true --xla_gpu_graph_level=0 --xla_gpu_pgle_profile_file_or_directory_path=/opt/paxml/workspace/pgle_output_profile.pbtxt --xla_gpu_reduce_scatter_combine_threshold_bytes=$PER_GPU_COMBINE_THRESHOLD --xla_gpu_run_post_layout_collective_pipeliner=false --xla_gpu_use_memcpy_local_p2p=false" - echo "Loading container with srun command" +echo "Loading container with srun command" srun --mpi=none --container-image=https:/docker/url --container-name=cont true echo "Running srun command" srun \ diff --git a/tests/ref_data/nccl.sbatch b/tests/ref_data/nccl.sbatch index 3ac39077..dc179ba9 100644 --- a/tests/ref_data/nccl.sbatch +++ b/tests/ref_data/nccl.sbatch @@ -8,23 +8,4 @@ export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) -srun \ ---mpi=pmix \ ---container-image=nvcr.io/nvidia/pytorch:24.02-py3 \ -/usr/local/bin/all_reduce_perf_mpi \ ---nthreads 1 \ ---ngpus 1 \ ---minbytes 32M \ ---maxbytes 32M \ ---stepbytes 1M \ ---op sum \ ---datatype float \ ---root 0 \ ---iters 20 \ ---warmup_iters 5 \ ---agg_iters 1 \ ---average 1 \ ---parallel_init 0 \ ---check 1 \ ---blocking 0 \ ---cudagraph 0 \ No newline at end of file +srun --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 /usr/local/bin/all_reduce_perf_mpi --nthreads 1 --ngpus 1 --minbytes 32M --maxbytes 32M --stepbytes 1M --op sum --datatype float --root 0 --iters 20 --warmup_iters 5 --agg_iters 1 --average 1 --parallel_init 0 --check 1 --blocking 0 --cudagraph 0 diff --git a/tests/ref_data/sleep.sbatch b/tests/ref_data/sleep.sbatch index 7c24ec14..9262001b 100644 --- a/tests/ref_data/sleep.sbatch +++ b/tests/ref_data/sleep.sbatch @@ -8,6 +8,4 @@ export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) -srun \ ---mpi=pmix \ -sleep 5 \ No newline at end of file +srun --mpi=pmix sleep 5 diff --git a/tests/ref_data/ucc.sbatch b/tests/ref_data/ucc.sbatch index 74fa7799..a9f9e686 100644 --- a/tests/ref_data/ucc.sbatch +++ b/tests/ref_data/ucc.sbatch @@ -8,12 +8,4 @@ export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) -srun \ ---mpi=pmix \ ---container-image=nvcr.io/nvidia/pytorch:24.02-py3 \ -/opt/hpcx/ucc/bin/ucc_perftest \ --c alltoall \ --b 1 \ --e 8M \ --m cuda \ --F \ No newline at end of file +srun --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 /opt/hpcx/ucc/bin/ucc_perftest -c alltoall -b 1 -e 8M -m cuda -F diff --git a/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py index 36db4473..fb59cc38 100644 --- a/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py +++ b/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py @@ -19,7 +19,7 @@ import pytest -from cloudai import Test, TestDefinition, TestRun, TestTemplate +from cloudai import Test, TestDefinition, TestRun, TestScenario, TestTemplate from cloudai.systems import SlurmSystem from cloudai.systems.slurm.strategy import SlurmCommandGenStrategy @@ -120,3 +120,138 @@ def test_raises_if_no_default_partition(slurm_system: SlurmSystem): "system configuration. Please ensure that 'default_partition' is set correctly " "in the corresponding system configuration (e.g., system.toml)." ) in str(exc_info.value) + + +@pytest.mark.parametrize( + "prologue,epilogue,expected_script_lines", + [ + # No prologue, no epilogue + (None, None, ["srun"]), + # One prologue, no epilogue + ( + [Mock(test=Mock(name="test1", test_template=Mock()))], + None, + [ + "SUCCESS_0=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)", + "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 )", + "if [ $PROLOGUE_SUCCESS -eq 1 ]; then", + " srun", + "fi", + ], + ), + # No prologue, one epilogue + ( + None, + [Mock(test=Mock(name="test2", test_template=Mock()))], + [ + "srun", + "epilogue", + ], + ), + # One prologue, one epilogue + ( + [Mock(test=Mock(name="test1", test_template=Mock()))], + [Mock(test=Mock(name="test2", test_template=Mock()))], + [ + "SUCCESS_0=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)", + "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 )", + "if [ $PROLOGUE_SUCCESS -eq 1 ]; then", + " srun", + " epilogue", + "fi", + ], + ), + # Multiple prologues, multiple epilogues + ( + [Mock(test=Mock(name="test1", test_template=Mock())), Mock(test=Mock(name="test2", test_template=Mock()))], + [Mock(test=Mock(name="test3", test_template=Mock())), Mock(test=Mock(name="test4", test_template=Mock()))], + [ + "SUCCESS_0=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)", + "SUCCESS_1=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)", + "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && [ $SUCCESS_1 -eq 1 ] && echo 1 || echo 0 )", + "if [ $PROLOGUE_SUCCESS -eq 1 ]; then", + " srun", + " epilogue", + " epilogue", + "fi", + ], + ), + # Multiple prologues, no epilogue + ( + [Mock(test=Mock(name="test1", test_template=Mock())), Mock(test=Mock(name="test2", test_template=Mock()))], + None, + [ + "SUCCESS_0=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)", + "SUCCESS_1=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)", + "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && [ $SUCCESS_1 -eq 1 ] && echo 1 || echo 0 )", + "if [ $PROLOGUE_SUCCESS -eq 1 ]; then", + " srun", + "fi", + ], + ), + # No prologue, multiple epilogues + ( + None, + [Mock(test=Mock(name="test3", test_template=Mock())), Mock(test=Mock(name="test4", test_template=Mock()))], + [ + "srun", + "epilogue", + "epilogue", + ], + ), + # Multiple prologues, single epilogue + ( + [Mock(test=Mock(name="test1", test_template=Mock())), Mock(test=Mock(name="test2", test_template=Mock()))], + [Mock(test=Mock(name="test3", test_template=Mock()))], + [ + "SUCCESS_0=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)", + "SUCCESS_1=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)", + "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && [ $SUCCESS_1 -eq 1 ] && echo 1 || echo 0 )", + "if [ $PROLOGUE_SUCCESS -eq 1 ]; then", + " srun", + " epilogue", + "fi", + ], + ), + ], +) +def test_prologue_epilogue_combinations( + strategy_fixture: SlurmCommandGenStrategy, + testrun_fixture: TestRun, + prologue, + epilogue, + expected_script_lines, + tmp_path, +): + testrun_fixture.prologue = Mock(spec=TestScenario) if prologue else None + testrun_fixture.epilogue = Mock(spec=TestScenario) if epilogue else None + + if prologue is not None: + testrun_fixture.prologue = Mock(spec=TestScenario) + testrun_fixture.prologue.test_runs = prologue + for idx, run in enumerate(prologue): + run.test.test_template.gen_srun_success_check.return_value = ( + "grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0" + ) + run.test.test_template.gen_srun_command.return_value = "srun" + run.test.name = f"test{idx+1}" + else: + testrun_fixture.prologue = None + + if epilogue is not None: + testrun_fixture.epilogue = Mock(spec=TestScenario) + testrun_fixture.epilogue.test_runs = epilogue + for idx, run in enumerate(epilogue): + run.test.test_template.gen_srun_command.return_value = "epilogue" + run.test.name = f"test{idx+1}" + else: + testrun_fixture.epilogue = None + + sbatch_command = strategy_fixture.gen_exec_command(testrun_fixture) + script_file_path = sbatch_command.split()[-1] + + with open(script_file_path, "r") as script_file: + script_content = script_file.read() + + for expected_line in expected_script_lines: + assert expected_line in script_content, f"Expected '{expected_line}' in generated script but it was missing." From 06d4b7dc3d0bcfcf19ac5578e295fddb6bc9a45d Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Fri, 25 Oct 2024 11:33:12 -0400 Subject: [PATCH 09/64] Remove plugin option from CLI --- src/cloudai/cli/cli.py | 4 -- tests/test_cli.py | 92 +----------------------------------------- 2 files changed, 2 insertions(+), 94 deletions(-) diff --git a/src/cloudai/cli/cli.py b/src/cloudai/cli/cli.py index f962a2d5..53059799 100644 --- a/src/cloudai/cli/cli.py +++ b/src/cloudai/cli/cli.py @@ -60,7 +60,6 @@ def add_command( handler: Callable[[argparse.Namespace], int], system_config: Optional[bool] = None, tests_dir: Optional[bool] = None, - plugin_dir: Optional[bool] = None, test_scenario: Optional[bool] = None, output_dir: Optional[bool] = None, result_dir: Optional[bool] = None, @@ -75,8 +74,6 @@ def add_command( p.add_argument( "--tests-dir", help="Path to the test configuration directory.", required=tests_dir, type=Path ) - if plugin_dir is not None: - p.add_argument("--plugin-dir", help="Path to the plugin directory.", required=plugin_dir, type=Path) if test_scenario is not None: p.add_argument("--test-scenario", help="Path to the test scenario file.", required=test_scenario, type=Path) if output_dir is not None: @@ -130,7 +127,6 @@ def add_run_and_dry_run(self): handle_dry_run_and_run, system_config=True, tests_dir=True, - plugin_dir=False, test_scenario=True, output_dir=False, ) diff --git a/tests/test_cli.py b/tests/test_cli.py index bb6c1a5d..538e497f 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -20,8 +20,8 @@ import pytest -from cloudai.cli import CloudAICLI -from cloudai.cli.handlers import handle_generate_report, handle_install_and_uninstall, handle_verify_all_configs +from cloudai.cli import CloudAICLI, handle_generate_report, handle_install_and_uninstall +from cloudai.cli.handlers import handle_verify_all_configs def test_help_message(capsys: pytest.CaptureFixture[str]) -> None: @@ -108,7 +108,6 @@ def test_add_command_all_optional(): lambda _: 0, system_config=False, tests_dir=False, - plugin_dir=False, test_scenario=False, output_dir=False, ) @@ -119,7 +118,6 @@ def test_add_command_all_optional(): mode="test", system_config=None, tests_dir=None, - plugin_dir=None, test_scenario=None, output_dir=None, ) @@ -134,7 +132,6 @@ def test_add_command_all_required(): lambda _: 0, system_config=True, tests_dir=True, - plugin_dir=True, test_scenario=True, output_dir=True, ) @@ -145,8 +142,6 @@ def test_add_command_all_required(): "system_config", "--tests-dir", "tests_dir", - "--plugin-dir", - "plugin_dir", "--test-scenario", "test_scenario", "--output-dir", @@ -159,91 +154,11 @@ def test_add_command_all_required(): mode="test", system_config=Path("system_config"), tests_dir=Path("tests_dir"), - plugin_dir=Path("plugin_dir"), test_scenario=Path("test_scenario"), output_dir=Path("output_dir"), ) -@pytest.mark.parametrize( - "mode,args,expected_plugin_dir", - [ - ( - "run", - [ - "run", - "--system-config", - "system_config", - "--tests-dir", - "tests_dir", - "--plugin-dir", - "plugin_dir", - "--test-scenario", - "test_scenario", - ], - Path("plugin_dir"), - ), - ( - "run", - [ - "run", - "--system-config", - "system_config", - "--tests-dir", - "tests_dir", - "--test-scenario", - "test_scenario", - ], - None, - ), - ( - "dry-run", - [ - "dry-run", - "--system-config", - "system_config", - "--tests-dir", - "tests_dir", - "--plugin-dir", - "plugin_dir", - "--test-scenario", - "test_scenario", - ], - Path("plugin_dir"), - ), - ( - "dry-run", - [ - "dry-run", - "--system-config", - "system_config", - "--tests-dir", - "tests_dir", - "--test-scenario", - "test_scenario", - ], - None, - ), - ], -) -def test_modes_with_or_without_plugin_dir(mode, args, expected_plugin_dir): - cli = CloudAICLI() - - cli.add_command( - mode, - f"{mode} command", - lambda _: 0, - system_config=True, - tests_dir=True, - plugin_dir=False, - test_scenario=True, - output_dir=False, - ) - - parsed_args = cli.parser.parse_args(args) - assert parsed_args.plugin_dir == expected_plugin_dir - - def test_real_uninstall(): cli = CloudAICLI() cli.init_default_args() @@ -362,8 +277,6 @@ def test_run_dry_run_modes(self, cli: CloudAICLI): "tests_dir", "--test-scenario", "test_scenario", - "--plugin-dir", - "plugin_dir", ] ) @@ -373,7 +286,6 @@ def test_run_dry_run_modes(self, cli: CloudAICLI): mode=mode, system_config=Path("system_config"), tests_dir=Path("tests_dir"), - plugin_dir=Path("plugin_dir"), test_scenario=Path("test_scenario"), output_dir=None, ) From 9174f0027709dfc92dde69ab5e85dcfc8eb5c2cb Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Fri, 25 Oct 2024 11:38:04 -0400 Subject: [PATCH 10/64] Make plugin directory self-contained --- .../plugin/test/nccl_test_all_gather.toml | 33 +++++++++++++++++++ .../plugin/test/nccl_test_all_reduce.toml | 30 +++++++++++++++++ .../nccl_test_epilogue.toml | 0 .../nccl_test_prologue.toml | 0 4 files changed, 63 insertions(+) create mode 100644 conf/common/plugin/test/nccl_test_all_gather.toml create mode 100644 conf/common/plugin/test/nccl_test_all_reduce.toml rename conf/common/plugin/{ => test_scenario}/nccl_test_epilogue.toml (100%) rename conf/common/plugin/{ => test_scenario}/nccl_test_prologue.toml (100%) diff --git a/conf/common/plugin/test/nccl_test_all_gather.toml b/conf/common/plugin/test/nccl_test_all_gather.toml new file mode 100644 index 00000000..4fec288a --- /dev/null +++ b/conf/common/plugin/test/nccl_test_all_gather.toml @@ -0,0 +1,33 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name = "nccl_test_all_gather" +description = "all_gather" +test_template_name = "NcclTest" + +[cmd_args] +"subtest_name" = "all_gather_perf_mpi" +"ngpus" = "1" +"minbytes" = "128" +"maxbytes" = "4G" +"iters" = "100" +"warmup_iters" = "50" + +[extra_cmd_args] +"--stepfactor" = "2" + +[extra_env_vars] +"NCCL_TEST_SPLIT_MASK" = "0x7" diff --git a/conf/common/plugin/test/nccl_test_all_reduce.toml b/conf/common/plugin/test/nccl_test_all_reduce.toml new file mode 100644 index 00000000..9074b2b8 --- /dev/null +++ b/conf/common/plugin/test/nccl_test_all_reduce.toml @@ -0,0 +1,30 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name = "nccl_test_all_reduce" +description = "all_reduce" +test_template_name = "NcclTest" + +[cmd_args] +"subtest_name" = "all_reduce_perf_mpi" +"ngpus" = "1" +"minbytes" = "128" +"maxbytes" = "16G" +"iters" = "100" +"warmup_iters" = "50" + +[extra_cmd_args] +"--stepfactor" = "2" diff --git a/conf/common/plugin/nccl_test_epilogue.toml b/conf/common/plugin/test_scenario/nccl_test_epilogue.toml similarity index 100% rename from conf/common/plugin/nccl_test_epilogue.toml rename to conf/common/plugin/test_scenario/nccl_test_epilogue.toml diff --git a/conf/common/plugin/nccl_test_prologue.toml b/conf/common/plugin/test_scenario/nccl_test_prologue.toml similarity index 100% rename from conf/common/plugin/nccl_test_prologue.toml rename to conf/common/plugin/test_scenario/nccl_test_prologue.toml From 7afa73fa20d7eeecc8259d266545f426685c3f19 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Fri, 25 Oct 2024 11:36:06 -0400 Subject: [PATCH 11/64] Update Parser to support self-contained plugin directory --- src/cloudai/cli/handlers.py | 4 +- src/cloudai/parser.py | 105 ++++++++++++++++++++++++++---------- tests/test_parser.py | 58 +++++++++++++------- 3 files changed, 117 insertions(+), 50 deletions(-) diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py index a204f70e..ac42363f 100644 --- a/src/cloudai/cli/handlers.py +++ b/src/cloudai/cli/handlers.py @@ -114,7 +114,9 @@ def handle_dry_run_and_run(args: argparse.Namespace) -> int: args (argparse.Namespace): The parsed command-line arguments. """ parser = Parser(args.system_config) - system, tests, test_scenario = parser.parse(args.tests_dir, args.test_scenario, args.plugin_dir) + system, tests, test_scenario = parser.parse( + args.tests_dir, args.test_scenario, Path("conf/common/plugin/test"), Path("conf/common/plugin/test_scenario") + ) assert test_scenario is not None if args.output_dir: diff --git a/src/cloudai/parser.py b/src/cloudai/parser.py index 5b21ab3f..73ab8717 100644 --- a/src/cloudai/parser.py +++ b/src/cloudai/parser.py @@ -49,14 +49,25 @@ def __init__(self, system_config_path: Path) -> None: self.system_config_path = system_config_path def parse( - self, test_path: Path, test_scenario_path: Optional[Path] = None, plugin_path: Optional[Path] = None + self, + test_path: Path, + test_scenario_path: Optional[Path] = None, + plugin_test_path: Optional[Path] = None, + plugin_test_scenario_path: Optional[Path] = None, ) -> Tuple[System, List[Test], Optional[TestScenario]]: """ Parse configurations for system, test templates, and test scenarios. - Returns - Tuple[System, List[TestTemplate], TestScenario]: A tuple containing the system object, a list of test - template objects, and the test scenario object. + Args: + test_path (Path): The file path for tests. + test_scenario_path (Optional[Path]): The file path for the main test scenario. + If None, all tests are included. + plugin_test_path (Optional[Path]): The file path for plugin-specific tests. + plugin_test_scenario_path (Optional[Path]): The file path for plugin-specific test scenarios. + + Returns: + Tuple[System, List[Test], Optional[TestScenario]]: A tuple containing the system object, a list of filtered + test template objects, and the main test scenario object if provided. """ if not test_path.exists(): raise FileNotFoundError(f"Test path '{test_path}' not found.") @@ -64,47 +75,83 @@ def parse( try: system = self.parse_system(self.system_config_path) except SystemConfigParsingError: - exit(1) # exit right away to keep error message readable for users + exit(1) try: tests = self.parse_tests(list(test_path.glob("*.toml")), system) except TestConfigParsingError: - exit(1) # exit right away to keep error message readable for users + exit(1) - logging.debug(f"Parsed {len(tests)} tests: {[t.name for t in tests]}") - test_mapping = {t.name: t for t in tests} + plugin_tests = ( + self.parse_tests(list(plugin_test_path.glob("*.toml")), system) + if plugin_test_path and plugin_test_path.exists() + else [] + ) - test_scenario: Optional[TestScenario] = None - scenario_test_names: Set[str] = set() if test_scenario_path: - plugin_mapping: Dict[str, TestScenario] = {} - plugin_test_names: Set[str] = set() - if plugin_path and plugin_path.exists(): - try: - plugin_mapping = self.parse_plugins(list(plugin_path.glob("*.toml")), test_mapping) - for plugin_scenario in plugin_mapping.values(): - plugin_test_names.update(tr.test.name for tr in plugin_scenario.test_runs) - except TestScenarioParsingError: - exit(1) + return self._parse_with_scenario(system, tests, test_scenario_path, plugin_tests, plugin_test_scenario_path) + + return system, tests + plugin_tests, None + + def _parse_with_scenario( + self, + system: System, + tests: List[Test], + test_scenario_path: Path, + plugin_tests: List[Test], + plugin_test_scenario_path: Optional[Path], + ) -> Tuple[System, List[Test], Optional[TestScenario]]: + """Parse tests and scenarios with a main test scenario path specified.""" + test_mapping = {t.name: t for t in tests} + plugin_test_mapping = {t.name: t for t in plugin_tests} + + plugin_test_scenario_mapping = self._load_plugin_scenarios(plugin_test_scenario_path, plugin_test_mapping) + test_scenario = self._load_main_scenario(test_scenario_path, test_mapping, plugin_test_scenario_mapping) + all_used_test_names = self._collect_used_test_names(plugin_test_scenario_mapping, test_scenario) + filtered_tests = [t for t in tests if t.name in all_used_test_names] + + return system, filtered_tests, test_scenario + + def _load_plugin_scenarios( + self, plugin_test_scenario_path: Optional[Path], plugin_test_mapping: Dict[str, Test] + ) -> Dict[str, TestScenario]: + """Load plugin-specific test scenarios from the specified path.""" + if plugin_test_scenario_path and plugin_test_scenario_path.exists(): try: - test_scenario = self.parse_test_scenario(test_scenario_path, test_mapping, plugin_mapping) - scenario_test_names = set(tr.test.name for tr in test_scenario.test_runs) + return self.parse_plugins(list(plugin_test_scenario_path.glob("*.toml")), plugin_test_mapping) except TestScenarioParsingError: exit(1) + return {} - all_used_test_names = plugin_test_names.union(scenario_test_names) - filtered_tests = [t for t in tests if t.name in all_used_test_names] - else: - filtered_tests = tests - - return system, filtered_tests, test_scenario + def _load_main_scenario( + self, + test_scenario_path: Path, + test_mapping: Dict[str, Test], + plugin_test_scenario_mapping: Dict[str, TestScenario], + ) -> Optional[TestScenario]: + """Load the main test scenario using provided mappings.""" + try: + return self.parse_test_scenario(test_scenario_path, test_mapping, plugin_test_scenario_mapping) + except TestScenarioParsingError: + exit(1) + + def _collect_used_test_names( + self, plugin_test_scenario_mapping: Dict[str, TestScenario], test_scenario: Optional[TestScenario] + ) -> Set[str]: + """Collect test names used in both plugin and main test scenarios.""" + # TODO: collect test names in the plugin test scenarios only + plugin_test_names = { + tr.test.name for scenario in plugin_test_scenario_mapping.values() for tr in scenario.test_runs + } + scenario_test_names = {tr.test.name for tr in test_scenario.test_runs} if test_scenario else set() + return plugin_test_names.union(scenario_test_names) @staticmethod def parse_plugins(plugin_tomls: List[Path], test_mapping: Dict[str, Test]) -> Dict[str, TestScenario]: plugin_mapping = {} - for plugin_path in plugin_tomls: - plugin_scenario = Parser.parse_test_scenario(plugin_path, test_mapping) + for plugin_test_scenario_path in plugin_tomls: + plugin_scenario = Parser.parse_test_scenario(plugin_test_scenario_path, test_mapping) plugin_mapping[plugin_scenario.name] = plugin_scenario return plugin_mapping diff --git a/tests/test_parser.py b/tests/test_parser.py index cb809d36..12372755 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -15,13 +15,13 @@ # limitations under the License. from pathlib import Path -from typing import cast +from typing import Dict, cast from unittest.mock import Mock, patch import pytest from pydantic_core import ErrorDetails -from cloudai import Parser, format_validation_error +from cloudai import Parser, TestScenario, format_validation_error from cloudai.systems.slurm.slurm_system import SlurmSystem @@ -100,16 +100,14 @@ def test_scenario_with_plugin_common_tests( @patch("cloudai._core.test_parser.TestParser.parse_all") @patch("cloudai._core.test_scenario_parser.TestScenarioParser.parse") - @patch("cloudai.parser.Parser.parse_plugins") - def test_scenario_with_plugin_exclusive_tests( - self, parse_plugins: Mock, test_scenario_parser: Mock, test_parser: Mock, parser: Parser - ): + def test_scenario_with_plugin_exclusive_tests(self, test_scenario_parser: Mock, test_parser: Mock, parser: Parser): tests_dir = parser.system_config_path.parent.parent / "test" + test_scenario_path = Path("/mock/test_scenario.toml") + plugin_test_scenario_path = Path("/mock/plugin_scenarios") - fake_tests = [] - for i in range(4): - fake_tests.append(Mock()) - fake_tests[-1].name = f"test-{i}" + fake_tests = [Mock() for _ in range(4)] + for i, test in enumerate(fake_tests): + test.name = f"test-{i}" test_parser.return_value = fake_tests fake_scenario = Mock() @@ -117,18 +115,38 @@ def test_scenario_with_plugin_exclusive_tests( fake_scenario.test_runs[0].test.name = "test-1" test_scenario_parser.return_value = fake_scenario - fake_plugin = Mock() - fake_plugin.test_runs = [Mock()] - fake_plugin.test_runs[0].test.name = "test-2" - parse_plugins.return_value = {"plugin-1": fake_plugin} + fake_plugin_scenarios = {"plugin-1": Mock(test_runs=[Mock()])} + fake_plugin_scenarios["plugin-1"].test_runs[0].test.name = "test-2" - _, tests, _ = parser.parse(tests_dir, Path(), Path()) + with patch.object(parser, "_load_plugin_scenarios", return_value=fake_plugin_scenarios): + _, filtered_tests, _ = parser.parse(tests_dir, test_scenario_path, tests_dir, plugin_test_scenario_path) + + filtered_test_names = {t.name for t in filtered_tests} + assert len(filtered_tests) == 2 + assert "test-1" in filtered_test_names + assert "test-2" in filtered_test_names + assert "test-0" not in filtered_test_names + assert "test-3" not in filtered_test_names + + def test_collect_used_test_names(self, parser: Parser): + fake_scenario = Mock() + fake_scenario.test_runs = [Mock()] + fake_scenario.test_runs[0].test.name = "test-1" + + fake_plugin_scenario_1 = Mock(spec=TestScenario) + fake_plugin_scenario_1.test_runs = [Mock()] + fake_plugin_scenario_1.test_runs[0].test.name = "test-2" + + fake_plugin_scenario_2 = Mock(spec=TestScenario) + fake_plugin_scenario_2.test_runs = [Mock()] + fake_plugin_scenario_2.test_runs[0].test.name = "test-3" + + fake_plugin_scenarios = cast( + Dict[str, TestScenario], {"plugin-1": fake_plugin_scenario_1, "plugin-2": fake_plugin_scenario_2} + ) - assert len(tests) == 2 - assert "test-1" in [t.name for t in tests] - assert "test-2" in [t.name for t in tests] - assert "test-0" not in [t.name for t in tests] - assert "test-3" not in [t.name for t in tests] + used_test_names = parser._collect_used_test_names(fake_plugin_scenarios, fake_scenario) + assert used_test_names == {"test-1", "test-2", "test-3"} def test_parse_system(self, parser: Parser): parser.system_config_path = Path("conf/common/system/example_slurm_cluster.toml") From 3e45b25b7d6cb2047cf9ad37dc787696630af925 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Mon, 28 Oct 2024 13:35:01 -0400 Subject: [PATCH 12/64] Refactor plugin path handling in parse to use a single plugin_path param --- src/cloudai/cli/handlers.py | 4 +--- src/cloudai/parser.py | 9 +++++---- tests/test_parser.py | 4 ++-- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py index ac42363f..3382ff4b 100644 --- a/src/cloudai/cli/handlers.py +++ b/src/cloudai/cli/handlers.py @@ -114,9 +114,7 @@ def handle_dry_run_and_run(args: argparse.Namespace) -> int: args (argparse.Namespace): The parsed command-line arguments. """ parser = Parser(args.system_config) - system, tests, test_scenario = parser.parse( - args.tests_dir, args.test_scenario, Path("conf/common/plugin/test"), Path("conf/common/plugin/test_scenario") - ) + system, tests, test_scenario = parser.parse(args.tests_dir, args.test_scenario, Path("conf/common/plugin")) assert test_scenario is not None if args.output_dir: diff --git a/src/cloudai/parser.py b/src/cloudai/parser.py index 73ab8717..a9227f88 100644 --- a/src/cloudai/parser.py +++ b/src/cloudai/parser.py @@ -52,8 +52,7 @@ def parse( self, test_path: Path, test_scenario_path: Optional[Path] = None, - plugin_test_path: Optional[Path] = None, - plugin_test_scenario_path: Optional[Path] = None, + plugin_path: Optional[Path] = None, ) -> Tuple[System, List[Test], Optional[TestScenario]]: """ Parse configurations for system, test templates, and test scenarios. @@ -62,8 +61,7 @@ def parse( test_path (Path): The file path for tests. test_scenario_path (Optional[Path]): The file path for the main test scenario. If None, all tests are included. - plugin_test_path (Optional[Path]): The file path for plugin-specific tests. - plugin_test_scenario_path (Optional[Path]): The file path for plugin-specific test scenarios. + plugin_path (Optional[Path]): The base file path for plugin-specific tests and scenarios. Returns: Tuple[System, List[Test], Optional[TestScenario]]: A tuple containing the system object, a list of filtered @@ -82,6 +80,9 @@ def parse( except TestConfigParsingError: exit(1) + plugin_test_scenario_path = plugin_path + plugin_test_path = plugin_path / "test" if plugin_path else None + plugin_tests = ( self.parse_tests(list(plugin_test_path.glob("*.toml")), system) if plugin_test_path and plugin_test_path.exists() diff --git a/tests/test_parser.py b/tests/test_parser.py index 12372755..bcfd63a3 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -103,7 +103,7 @@ def test_scenario_with_plugin_common_tests( def test_scenario_with_plugin_exclusive_tests(self, test_scenario_parser: Mock, test_parser: Mock, parser: Parser): tests_dir = parser.system_config_path.parent.parent / "test" test_scenario_path = Path("/mock/test_scenario.toml") - plugin_test_scenario_path = Path("/mock/plugin_scenarios") + plugin_path = Path("/mock/plugin_scenarios") fake_tests = [Mock() for _ in range(4)] for i, test in enumerate(fake_tests): @@ -119,7 +119,7 @@ def test_scenario_with_plugin_exclusive_tests(self, test_scenario_parser: Mock, fake_plugin_scenarios["plugin-1"].test_runs[0].test.name = "test-2" with patch.object(parser, "_load_plugin_scenarios", return_value=fake_plugin_scenarios): - _, filtered_tests, _ = parser.parse(tests_dir, test_scenario_path, tests_dir, plugin_test_scenario_path) + _, filtered_tests, _ = parser.parse(tests_dir, test_scenario_path, plugin_path) filtered_test_names = {t.name for t in filtered_tests} assert len(filtered_tests) == 2 From 5634f743463bb157bb4b843941f4b7fe4d1c0311 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Mon, 28 Oct 2024 13:29:04 -0400 Subject: [PATCH 13/64] Remove test_scenario directory from conf/common/plugin/ --- conf/common/plugin/{test_scenario => }/nccl_test_epilogue.toml | 0 conf/common/plugin/{test_scenario => }/nccl_test_prologue.toml | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename conf/common/plugin/{test_scenario => }/nccl_test_epilogue.toml (100%) rename conf/common/plugin/{test_scenario => }/nccl_test_prologue.toml (100%) diff --git a/conf/common/plugin/test_scenario/nccl_test_epilogue.toml b/conf/common/plugin/nccl_test_epilogue.toml similarity index 100% rename from conf/common/plugin/test_scenario/nccl_test_epilogue.toml rename to conf/common/plugin/nccl_test_epilogue.toml diff --git a/conf/common/plugin/test_scenario/nccl_test_prologue.toml b/conf/common/plugin/nccl_test_prologue.toml similarity index 100% rename from conf/common/plugin/test_scenario/nccl_test_prologue.toml rename to conf/common/plugin/nccl_test_prologue.toml From b7989812201c1c94141b9621dd542d18141377a4 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Tue, 29 Oct 2024 09:26:27 -0400 Subject: [PATCH 14/64] Restore comments in src/cloudai/parser.py --- src/cloudai/parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cloudai/parser.py b/src/cloudai/parser.py index a9227f88..00f494aa 100644 --- a/src/cloudai/parser.py +++ b/src/cloudai/parser.py @@ -73,12 +73,12 @@ def parse( try: system = self.parse_system(self.system_config_path) except SystemConfigParsingError: - exit(1) + exit(1) # exit right away to keep error message readable for users try: tests = self.parse_tests(list(test_path.glob("*.toml")), system) except TestConfigParsingError: - exit(1) + exit(1) # exit right away to keep error message readable for users plugin_test_scenario_path = plugin_path plugin_test_path = plugin_path / "test" if plugin_path else None From da9abbfddc46f308785f3f5852849ffd3bc413a4 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Tue, 29 Oct 2024 09:38:47 -0400 Subject: [PATCH 15/64] Remove unused tmp_path from unit tests --- .../test_common_slurm_command_gen_strategy.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py index fb59cc38..dac84a1d 100644 --- a/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py +++ b/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py @@ -221,7 +221,6 @@ def test_prologue_epilogue_combinations( prologue, epilogue, expected_script_lines, - tmp_path, ): testrun_fixture.prologue = Mock(spec=TestScenario) if prologue else None testrun_fixture.epilogue = Mock(spec=TestScenario) if epilogue else None From 6b11f54c8b033a4801d182c1403950da69a4a09c Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Tue, 29 Oct 2024 09:39:19 -0400 Subject: [PATCH 16/64] Set prologue and epilogue to None by default --- src/cloudai/_core/test_scenario_parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cloudai/_core/test_scenario_parser.py b/src/cloudai/_core/test_scenario_parser.py index c59adeba..c3cbca99 100644 --- a/src/cloudai/_core/test_scenario_parser.py +++ b/src/cloudai/_core/test_scenario_parser.py @@ -166,8 +166,8 @@ def _create_section_test_run( self, test_info: _TestRunTOML, normalized_weight: float, - prologue: Optional[TestScenario], - epilogue: Optional[TestScenario], + prologue: Optional[TestScenario] = None, + epilogue: Optional[TestScenario] = None, ) -> TestRun: """ Create a section-specific Test object by copying from the test mapping. From b84c16fef9665de9c0fd4eed91f9de070f06674b Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Tue, 29 Oct 2024 09:45:08 -0400 Subject: [PATCH 17/64] Add validation to ensure 'prologue' and 'epilogue' are not empty strings --- src/cloudai/_core/test_scenario_parser.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/cloudai/_core/test_scenario_parser.py b/src/cloudai/_core/test_scenario_parser.py index c3cbca99..03ac1288 100644 --- a/src/cloudai/_core/test_scenario_parser.py +++ b/src/cloudai/_core/test_scenario_parser.py @@ -54,8 +54,8 @@ class _TestScenarioTOML(BaseModel): name: str job_status_check: bool = True tests: list[_TestRunTOML] = Field(alias="Tests", min_length=1) - prologue: str = "" - epilogue: str = "" + prologue: Optional[str] = None + epilogue: Optional[str] = None @model_validator(mode="after") def check_no_self_dependency(self): @@ -89,6 +89,20 @@ def check_all_dependencies_are_known(self): return self + @model_validator(mode="after") + def check_prologue_not_empty(self): + """Ensure that prologue is not an empty string if provided.""" + if self.prologue == "": + raise ValueError("The 'prologue' field should not be an empty string.") + return self + + @model_validator(mode="after") + def check_epilogue_not_empty(self): + """Ensure that epilogue is not an empty string if provided.""" + if self.epilogue == "": + raise ValueError("The 'epilogue' field should not be an empty string.") + return self + class TestScenarioParser: """ From 8d840cf4b36398d9767b6be1916d563d32e16ae4 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Tue, 22 Oct 2024 12:48:01 -0400 Subject: [PATCH 18/64] Reorder SlurmCommandGenStrategy methods --- .../strategy/slurm_command_gen_strategy.py | 46 +++++++++---------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py index 4a052a47..7dc786f9 100644 --- a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py +++ b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py @@ -51,22 +51,12 @@ def __init__(self, system: SlurmSystem, cmd_args: Dict[str, Any]) -> None: self.docker_image_url = self.cmd_args.get("docker_image_url", "") - def _format_env_vars(self, env_vars: Dict[str, Any]) -> str: - """ - Format environment variables for inclusion in a batch script. - - Args: - env_vars (Dict[str, Any]): Environment variables to format. - - Returns: - str: A string representation of the formatted environment variables. - """ - formatted_vars = [] - for key in sorted(env_vars.keys()): - value = env_vars[key] - formatted_value = str(value["default"]) if isinstance(value, dict) and "default" in value else str(value) - formatted_vars.append(f"export {key}={formatted_value}") - return "\n".join(formatted_vars) + def gen_exec_command(self, tr: TestRun) -> str: + env_vars = self._override_env_vars(self.system.global_env_vars, tr.test.extra_env_vars) + cmd_args = self._override_cmd_args(self.default_cmd_args, tr.test.cmd_args) + slurm_args = self._parse_slurm_args(tr.test.test_template.__class__.__name__, env_vars, cmd_args, tr) + srun_command = self.generate_srun_command(slurm_args, env_vars, cmd_args, tr.test.extra_cmd_args) + return self._write_sbatch_script(slurm_args, env_vars, srun_command, tr.output_path) def _parse_slurm_args( self, job_name_prefix: str, env_vars: Dict[str, str], cmd_args: Dict[str, str], tr: TestRun @@ -127,13 +117,6 @@ def generate_srun_prefix(self, slurm_args: Dict[str, Any]) -> List[str]: return srun_command_parts - def gen_exec_command(self, tr: TestRun) -> str: - env_vars = self._override_env_vars(self.system.global_env_vars, tr.test.extra_env_vars) - cmd_args = self._override_cmd_args(self.default_cmd_args, tr.test.cmd_args) - slurm_args = self._parse_slurm_args(tr.test.test_template.__class__.__name__, env_vars, cmd_args, tr) - srun_command = self.generate_srun_command(slurm_args, env_vars, cmd_args, tr.test.extra_cmd_args) - return self._write_sbatch_script(slurm_args, env_vars, srun_command, tr.output_path) - def generate_test_command( self, env_vars: Dict[str, str], cmd_args: Dict[str, str], extra_cmd_args: str ) -> List[str]: @@ -223,3 +206,20 @@ def _append_sbatch_directives( batch_script_content.append( "\nexport SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)" ) + + def _format_env_vars(self, env_vars: Dict[str, Any]) -> str: + """ + Format environment variables for inclusion in a batch script. + + Args: + env_vars (Dict[str, Any]): Environment variables to format. + + Returns: + str: A string representation of the formatted environment variables. + """ + formatted_vars = [] + for key in sorted(env_vars.keys()): + value = env_vars[key] + formatted_value = str(value["default"]) if isinstance(value, dict) and "default" in value else str(value) + formatted_vars.append(f"export {key}={formatted_value}") + return "\n".join(formatted_vars) From 9725b365141bdc905ae62949f5a4075b31a9ab19 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Tue, 22 Oct 2024 13:45:01 -0400 Subject: [PATCH 19/64] Rename generate_srun_command to _gen_srun_command --- .../test_template/jax_toolbox/slurm_command_gen_strategy.py | 2 +- .../systems/slurm/strategy/slurm_command_gen_strategy.py | 6 +++--- .../test_common_slurm_command_gen_strategy.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py index ff70b5c4..c5a98509 100644 --- a/src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py +++ b/src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py @@ -146,7 +146,7 @@ def _parse_slurm_args( return base_args - def generate_srun_command( + def _gen_srun_command( self, slurm_args: Dict[str, Any], env_vars: Dict[str, str], cmd_args: Dict[str, Any], extra_cmd_args: str ) -> str: self._create_run_script(env_vars, cmd_args, extra_cmd_args) diff --git a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py index 7dc786f9..5c33a141 100644 --- a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py +++ b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py @@ -98,14 +98,14 @@ def job_name(self, job_name_prefix: str) -> str: job_name = f"{self.system.account}-{job_name_prefix}.{datetime.now().strftime('%Y%m%d_%H%M%S')}" return job_name - def generate_srun_command( + def _gen_srun_command( self, slurm_args: Dict[str, Any], env_vars: Dict[str, str], cmd_args: Dict[str, str], extra_cmd_args: str ) -> str: - srun_command_parts = self.generate_srun_prefix(slurm_args) + srun_command_parts = self.gen_srun_prefix(slurm_args) test_command_parts = self.generate_test_command(env_vars, cmd_args, extra_cmd_args) return " \\\n".join(srun_command_parts + test_command_parts) - def generate_srun_prefix(self, slurm_args: Dict[str, Any]) -> List[str]: + def gen_srun_prefix(self, slurm_args: Dict[str, Any]) -> List[str]: srun_command_parts = ["srun", f"--mpi={self.system.mpi}"] if slurm_args.get("image_path"): srun_command_parts.append(f'--container-image={slurm_args["image_path"]}') diff --git a/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py index 37d6a962..f2aae181 100644 --- a/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py +++ b/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py @@ -54,7 +54,7 @@ def test_filename_generation(strategy_fixture: SlurmCommandGenStrategy, testrun_ env_vars = {"TEST_VAR": "VALUE"} cmd_args = {"test_arg": "test_value"} slurm_args = strategy_fixture._parse_slurm_args(job_name_prefix, env_vars, cmd_args, testrun_fixture) - srun_command = strategy_fixture.generate_srun_command(slurm_args, env_vars, cmd_args, "") + srun_command = strategy_fixture._gen_srun_command(slurm_args, env_vars, cmd_args, "") sbatch_command = strategy_fixture._write_sbatch_script( slurm_args, env_vars, srun_command, testrun_fixture.output_path From 5a658c3b44890755e9690521b1756fd2e903828d Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Wed, 23 Oct 2024 12:05:42 -0400 Subject: [PATCH 20/64] Remove pre-test implementation from JaxToolbox --- .../jax_toolbox/slurm_command_gen_strategy.py | 93 ---------------- src/cloudai/test_definitions/gpt.py | 5 +- src/cloudai/test_definitions/grok.py | 5 +- src/cloudai/test_definitions/jax_toolbox.py | 34 +----- .../{gpt-no-pretest.sbatch => gpt.sbatch} | 2 +- .../{grok-no-pretest.sbatch => grok.sbatch} | 2 +- ..._jax_toolbox_slurm_command_gen_strategy.py | 105 +----------------- tests/test_acceptance.py | 18 +-- 8 files changed, 16 insertions(+), 248 deletions(-) rename tests/ref_data/{gpt-no-pretest.sbatch => gpt.sbatch} (96%) rename tests/ref_data/{grok-no-pretest.sbatch => grok.sbatch} (98%) diff --git a/src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py index c5a98509..b27d878d 100644 --- a/src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py +++ b/src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py @@ -152,25 +152,11 @@ def _gen_srun_command( self._create_run_script(env_vars, cmd_args, extra_cmd_args) commands = [] - - run_pre_test = cmd_args.get("pre_test.enable", False) - - if run_pre_test: - output_path = Path(cmd_args["output_path"]).resolve() / "output_pretest-%j-%n-%t.txt" - error_path = Path(cmd_args["output_path"]).resolve() / "error_pretest-%j-%n-%t.txt" - commands.append(self._generate_pre_test_command(cmd_args, output_path, error_path)) - commands.append(self._generate_pre_test_check_command(cmd_args, output_path)) - commands.append('if [ "$PRE_TEST_SUCCESS" = true ]; then') - load_container = cmd_args.get("load_container", False) if load_container: commands += self._generate_container_load_command(slurm_args) - commands += self._generate_run_command(slurm_args) - if run_pre_test: - commands.append("fi") - return "\n".join(commands) def _create_run_script( @@ -341,85 +327,6 @@ def _create_pgo_nsys_converter_command(self, stage: str, cmd_args: Dict[str, str ["", 'if [ "$SLURM_NODEID" -eq 0 ] && [ "$SLURM_PROCID" -eq 0 ]; then', f" {command}", "fi"] ) - def _generate_pre_test_command(self, cmd_args: Dict[str, Any], output_path: Path, error_path: Path) -> str: - """ - Generate the pre-test command for running a test. - - This method constructs the pre-test command based on the command-line - arguments provided. - - Args: - cmd_args (Dict[str, Any]): A dictionary containing command arguments. - output_path (Path): The path to the output file. - error_path (Path): The path to the error file. - - Returns: - str: The generated pre-test command. - """ - nccl_test_prefix = "pre_test.nccl_test." - nccl_test = {} - - for key, value in cmd_args.items(): - if key.startswith(nccl_test_prefix): - flag_name = key[len(nccl_test_prefix) :] - nccl_test[flag_name] = value - pre_test_command_parts = [ - "srun", - "--mpi=pmix", - f"-N {nccl_test.get('num_nodes', 2)}", - f"-o {output_path}", - f"-e {error_path}", - f"--container-image={nccl_test.get('docker_image_url', 'nvcr.io/nvidia/pytorch:24.02-py3')}", - f"/usr/local/bin/{nccl_test.get('subtest_name', 'all_gather_perf_mpi')}", - f"--nthreads {nccl_test.get('nthreads', 1)}", - f"--ngpus {nccl_test.get('ngpus', 1)}", - f"--minbytes {nccl_test.get('minbytes', '32M')}", - f"--maxbytes {nccl_test.get('maxbytes', '16G')}", - f"--stepbytes {nccl_test.get('stepbytes', '1M')}", - f"--op {nccl_test.get('op', 'sum')}", - f"--datatype {nccl_test.get('datatype', 'float')}", - f"--root {nccl_test.get('root', 0)}", - f"--iters {nccl_test.get('iters', 20)}", - f"--warmup_iters {nccl_test.get('warmup_iters', 5)}", - f"--agg_iters {nccl_test.get('agg_iters', 1)}", - f"--average {nccl_test.get('average', 1)}", - f"--parallel_init {nccl_test.get('parallel_init', 0)}", - f"--check {nccl_test.get('check', 1)}", - f"--blocking {nccl_test.get('blocking', 0)}", - f"--cudagraph {nccl_test.get('cudagraph', 0)}", - f"--stepfactor {nccl_test.get('stepfactor', 2)}", - ] - return " \\\n".join(pre_test_command_parts) - - def _generate_pre_test_check_command(self, cmd_args: Dict[str, str], output_path: Path) -> str: - """ - Generate the command for pre-test check. - - This method generates the command that checks the output of the pre-test to determine if the main test should - be run. - - Args: - cmd_args (Dict[str, str]): Command-line arguments for the job. - output_path (str): The path to the output file. - - Returns: - str: The generated command for pre-test check. - """ - pretest_output_files = str(Path(output_path).parent / "output_pretest-*.txt") - keyword = cmd_args.get("keyword", "Avg bus bandwidth") - - return "\n".join( - [ - f'PRETEST_OUTPUT_FILES="{pretest_output_files}"', - f'keyword="{keyword}"', - "", - "# Use grep to search for the keyword in the files", - 'if grep -q "$keyword" $PRETEST_OUTPUT_FILES; then', - " PRE_TEST_SUCCESS=true", - "fi", - ] - ) - def _generate_container_load_command(self, slurm_args: Dict[str, Any]) -> List[str]: """Generate the command for loading a container.""" container_image = slurm_args.get("image_path") diff --git a/src/cloudai/test_definitions/gpt.py b/src/cloudai/test_definitions/gpt.py index ff1e8f1e..353d97fe 100644 --- a/src/cloudai/test_definitions/gpt.py +++ b/src/cloudai/test_definitions/gpt.py @@ -21,7 +21,7 @@ from cloudai import Installable from cloudai.installer.installables import DockerImage -from .jax_toolbox import JaxFdl, JaxToolboxCmdArgs, JaxToolboxTestDefinition, PreTest, SetupFlags, XLAFlags +from .jax_toolbox import JaxFdl, JaxToolboxCmdArgs, JaxToolboxTestDefinition, SetupFlags, XLAFlags class GPTFdl(JaxFdl): @@ -48,7 +48,6 @@ class GPTCmdArgs(JaxToolboxCmdArgs): fdl_config: str fdl: GPTFdl = Field(default_factory=GPTFdl) - pre_test: PreTest = Field(default_factory=PreTest) xla_flags: GPTXLAFlags = Field(default_factory=GPTXLAFlags) setup_flags: GPTSetupFlags = Field(default_factory=GPTSetupFlags) @@ -64,7 +63,7 @@ def cmd_args_dict(self): d = self.cmd_args.model_dump() res = {} for k, v in d.items(): - if k in {"pre_test", "docker_image_url", "load_container", "output_path"}: + if k in {"docker_image_url", "load_container", "output_path"}: res[k] = v else: if k == "xla_flags": diff --git a/src/cloudai/test_definitions/grok.py b/src/cloudai/test_definitions/grok.py index c87c6e44..88a358be 100644 --- a/src/cloudai/test_definitions/grok.py +++ b/src/cloudai/test_definitions/grok.py @@ -21,7 +21,7 @@ from cloudai import Installable from cloudai.installer.installables import DockerImage -from .jax_toolbox import JaxFdl, JaxToolboxCmdArgs, JaxToolboxTestDefinition, PreTest, SetupFlags, XLAFlags +from .jax_toolbox import JaxFdl, JaxToolboxCmdArgs, JaxToolboxTestDefinition, SetupFlags, XLAFlags class GrokFdl(JaxFdl): @@ -77,7 +77,6 @@ class GrokCmdArgs(JaxToolboxCmdArgs): setup_flags: SetupFlags = Field(default_factory=SetupFlags) profile: GrokProfileXLAFlags = Field(default_factory=GrokProfileXLAFlags) perf: GrokPerfXLAFlags = Field(default_factory=GrokPerfXLAFlags) - pre_test: PreTest = Field(default_factory=PreTest) class GrokTestDefinition(JaxToolboxTestDefinition): @@ -97,7 +96,7 @@ def cmd_args_dict(self): if k in {"profile", "perf"}: res.setdefault(f"Grok.{k}", {}) res[f"Grok.{k}"]["XLA_FLAGS"] = v - elif k in {"pre_test", "docker_image_url", "load_container", "output_path"}: + elif k in {"docker_image_url", "load_container", "output_path"}: res[k] = v else: res[f"Grok.{k}"] = v diff --git a/src/cloudai/test_definitions/jax_toolbox.py b/src/cloudai/test_definitions/jax_toolbox.py index 079e5b4e..4593028a 100644 --- a/src/cloudai/test_definitions/jax_toolbox.py +++ b/src/cloudai/test_definitions/jax_toolbox.py @@ -14,12 +14,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, Optional +from typing import Optional -from pydantic import BaseModel, ConfigDict, Field, field_serializer +from pydantic import BaseModel, ConfigDict, field_serializer from cloudai import CmdArgs, TestDefinition -from cloudai.test_definitions.nccl import NCCLCmdArgs class JaxFdl(BaseModel): @@ -54,35 +53,6 @@ def checkpoint_policy_serializer(self, value: str) -> str: return f'\\"{value}\\"' -class NCCLCmdAgrsPreTest(NCCLCmdArgs): - """NCCL pre-test command arguments.""" - - num_nodes: int = 8 - stepfactor: int = 2 - minbytes: str = "8M" - maxbytes: str = "16G" - blocking: int = 1 - - def model_post_init(self, _: Any) -> None: - self.subtest_name = "all_gather_perf_mpi" - self.docker_image_url = "nvcr.io/nvidia/pytorch:24.02-py3" - - -class PreTest(BaseModel): - """Pre-test configuration.""" - - model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True) - enable: bool = True - nccl_test: NCCLCmdAgrsPreTest = Field(default_factory=NCCLCmdAgrsPreTest) - - -class NCCLPreTest(BaseModel): - """Pre-test configuration.""" - - model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True) - nccl_test: Optional[NCCLCmdAgrsPreTest] = None - - class JaxToolboxCmdArgs(CmdArgs): """JAX Toolbox test command arguments.""" diff --git a/tests/ref_data/gpt-no-pretest.sbatch b/tests/ref_data/gpt.sbatch similarity index 96% rename from tests/ref_data/gpt-no-pretest.sbatch rename to tests/ref_data/gpt.sbatch index edc4d19c..3cd84b21 100644 --- a/tests/ref_data/gpt-no-pretest.sbatch +++ b/tests/ref_data/gpt.sbatch @@ -19,4 +19,4 @@ export XLA_FLAGS="--xla_gpu_all_gather_combine_threshold_bytes=$COMBINE_THRESHOL -e __OUTPUT_DIR__/error-%j-%n-%t.txt \ --container-name=cont \ --container-mounts=__OUTPUT_DIR__:/opt/paxml/workspace/ \ - /opt/paxml/workspace/run.sh \ No newline at end of file + /opt/paxml/workspace/run.sh diff --git a/tests/ref_data/grok-no-pretest.sbatch b/tests/ref_data/grok.sbatch similarity index 98% rename from tests/ref_data/grok-no-pretest.sbatch rename to tests/ref_data/grok.sbatch index a8274477..f5d32243 100644 --- a/tests/ref_data/grok-no-pretest.sbatch +++ b/tests/ref_data/grok.sbatch @@ -19,4 +19,4 @@ export XLA_FLAGS="--xla_disable_hlo_passes=rematerialization --xla_dump_hlo_pass -e __OUTPUT_DIR__/error-%j-%n-%t.txt \ --container-name=cont \ --container-mounts=__OUTPUT_DIR__:/opt/paxml/workspace/ \ - /opt/paxml/workspace/run.sh \ No newline at end of file + /opt/paxml/workspace/run.sh diff --git a/tests/slurm_command_gen_strategy/test_jax_toolbox_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_jax_toolbox_slurm_command_gen_strategy.py index 5db0d1bd..131e4a55 100644 --- a/tests/slurm_command_gen_strategy/test_jax_toolbox_slurm_command_gen_strategy.py +++ b/tests/slurm_command_gen_strategy/test_jax_toolbox_slurm_command_gen_strategy.py @@ -25,7 +25,7 @@ from cloudai.systems import SlurmSystem from cloudai.test_definitions.gpt import GPTCmdArgs, GPTTestDefinition from cloudai.test_definitions.grok import GrokCmdArgs, GrokTestDefinition -from cloudai.test_definitions.jax_toolbox import JaxFdl, PreTest +from cloudai.test_definitions.jax_toolbox import JaxFdl class TestJaxToolboxSlurmCommandGenStrategy: @@ -63,7 +63,6 @@ def test_gen_exec_command( test_fixture, ) -> None: test_def = request.getfixturevalue(test_fixture) - test_def.cmd_args.pre_test = PreTest(enable=True) test = Test(test_definition=test_def, test_template=JaxToolbox(slurm_system, "name")) test_run = TestRun( @@ -74,14 +73,10 @@ def test_gen_exec_command( name="test-job", ) - cmd_gen_strategy._generate_pre_test_command = MagicMock(return_value="pre_test_command") cmd = cmd_gen_strategy.gen_exec_command(test_run) assert cmd == f"sbatch {test_run.output_path}/cloudai_sbatch_script.sh" assert (test_run.output_path / "run.sh").exists() - content = Path(f"{test_run.output_path}/cloudai_sbatch_script.sh").read_text() - assert "pre_test_command" in content - @pytest.mark.parametrize( "cmd_args, expected", [ @@ -215,100 +210,6 @@ def test_generate_python_command( "fi", ] - def test_generate_pre_test_command( - self, cmd_gen_strategy: JaxToolboxSlurmCommandGenStrategy, grok_test: GrokTestDefinition, tmp_path: Path - ) -> None: - grok_test.cmd_args.pre_test = PreTest(enable=True) - - nccl_test = grok_test.cmd_args.pre_test.nccl_test - nccl_test.num_nodes = 2 - nccl_test.minbytes = "32M" - nccl_test.blocking = 0 - - cargs = {"output_path": str(tmp_path), **grok_test.cmd_args_dict} - - pre_test_cli = cmd_gen_strategy._generate_pre_test_command(cargs, tmp_path, tmp_path).splitlines() - - expected_pre_test_cli = [ - "srun \\", - "--mpi=pmix \\", - f"-N {nccl_test.num_nodes} \\", - f"-o {tmp_path} \\", - f"-e {tmp_path} \\", - f"--container-image={nccl_test.docker_image_url} \\", - f"/usr/local/bin/{nccl_test.subtest_name} \\", - f"--nthreads {nccl_test.nthreads} \\", - f"--ngpus {nccl_test.ngpus} \\", - f"--minbytes {nccl_test.minbytes} \\", - f"--maxbytes {nccl_test.maxbytes} \\", - f"--stepbytes {nccl_test.stepbytes} \\", - f"--op {nccl_test.op} \\", - f"--datatype {nccl_test.datatype} \\", - f"--root {nccl_test.root} \\", - f"--iters {nccl_test.iters} \\", - f"--warmup_iters {nccl_test.warmup_iters} \\", - f"--agg_iters {nccl_test.agg_iters} \\", - f"--average {nccl_test.average} \\", - f"--parallel_init {nccl_test.parallel_init} \\", - f"--check {nccl_test.check} \\", - f"--blocking {nccl_test.blocking} \\", - f"--cudagraph {nccl_test.cudagraph} \\", - f"--stepfactor {nccl_test.stepfactor}", - ] - - assert pre_test_cli == expected_pre_test_cli, ( - "The generated pre-test command did not match the expected command.\n" - f"Expected: {expected_pre_test_cli}\n" - f"Actual: {pre_test_cli}" - ) - - def test_generate_srun_command(self, slurm_system, cmd_gen_strategy, grok_test): - cmd_gen_strategy.test_name = grok_test.name - Path("/tmp/output").mkdir(parents=True, exist_ok=True) - - output_path = Path("/tmp/output/output") - output_path.mkdir(parents=True, exist_ok=True) - - # Use the existing setup for mocking internal methods - cmd_gen_strategy._generate_pre_test_command = MagicMock(return_value="srun --mpi=none pre_test_command") - cmd_gen_strategy._generate_run_command = MagicMock(return_value="srun --mpi=none run_command") - cmd_gen_strategy._generate_container_load_command = MagicMock( - return_value="srun --mpi=none container_load_command" - ) - - slurm_args = { - "output": "/tmp/output/output-%j.txt", - "error": "/tmp/output/error-%j.txt", - "image_path": "fake_image_url", - "container_mounts": "/tmp/output:/workspace", - } - cmd_args = { - "output_path": "/tmp/output", - "pre_test": {"enable": True}, - f"{grok_test.name}.setup_flags.docker_workspace_dir": "/workspace/docker", - f"{grok_test.name}.setup_flags.tfds_data_dir": "/workspace/tfds", - f"{grok_test.name}.setup_flags.enable_checkpoint_saving": True, - } - - pre_test_command = cmd_gen_strategy._generate_pre_test_command( - cmd_args, Path("/tmp/output"), Path("/tmp/output") - ) - run_command = cmd_gen_strategy._generate_run_command(slurm_args) - container_load_command = cmd_gen_strategy._generate_container_load_command(slurm_args) - - result_command = f"{pre_test_command}\n{container_load_command}\n{run_command}" - - # Assert expected parts of the command are in the generated result - assert "pre_test_command" in result_command - assert "container_load_command" in result_command - assert "run_command" in result_command - assert "srun" in result_command - assert "--mpi=none" in result_command - - cmd_gen_strategy._generate_pre_test_command.assert_called_once() - cmd_gen_strategy._generate_run_command.assert_called_once() - cmd_gen_strategy._generate_container_load_command.assert_called_once() - def test_gpt_test_definition_cmd_args_dict(): gpt = GPTTestDefinition( @@ -324,7 +225,7 @@ def test_gpt_test_definition_cmd_args_dict(): assert "GPT.setup_flags" in cargs assert "GPT.XLA_FLAGS" in cargs - for k in {"pre_test", "docker_image_url", "load_container"}: + for k in {"docker_image_url", "load_container"}: assert k in cargs assert f"GPT.{k}" not in cargs @@ -348,7 +249,7 @@ def test_grok_test_definition_cmd_args_dict(): assert "Grok.perf" in cargs assert "XLA_FLAGS" in cargs["Grok.perf"] - for k in {"pre_test", "docker_image_url", "load_container"}: + for k in {"docker_image_url", "load_container"}: assert k in cargs assert f"Grok.{k}" not in cargs diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py index be5f1299..e18d5d60 100644 --- a/tests/test_acceptance.py +++ b/tests/test_acceptance.py @@ -90,7 +90,7 @@ def partial_tr(slurm_system: SlurmSystem) -> partial[TestRun]: return partial(TestRun, num_nodes=1, nodes=[], output_path=slurm_system.output_path) -@pytest.fixture(params=["ucc", "nccl", "sleep", "gpt-pretest", "gpt-no-pretest", "grok-pretest", "grok-no-pretest"]) +@pytest.fixture(params=["ucc", "nccl", "sleep", "gpt", "grok"]) def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -> tuple[TestRun, str, Optional[str]]: if request.param == "ucc": tr = partial_tr( @@ -140,7 +140,7 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name") return (tr, "sleep.sbatch", None) - elif request.param.startswith("gpt-"): + elif request.param.startswith("gpt"): tr = partial_tr( name="gpt", test=Test( @@ -158,13 +158,9 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - slurm_system, tr.test.test_definition.cmd_args_dict ) tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name") - if "no-pretest" in request.param: - tr.test.test_definition.cmd_args.pre_test.enable = False - else: - tr.test.test_definition.cmd_args.pre_test.enable = True return (tr, f"{request.param}.sbatch", "gpt.run") - elif request.param.startswith("grok-"): + elif request.param.startswith("grok"): tr = partial_tr( name="grok", test=Test( @@ -182,10 +178,6 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - slurm_system, tr.test.test_definition.cmd_args_dict ) tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name") - if "no-pretest" in request.param: - tr.test.test_definition.cmd_args.pre_test.enable = False - else: - tr.test.test_definition.cmd_args.pre_test.enable = True return (tr, f"{request.param}.sbatch", "grok.run") @@ -199,8 +191,8 @@ def test_sbatch_generation(slurm_system: SlurmSystem, test_req: tuple[TestRun, s sbatch_script = tr.test.test_template.gen_exec_command(tr).split()[-1] - curr = Path(sbatch_script).read_text() - ref = (Path(__file__).parent / "ref_data" / test_req[1]).read_text() + curr = Path(sbatch_script).read_text().strip() + ref = (Path(__file__).parent / "ref_data" / test_req[1]).read_text().strip() ref = ref.replace("__OUTPUT_DIR__", str(slurm_system.output_path)).replace("__JOB_NAME__", "job_name") assert curr == ref From cac548409e262164f74fdd8b7c336bc6f038c2f6 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Wed, 23 Oct 2024 07:24:23 -0400 Subject: [PATCH 21/64] Add prologue and epilogue to _TestScenarioTOML --- src/cloudai/_core/test_scenario_parser.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/cloudai/_core/test_scenario_parser.py b/src/cloudai/_core/test_scenario_parser.py index 08526dca..16302a2a 100644 --- a/src/cloudai/_core/test_scenario_parser.py +++ b/src/cloudai/_core/test_scenario_parser.py @@ -54,6 +54,8 @@ class _TestScenarioTOML(BaseModel): name: str job_status_check: bool = True tests: list[_TestRunTOML] = Field(alias="Tests", min_length=1) + prologue: str = "" + epilogue: str = "" @model_validator(mode="after") def check_no_self_dependency(self): From aab165bb1e31f75d84dce21815947e4828e56b08 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Tue, 22 Oct 2024 14:50:35 -0400 Subject: [PATCH 22/64] Add example plugin files --- conf/common/plugin/nccl_test_epilogue.toml | 22 ++++++++++++++++++++++ conf/common/plugin/nccl_test_prologue.toml | 22 ++++++++++++++++++++++ conf/common/test_scenario/nccl_test.toml | 4 ++++ 3 files changed, 48 insertions(+) create mode 100644 conf/common/plugin/nccl_test_epilogue.toml create mode 100644 conf/common/plugin/nccl_test_prologue.toml diff --git a/conf/common/plugin/nccl_test_epilogue.toml b/conf/common/plugin/nccl_test_epilogue.toml new file mode 100644 index 00000000..346dc8e4 --- /dev/null +++ b/conf/common/plugin/nccl_test_epilogue.toml @@ -0,0 +1,22 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name = "nccl_test_epilogue" + +[[Tests]] +id = "Tests.1" +test_name = "nccl_test_all_gather" +time_limit = "00:20:00" diff --git a/conf/common/plugin/nccl_test_prologue.toml b/conf/common/plugin/nccl_test_prologue.toml new file mode 100644 index 00000000..e5c1a1e4 --- /dev/null +++ b/conf/common/plugin/nccl_test_prologue.toml @@ -0,0 +1,22 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name = "nccl_test_prologue" + +[[Tests]] +id = "Tests.1" +test_name = "nccl_test_all_reduce" +time_limit = "00:20:00" diff --git a/conf/common/test_scenario/nccl_test.toml b/conf/common/test_scenario/nccl_test.toml index f6ccf02c..9b731e96 100644 --- a/conf/common/test_scenario/nccl_test.toml +++ b/conf/common/test_scenario/nccl_test.toml @@ -15,6 +15,10 @@ # limitations under the License. name = "nccl-test" + +prologue = "nccl_test_prologue" +epilogue = "nccl_test_epilogue" + [[Tests]] id = "Tests.1" test_name = "nccl_test_all_reduce" From 265e42e651197452e9040e26086471218bcfaab9 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Wed, 23 Oct 2024 07:35:01 -0400 Subject: [PATCH 23/64] Add plugin option to CLI --- src/cloudai/cli/cli.py | 4 ++ tests/test_cli.py | 92 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 94 insertions(+), 2 deletions(-) diff --git a/src/cloudai/cli/cli.py b/src/cloudai/cli/cli.py index 53059799..f962a2d5 100644 --- a/src/cloudai/cli/cli.py +++ b/src/cloudai/cli/cli.py @@ -60,6 +60,7 @@ def add_command( handler: Callable[[argparse.Namespace], int], system_config: Optional[bool] = None, tests_dir: Optional[bool] = None, + plugin_dir: Optional[bool] = None, test_scenario: Optional[bool] = None, output_dir: Optional[bool] = None, result_dir: Optional[bool] = None, @@ -74,6 +75,8 @@ def add_command( p.add_argument( "--tests-dir", help="Path to the test configuration directory.", required=tests_dir, type=Path ) + if plugin_dir is not None: + p.add_argument("--plugin-dir", help="Path to the plugin directory.", required=plugin_dir, type=Path) if test_scenario is not None: p.add_argument("--test-scenario", help="Path to the test scenario file.", required=test_scenario, type=Path) if output_dir is not None: @@ -127,6 +130,7 @@ def add_run_and_dry_run(self): handle_dry_run_and_run, system_config=True, tests_dir=True, + plugin_dir=False, test_scenario=True, output_dir=False, ) diff --git a/tests/test_cli.py b/tests/test_cli.py index 538e497f..bb6c1a5d 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -20,8 +20,8 @@ import pytest -from cloudai.cli import CloudAICLI, handle_generate_report, handle_install_and_uninstall -from cloudai.cli.handlers import handle_verify_all_configs +from cloudai.cli import CloudAICLI +from cloudai.cli.handlers import handle_generate_report, handle_install_and_uninstall, handle_verify_all_configs def test_help_message(capsys: pytest.CaptureFixture[str]) -> None: @@ -108,6 +108,7 @@ def test_add_command_all_optional(): lambda _: 0, system_config=False, tests_dir=False, + plugin_dir=False, test_scenario=False, output_dir=False, ) @@ -118,6 +119,7 @@ def test_add_command_all_optional(): mode="test", system_config=None, tests_dir=None, + plugin_dir=None, test_scenario=None, output_dir=None, ) @@ -132,6 +134,7 @@ def test_add_command_all_required(): lambda _: 0, system_config=True, tests_dir=True, + plugin_dir=True, test_scenario=True, output_dir=True, ) @@ -142,6 +145,8 @@ def test_add_command_all_required(): "system_config", "--tests-dir", "tests_dir", + "--plugin-dir", + "plugin_dir", "--test-scenario", "test_scenario", "--output-dir", @@ -154,11 +159,91 @@ def test_add_command_all_required(): mode="test", system_config=Path("system_config"), tests_dir=Path("tests_dir"), + plugin_dir=Path("plugin_dir"), test_scenario=Path("test_scenario"), output_dir=Path("output_dir"), ) +@pytest.mark.parametrize( + "mode,args,expected_plugin_dir", + [ + ( + "run", + [ + "run", + "--system-config", + "system_config", + "--tests-dir", + "tests_dir", + "--plugin-dir", + "plugin_dir", + "--test-scenario", + "test_scenario", + ], + Path("plugin_dir"), + ), + ( + "run", + [ + "run", + "--system-config", + "system_config", + "--tests-dir", + "tests_dir", + "--test-scenario", + "test_scenario", + ], + None, + ), + ( + "dry-run", + [ + "dry-run", + "--system-config", + "system_config", + "--tests-dir", + "tests_dir", + "--plugin-dir", + "plugin_dir", + "--test-scenario", + "test_scenario", + ], + Path("plugin_dir"), + ), + ( + "dry-run", + [ + "dry-run", + "--system-config", + "system_config", + "--tests-dir", + "tests_dir", + "--test-scenario", + "test_scenario", + ], + None, + ), + ], +) +def test_modes_with_or_without_plugin_dir(mode, args, expected_plugin_dir): + cli = CloudAICLI() + + cli.add_command( + mode, + f"{mode} command", + lambda _: 0, + system_config=True, + tests_dir=True, + plugin_dir=False, + test_scenario=True, + output_dir=False, + ) + + parsed_args = cli.parser.parse_args(args) + assert parsed_args.plugin_dir == expected_plugin_dir + + def test_real_uninstall(): cli = CloudAICLI() cli.init_default_args() @@ -277,6 +362,8 @@ def test_run_dry_run_modes(self, cli: CloudAICLI): "tests_dir", "--test-scenario", "test_scenario", + "--plugin-dir", + "plugin_dir", ] ) @@ -286,6 +373,7 @@ def test_run_dry_run_modes(self, cli: CloudAICLI): mode=mode, system_config=Path("system_config"), tests_dir=Path("tests_dir"), + plugin_dir=Path("plugin_dir"), test_scenario=Path("test_scenario"), output_dir=None, ) From d9b2e83f8ee4e5dab3e22618533411eba2a16081 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Wed, 23 Oct 2024 07:43:01 -0400 Subject: [PATCH 24/64] Parse plugins and pass them to TestRun --- src/cloudai/_core/test_scenario.py | 2 + src/cloudai/_core/test_scenario_parser.py | 23 +++++++- src/cloudai/cli/handlers.py | 2 +- src/cloudai/parser.py | 47 ++++++++++++--- tests/test_acceptance.py | 1 + tests/test_parser.py | 70 ++++++++++++++++++++++- tests/test_test_scenario.py | 2 +- 7 files changed, 131 insertions(+), 16 deletions(-) diff --git a/src/cloudai/_core/test_scenario.py b/src/cloudai/_core/test_scenario.py index 3a60c036..97c89994 100644 --- a/src/cloudai/_core/test_scenario.py +++ b/src/cloudai/_core/test_scenario.py @@ -58,6 +58,8 @@ class TestRun: weight: float = 0.0 ideal_perf: float = 1.0 dependencies: dict[str, TestDependency] = field(default_factory=dict) + prologue: Optional["TestScenario"] = None + epilogue: Optional["TestScenario"] = None def __hash__(self) -> int: return hash(self.name + self.test.name + str(self.iterations) + str(self.current_iteration)) diff --git a/src/cloudai/_core/test_scenario_parser.py b/src/cloudai/_core/test_scenario_parser.py index 16302a2a..c59adeba 100644 --- a/src/cloudai/_core/test_scenario_parser.py +++ b/src/cloudai/_core/test_scenario_parser.py @@ -101,9 +101,10 @@ class TestScenarioParser: __test__ = False - def __init__(self, file_path: Path, test_mapping: Dict[str, Test]) -> None: + def __init__(self, file_path: Path, test_mapping: Dict[str, Test], plugin_mapping: Dict[str, TestScenario]) -> None: self.file_path = file_path self.test_mapping = test_mapping + self.plugin_mapping = plugin_mapping def parse(self) -> TestScenario: """ @@ -138,8 +139,14 @@ def _parse_data(self, data: Dict[str, Any]) -> TestScenario: total_weight = sum(tr.weight for tr in ts_model.tests) normalized_weight = 0 if total_weight == 0 else 100 / total_weight + prologue_name = data.get("prologue", "") + epilogue_name = data.get("epilogue", "") + + prologue = self.plugin_mapping.get(prologue_name, None) if prologue_name else None + epilogue = self.plugin_mapping.get(epilogue_name, None) if epilogue_name else None + testruns_by_id: dict[str, TestRun] = { - tr.id: self._create_section_test_run(tr, normalized_weight) for tr in ts_model.tests + tr.id: self._create_section_test_run(tr, normalized_weight, prologue, epilogue) for tr in ts_model.tests } tests_data: dict[str, _TestRunTOML] = {tr.id: tr for tr in ts_model.tests} @@ -155,13 +162,21 @@ def _parse_data(self, data: Dict[str, Any]) -> TestScenario: job_status_check=ts_model.job_status_check, ) - def _create_section_test_run(self, test_info: _TestRunTOML, normalized_weight: float) -> TestRun: + def _create_section_test_run( + self, + test_info: _TestRunTOML, + normalized_weight: float, + prologue: Optional[TestScenario], + epilogue: Optional[TestScenario], + ) -> TestRun: """ Create a section-specific Test object by copying from the test mapping. Args: test_info (Dict[str, Any]): Information of the test. normalized_weight (float): Normalized weight for the test. + prologue (Optional[TestScenario]): TestScenario object representing the prologue sequence. + epilogue (Optional[TestScenario]): TestScenario object representing the epilogue sequence. Returns: Test: Copied and updated Test object for the section. @@ -194,5 +209,7 @@ def _create_section_test_run(self, test_info: _TestRunTOML, normalized_weight: f sol=test_info.sol, weight=test_info.weight * normalized_weight, ideal_perf=test_info.ideal_perf, + prologue=prologue if prologue is not None else TestScenario(name="default_prologue", test_runs=[]), + epilogue=epilogue if epilogue is not None else TestScenario(name="default_epilogue", test_runs=[]), ) return tr diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py index 6105bc24..e654cf03 100644 --- a/src/cloudai/cli/handlers.py +++ b/src/cloudai/cli/handlers.py @@ -90,7 +90,7 @@ def handle_dry_run_and_run(args: argparse.Namespace) -> int: args (argparse.Namespace): The parsed command-line arguments. """ parser = Parser(args.system_config) - system, tests, test_scenario = parser.parse(args.tests_dir, args.test_scenario) + system, tests, test_scenario = parser.parse(args.tests_dir, args.test_scenario, args.plugin_dir) assert test_scenario is not None if args.output_dir: diff --git a/src/cloudai/parser.py b/src/cloudai/parser.py index a627b312..5b21ab3f 100644 --- a/src/cloudai/parser.py +++ b/src/cloudai/parser.py @@ -16,7 +16,7 @@ import logging from pathlib import Path -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Set, Tuple import toml from pydantic import ValidationError @@ -49,7 +49,7 @@ def __init__(self, system_config_path: Path) -> None: self.system_config_path = system_config_path def parse( - self, test_path: Path, test_scenario_path: Optional[Path] = None + self, test_path: Path, test_scenario_path: Optional[Path] = None, plugin_path: Optional[Path] = None ) -> Tuple[System, List[Test], Optional[TestScenario]]: """ Parse configurations for system, test templates, and test scenarios. @@ -74,21 +74,50 @@ def parse( logging.debug(f"Parsed {len(tests)} tests: {[t.name for t in tests]}") test_mapping = {t.name: t for t in tests} - filtered_tests = tests test_scenario: Optional[TestScenario] = None + scenario_test_names: Set[str] = set() if test_scenario_path: + plugin_mapping: Dict[str, TestScenario] = {} + plugin_test_names: Set[str] = set() + if plugin_path and plugin_path.exists(): + try: + plugin_mapping = self.parse_plugins(list(plugin_path.glob("*.toml")), test_mapping) + for plugin_scenario in plugin_mapping.values(): + plugin_test_names.update(tr.test.name for tr in plugin_scenario.test_runs) + except TestScenarioParsingError: + exit(1) + try: - test_scenario = self.parse_test_scenario(test_scenario_path, test_mapping) + test_scenario = self.parse_test_scenario(test_scenario_path, test_mapping, plugin_mapping) + scenario_test_names = set(tr.test.name for tr in test_scenario.test_runs) except TestScenarioParsingError: - exit(1) # exit right away to keep error message readable for users - scenario_tests = set(tr.test.name for tr in test_scenario.test_runs) - filtered_tests = [t for t in tests if t.name in scenario_tests] + exit(1) + + all_used_test_names = plugin_test_names.union(scenario_test_names) + filtered_tests = [t for t in tests if t.name in all_used_test_names] + else: + filtered_tests = tests return system, filtered_tests, test_scenario @staticmethod - def parse_test_scenario(test_scenario_path: Path, test_mapping: Dict[str, Test]) -> TestScenario: - test_scenario_parser = TestScenarioParser(test_scenario_path, test_mapping) + def parse_plugins(plugin_tomls: List[Path], test_mapping: Dict[str, Test]) -> Dict[str, TestScenario]: + plugin_mapping = {} + for plugin_path in plugin_tomls: + plugin_scenario = Parser.parse_test_scenario(plugin_path, test_mapping) + plugin_mapping[plugin_scenario.name] = plugin_scenario + return plugin_mapping + + @staticmethod + def parse_test_scenario( + test_scenario_path: Path, + test_mapping: Dict[str, Test], + plugin_mapping: Optional[Dict[str, TestScenario]] = None, + ) -> TestScenario: + if plugin_mapping is None: + plugin_mapping = {} + + test_scenario_parser = TestScenarioParser(test_scenario_path, test_mapping, plugin_mapping) test_scenario = test_scenario_parser.parse() return test_scenario diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py index e18d5d60..e11ff50b 100644 --- a/tests/test_acceptance.py +++ b/tests/test_acceptance.py @@ -60,6 +60,7 @@ def test_slurm(tmp_path: Path, scenario: Dict): system_config=Path("conf/common/system/example_slurm_cluster.toml"), test_templates_dir=Path("conf/common/test_template"), tests_dir=Path("conf/common/test"), + plugin_dir=Path("conf/common/plugin"), test_scenario=test_scenario_path, output_dir=tmp_path, ) diff --git a/tests/test_parser.py b/tests/test_parser.py index d35896a9..cb809d36 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -34,7 +34,7 @@ def parser(self, tmp_path: Path) -> Parser: def test_no_tests_dir(self, parser: Parser): tests_dir = parser.system_config_path.parent / "tests" with pytest.raises(FileNotFoundError) as exc_info: - parser.parse(tests_dir, None) + parser.parse(tests_dir, None, None) assert "Test path" in str(exc_info.value) @patch("cloudai._core.test_parser.TestParser.parse_all") @@ -50,19 +50,85 @@ def test_no_scenario(self, test_parser: Mock, parser: Parser): @patch("cloudai._core.test_parser.TestParser.parse_all") @patch("cloudai._core.test_scenario_parser.TestScenarioParser.parse") - def test_scenario_filters_tests(self, test_scenario_parser: Mock, test_parser: Mock, parser: Parser): + def test_scenario_without_plugin(self, test_scenario_parser: Mock, test_parser: Mock, parser: Parser): tests_dir = parser.system_config_path.parent.parent / "test" + fake_tests = [] for i in range(3): fake_tests.append(Mock()) fake_tests[-1].name = f"test-{i}" test_parser.return_value = fake_tests + fake_scenario = Mock() fake_scenario.test_runs = [Mock()] fake_scenario.test_runs[0].test.name = "test-1" test_scenario_parser.return_value = fake_scenario + _, tests, _ = parser.parse(tests_dir, Path()) + + assert len(tests) == 1 + assert tests[0].name == "test-1" + + @patch("cloudai._core.test_parser.TestParser.parse_all") + @patch("cloudai._core.test_scenario_parser.TestScenarioParser.parse") + @patch("cloudai.parser.Parser.parse_plugins") + def test_scenario_with_plugin_common_tests( + self, parse_plugins: Mock, test_scenario_parser: Mock, test_parser: Mock, parser: Parser + ): + tests_dir = parser.system_config_path.parent.parent / "test" + + fake_tests = [] + for i in range(3): + fake_tests.append(Mock()) + fake_tests[-1].name = f"test-{i}" + test_parser.return_value = fake_tests + + fake_scenario = Mock() + fake_scenario.test_runs = [Mock()] + fake_scenario.test_runs[0].test.name = "test-1" + test_scenario_parser.return_value = fake_scenario + + fake_plugin = Mock() + fake_plugin.test_runs = [Mock()] + fake_plugin.test_runs[0].test.name = "test-1" + parse_plugins.return_value = {"plugin-1": fake_plugin} + + _, tests, _ = parser.parse(tests_dir, Path(), Path()) + assert len(tests) == 1 + assert tests[0].name == "test-1" + + @patch("cloudai._core.test_parser.TestParser.parse_all") + @patch("cloudai._core.test_scenario_parser.TestScenarioParser.parse") + @patch("cloudai.parser.Parser.parse_plugins") + def test_scenario_with_plugin_exclusive_tests( + self, parse_plugins: Mock, test_scenario_parser: Mock, test_parser: Mock, parser: Parser + ): + tests_dir = parser.system_config_path.parent.parent / "test" + + fake_tests = [] + for i in range(4): + fake_tests.append(Mock()) + fake_tests[-1].name = f"test-{i}" + test_parser.return_value = fake_tests + + fake_scenario = Mock() + fake_scenario.test_runs = [Mock()] + fake_scenario.test_runs[0].test.name = "test-1" + test_scenario_parser.return_value = fake_scenario + + fake_plugin = Mock() + fake_plugin.test_runs = [Mock()] + fake_plugin.test_runs[0].test.name = "test-2" + parse_plugins.return_value = {"plugin-1": fake_plugin} + + _, tests, _ = parser.parse(tests_dir, Path(), Path()) + + assert len(tests) == 2 + assert "test-1" in [t.name for t in tests] + assert "test-2" in [t.name for t in tests] + assert "test-0" not in [t.name for t in tests] + assert "test-3" not in [t.name for t in tests] def test_parse_system(self, parser: Parser): parser.system_config_path = Path("conf/common/system/example_slurm_cluster.toml") diff --git a/tests/test_test_scenario.py b/tests/test_test_scenario.py index ab81bdbd..72639068 100644 --- a/tests/test_test_scenario.py +++ b/tests/test_test_scenario.py @@ -27,7 +27,7 @@ @pytest.fixture def test_scenario_parser(tmp_path: Path) -> TestScenarioParser: - tsp = TestScenarioParser(Path(""), {}) + tsp = TestScenarioParser(Path(""), {}, {}) return tsp From bfb653f935910fb266da508dcf0bd1fe3722c101 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Fri, 25 Oct 2024 06:28:41 -0400 Subject: [PATCH 25/64] Generate plugin commands --- src/cloudai/_core/command_gen_strategy.py | 26 ++++ src/cloudai/_core/test_template.py | 34 +++++ .../nccl_test/slurm_command_gen_strategy.py | 4 + .../strategy/slurm_command_gen_strategy.py | 108 +++++++++++++- tests/ref_data/gpt.sbatch | 2 +- tests/ref_data/grok.sbatch | 2 +- tests/ref_data/nccl.sbatch | 21 +-- tests/ref_data/sleep.sbatch | 4 +- tests/ref_data/ucc.sbatch | 10 +- .../test_common_slurm_command_gen_strategy.py | 137 +++++++++++++++++- 10 files changed, 309 insertions(+), 39 deletions(-) diff --git a/src/cloudai/_core/command_gen_strategy.py b/src/cloudai/_core/command_gen_strategy.py index 16bd04f9..9c8bb389 100644 --- a/src/cloudai/_core/command_gen_strategy.py +++ b/src/cloudai/_core/command_gen_strategy.py @@ -39,3 +39,29 @@ def gen_exec_command(self, tr: TestRun) -> str: str: The generated execution command. """ pass + + @abstractmethod + def gen_srun_command(self, tr: TestRun) -> str: + """ + Generate the Slurm srun command for a test based on the given parameters. + + Args: + tr (TestRun): Contains the test and its run-specific configurations. + + Returns: + str: The generated Slurm srun command. + """ + pass + + @abstractmethod + def gen_srun_success_check(self, tr: TestRun) -> str: + """ + Generate the Slurm success check command to verify if a test run was successful. + + Args: + tr (TestRun): Contains the test and its run-specific configurations. + + Returns: + str: The generated command to check the success of the test run. + """ + pass diff --git a/src/cloudai/_core/test_template.py b/src/cloudai/_core/test_template.py index de98b1a8..2ccd872a 100644 --- a/src/cloudai/_core/test_template.py +++ b/src/cloudai/_core/test_template.py @@ -94,6 +94,40 @@ def gen_exec_command(self, tr: TestRun) -> str: ) return self.command_gen_strategy.gen_exec_command(tr) + def gen_srun_command(self, tr: TestRun) -> str: + """ + Generate an Slurm srun command for a test using the provided command generation strategy. + + Args: + tr (TestRun): Contains the test and its run-specific configurations. + + Returns: + str: The generated Slurm srun command. + """ + if self.command_gen_strategy is None: + raise ValueError( + "command_gen_strategy is missing. Ensure the strategy is registered in the Registry " + "by calling the appropriate registration function for the system type." + ) + return self.command_gen_strategy.gen_srun_command(tr) + + def gen_srun_success_check(self, tr: TestRun) -> str: + """ + Generate a Slurm success check command for a test using the provided command generation strategy. + + Args: + tr (TestRun): Contains the test and its run-specific configurations. + + Returns: + str: The generated command to check the success of the test run. + """ + if self.command_gen_strategy is None: + raise ValueError( + "command_gen_strategy is missing. Ensure the strategy is registered in the Registry " + "by calling the appropriate registration function for the system type." + ) + return self.command_gen_strategy.gen_srun_success_check(tr) + def gen_json(self, tr: TestRun) -> Dict[Any, Any]: """ Generate a JSON string representing the Kubernetes job specification for this test using this template. diff --git a/src/cloudai/schema/test_template/nccl_test/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/nccl_test/slurm_command_gen_strategy.py index 8805202c..28281841 100644 --- a/src/cloudai/schema/test_template/nccl_test/slurm_command_gen_strategy.py +++ b/src/cloudai/schema/test_template/nccl_test/slurm_command_gen_strategy.py @@ -73,3 +73,7 @@ def generate_test_command( srun_command_parts.append(extra_cmd_args) return srun_command_parts + + def gen_srun_success_check(self, tr: TestRun) -> str: + output_file = Path(tr.output_path) / "stdout.txt" + return f'grep -q "Avg bus bandwidth" {output_file} && echo 1 || echo 0' diff --git a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py index 5c33a141..57c37812 100644 --- a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py +++ b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py @@ -18,7 +18,7 @@ from pathlib import Path from typing import Any, Dict, List -from cloudai import CommandGenStrategy, TestRun +from cloudai import CommandGenStrategy, TestRun, TestScenario from cloudai.systems import SlurmSystem @@ -55,8 +55,33 @@ def gen_exec_command(self, tr: TestRun) -> str: env_vars = self._override_env_vars(self.system.global_env_vars, tr.test.extra_env_vars) cmd_args = self._override_cmd_args(self.default_cmd_args, tr.test.cmd_args) slurm_args = self._parse_slurm_args(tr.test.test_template.__class__.__name__, env_vars, cmd_args, tr) - srun_command = self.generate_srun_command(slurm_args, env_vars, cmd_args, tr.test.extra_cmd_args) - return self._write_sbatch_script(slurm_args, env_vars, srun_command, tr.output_path) + + if tr.prologue: + prologue_command = self.gen_prologue(tr.prologue, tr.output_path) + srun_command = self._gen_srun_command(slurm_args, env_vars, cmd_args, tr.test.extra_cmd_args) + command_list = [prologue_command, "if [ $PROLOGUE_SUCCESS -eq 1 ]; then", f" {srun_command}"] + + if tr.epilogue: + epilogue_command = self.gen_epilogue(tr.epilogue, tr.output_path) + command_list.append(f" {epilogue_command}") + + command_list.append("fi") + else: + srun_command = self._gen_srun_command(slurm_args, env_vars, cmd_args, tr.test.extra_cmd_args) + command_list = [srun_command] + + if tr.epilogue: + epilogue_command = self.gen_epilogue(tr.epilogue, tr.output_path) + command_list.append(epilogue_command) + + full_command = "\n".join(command_list).strip() + return self._write_sbatch_script(slurm_args, env_vars, full_command, tr.output_path) + + def gen_srun_command(self, tr: TestRun) -> str: + env_vars = self._override_env_vars(self.system.global_env_vars, tr.test.extra_env_vars) + cmd_args = self._override_cmd_args(self.default_cmd_args, tr.test.cmd_args) + slurm_args = self._parse_slurm_args(tr.test.test_template.__class__.__name__, env_vars, cmd_args, tr) + return self._gen_srun_command(slurm_args, env_vars, cmd_args, tr.test.extra_cmd_args) def _parse_slurm_args( self, job_name_prefix: str, env_vars: Dict[str, str], cmd_args: Dict[str, str], tr: TestRun @@ -98,12 +123,87 @@ def job_name(self, job_name_prefix: str) -> str: job_name = f"{self.system.account}-{job_name_prefix}.{datetime.now().strftime('%Y%m%d_%H%M%S')}" return job_name + def gen_prologue(self, prologue: TestScenario, base_output_path: Path) -> str: + """ + Generate the prologue command by running all tests defined in the prologue test scenario. + + Args: + prologue (TestScenario): The prologue test scenario containing the tests to be run. + base_output_path (Path): The base output directory path for storing prologue outputs. + + Returns: + str: A string with all the Slurm srun commands generated for the prologue. + """ + if not prologue.test_runs: + return "PROLOGUE_SUCCESS=1\n" + + prologue_output_dir = base_output_path / "prologue" + prologue_output_dir.mkdir(parents=True, exist_ok=True) + + prologue_commands = [] + success_vars = [] + + for idx, tr in enumerate(prologue.test_runs): + plugin_dir = prologue_output_dir / tr.test.name + plugin_dir.mkdir(parents=True, exist_ok=True) + tr.output_path = plugin_dir + + srun_command = tr.test.test_template.gen_srun_command(tr) + srun_command_with_output = srun_command.replace( + "srun ", f"srun --output={plugin_dir / 'stdout.txt'} --error={plugin_dir / 'stderr.txt'} " + ) + prologue_commands.append(srun_command_with_output) + + success_var = f"SUCCESS_{idx}" + success_vars.append(success_var) + + success_check_command = tr.test.test_template.gen_srun_success_check(tr) + prologue_commands.append(f"{success_var}=$({success_check_command})") + + combined_success_var = " && ".join([f"[ ${var} -eq 1 ]" for var in success_vars]) + + prologue_commands.append(f"PROLOGUE_SUCCESS=$( {combined_success_var} && echo 1 || echo 0 )") + + return "\n".join(prologue_commands) + + def gen_epilogue(self, epilogue: TestScenario, base_output_path: Path) -> str: + """ + Generate the epilogue command by running all tests defined in the epilogue test scenario. + + Args: + epilogue (TestScenario): The epilogue test scenario containing the tests to be run. + base_output_path (Path): The base output directory path for storing epilogue outputs. + + Returns: + str: A string with all the Slurm srun commands generated for the epilogue. + """ + if not epilogue.test_runs: + return "" + + epilogue_output_dir = base_output_path / "epilogue" + epilogue_output_dir.mkdir(parents=True, exist_ok=True) + + epilogue_commands = [] + + for tr in epilogue.test_runs: + plugin_dir = epilogue_output_dir / tr.test.name + plugin_dir.mkdir(parents=True, exist_ok=True) + tr.output_path = plugin_dir + + srun_command = tr.test.test_template.gen_srun_command(tr) + srun_command_with_output = srun_command.replace( + "srun ", f"srun --output={plugin_dir / 'stdout.txt'} --error={plugin_dir / 'stderr.txt'} " + ) + epilogue_commands.append(srun_command_with_output) + + return "\n".join(epilogue_commands) + def _gen_srun_command( self, slurm_args: Dict[str, Any], env_vars: Dict[str, str], cmd_args: Dict[str, str], extra_cmd_args: str ) -> str: srun_command_parts = self.gen_srun_prefix(slurm_args) test_command_parts = self.generate_test_command(env_vars, cmd_args, extra_cmd_args) - return " \\\n".join(srun_command_parts + test_command_parts) + return " ".join(srun_command_parts + test_command_parts) def gen_srun_prefix(self, slurm_args: Dict[str, Any]) -> List[str]: srun_command_parts = ["srun", f"--mpi={self.system.mpi}"] diff --git a/tests/ref_data/gpt.sbatch b/tests/ref_data/gpt.sbatch index 3cd84b21..f01e9222 100644 --- a/tests/ref_data/gpt.sbatch +++ b/tests/ref_data/gpt.sbatch @@ -8,7 +8,7 @@ export COMBINE_THRESHOLD=1 export PER_GPU_COMBINE_THRESHOLD=0 export XLA_FLAGS="--xla_gpu_all_gather_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_all_reduce_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_reduce_scatter_combine_threshold_bytes=$PER_GPU_COMBINE_THRESHOLD" - echo "Loading container with srun command" +echo "Loading container with srun command" srun --mpi=none --container-image=https://docker/url --container-name=cont true echo "Running srun command" srun \ diff --git a/tests/ref_data/grok.sbatch b/tests/ref_data/grok.sbatch index f5d32243..7e7adfc2 100644 --- a/tests/ref_data/grok.sbatch +++ b/tests/ref_data/grok.sbatch @@ -8,7 +8,7 @@ export COMBINE_THRESHOLD=1 export PER_GPU_COMBINE_THRESHOLD=0 export XLA_FLAGS="--xla_disable_hlo_passes=rematerialization --xla_dump_hlo_pass_re=.* --xla_gpu_all_gather_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_all_reduce_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_enable_all_gather_combine_by_dim=false --xla_gpu_enable_highest_priority_async_stream=true --xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_pipelined_all_gather=true --xla_gpu_enable_pipelined_all_reduce=true --xla_gpu_enable_pipelined_reduce_scatter=true --xla_gpu_enable_reduce_scatter_combine_by_dim=false --xla_gpu_enable_triton_gemm=false --xla_gpu_enable_triton_softmax_fusion=false --xla_gpu_enable_while_loop_double_buffering=true --xla_gpu_graph_level=0 --xla_gpu_pgle_profile_file_or_directory_path=/opt/paxml/workspace/pgle_output_profile.pbtxt --xla_gpu_reduce_scatter_combine_threshold_bytes=$PER_GPU_COMBINE_THRESHOLD --xla_gpu_run_post_layout_collective_pipeliner=false --xla_gpu_use_memcpy_local_p2p=false" - echo "Loading container with srun command" +echo "Loading container with srun command" srun --mpi=none --container-image=https://docker/url --container-name=cont true echo "Running srun command" srun \ diff --git a/tests/ref_data/nccl.sbatch b/tests/ref_data/nccl.sbatch index 3ac39077..dc179ba9 100644 --- a/tests/ref_data/nccl.sbatch +++ b/tests/ref_data/nccl.sbatch @@ -8,23 +8,4 @@ export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) -srun \ ---mpi=pmix \ ---container-image=nvcr.io/nvidia/pytorch:24.02-py3 \ -/usr/local/bin/all_reduce_perf_mpi \ ---nthreads 1 \ ---ngpus 1 \ ---minbytes 32M \ ---maxbytes 32M \ ---stepbytes 1M \ ---op sum \ ---datatype float \ ---root 0 \ ---iters 20 \ ---warmup_iters 5 \ ---agg_iters 1 \ ---average 1 \ ---parallel_init 0 \ ---check 1 \ ---blocking 0 \ ---cudagraph 0 \ No newline at end of file +srun --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 /usr/local/bin/all_reduce_perf_mpi --nthreads 1 --ngpus 1 --minbytes 32M --maxbytes 32M --stepbytes 1M --op sum --datatype float --root 0 --iters 20 --warmup_iters 5 --agg_iters 1 --average 1 --parallel_init 0 --check 1 --blocking 0 --cudagraph 0 diff --git a/tests/ref_data/sleep.sbatch b/tests/ref_data/sleep.sbatch index 7c24ec14..9262001b 100644 --- a/tests/ref_data/sleep.sbatch +++ b/tests/ref_data/sleep.sbatch @@ -8,6 +8,4 @@ export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) -srun \ ---mpi=pmix \ -sleep 5 \ No newline at end of file +srun --mpi=pmix sleep 5 diff --git a/tests/ref_data/ucc.sbatch b/tests/ref_data/ucc.sbatch index 74fa7799..a9f9e686 100644 --- a/tests/ref_data/ucc.sbatch +++ b/tests/ref_data/ucc.sbatch @@ -8,12 +8,4 @@ export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) -srun \ ---mpi=pmix \ ---container-image=nvcr.io/nvidia/pytorch:24.02-py3 \ -/opt/hpcx/ucc/bin/ucc_perftest \ --c alltoall \ --b 1 \ --e 8M \ --m cuda \ --F \ No newline at end of file +srun --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 /opt/hpcx/ucc/bin/ucc_perftest -c alltoall -b 1 -e 8M -m cuda -F diff --git a/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py index f2aae181..4484b6e1 100644 --- a/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py +++ b/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py @@ -19,7 +19,7 @@ import pytest -from cloudai import Test, TestDefinition, TestRun, TestTemplate +from cloudai import Test, TestDefinition, TestRun, TestScenario, TestTemplate from cloudai.systems import SlurmSystem from cloudai.systems.slurm.strategy import SlurmCommandGenStrategy @@ -122,3 +122,138 @@ def test_raises_if_no_default_partition(slurm_system: SlurmSystem): "system configuration. Please ensure that 'default_partition' is set correctly " "in the corresponding system configuration (e.g., system.toml)." ) in str(exc_info.value) + + +@pytest.mark.parametrize( + "prologue,epilogue,expected_script_lines", + [ + # No prologue, no epilogue + (None, None, ["srun"]), + # One prologue, no epilogue + ( + [Mock(test=Mock(name="test1", test_template=Mock()))], + None, + [ + "SUCCESS_0=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)", + "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 )", + "if [ $PROLOGUE_SUCCESS -eq 1 ]; then", + " srun", + "fi", + ], + ), + # No prologue, one epilogue + ( + None, + [Mock(test=Mock(name="test2", test_template=Mock()))], + [ + "srun", + "epilogue", + ], + ), + # One prologue, one epilogue + ( + [Mock(test=Mock(name="test1", test_template=Mock()))], + [Mock(test=Mock(name="test2", test_template=Mock()))], + [ + "SUCCESS_0=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)", + "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 )", + "if [ $PROLOGUE_SUCCESS -eq 1 ]; then", + " srun", + " epilogue", + "fi", + ], + ), + # Multiple prologues, multiple epilogues + ( + [Mock(test=Mock(name="test1", test_template=Mock())), Mock(test=Mock(name="test2", test_template=Mock()))], + [Mock(test=Mock(name="test3", test_template=Mock())), Mock(test=Mock(name="test4", test_template=Mock()))], + [ + "SUCCESS_0=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)", + "SUCCESS_1=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)", + "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && [ $SUCCESS_1 -eq 1 ] && echo 1 || echo 0 )", + "if [ $PROLOGUE_SUCCESS -eq 1 ]; then", + " srun", + " epilogue", + " epilogue", + "fi", + ], + ), + # Multiple prologues, no epilogue + ( + [Mock(test=Mock(name="test1", test_template=Mock())), Mock(test=Mock(name="test2", test_template=Mock()))], + None, + [ + "SUCCESS_0=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)", + "SUCCESS_1=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)", + "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && [ $SUCCESS_1 -eq 1 ] && echo 1 || echo 0 )", + "if [ $PROLOGUE_SUCCESS -eq 1 ]; then", + " srun", + "fi", + ], + ), + # No prologue, multiple epilogues + ( + None, + [Mock(test=Mock(name="test3", test_template=Mock())), Mock(test=Mock(name="test4", test_template=Mock()))], + [ + "srun", + "epilogue", + "epilogue", + ], + ), + # Multiple prologues, single epilogue + ( + [Mock(test=Mock(name="test1", test_template=Mock())), Mock(test=Mock(name="test2", test_template=Mock()))], + [Mock(test=Mock(name="test3", test_template=Mock()))], + [ + "SUCCESS_0=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)", + "SUCCESS_1=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)", + "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && [ $SUCCESS_1 -eq 1 ] && echo 1 || echo 0 )", + "if [ $PROLOGUE_SUCCESS -eq 1 ]; then", + " srun", + " epilogue", + "fi", + ], + ), + ], +) +def test_prologue_epilogue_combinations( + strategy_fixture: SlurmCommandGenStrategy, + testrun_fixture: TestRun, + prologue, + epilogue, + expected_script_lines, + tmp_path, +): + testrun_fixture.prologue = Mock(spec=TestScenario) if prologue else None + testrun_fixture.epilogue = Mock(spec=TestScenario) if epilogue else None + + if prologue is not None: + testrun_fixture.prologue = Mock(spec=TestScenario) + testrun_fixture.prologue.test_runs = prologue + for idx, run in enumerate(prologue): + run.test.test_template.gen_srun_success_check.return_value = ( + "grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0" + ) + run.test.test_template.gen_srun_command.return_value = "srun" + run.test.name = f"test{idx+1}" + else: + testrun_fixture.prologue = None + + if epilogue is not None: + testrun_fixture.epilogue = Mock(spec=TestScenario) + testrun_fixture.epilogue.test_runs = epilogue + for idx, run in enumerate(epilogue): + run.test.test_template.gen_srun_command.return_value = "epilogue" + run.test.name = f"test{idx+1}" + else: + testrun_fixture.epilogue = None + + sbatch_command = strategy_fixture.gen_exec_command(testrun_fixture) + script_file_path = sbatch_command.split()[-1] + + with open(script_file_path, "r") as script_file: + script_content = script_file.read() + + for expected_line in expected_script_lines: + assert expected_line in script_content, f"Expected '{expected_line}' in generated script but it was missing." From 9f83cd58d70fc9011f8871e44ff65ff3782ae52c Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Fri, 25 Oct 2024 11:33:12 -0400 Subject: [PATCH 26/64] Remove plugin option from CLI --- src/cloudai/cli/cli.py | 4 -- tests/test_cli.py | 92 +----------------------------------------- 2 files changed, 2 insertions(+), 94 deletions(-) diff --git a/src/cloudai/cli/cli.py b/src/cloudai/cli/cli.py index f962a2d5..53059799 100644 --- a/src/cloudai/cli/cli.py +++ b/src/cloudai/cli/cli.py @@ -60,7 +60,6 @@ def add_command( handler: Callable[[argparse.Namespace], int], system_config: Optional[bool] = None, tests_dir: Optional[bool] = None, - plugin_dir: Optional[bool] = None, test_scenario: Optional[bool] = None, output_dir: Optional[bool] = None, result_dir: Optional[bool] = None, @@ -75,8 +74,6 @@ def add_command( p.add_argument( "--tests-dir", help="Path to the test configuration directory.", required=tests_dir, type=Path ) - if plugin_dir is not None: - p.add_argument("--plugin-dir", help="Path to the plugin directory.", required=plugin_dir, type=Path) if test_scenario is not None: p.add_argument("--test-scenario", help="Path to the test scenario file.", required=test_scenario, type=Path) if output_dir is not None: @@ -130,7 +127,6 @@ def add_run_and_dry_run(self): handle_dry_run_and_run, system_config=True, tests_dir=True, - plugin_dir=False, test_scenario=True, output_dir=False, ) diff --git a/tests/test_cli.py b/tests/test_cli.py index bb6c1a5d..538e497f 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -20,8 +20,8 @@ import pytest -from cloudai.cli import CloudAICLI -from cloudai.cli.handlers import handle_generate_report, handle_install_and_uninstall, handle_verify_all_configs +from cloudai.cli import CloudAICLI, handle_generate_report, handle_install_and_uninstall +from cloudai.cli.handlers import handle_verify_all_configs def test_help_message(capsys: pytest.CaptureFixture[str]) -> None: @@ -108,7 +108,6 @@ def test_add_command_all_optional(): lambda _: 0, system_config=False, tests_dir=False, - plugin_dir=False, test_scenario=False, output_dir=False, ) @@ -119,7 +118,6 @@ def test_add_command_all_optional(): mode="test", system_config=None, tests_dir=None, - plugin_dir=None, test_scenario=None, output_dir=None, ) @@ -134,7 +132,6 @@ def test_add_command_all_required(): lambda _: 0, system_config=True, tests_dir=True, - plugin_dir=True, test_scenario=True, output_dir=True, ) @@ -145,8 +142,6 @@ def test_add_command_all_required(): "system_config", "--tests-dir", "tests_dir", - "--plugin-dir", - "plugin_dir", "--test-scenario", "test_scenario", "--output-dir", @@ -159,91 +154,11 @@ def test_add_command_all_required(): mode="test", system_config=Path("system_config"), tests_dir=Path("tests_dir"), - plugin_dir=Path("plugin_dir"), test_scenario=Path("test_scenario"), output_dir=Path("output_dir"), ) -@pytest.mark.parametrize( - "mode,args,expected_plugin_dir", - [ - ( - "run", - [ - "run", - "--system-config", - "system_config", - "--tests-dir", - "tests_dir", - "--plugin-dir", - "plugin_dir", - "--test-scenario", - "test_scenario", - ], - Path("plugin_dir"), - ), - ( - "run", - [ - "run", - "--system-config", - "system_config", - "--tests-dir", - "tests_dir", - "--test-scenario", - "test_scenario", - ], - None, - ), - ( - "dry-run", - [ - "dry-run", - "--system-config", - "system_config", - "--tests-dir", - "tests_dir", - "--plugin-dir", - "plugin_dir", - "--test-scenario", - "test_scenario", - ], - Path("plugin_dir"), - ), - ( - "dry-run", - [ - "dry-run", - "--system-config", - "system_config", - "--tests-dir", - "tests_dir", - "--test-scenario", - "test_scenario", - ], - None, - ), - ], -) -def test_modes_with_or_without_plugin_dir(mode, args, expected_plugin_dir): - cli = CloudAICLI() - - cli.add_command( - mode, - f"{mode} command", - lambda _: 0, - system_config=True, - tests_dir=True, - plugin_dir=False, - test_scenario=True, - output_dir=False, - ) - - parsed_args = cli.parser.parse_args(args) - assert parsed_args.plugin_dir == expected_plugin_dir - - def test_real_uninstall(): cli = CloudAICLI() cli.init_default_args() @@ -362,8 +277,6 @@ def test_run_dry_run_modes(self, cli: CloudAICLI): "tests_dir", "--test-scenario", "test_scenario", - "--plugin-dir", - "plugin_dir", ] ) @@ -373,7 +286,6 @@ def test_run_dry_run_modes(self, cli: CloudAICLI): mode=mode, system_config=Path("system_config"), tests_dir=Path("tests_dir"), - plugin_dir=Path("plugin_dir"), test_scenario=Path("test_scenario"), output_dir=None, ) From f656eee58c81ed56a36db44c968a4b0e1626fa6a Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Fri, 25 Oct 2024 11:38:04 -0400 Subject: [PATCH 27/64] Make plugin directory self-contained --- .../plugin/test/nccl_test_all_gather.toml | 33 +++++++++++++++++++ .../plugin/test/nccl_test_all_reduce.toml | 30 +++++++++++++++++ .../nccl_test_epilogue.toml | 0 .../nccl_test_prologue.toml | 0 4 files changed, 63 insertions(+) create mode 100644 conf/common/plugin/test/nccl_test_all_gather.toml create mode 100644 conf/common/plugin/test/nccl_test_all_reduce.toml rename conf/common/plugin/{ => test_scenario}/nccl_test_epilogue.toml (100%) rename conf/common/plugin/{ => test_scenario}/nccl_test_prologue.toml (100%) diff --git a/conf/common/plugin/test/nccl_test_all_gather.toml b/conf/common/plugin/test/nccl_test_all_gather.toml new file mode 100644 index 00000000..4fec288a --- /dev/null +++ b/conf/common/plugin/test/nccl_test_all_gather.toml @@ -0,0 +1,33 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name = "nccl_test_all_gather" +description = "all_gather" +test_template_name = "NcclTest" + +[cmd_args] +"subtest_name" = "all_gather_perf_mpi" +"ngpus" = "1" +"minbytes" = "128" +"maxbytes" = "4G" +"iters" = "100" +"warmup_iters" = "50" + +[extra_cmd_args] +"--stepfactor" = "2" + +[extra_env_vars] +"NCCL_TEST_SPLIT_MASK" = "0x7" diff --git a/conf/common/plugin/test/nccl_test_all_reduce.toml b/conf/common/plugin/test/nccl_test_all_reduce.toml new file mode 100644 index 00000000..9074b2b8 --- /dev/null +++ b/conf/common/plugin/test/nccl_test_all_reduce.toml @@ -0,0 +1,30 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name = "nccl_test_all_reduce" +description = "all_reduce" +test_template_name = "NcclTest" + +[cmd_args] +"subtest_name" = "all_reduce_perf_mpi" +"ngpus" = "1" +"minbytes" = "128" +"maxbytes" = "16G" +"iters" = "100" +"warmup_iters" = "50" + +[extra_cmd_args] +"--stepfactor" = "2" diff --git a/conf/common/plugin/nccl_test_epilogue.toml b/conf/common/plugin/test_scenario/nccl_test_epilogue.toml similarity index 100% rename from conf/common/plugin/nccl_test_epilogue.toml rename to conf/common/plugin/test_scenario/nccl_test_epilogue.toml diff --git a/conf/common/plugin/nccl_test_prologue.toml b/conf/common/plugin/test_scenario/nccl_test_prologue.toml similarity index 100% rename from conf/common/plugin/nccl_test_prologue.toml rename to conf/common/plugin/test_scenario/nccl_test_prologue.toml From 5af7113c9f22848f48f287adaa10516417392a04 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Fri, 25 Oct 2024 11:36:06 -0400 Subject: [PATCH 28/64] Update Parser to support self-contained plugin directory --- src/cloudai/cli/handlers.py | 4 +- src/cloudai/parser.py | 105 ++++++++++++++++++++++++++---------- tests/test_parser.py | 58 +++++++++++++------- 3 files changed, 117 insertions(+), 50 deletions(-) diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py index e654cf03..b76609ef 100644 --- a/src/cloudai/cli/handlers.py +++ b/src/cloudai/cli/handlers.py @@ -90,7 +90,9 @@ def handle_dry_run_and_run(args: argparse.Namespace) -> int: args (argparse.Namespace): The parsed command-line arguments. """ parser = Parser(args.system_config) - system, tests, test_scenario = parser.parse(args.tests_dir, args.test_scenario, args.plugin_dir) + system, tests, test_scenario = parser.parse( + args.tests_dir, args.test_scenario, Path("conf/common/plugin/test"), Path("conf/common/plugin/test_scenario") + ) assert test_scenario is not None if args.output_dir: diff --git a/src/cloudai/parser.py b/src/cloudai/parser.py index 5b21ab3f..73ab8717 100644 --- a/src/cloudai/parser.py +++ b/src/cloudai/parser.py @@ -49,14 +49,25 @@ def __init__(self, system_config_path: Path) -> None: self.system_config_path = system_config_path def parse( - self, test_path: Path, test_scenario_path: Optional[Path] = None, plugin_path: Optional[Path] = None + self, + test_path: Path, + test_scenario_path: Optional[Path] = None, + plugin_test_path: Optional[Path] = None, + plugin_test_scenario_path: Optional[Path] = None, ) -> Tuple[System, List[Test], Optional[TestScenario]]: """ Parse configurations for system, test templates, and test scenarios. - Returns - Tuple[System, List[TestTemplate], TestScenario]: A tuple containing the system object, a list of test - template objects, and the test scenario object. + Args: + test_path (Path): The file path for tests. + test_scenario_path (Optional[Path]): The file path for the main test scenario. + If None, all tests are included. + plugin_test_path (Optional[Path]): The file path for plugin-specific tests. + plugin_test_scenario_path (Optional[Path]): The file path for plugin-specific test scenarios. + + Returns: + Tuple[System, List[Test], Optional[TestScenario]]: A tuple containing the system object, a list of filtered + test template objects, and the main test scenario object if provided. """ if not test_path.exists(): raise FileNotFoundError(f"Test path '{test_path}' not found.") @@ -64,47 +75,83 @@ def parse( try: system = self.parse_system(self.system_config_path) except SystemConfigParsingError: - exit(1) # exit right away to keep error message readable for users + exit(1) try: tests = self.parse_tests(list(test_path.glob("*.toml")), system) except TestConfigParsingError: - exit(1) # exit right away to keep error message readable for users + exit(1) - logging.debug(f"Parsed {len(tests)} tests: {[t.name for t in tests]}") - test_mapping = {t.name: t for t in tests} + plugin_tests = ( + self.parse_tests(list(plugin_test_path.glob("*.toml")), system) + if plugin_test_path and plugin_test_path.exists() + else [] + ) - test_scenario: Optional[TestScenario] = None - scenario_test_names: Set[str] = set() if test_scenario_path: - plugin_mapping: Dict[str, TestScenario] = {} - plugin_test_names: Set[str] = set() - if plugin_path and plugin_path.exists(): - try: - plugin_mapping = self.parse_plugins(list(plugin_path.glob("*.toml")), test_mapping) - for plugin_scenario in plugin_mapping.values(): - plugin_test_names.update(tr.test.name for tr in plugin_scenario.test_runs) - except TestScenarioParsingError: - exit(1) + return self._parse_with_scenario(system, tests, test_scenario_path, plugin_tests, plugin_test_scenario_path) + + return system, tests + plugin_tests, None + + def _parse_with_scenario( + self, + system: System, + tests: List[Test], + test_scenario_path: Path, + plugin_tests: List[Test], + plugin_test_scenario_path: Optional[Path], + ) -> Tuple[System, List[Test], Optional[TestScenario]]: + """Parse tests and scenarios with a main test scenario path specified.""" + test_mapping = {t.name: t for t in tests} + plugin_test_mapping = {t.name: t for t in plugin_tests} + + plugin_test_scenario_mapping = self._load_plugin_scenarios(plugin_test_scenario_path, plugin_test_mapping) + test_scenario = self._load_main_scenario(test_scenario_path, test_mapping, plugin_test_scenario_mapping) + all_used_test_names = self._collect_used_test_names(plugin_test_scenario_mapping, test_scenario) + filtered_tests = [t for t in tests if t.name in all_used_test_names] + + return system, filtered_tests, test_scenario + + def _load_plugin_scenarios( + self, plugin_test_scenario_path: Optional[Path], plugin_test_mapping: Dict[str, Test] + ) -> Dict[str, TestScenario]: + """Load plugin-specific test scenarios from the specified path.""" + if plugin_test_scenario_path and plugin_test_scenario_path.exists(): try: - test_scenario = self.parse_test_scenario(test_scenario_path, test_mapping, plugin_mapping) - scenario_test_names = set(tr.test.name for tr in test_scenario.test_runs) + return self.parse_plugins(list(plugin_test_scenario_path.glob("*.toml")), plugin_test_mapping) except TestScenarioParsingError: exit(1) + return {} - all_used_test_names = plugin_test_names.union(scenario_test_names) - filtered_tests = [t for t in tests if t.name in all_used_test_names] - else: - filtered_tests = tests - - return system, filtered_tests, test_scenario + def _load_main_scenario( + self, + test_scenario_path: Path, + test_mapping: Dict[str, Test], + plugin_test_scenario_mapping: Dict[str, TestScenario], + ) -> Optional[TestScenario]: + """Load the main test scenario using provided mappings.""" + try: + return self.parse_test_scenario(test_scenario_path, test_mapping, plugin_test_scenario_mapping) + except TestScenarioParsingError: + exit(1) + + def _collect_used_test_names( + self, plugin_test_scenario_mapping: Dict[str, TestScenario], test_scenario: Optional[TestScenario] + ) -> Set[str]: + """Collect test names used in both plugin and main test scenarios.""" + # TODO: collect test names in the plugin test scenarios only + plugin_test_names = { + tr.test.name for scenario in plugin_test_scenario_mapping.values() for tr in scenario.test_runs + } + scenario_test_names = {tr.test.name for tr in test_scenario.test_runs} if test_scenario else set() + return plugin_test_names.union(scenario_test_names) @staticmethod def parse_plugins(plugin_tomls: List[Path], test_mapping: Dict[str, Test]) -> Dict[str, TestScenario]: plugin_mapping = {} - for plugin_path in plugin_tomls: - plugin_scenario = Parser.parse_test_scenario(plugin_path, test_mapping) + for plugin_test_scenario_path in plugin_tomls: + plugin_scenario = Parser.parse_test_scenario(plugin_test_scenario_path, test_mapping) plugin_mapping[plugin_scenario.name] = plugin_scenario return plugin_mapping diff --git a/tests/test_parser.py b/tests/test_parser.py index cb809d36..12372755 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -15,13 +15,13 @@ # limitations under the License. from pathlib import Path -from typing import cast +from typing import Dict, cast from unittest.mock import Mock, patch import pytest from pydantic_core import ErrorDetails -from cloudai import Parser, format_validation_error +from cloudai import Parser, TestScenario, format_validation_error from cloudai.systems.slurm.slurm_system import SlurmSystem @@ -100,16 +100,14 @@ def test_scenario_with_plugin_common_tests( @patch("cloudai._core.test_parser.TestParser.parse_all") @patch("cloudai._core.test_scenario_parser.TestScenarioParser.parse") - @patch("cloudai.parser.Parser.parse_plugins") - def test_scenario_with_plugin_exclusive_tests( - self, parse_plugins: Mock, test_scenario_parser: Mock, test_parser: Mock, parser: Parser - ): + def test_scenario_with_plugin_exclusive_tests(self, test_scenario_parser: Mock, test_parser: Mock, parser: Parser): tests_dir = parser.system_config_path.parent.parent / "test" + test_scenario_path = Path("/mock/test_scenario.toml") + plugin_test_scenario_path = Path("/mock/plugin_scenarios") - fake_tests = [] - for i in range(4): - fake_tests.append(Mock()) - fake_tests[-1].name = f"test-{i}" + fake_tests = [Mock() for _ in range(4)] + for i, test in enumerate(fake_tests): + test.name = f"test-{i}" test_parser.return_value = fake_tests fake_scenario = Mock() @@ -117,18 +115,38 @@ def test_scenario_with_plugin_exclusive_tests( fake_scenario.test_runs[0].test.name = "test-1" test_scenario_parser.return_value = fake_scenario - fake_plugin = Mock() - fake_plugin.test_runs = [Mock()] - fake_plugin.test_runs[0].test.name = "test-2" - parse_plugins.return_value = {"plugin-1": fake_plugin} + fake_plugin_scenarios = {"plugin-1": Mock(test_runs=[Mock()])} + fake_plugin_scenarios["plugin-1"].test_runs[0].test.name = "test-2" - _, tests, _ = parser.parse(tests_dir, Path(), Path()) + with patch.object(parser, "_load_plugin_scenarios", return_value=fake_plugin_scenarios): + _, filtered_tests, _ = parser.parse(tests_dir, test_scenario_path, tests_dir, plugin_test_scenario_path) + + filtered_test_names = {t.name for t in filtered_tests} + assert len(filtered_tests) == 2 + assert "test-1" in filtered_test_names + assert "test-2" in filtered_test_names + assert "test-0" not in filtered_test_names + assert "test-3" not in filtered_test_names + + def test_collect_used_test_names(self, parser: Parser): + fake_scenario = Mock() + fake_scenario.test_runs = [Mock()] + fake_scenario.test_runs[0].test.name = "test-1" + + fake_plugin_scenario_1 = Mock(spec=TestScenario) + fake_plugin_scenario_1.test_runs = [Mock()] + fake_plugin_scenario_1.test_runs[0].test.name = "test-2" + + fake_plugin_scenario_2 = Mock(spec=TestScenario) + fake_plugin_scenario_2.test_runs = [Mock()] + fake_plugin_scenario_2.test_runs[0].test.name = "test-3" + + fake_plugin_scenarios = cast( + Dict[str, TestScenario], {"plugin-1": fake_plugin_scenario_1, "plugin-2": fake_plugin_scenario_2} + ) - assert len(tests) == 2 - assert "test-1" in [t.name for t in tests] - assert "test-2" in [t.name for t in tests] - assert "test-0" not in [t.name for t in tests] - assert "test-3" not in [t.name for t in tests] + used_test_names = parser._collect_used_test_names(fake_plugin_scenarios, fake_scenario) + assert used_test_names == {"test-1", "test-2", "test-3"} def test_parse_system(self, parser: Parser): parser.system_config_path = Path("conf/common/system/example_slurm_cluster.toml") From b22c2f2d47d0bda68de6c9cebbc0e1453d1c5432 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Mon, 28 Oct 2024 13:35:01 -0400 Subject: [PATCH 29/64] Refactor plugin path handling in parse to use a single plugin_path param --- src/cloudai/cli/handlers.py | 4 +--- src/cloudai/parser.py | 9 +++++---- tests/test_parser.py | 4 ++-- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py index b76609ef..1085440f 100644 --- a/src/cloudai/cli/handlers.py +++ b/src/cloudai/cli/handlers.py @@ -90,9 +90,7 @@ def handle_dry_run_and_run(args: argparse.Namespace) -> int: args (argparse.Namespace): The parsed command-line arguments. """ parser = Parser(args.system_config) - system, tests, test_scenario = parser.parse( - args.tests_dir, args.test_scenario, Path("conf/common/plugin/test"), Path("conf/common/plugin/test_scenario") - ) + system, tests, test_scenario = parser.parse(args.tests_dir, args.test_scenario, Path("conf/common/plugin")) assert test_scenario is not None if args.output_dir: diff --git a/src/cloudai/parser.py b/src/cloudai/parser.py index 73ab8717..a9227f88 100644 --- a/src/cloudai/parser.py +++ b/src/cloudai/parser.py @@ -52,8 +52,7 @@ def parse( self, test_path: Path, test_scenario_path: Optional[Path] = None, - plugin_test_path: Optional[Path] = None, - plugin_test_scenario_path: Optional[Path] = None, + plugin_path: Optional[Path] = None, ) -> Tuple[System, List[Test], Optional[TestScenario]]: """ Parse configurations for system, test templates, and test scenarios. @@ -62,8 +61,7 @@ def parse( test_path (Path): The file path for tests. test_scenario_path (Optional[Path]): The file path for the main test scenario. If None, all tests are included. - plugin_test_path (Optional[Path]): The file path for plugin-specific tests. - plugin_test_scenario_path (Optional[Path]): The file path for plugin-specific test scenarios. + plugin_path (Optional[Path]): The base file path for plugin-specific tests and scenarios. Returns: Tuple[System, List[Test], Optional[TestScenario]]: A tuple containing the system object, a list of filtered @@ -82,6 +80,9 @@ def parse( except TestConfigParsingError: exit(1) + plugin_test_scenario_path = plugin_path + plugin_test_path = plugin_path / "test" if plugin_path else None + plugin_tests = ( self.parse_tests(list(plugin_test_path.glob("*.toml")), system) if plugin_test_path and plugin_test_path.exists() diff --git a/tests/test_parser.py b/tests/test_parser.py index 12372755..bcfd63a3 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -103,7 +103,7 @@ def test_scenario_with_plugin_common_tests( def test_scenario_with_plugin_exclusive_tests(self, test_scenario_parser: Mock, test_parser: Mock, parser: Parser): tests_dir = parser.system_config_path.parent.parent / "test" test_scenario_path = Path("/mock/test_scenario.toml") - plugin_test_scenario_path = Path("/mock/plugin_scenarios") + plugin_path = Path("/mock/plugin_scenarios") fake_tests = [Mock() for _ in range(4)] for i, test in enumerate(fake_tests): @@ -119,7 +119,7 @@ def test_scenario_with_plugin_exclusive_tests(self, test_scenario_parser: Mock, fake_plugin_scenarios["plugin-1"].test_runs[0].test.name = "test-2" with patch.object(parser, "_load_plugin_scenarios", return_value=fake_plugin_scenarios): - _, filtered_tests, _ = parser.parse(tests_dir, test_scenario_path, tests_dir, plugin_test_scenario_path) + _, filtered_tests, _ = parser.parse(tests_dir, test_scenario_path, plugin_path) filtered_test_names = {t.name for t in filtered_tests} assert len(filtered_tests) == 2 From c88fe2ef42986e0c5fd502de9d45e5b7c2c42844 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Mon, 28 Oct 2024 13:29:04 -0400 Subject: [PATCH 30/64] Remove test_scenario directory from conf/common/plugin/ --- conf/common/plugin/{test_scenario => }/nccl_test_epilogue.toml | 0 conf/common/plugin/{test_scenario => }/nccl_test_prologue.toml | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename conf/common/plugin/{test_scenario => }/nccl_test_epilogue.toml (100%) rename conf/common/plugin/{test_scenario => }/nccl_test_prologue.toml (100%) diff --git a/conf/common/plugin/test_scenario/nccl_test_epilogue.toml b/conf/common/plugin/nccl_test_epilogue.toml similarity index 100% rename from conf/common/plugin/test_scenario/nccl_test_epilogue.toml rename to conf/common/plugin/nccl_test_epilogue.toml diff --git a/conf/common/plugin/test_scenario/nccl_test_prologue.toml b/conf/common/plugin/nccl_test_prologue.toml similarity index 100% rename from conf/common/plugin/test_scenario/nccl_test_prologue.toml rename to conf/common/plugin/nccl_test_prologue.toml From c814ccb65258a5b77c4a7e1ddb8d4a3c9d4624ac Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Tue, 29 Oct 2024 10:11:34 -0400 Subject: [PATCH 31/64] Use Pydantic model to load prologue and epilogue --- src/cloudai/_core/test_scenario_parser.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/cloudai/_core/test_scenario_parser.py b/src/cloudai/_core/test_scenario_parser.py index c59adeba..4d192a7a 100644 --- a/src/cloudai/_core/test_scenario_parser.py +++ b/src/cloudai/_core/test_scenario_parser.py @@ -139,11 +139,11 @@ def _parse_data(self, data: Dict[str, Any]) -> TestScenario: total_weight = sum(tr.weight for tr in ts_model.tests) normalized_weight = 0 if total_weight == 0 else 100 / total_weight - prologue_name = data.get("prologue", "") - epilogue_name = data.get("epilogue", "") - - prologue = self.plugin_mapping.get(prologue_name, None) if prologue_name else None - epilogue = self.plugin_mapping.get(epilogue_name, None) if epilogue_name else None + prologue, epilogue = None, None + if ts_model.prologue: + prologue = self.plugin_mapping.get(ts_model.prologue) + if ts_model.epilogue: + epilogue = self.plugin_mapping.get(ts_model.epilogue) testruns_by_id: dict[str, TestRun] = { tr.id: self._create_section_test_run(tr, normalized_weight, prologue, epilogue) for tr in ts_model.tests From a6d3efc631368bd6314d14efc00a3cf71d1a416e Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Tue, 29 Oct 2024 10:48:42 -0400 Subject: [PATCH 32/64] Recover acceptance tests with plugin --- .../{gpt.sbatch => gpt-no-plugin.sbatch} | 0 .../{gpt-pretest.sbatch => gpt-plugin.sbatch} | 40 +++---------------- .../{grok.sbatch => grok-no-plugin.sbatch} | 0 ...grok-pretest.sbatch => grok-plugin.sbatch} | 40 +++---------------- tests/test_acceptance.py | 38 ++++++++++++++++-- 5 files changed, 46 insertions(+), 72 deletions(-) rename tests/ref_data/{gpt.sbatch => gpt-no-plugin.sbatch} (100%) rename tests/ref_data/{gpt-pretest.sbatch => gpt-plugin.sbatch} (52%) rename tests/ref_data/{grok.sbatch => grok-no-plugin.sbatch} (100%) rename tests/ref_data/{grok-pretest.sbatch => grok-plugin.sbatch} (67%) diff --git a/tests/ref_data/gpt.sbatch b/tests/ref_data/gpt-no-plugin.sbatch similarity index 100% rename from tests/ref_data/gpt.sbatch rename to tests/ref_data/gpt-no-plugin.sbatch diff --git a/tests/ref_data/gpt-pretest.sbatch b/tests/ref_data/gpt-plugin.sbatch similarity index 52% rename from tests/ref_data/gpt-pretest.sbatch rename to tests/ref_data/gpt-plugin.sbatch index 3a64823c..08a3a87a 100644 --- a/tests/ref_data/gpt-pretest.sbatch +++ b/tests/ref_data/gpt-plugin.sbatch @@ -8,39 +8,11 @@ export COMBINE_THRESHOLD=1 export PER_GPU_COMBINE_THRESHOLD=0 export XLA_FLAGS="--xla_gpu_all_gather_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_all_reduce_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_reduce_scatter_combine_threshold_bytes=$PER_GPU_COMBINE_THRESHOLD" -srun \ ---mpi=pmix \ --N 8 \ --o __OUTPUT_DIR__/output_pretest-%j-%n-%t.txt \ --e __OUTPUT_DIR__/error_pretest-%j-%n-%t.txt \ ---container-image=nvcr.io/nvidia/pytorch:24.02-py3 \ -/usr/local/bin/all_gather_perf_mpi \ ---nthreads 1 \ ---ngpus 1 \ ---minbytes 8M \ ---maxbytes 16G \ ---stepbytes 1M \ ---op sum \ ---datatype float \ ---root 0 \ ---iters 20 \ ---warmup_iters 5 \ ---agg_iters 1 \ ---average 1 \ ---parallel_init 0 \ ---check 1 \ ---blocking 1 \ ---cudagraph 0 \ ---stepfactor 2 -PRETEST_OUTPUT_FILES="__OUTPUT_DIR__/output_pretest-*.txt" -keyword="Avg bus bandwidth" - -# Use grep to search for the keyword in the files -if grep -q "$keyword" $PRETEST_OUTPUT_FILES; then - PRE_TEST_SUCCESS=true -fi -if [ "$PRE_TEST_SUCCESS" = true ]; then - echo "Loading container with srun command" +srun --output=__OUTPUT_DIR__/prologue/nccl/stdout.txt --error=__OUTPUT_DIR__/prologue/nccl/stderr.txt --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 /usr/local/bin/all_reduce_perf_mpi --nthreads 1 --ngpus 1 --minbytes 32M --maxbytes 32M --stepbytes 1M --op sum --datatype float --root 0 --iters 20 --warmup_iters 5 --agg_iters 1 --average 1 --parallel_init 0 --check 1 --blocking 0 --cudagraph 0 +SUCCESS_0=$(grep -q "Avg bus bandwidth" __OUTPUT_DIR__/prologue/nccl/stdout.txt && echo 1 || echo 0) +PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 ) +if [ $PROLOGUE_SUCCESS -eq 1 ]; then + echo "Loading container with srun command" srun --mpi=none --container-image=https://docker/url --container-name=cont true echo "Running srun command" srun \ @@ -52,4 +24,4 @@ if [ "$PRE_TEST_SUCCESS" = true ]; then --container-name=cont \ --container-mounts=__OUTPUT_DIR__:/opt/paxml/workspace/ \ /opt/paxml/workspace/run.sh -fi \ No newline at end of file +fi diff --git a/tests/ref_data/grok.sbatch b/tests/ref_data/grok-no-plugin.sbatch similarity index 100% rename from tests/ref_data/grok.sbatch rename to tests/ref_data/grok-no-plugin.sbatch diff --git a/tests/ref_data/grok-pretest.sbatch b/tests/ref_data/grok-plugin.sbatch similarity index 67% rename from tests/ref_data/grok-pretest.sbatch rename to tests/ref_data/grok-plugin.sbatch index 0e2672d5..e75d3d77 100644 --- a/tests/ref_data/grok-pretest.sbatch +++ b/tests/ref_data/grok-plugin.sbatch @@ -8,39 +8,11 @@ export COMBINE_THRESHOLD=1 export PER_GPU_COMBINE_THRESHOLD=0 export XLA_FLAGS="--xla_disable_hlo_passes=rematerialization --xla_dump_hlo_pass_re=.* --xla_gpu_all_gather_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_all_reduce_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_enable_all_gather_combine_by_dim=false --xla_gpu_enable_highest_priority_async_stream=true --xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_pipelined_all_gather=true --xla_gpu_enable_pipelined_all_reduce=true --xla_gpu_enable_pipelined_reduce_scatter=true --xla_gpu_enable_reduce_scatter_combine_by_dim=false --xla_gpu_enable_triton_gemm=false --xla_gpu_enable_triton_softmax_fusion=false --xla_gpu_enable_while_loop_double_buffering=true --xla_gpu_graph_level=0 --xla_gpu_pgle_profile_file_or_directory_path=/opt/paxml/workspace/pgle_output_profile.pbtxt --xla_gpu_reduce_scatter_combine_threshold_bytes=$PER_GPU_COMBINE_THRESHOLD --xla_gpu_run_post_layout_collective_pipeliner=false --xla_gpu_use_memcpy_local_p2p=false" -srun \ ---mpi=pmix \ --N 8 \ --o __OUTPUT_DIR__/output_pretest-%j-%n-%t.txt \ --e __OUTPUT_DIR__/error_pretest-%j-%n-%t.txt \ ---container-image=nvcr.io/nvidia/pytorch:24.02-py3 \ -/usr/local/bin/all_gather_perf_mpi \ ---nthreads 1 \ ---ngpus 1 \ ---minbytes 8M \ ---maxbytes 16G \ ---stepbytes 1M \ ---op sum \ ---datatype float \ ---root 0 \ ---iters 20 \ ---warmup_iters 5 \ ---agg_iters 1 \ ---average 1 \ ---parallel_init 0 \ ---check 1 \ ---blocking 1 \ ---cudagraph 0 \ ---stepfactor 2 -PRETEST_OUTPUT_FILES="__OUTPUT_DIR__/output_pretest-*.txt" -keyword="Avg bus bandwidth" - -# Use grep to search for the keyword in the files -if grep -q "$keyword" $PRETEST_OUTPUT_FILES; then - PRE_TEST_SUCCESS=true -fi -if [ "$PRE_TEST_SUCCESS" = true ]; then - echo "Loading container with srun command" +srun --output=__OUTPUT_DIR__/prologue/nccl/stdout.txt --error=__OUTPUT_DIR__/prologue/nccl/stderr.txt --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 /usr/local/bin/all_reduce_perf_mpi --nthreads 1 --ngpus 1 --minbytes 32M --maxbytes 32M --stepbytes 1M --op sum --datatype float --root 0 --iters 20 --warmup_iters 5 --agg_iters 1 --average 1 --parallel_init 0 --check 1 --blocking 0 --cudagraph 0 +SUCCESS_0=$(grep -q "Avg bus bandwidth" __OUTPUT_DIR__/prologue/nccl/stdout.txt && echo 1 || echo 0) +PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 ) +if [ $PROLOGUE_SUCCESS -eq 1 ]; then + echo "Loading container with srun command" srun --mpi=none --container-image=https://docker/url --container-name=cont true echo "Running srun command" srun \ @@ -52,4 +24,4 @@ if [ "$PRE_TEST_SUCCESS" = true ]; then --container-name=cont \ --container-mounts=__OUTPUT_DIR__:/opt/paxml/workspace/ \ /opt/paxml/workspace/run.sh -fi \ No newline at end of file +fi diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py index e11ff50b..19e7acb9 100644 --- a/tests/test_acceptance.py +++ b/tests/test_acceptance.py @@ -22,7 +22,7 @@ import pytest -from cloudai import NcclTest, Test, TestRun, UCCTest +from cloudai import NcclTest, Test, TestRun, TestScenario, UCCTest from cloudai.cli import handle_dry_run_and_run, setup_logging from cloudai.schema.test_template.jax_toolbox.slurm_command_gen_strategy import JaxToolboxSlurmCommandGenStrategy from cloudai.schema.test_template.jax_toolbox.template import JaxToolbox @@ -91,7 +91,7 @@ def partial_tr(slurm_system: SlurmSystem) -> partial[TestRun]: return partial(TestRun, num_nodes=1, nodes=[], output_path=slurm_system.output_path) -@pytest.fixture(params=["ucc", "nccl", "sleep", "gpt", "grok"]) +@pytest.fixture(params=["ucc", "nccl", "sleep", "gpt-plugin", "gpt-no-plugin", "grok-plugin", "grok-no-plugin"]) def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -> tuple[TestRun, str, Optional[str]]: if request.param == "ucc": tr = partial_tr( @@ -141,7 +141,7 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name") return (tr, "sleep.sbatch", None) - elif request.param.startswith("gpt"): + elif request.param.startswith("gpt-"): tr = partial_tr( name="gpt", test=Test( @@ -159,9 +159,24 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - slurm_system, tr.test.test_definition.cmd_args_dict ) tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name") + if "no-plugin" not in request.param: + prologue_tr = partial_tr( + name="nccl", + test=Test( + test_definition=NCCLTestDefinition( + name="nccl", description="nccl", test_template_name="nccl", cmd_args=NCCLCmdArgs() + ), + test_template=NcclTest(slurm_system, name="nccl"), + ), + ) + prologue_tr.test.test_template.command_gen_strategy = NcclTestSlurmCommandGenStrategy( + slurm_system, prologue_tr.test.test_definition.cmd_args_dict + ) + prologue_tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name") + tr.prologue = TestScenario(name=f"{prologue_tr.name} NCCL Prologue", test_runs=[prologue_tr]) return (tr, f"{request.param}.sbatch", "gpt.run") - elif request.param.startswith("grok"): + elif request.param.startswith("grok-"): tr = partial_tr( name="grok", test=Test( @@ -179,6 +194,21 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - slurm_system, tr.test.test_definition.cmd_args_dict ) tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name") + if "no-plugin" not in request.param: + prologue_tr = partial_tr( + name="nccl", + test=Test( + test_definition=NCCLTestDefinition( + name="nccl", description="nccl", test_template_name="nccl", cmd_args=NCCLCmdArgs() + ), + test_template=NcclTest(slurm_system, name="nccl"), + ), + ) + prologue_tr.test.test_template.command_gen_strategy = NcclTestSlurmCommandGenStrategy( + slurm_system, prologue_tr.test.test_definition.cmd_args_dict + ) + prologue_tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name") + tr.prologue = TestScenario(name=f"{prologue_tr.name} NCCL Prologue", test_runs=[prologue_tr]) return (tr, f"{request.param}.sbatch", "grok.run") From 46cabe912bfd22d39330b75df7665e48919432ed Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Tue, 29 Oct 2024 10:50:38 -0400 Subject: [PATCH 33/64] Clean up unit tests --- .../test_common_slurm_command_gen_strategy.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py index 4484b6e1..0f7821a3 100644 --- a/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py +++ b/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py @@ -237,8 +237,6 @@ def test_prologue_epilogue_combinations( ) run.test.test_template.gen_srun_command.return_value = "srun" run.test.name = f"test{idx+1}" - else: - testrun_fixture.prologue = None if epilogue is not None: testrun_fixture.epilogue = Mock(spec=TestScenario) @@ -246,8 +244,6 @@ def test_prologue_epilogue_combinations( for idx, run in enumerate(epilogue): run.test.test_template.gen_srun_command.return_value = "epilogue" run.test.name = f"test{idx+1}" - else: - testrun_fixture.epilogue = None sbatch_command = strategy_fixture.gen_exec_command(testrun_fixture) script_file_path = sbatch_command.split()[-1] From 764e18150e5b00f3ff610b42e845853f35e765c6 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Tue, 29 Oct 2024 11:08:11 -0400 Subject: [PATCH 34/64] Refactor parser to remove explicit plugin_path argument, use default Path --- src/cloudai/cli/handlers.py | 2 +- src/cloudai/parser.py | 8 +++----- tests/test_parser.py | 7 +++---- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py index 1085440f..6105bc24 100644 --- a/src/cloudai/cli/handlers.py +++ b/src/cloudai/cli/handlers.py @@ -90,7 +90,7 @@ def handle_dry_run_and_run(args: argparse.Namespace) -> int: args (argparse.Namespace): The parsed command-line arguments. """ parser = Parser(args.system_config) - system, tests, test_scenario = parser.parse(args.tests_dir, args.test_scenario, Path("conf/common/plugin")) + system, tests, test_scenario = parser.parse(args.tests_dir, args.test_scenario) assert test_scenario is not None if args.output_dir: diff --git a/src/cloudai/parser.py b/src/cloudai/parser.py index a9227f88..9e9a6766 100644 --- a/src/cloudai/parser.py +++ b/src/cloudai/parser.py @@ -52,7 +52,6 @@ def parse( self, test_path: Path, test_scenario_path: Optional[Path] = None, - plugin_path: Optional[Path] = None, ) -> Tuple[System, List[Test], Optional[TestScenario]]: """ Parse configurations for system, test templates, and test scenarios. @@ -61,7 +60,6 @@ def parse( test_path (Path): The file path for tests. test_scenario_path (Optional[Path]): The file path for the main test scenario. If None, all tests are included. - plugin_path (Optional[Path]): The base file path for plugin-specific tests and scenarios. Returns: Tuple[System, List[Test], Optional[TestScenario]]: A tuple containing the system object, a list of filtered @@ -80,8 +78,8 @@ def parse( except TestConfigParsingError: exit(1) - plugin_test_scenario_path = plugin_path - plugin_test_path = plugin_path / "test" if plugin_path else None + plugin_test_scenario_path = Path("conf/common/plugin") + plugin_test_path = Path("conf/common/plugin/test") plugin_tests = ( self.parse_tests(list(plugin_test_path.glob("*.toml")), system) @@ -92,7 +90,7 @@ def parse( if test_scenario_path: return self._parse_with_scenario(system, tests, test_scenario_path, plugin_tests, plugin_test_scenario_path) - return system, tests + plugin_tests, None + return system, list(set(tests + plugin_tests)), None def _parse_with_scenario( self, diff --git a/tests/test_parser.py b/tests/test_parser.py index bcfd63a3..f347c7ee 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -34,7 +34,7 @@ def parser(self, tmp_path: Path) -> Parser: def test_no_tests_dir(self, parser: Parser): tests_dir = parser.system_config_path.parent / "tests" with pytest.raises(FileNotFoundError) as exc_info: - parser.parse(tests_dir, None, None) + parser.parse(tests_dir, None) assert "Test path" in str(exc_info.value) @patch("cloudai._core.test_parser.TestParser.parse_all") @@ -93,7 +93,7 @@ def test_scenario_with_plugin_common_tests( fake_plugin.test_runs[0].test.name = "test-1" parse_plugins.return_value = {"plugin-1": fake_plugin} - _, tests, _ = parser.parse(tests_dir, Path(), Path()) + _, tests, _ = parser.parse(tests_dir, Path()) assert len(tests) == 1 assert tests[0].name == "test-1" @@ -103,7 +103,6 @@ def test_scenario_with_plugin_common_tests( def test_scenario_with_plugin_exclusive_tests(self, test_scenario_parser: Mock, test_parser: Mock, parser: Parser): tests_dir = parser.system_config_path.parent.parent / "test" test_scenario_path = Path("/mock/test_scenario.toml") - plugin_path = Path("/mock/plugin_scenarios") fake_tests = [Mock() for _ in range(4)] for i, test in enumerate(fake_tests): @@ -119,7 +118,7 @@ def test_scenario_with_plugin_exclusive_tests(self, test_scenario_parser: Mock, fake_plugin_scenarios["plugin-1"].test_runs[0].test.name = "test-2" with patch.object(parser, "_load_plugin_scenarios", return_value=fake_plugin_scenarios): - _, filtered_tests, _ = parser.parse(tests_dir, test_scenario_path, plugin_path) + _, filtered_tests, _ = parser.parse(tests_dir, test_scenario_path) filtered_test_names = {t.name for t in filtered_tests} assert len(filtered_tests) == 2 From d9e8c1fb84c0aff19eccaee5de99d674d6cd9f66 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Tue, 29 Oct 2024 11:21:40 -0400 Subject: [PATCH 35/64] Refactor gen_exec_command to simplify indentation logic for readability --- .../strategy/slurm_command_gen_strategy.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py index 57c37812..1d58e4bf 100644 --- a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py +++ b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py @@ -56,23 +56,23 @@ def gen_exec_command(self, tr: TestRun) -> str: cmd_args = self._override_cmd_args(self.default_cmd_args, tr.test.cmd_args) slurm_args = self._parse_slurm_args(tr.test.test_template.__class__.__name__, env_vars, cmd_args, tr) + srun_command = self._gen_srun_command(slurm_args, env_vars, cmd_args, tr.test.extra_cmd_args) + command_list = [] + indent = "" + if tr.prologue: prologue_command = self.gen_prologue(tr.prologue, tr.output_path) - srun_command = self._gen_srun_command(slurm_args, env_vars, cmd_args, tr.test.extra_cmd_args) - command_list = [prologue_command, "if [ $PROLOGUE_SUCCESS -eq 1 ]; then", f" {srun_command}"] + command_list = [prologue_command, "if [ $PROLOGUE_SUCCESS -eq 1 ]; then"] + indent = " " - if tr.epilogue: - epilogue_command = self.gen_epilogue(tr.epilogue, tr.output_path) - command_list.append(f" {epilogue_command}") + command_list.append(f"{indent}{srun_command}") - command_list.append("fi") - else: - srun_command = self._gen_srun_command(slurm_args, env_vars, cmd_args, tr.test.extra_cmd_args) - command_list = [srun_command] + if tr.epilogue: + epilogue_command = self.gen_epilogue(tr.epilogue, tr.output_path) + command_list.append(f"{indent}{epilogue_command}") - if tr.epilogue: - epilogue_command = self.gen_epilogue(tr.epilogue, tr.output_path) - command_list.append(epilogue_command) + if tr.prologue: + command_list.append("fi") full_command = "\n".join(command_list).strip() return self._write_sbatch_script(slurm_args, env_vars, full_command, tr.output_path) From 897a7da60ef7db0d7e6ead059d619076199b2e93 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Tue, 29 Oct 2024 15:29:52 -0400 Subject: [PATCH 36/64] Make prologue and epilogue fields optional --- src/cloudai/_core/test_scenario_parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cloudai/_core/test_scenario_parser.py b/src/cloudai/_core/test_scenario_parser.py index 4d192a7a..67f52eac 100644 --- a/src/cloudai/_core/test_scenario_parser.py +++ b/src/cloudai/_core/test_scenario_parser.py @@ -54,8 +54,8 @@ class _TestScenarioTOML(BaseModel): name: str job_status_check: bool = True tests: list[_TestRunTOML] = Field(alias="Tests", min_length=1) - prologue: str = "" - epilogue: str = "" + prologue: Optional[str] = None + epilogue: Optional[str] = None @model_validator(mode="after") def check_no_self_dependency(self): From d44023bf9923ec6f5a1edd879029d423acfac753 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Tue, 29 Oct 2024 15:31:29 -0400 Subject: [PATCH 37/64] Set prologue and epilogue to None by default --- src/cloudai/_core/test_scenario_parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cloudai/_core/test_scenario_parser.py b/src/cloudai/_core/test_scenario_parser.py index 67f52eac..6ff81dc9 100644 --- a/src/cloudai/_core/test_scenario_parser.py +++ b/src/cloudai/_core/test_scenario_parser.py @@ -166,8 +166,8 @@ def _create_section_test_run( self, test_info: _TestRunTOML, normalized_weight: float, - prologue: Optional[TestScenario], - epilogue: Optional[TestScenario], + prologue: Optional[TestScenario] = None, + epilogue: Optional[TestScenario] = None, ) -> TestRun: """ Create a section-specific Test object by copying from the test mapping. From 12022de61f5655eb303c22157a4d4aec10b5fb9e Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Tue, 29 Oct 2024 15:33:58 -0400 Subject: [PATCH 38/64] Recover comments --- src/cloudai/parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cloudai/parser.py b/src/cloudai/parser.py index 9e9a6766..cbb21d78 100644 --- a/src/cloudai/parser.py +++ b/src/cloudai/parser.py @@ -71,12 +71,12 @@ def parse( try: system = self.parse_system(self.system_config_path) except SystemConfigParsingError: - exit(1) + exit(1) # exit right away to keep error message readable for users try: tests = self.parse_tests(list(test_path.glob("*.toml")), system) except TestConfigParsingError: - exit(1) + exit(1) # exit right away to keep error message readable for users plugin_test_scenario_path = Path("conf/common/plugin") plugin_test_path = Path("conf/common/plugin/test") From 3cf27df886de2fff6a40de1cdeda406647e8ab11 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Tue, 29 Oct 2024 15:34:32 -0400 Subject: [PATCH 39/64] Remove unused tmp_path from unit tests --- .../test_common_slurm_command_gen_strategy.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py index 0f7821a3..c5a6d0f0 100644 --- a/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py +++ b/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py @@ -223,7 +223,6 @@ def test_prologue_epilogue_combinations( prologue, epilogue, expected_script_lines, - tmp_path, ): testrun_fixture.prologue = Mock(spec=TestScenario) if prologue else None testrun_fixture.epilogue = Mock(spec=TestScenario) if epilogue else None From 9244bd6e510e6807f246b0d2c4c0a6ec71eac5f1 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Wed, 30 Oct 2024 10:52:54 -0400 Subject: [PATCH 40/64] Do not allow empty test runs in plugins --- .../systems/slurm/strategy/slurm_command_gen_strategy.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py index a06dd33f..e4b2f0c3 100644 --- a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py +++ b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py @@ -134,9 +134,6 @@ def gen_prologue(self, prologue: TestScenario, base_output_path: Path) -> str: Returns: str: A string with all the Slurm srun commands generated for the prologue. """ - if not prologue.test_runs: - return "PROLOGUE_SUCCESS=1\n" - prologue_output_dir = base_output_path / "prologue" prologue_output_dir.mkdir(parents=True, exist_ok=True) @@ -177,9 +174,6 @@ def gen_epilogue(self, epilogue: TestScenario, base_output_path: Path) -> str: Returns: str: A string with all the Slurm srun commands generated for the epilogue. """ - if not epilogue.test_runs: - return "" - epilogue_output_dir = base_output_path / "epilogue" epilogue_output_dir.mkdir(parents=True, exist_ok=True) From 00f34f2bdd349e802d7d053619610fdad677af33 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Wed, 30 Oct 2024 12:15:17 -0400 Subject: [PATCH 41/64] Simplify prologue unit tests --- .../test_common_slurm_command_gen_strategy.py | 20 +++++++++---------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py index 25efdf16..e388f0d9 100644 --- a/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py +++ b/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py @@ -132,7 +132,7 @@ def test_raises_if_no_default_partition(slurm_system: SlurmSystem): [Mock(test=Mock(name="test1", test_template=Mock()))], None, [ - "SUCCESS_0=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)", + "prologue", "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 )", "if [ $PROLOGUE_SUCCESS -eq 1 ]; then", " srun", @@ -153,7 +153,7 @@ def test_raises_if_no_default_partition(slurm_system: SlurmSystem): [Mock(test=Mock(name="test1", test_template=Mock()))], [Mock(test=Mock(name="test2", test_template=Mock()))], [ - "SUCCESS_0=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)", + "prologue", "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 )", "if [ $PROLOGUE_SUCCESS -eq 1 ]; then", " srun", @@ -166,8 +166,8 @@ def test_raises_if_no_default_partition(slurm_system: SlurmSystem): [Mock(test=Mock(name="test1", test_template=Mock())), Mock(test=Mock(name="test2", test_template=Mock()))], [Mock(test=Mock(name="test3", test_template=Mock())), Mock(test=Mock(name="test4", test_template=Mock()))], [ - "SUCCESS_0=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)", - "SUCCESS_1=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)", + "prologue", + "prologue", "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && [ $SUCCESS_1 -eq 1 ] && echo 1 || echo 0 )", "if [ $PROLOGUE_SUCCESS -eq 1 ]; then", " srun", @@ -181,8 +181,8 @@ def test_raises_if_no_default_partition(slurm_system: SlurmSystem): [Mock(test=Mock(name="test1", test_template=Mock())), Mock(test=Mock(name="test2", test_template=Mock()))], None, [ - "SUCCESS_0=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)", - "SUCCESS_1=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)", + "prologue", + "prologue", "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && [ $SUCCESS_1 -eq 1 ] && echo 1 || echo 0 )", "if [ $PROLOGUE_SUCCESS -eq 1 ]; then", " srun", @@ -204,8 +204,8 @@ def test_raises_if_no_default_partition(slurm_system: SlurmSystem): [Mock(test=Mock(name="test1", test_template=Mock())), Mock(test=Mock(name="test2", test_template=Mock()))], [Mock(test=Mock(name="test3", test_template=Mock()))], [ - "SUCCESS_0=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)", - "SUCCESS_1=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)", + "prologue", + "prologue", "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && [ $SUCCESS_1 -eq 1 ] && echo 1 || echo 0 )", "if [ $PROLOGUE_SUCCESS -eq 1 ]; then", " srun", @@ -229,9 +229,7 @@ def test_prologue_epilogue_combinations( testrun_fixture.prologue = Mock(spec=TestScenario) testrun_fixture.prologue.test_runs = prologue for idx, run in enumerate(prologue): - run.test.test_template.gen_srun_success_check.return_value = ( - "grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0" - ) + run.test.test_template.gen_srun_success_check.return_value = "prologue" run.test.test_template.gen_srun_command.return_value = "srun" run.test.name = f"test{idx+1}" From 7de11857a2b815f2ad06fa7851f50673ea1e53a4 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Wed, 30 Oct 2024 12:16:31 -0400 Subject: [PATCH 42/64] Move plugin directory to conf --- conf/{common => }/plugin/nccl_test_epilogue.toml | 0 conf/{common => }/plugin/nccl_test_prologue.toml | 0 conf/{common => }/plugin/test/nccl_test_all_gather.toml | 0 conf/{common => }/plugin/test/nccl_test_all_reduce.toml | 0 src/cloudai/parser.py | 4 ++-- 5 files changed, 2 insertions(+), 2 deletions(-) rename conf/{common => }/plugin/nccl_test_epilogue.toml (100%) rename conf/{common => }/plugin/nccl_test_prologue.toml (100%) rename conf/{common => }/plugin/test/nccl_test_all_gather.toml (100%) rename conf/{common => }/plugin/test/nccl_test_all_reduce.toml (100%) diff --git a/conf/common/plugin/nccl_test_epilogue.toml b/conf/plugin/nccl_test_epilogue.toml similarity index 100% rename from conf/common/plugin/nccl_test_epilogue.toml rename to conf/plugin/nccl_test_epilogue.toml diff --git a/conf/common/plugin/nccl_test_prologue.toml b/conf/plugin/nccl_test_prologue.toml similarity index 100% rename from conf/common/plugin/nccl_test_prologue.toml rename to conf/plugin/nccl_test_prologue.toml diff --git a/conf/common/plugin/test/nccl_test_all_gather.toml b/conf/plugin/test/nccl_test_all_gather.toml similarity index 100% rename from conf/common/plugin/test/nccl_test_all_gather.toml rename to conf/plugin/test/nccl_test_all_gather.toml diff --git a/conf/common/plugin/test/nccl_test_all_reduce.toml b/conf/plugin/test/nccl_test_all_reduce.toml similarity index 100% rename from conf/common/plugin/test/nccl_test_all_reduce.toml rename to conf/plugin/test/nccl_test_all_reduce.toml diff --git a/src/cloudai/parser.py b/src/cloudai/parser.py index cbb21d78..0db9147e 100644 --- a/src/cloudai/parser.py +++ b/src/cloudai/parser.py @@ -78,8 +78,8 @@ def parse( except TestConfigParsingError: exit(1) # exit right away to keep error message readable for users - plugin_test_scenario_path = Path("conf/common/plugin") - plugin_test_path = Path("conf/common/plugin/test") + plugin_test_scenario_path = Path("conf/plugin") + plugin_test_path = Path("conf/plugin/test") plugin_tests = ( self.parse_tests(list(plugin_test_path.glob("*.toml")), system) From c2b8d834fd0b3fe5ffe31dd9fe79eb7d88f4ddde Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Wed, 30 Oct 2024 12:31:21 -0400 Subject: [PATCH 43/64] Reflect Andrei's comments --- src/cloudai/parser.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/cloudai/parser.py b/src/cloudai/parser.py index 0db9147e..42186ad8 100644 --- a/src/cloudai/parser.py +++ b/src/cloudai/parser.py @@ -78,19 +78,20 @@ def parse( except TestConfigParsingError: exit(1) # exit right away to keep error message readable for users - plugin_test_scenario_path = Path("conf/plugin") plugin_test_path = Path("conf/plugin/test") + try: + plugin_tests = ( + self.parse_tests(list(plugin_test_path.glob("*.toml")), system) if plugin_test_path.exists() else [] + ) + except TestConfigParsingError: + exit(1) # exit right away to keep error message readable for users - plugin_tests = ( - self.parse_tests(list(plugin_test_path.glob("*.toml")), system) - if plugin_test_path and plugin_test_path.exists() - else [] - ) - + plugin_test_scenario_path = Path("conf/plugin") if test_scenario_path: return self._parse_with_scenario(system, tests, test_scenario_path, plugin_tests, plugin_test_scenario_path) - return system, list(set(tests + plugin_tests)), None + combined_tests = list(set(tests + plugin_tests)) + return system, combined_tests, None def _parse_with_scenario( self, From e1534d14963b6248c7edfa42a3ce570514bd45e0 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Thu, 31 Oct 2024 11:02:29 -0400 Subject: [PATCH 44/64] Reflect Andrei's comments --- src/cloudai/_core/test_scenario_parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cloudai/_core/test_scenario_parser.py b/src/cloudai/_core/test_scenario_parser.py index 19f7e4c1..a3ea8d9a 100644 --- a/src/cloudai/_core/test_scenario_parser.py +++ b/src/cloudai/_core/test_scenario_parser.py @@ -209,7 +209,7 @@ def _create_test_run( sol=test_info.sol, weight=test_info.weight * normalized_weight, ideal_perf=test_info.ideal_perf, - prologue=prologue if prologue is not None else TestScenario(name="default_prologue", test_runs=[]), - epilogue=epilogue if epilogue is not None else TestScenario(name="default_epilogue", test_runs=[]), + prologue=prologue, + epilogue=epilogue, ) return tr From 42080e25f2cbf5d9fac7a0d2272784d934b8b5fd Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Thu, 31 Oct 2024 11:06:45 -0400 Subject: [PATCH 45/64] Print out warning when plugins are missing --- src/cloudai/_core/test_scenario_parser.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/cloudai/_core/test_scenario_parser.py b/src/cloudai/_core/test_scenario_parser.py index a3ea8d9a..8db8622d 100644 --- a/src/cloudai/_core/test_scenario_parser.py +++ b/src/cloudai/_core/test_scenario_parser.py @@ -142,8 +142,18 @@ def _parse_data(self, data: Dict[str, Any]) -> TestScenario: prologue, epilogue = None, None if ts_model.prologue: prologue = self.plugin_mapping.get(ts_model.prologue) + if prologue is None: + logging.warning( + f"Prologue '{ts_model.prologue}' not found in plugin mapping. " + "Ensure that a proper plugin directory is set under the working directory." + ) if ts_model.epilogue: epilogue = self.plugin_mapping.get(ts_model.epilogue) + if epilogue is None: + logging.warning( + f"Epilogue '{ts_model.epilogue}' not found in plugin mapping. " + "Ensure that a proper plugin directory is set under the working directory." + ) test_runs_by_id: dict[str, TestRun] = { tr.id: self._create_test_run(tr, normalized_weight, prologue, epilogue) for tr in ts_model.tests From 3dbb4d3d2893e2b6faca2c1543580035d3455cfc Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Thu, 31 Oct 2024 11:09:06 -0400 Subject: [PATCH 46/64] Update acceptance test sbatch script names --- tests/ref_data/{gpt-plugin.sbatch => gpt-prologue.sbatch} | 0 tests/ref_data/{grok-plugin.sbatch => grok-prologue.sbatch} | 0 tests/test_acceptance.py | 6 +++--- 3 files changed, 3 insertions(+), 3 deletions(-) rename tests/ref_data/{gpt-plugin.sbatch => gpt-prologue.sbatch} (100%) rename tests/ref_data/{grok-plugin.sbatch => grok-prologue.sbatch} (100%) diff --git a/tests/ref_data/gpt-plugin.sbatch b/tests/ref_data/gpt-prologue.sbatch similarity index 100% rename from tests/ref_data/gpt-plugin.sbatch rename to tests/ref_data/gpt-prologue.sbatch diff --git a/tests/ref_data/grok-plugin.sbatch b/tests/ref_data/grok-prologue.sbatch similarity index 100% rename from tests/ref_data/grok-plugin.sbatch rename to tests/ref_data/grok-prologue.sbatch diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py index 19e7acb9..bb6f7897 100644 --- a/tests/test_acceptance.py +++ b/tests/test_acceptance.py @@ -91,7 +91,7 @@ def partial_tr(slurm_system: SlurmSystem) -> partial[TestRun]: return partial(TestRun, num_nodes=1, nodes=[], output_path=slurm_system.output_path) -@pytest.fixture(params=["ucc", "nccl", "sleep", "gpt-plugin", "gpt-no-plugin", "grok-plugin", "grok-no-plugin"]) +@pytest.fixture(params=["ucc", "nccl", "sleep", "gpt-prologue", "gpt-no-plugin", "grok-prologue", "grok-no-plugin"]) def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -> tuple[TestRun, str, Optional[str]]: if request.param == "ucc": tr = partial_tr( @@ -159,7 +159,7 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - slurm_system, tr.test.test_definition.cmd_args_dict ) tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name") - if "no-plugin" not in request.param: + if "prologue" in request.param: prologue_tr = partial_tr( name="nccl", test=Test( @@ -194,7 +194,7 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - slurm_system, tr.test.test_definition.cmd_args_dict ) tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name") - if "no-plugin" not in request.param: + if "prologue" in request.param: prologue_tr = partial_tr( name="nccl", test=Test( From a9f5c979a7835aa8b5aab2eae8664951447dd13d Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Thu, 31 Oct 2024 11:59:36 -0400 Subject: [PATCH 47/64] Reflect Andrei's comments --- src/cloudai/parser.py | 83 +++++++++++++++---------------------------- tests/test_parser.py | 73 +++++++++++++++---------------------- 2 files changed, 57 insertions(+), 99 deletions(-) diff --git a/src/cloudai/parser.py b/src/cloudai/parser.py index 42186ad8..71435e90 100644 --- a/src/cloudai/parser.py +++ b/src/cloudai/parser.py @@ -16,7 +16,7 @@ import logging from pathlib import Path -from typing import Dict, List, Optional, Set, Tuple +from typing import Dict, List, Optional, Tuple import toml from pydantic import ValidationError @@ -34,6 +34,9 @@ format_validation_error, ) +PLUGIN_ROOT = Path("conf/plugin") +PLUGIN_TEST_ROOT = PLUGIN_ROOT / "test" + class Parser: """Main parser for parsing all types of configurations.""" @@ -78,74 +81,44 @@ def parse( except TestConfigParsingError: exit(1) # exit right away to keep error message readable for users - plugin_test_path = Path("conf/plugin/test") try: plugin_tests = ( - self.parse_tests(list(plugin_test_path.glob("*.toml")), system) if plugin_test_path.exists() else [] + self.parse_tests(list(PLUGIN_TEST_ROOT.glob("*.toml")), system) if PLUGIN_TEST_ROOT.exists() else [] ) except TestConfigParsingError: exit(1) # exit right away to keep error message readable for users - plugin_test_scenario_path = Path("conf/plugin") - if test_scenario_path: - return self._parse_with_scenario(system, tests, test_scenario_path, plugin_tests, plugin_test_scenario_path) - - combined_tests = list(set(tests + plugin_tests)) - return system, combined_tests, None + if not test_scenario_path: + all_tests = list({test.name: test for test in tests + plugin_tests}.values()) + return system, all_tests, None - def _parse_with_scenario( - self, - system: System, - tests: List[Test], - test_scenario_path: Path, - plugin_tests: List[Test], - plugin_test_scenario_path: Optional[Path], - ) -> Tuple[System, List[Test], Optional[TestScenario]]: - """Parse tests and scenarios with a main test scenario path specified.""" test_mapping = {t.name: t for t in tests} - plugin_test_mapping = {t.name: t for t in plugin_tests} - - plugin_test_scenario_mapping = self._load_plugin_scenarios(plugin_test_scenario_path, plugin_test_mapping) - test_scenario = self._load_main_scenario(test_scenario_path, test_mapping, plugin_test_scenario_mapping) - - all_used_test_names = self._collect_used_test_names(plugin_test_scenario_mapping, test_scenario) - filtered_tests = [t for t in tests if t.name in all_used_test_names] - - return system, filtered_tests, test_scenario - - def _load_plugin_scenarios( - self, plugin_test_scenario_path: Optional[Path], plugin_test_mapping: Dict[str, Test] - ) -> Dict[str, TestScenario]: - """Load plugin-specific test scenarios from the specified path.""" - if plugin_test_scenario_path and plugin_test_scenario_path.exists(): + plugin_test_scenario_mapping = {} + if PLUGIN_ROOT.exists() and list(PLUGIN_ROOT.glob("*.toml")): try: - return self.parse_plugins(list(plugin_test_scenario_path.glob("*.toml")), plugin_test_mapping) + plugin_test_scenario_mapping = self.parse_plugins( + list(PLUGIN_ROOT.glob("*.toml")), {t.name: t for t in plugin_tests} + ) except TestScenarioParsingError: - exit(1) - return {} + exit(1) # exit right away to keep error message readable for users - def _load_main_scenario( - self, - test_scenario_path: Path, - test_mapping: Dict[str, Test], - plugin_test_scenario_mapping: Dict[str, TestScenario], - ) -> Optional[TestScenario]: - """Load the main test scenario using provided mappings.""" try: - return self.parse_test_scenario(test_scenario_path, test_mapping, plugin_test_scenario_mapping) + test_scenario = self.parse_test_scenario(test_scenario_path, test_mapping, plugin_test_scenario_mapping) except TestScenarioParsingError: - exit(1) - - def _collect_used_test_names( - self, plugin_test_scenario_mapping: Dict[str, TestScenario], test_scenario: Optional[TestScenario] - ) -> Set[str]: - """Collect test names used in both plugin and main test scenarios.""" - # TODO: collect test names in the plugin test scenarios only - plugin_test_names = { - tr.test.name for scenario in plugin_test_scenario_mapping.values() for tr in scenario.test_runs + exit(1) # exit right away to keep error message readable for users + + scenario_tests = {tr.test.name for tr in test_scenario.test_runs} + plugin_scenario_tests = { + tr.test.name + for plugin_scenario in plugin_test_scenario_mapping.values() + for tr in plugin_scenario.test_runs } - scenario_test_names = {tr.test.name for tr in test_scenario.test_runs} if test_scenario else set() - return plugin_test_names.union(scenario_test_names) + + relevant_test_names = scenario_tests.union(plugin_scenario_tests) + filtered_tests = [t for t in tests if t.name in relevant_test_names] + plugin_tests + filtered_tests = list({test.name: test for test in filtered_tests}.values()) + + return system, filtered_tests, test_scenario @staticmethod def parse_plugins(plugin_tomls: List[Path], test_mapping: Dict[str, Test]) -> Dict[str, TestScenario]: diff --git a/tests/test_parser.py b/tests/test_parser.py index f347c7ee..2b709938 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -15,13 +15,13 @@ # limitations under the License. from pathlib import Path -from typing import Dict, cast +from typing import cast from unittest.mock import Mock, patch import pytest from pydantic_core import ErrorDetails -from cloudai import Parser, TestScenario, format_validation_error +from cloudai import Parser, format_validation_error from cloudai.systems.slurm.slurm_system import SlurmSystem @@ -53,11 +53,11 @@ def test_no_scenario(self, test_parser: Mock, parser: Parser): def test_scenario_without_plugin(self, test_scenario_parser: Mock, test_parser: Mock, parser: Parser): tests_dir = parser.system_config_path.parent.parent / "test" - fake_tests = [] - for i in range(3): - fake_tests.append(Mock()) - fake_tests[-1].name = f"test-{i}" - test_parser.return_value = fake_tests + fake_tests = [Mock(name=f"test-{i}") for i in range(3)] + for i, test in enumerate(fake_tests): + test.name = f"test-{i}" + + test_parser.side_effect = [fake_tests, []] fake_scenario = Mock() fake_scenario.test_runs = [Mock()] @@ -77,11 +77,13 @@ def test_scenario_with_plugin_common_tests( ): tests_dir = parser.system_config_path.parent.parent / "test" - fake_tests = [] - for i in range(3): - fake_tests.append(Mock()) - fake_tests[-1].name = f"test-{i}" - test_parser.return_value = fake_tests + main_tests = [Mock() for _ in range(3)] + for i, test in enumerate(main_tests): + test.name = f"test-{i}" + plugin_tests = [Mock()] + plugin_tests[0].name = "test-1" + + test_parser.side_effect = [main_tests, plugin_tests] fake_scenario = Mock() fake_scenario.test_runs = [Mock()] @@ -95,57 +97,40 @@ def test_scenario_with_plugin_common_tests( _, tests, _ = parser.parse(tests_dir, Path()) + filtered_test_names = {t.name for t in tests} assert len(tests) == 1 - assert tests[0].name == "test-1" + assert "test-1" in filtered_test_names @patch("cloudai._core.test_parser.TestParser.parse_all") @patch("cloudai._core.test_scenario_parser.TestScenarioParser.parse") - def test_scenario_with_plugin_exclusive_tests(self, test_scenario_parser: Mock, test_parser: Mock, parser: Parser): + @patch("pathlib.Path.exists", return_value=True) + def test_scenario_with_plugin_exclusive_tests( + self, path_exists_mock: Mock, test_scenario_parser: Mock, test_parser: Mock, parser: Parser + ): tests_dir = parser.system_config_path.parent.parent / "test" test_scenario_path = Path("/mock/test_scenario.toml") - fake_tests = [Mock() for _ in range(4)] - for i, test in enumerate(fake_tests): + main_tests = [Mock() for _ in range(3)] + plugin_tests = [Mock()] + for i, test in enumerate(main_tests): test.name = f"test-{i}" - test_parser.return_value = fake_tests + plugin_tests[0].name = "plugin-test-1" + + test_parser.side_effect = [main_tests, plugin_tests] fake_scenario = Mock() fake_scenario.test_runs = [Mock()] fake_scenario.test_runs[0].test.name = "test-1" test_scenario_parser.return_value = fake_scenario - fake_plugin_scenarios = {"plugin-1": Mock(test_runs=[Mock()])} - fake_plugin_scenarios["plugin-1"].test_runs[0].test.name = "test-2" - - with patch.object(parser, "_load_plugin_scenarios", return_value=fake_plugin_scenarios): - _, filtered_tests, _ = parser.parse(tests_dir, test_scenario_path) + _, filtered_tests, _ = parser.parse(tests_dir, test_scenario_path) filtered_test_names = {t.name for t in filtered_tests} assert len(filtered_tests) == 2 assert "test-1" in filtered_test_names - assert "test-2" in filtered_test_names + assert "plugin-test-1" in filtered_test_names assert "test-0" not in filtered_test_names - assert "test-3" not in filtered_test_names - - def test_collect_used_test_names(self, parser: Parser): - fake_scenario = Mock() - fake_scenario.test_runs = [Mock()] - fake_scenario.test_runs[0].test.name = "test-1" - - fake_plugin_scenario_1 = Mock(spec=TestScenario) - fake_plugin_scenario_1.test_runs = [Mock()] - fake_plugin_scenario_1.test_runs[0].test.name = "test-2" - - fake_plugin_scenario_2 = Mock(spec=TestScenario) - fake_plugin_scenario_2.test_runs = [Mock()] - fake_plugin_scenario_2.test_runs[0].test.name = "test-3" - - fake_plugin_scenarios = cast( - Dict[str, TestScenario], {"plugin-1": fake_plugin_scenario_1, "plugin-2": fake_plugin_scenario_2} - ) - - used_test_names = parser._collect_used_test_names(fake_plugin_scenarios, fake_scenario) - assert used_test_names == {"test-1", "test-2", "test-3"} + assert "test-2" not in filtered_test_names def test_parse_system(self, parser: Parser): parser.system_config_path = Path("conf/common/system/example_slurm_cluster.toml") From d3c7cfd224f6fd80b019b31d55069fd5e90893b3 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Thu, 31 Oct 2024 12:02:35 -0400 Subject: [PATCH 48/64] Make vulture happy --- tests/test_parser.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/test_parser.py b/tests/test_parser.py index 2b709938..c7d4f873 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -103,10 +103,7 @@ def test_scenario_with_plugin_common_tests( @patch("cloudai._core.test_parser.TestParser.parse_all") @patch("cloudai._core.test_scenario_parser.TestScenarioParser.parse") - @patch("pathlib.Path.exists", return_value=True) - def test_scenario_with_plugin_exclusive_tests( - self, path_exists_mock: Mock, test_scenario_parser: Mock, test_parser: Mock, parser: Parser - ): + def test_scenario_with_plugin_exclusive_tests(self, test_scenario_parser: Mock, test_parser: Mock, parser: Parser): tests_dir = parser.system_config_path.parent.parent / "test" test_scenario_path = Path("/mock/test_scenario.toml") From f12309976c154adb7041cbfd884c371483651f9c Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Fri, 1 Nov 2024 07:03:26 -0400 Subject: [PATCH 49/64] Add logging messages to parser.parse --- src/cloudai/parser.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/cloudai/parser.py b/src/cloudai/parser.py index 71435e90..a4c59f19 100644 --- a/src/cloudai/parser.py +++ b/src/cloudai/parser.py @@ -95,10 +95,12 @@ def parse( test_mapping = {t.name: t for t in tests} plugin_test_scenario_mapping = {} if PLUGIN_ROOT.exists() and list(PLUGIN_ROOT.glob("*.toml")): + logging.debug("PLUGIN_ROOT exists and contains .toml files. Proceeding with plugin test scenario parsing.") try: plugin_test_scenario_mapping = self.parse_plugins( list(PLUGIN_ROOT.glob("*.toml")), {t.name: t for t in plugin_tests} ) + logging.debug("Plugin test scenarios successfully parsed from PLUGIN_ROOT.") except TestScenarioParsingError: exit(1) # exit right away to keep error message readable for users From e886bf69e4daaccc99c35cad1db85cd9882d11ca Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Fri, 1 Nov 2024 07:09:36 -0400 Subject: [PATCH 50/64] Simplify unit test for readability --- tests/test_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_parser.py b/tests/test_parser.py index c7d4f873..e662e9f7 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -97,7 +97,7 @@ def test_scenario_with_plugin_common_tests( _, tests, _ = parser.parse(tests_dir, Path()) - filtered_test_names = {t.name for t in tests} + filtered_test_names = {"test-1"} assert len(tests) == 1 assert "test-1" in filtered_test_names From 59d5cb36b02f799b360c6ea282d3a9231e9874d9 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Fri, 1 Nov 2024 08:56:35 -0400 Subject: [PATCH 51/64] Reflect Andrei's comments --- src/cloudai/parser.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/cloudai/parser.py b/src/cloudai/parser.py index a4c59f19..b87518e3 100644 --- a/src/cloudai/parser.py +++ b/src/cloudai/parser.py @@ -81,6 +81,9 @@ def parse( except TestConfigParsingError: exit(1) # exit right away to keep error message readable for users + if not PLUGIN_ROOT.exists(): + logger.debug(f"PLUGIN_ROOT path '{PLUGIN_ROOT}' does not exist. Plugins will not be enabled.") + try: plugin_tests = ( self.parse_tests(list(PLUGIN_TEST_ROOT.glob("*.toml")), system) if PLUGIN_TEST_ROOT.exists() else [] @@ -95,12 +98,10 @@ def parse( test_mapping = {t.name: t for t in tests} plugin_test_scenario_mapping = {} if PLUGIN_ROOT.exists() and list(PLUGIN_ROOT.glob("*.toml")): - logging.debug("PLUGIN_ROOT exists and contains .toml files. Proceeding with plugin test scenario parsing.") try: plugin_test_scenario_mapping = self.parse_plugins( list(PLUGIN_ROOT.glob("*.toml")), {t.name: t for t in plugin_tests} ) - logging.debug("Plugin test scenarios successfully parsed from PLUGIN_ROOT.") except TestScenarioParsingError: exit(1) # exit right away to keep error message readable for users From 4894972f6c439366c293a6d6b6e11dd13acf4b24 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Fri, 1 Nov 2024 08:57:30 -0400 Subject: [PATCH 52/64] Reflect Andrei's comments --- src/cloudai/parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cloudai/parser.py b/src/cloudai/parser.py index b87518e3..eb3270c3 100644 --- a/src/cloudai/parser.py +++ b/src/cloudai/parser.py @@ -82,7 +82,7 @@ def parse( exit(1) # exit right away to keep error message readable for users if not PLUGIN_ROOT.exists(): - logger.debug(f"PLUGIN_ROOT path '{PLUGIN_ROOT}' does not exist. Plugins will not be enabled.") + logging.debug(f"PLUGIN_ROOT path '{PLUGIN_ROOT}' does not exist. Plugins will not be enabled.") try: plugin_tests = ( From c19f24b07784ba890b0230cd0c141a32683c92c4 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Mon, 4 Nov 2024 14:03:03 -0600 Subject: [PATCH 53/64] Rename plugin to hook --- conf/common/test_scenario/nccl_test.toml | 4 +- .../nccl_test.toml} | 0 .../test/nccl_test_all_gather.toml | 0 .../test/nccl_test_all_reduce.toml | 0 conf/plugin/nccl_test_prologue.toml | 22 ----- src/cloudai/_core/test_scenario.py | 4 +- src/cloudai/_core/test_scenario_parser.py | 44 +++++----- src/cloudai/parser.py | 44 +++++----- .../strategy/slurm_command_gen_strategy.py | 78 +++++++++--------- ...pt-no-plugin.sbatch => gpt-no-hook.sbatch} | 0 ...pt-prologue.sbatch => gpt-pre-test.sbatch} | 0 ...k-no-plugin.sbatch => grok-no-hook.sbatch} | 0 ...k-prologue.sbatch => grok-pre-test.sbatch} | 0 .../test_common_slurm_command_gen_strategy.py | 80 +++++++++---------- tests/test_acceptance.py | 28 +++---- tests/test_parser.py | 32 ++++---- 16 files changed, 156 insertions(+), 180 deletions(-) rename conf/{plugin/nccl_test_epilogue.toml => hook/nccl_test.toml} (100%) rename conf/{plugin => hook}/test/nccl_test_all_gather.toml (100%) rename conf/{plugin => hook}/test/nccl_test_all_reduce.toml (100%) delete mode 100644 conf/plugin/nccl_test_prologue.toml rename tests/ref_data/{gpt-no-plugin.sbatch => gpt-no-hook.sbatch} (100%) rename tests/ref_data/{gpt-prologue.sbatch => gpt-pre-test.sbatch} (100%) rename tests/ref_data/{grok-no-plugin.sbatch => grok-no-hook.sbatch} (100%) rename tests/ref_data/{grok-prologue.sbatch => grok-pre-test.sbatch} (100%) diff --git a/conf/common/test_scenario/nccl_test.toml b/conf/common/test_scenario/nccl_test.toml index 9b731e96..15064561 100644 --- a/conf/common/test_scenario/nccl_test.toml +++ b/conf/common/test_scenario/nccl_test.toml @@ -16,8 +16,8 @@ name = "nccl-test" -prologue = "nccl_test_prologue" -epilogue = "nccl_test_epilogue" +pre_test = "nccl_test" +post_test = "nccl_test" [[Tests]] id = "Tests.1" diff --git a/conf/plugin/nccl_test_epilogue.toml b/conf/hook/nccl_test.toml similarity index 100% rename from conf/plugin/nccl_test_epilogue.toml rename to conf/hook/nccl_test.toml diff --git a/conf/plugin/test/nccl_test_all_gather.toml b/conf/hook/test/nccl_test_all_gather.toml similarity index 100% rename from conf/plugin/test/nccl_test_all_gather.toml rename to conf/hook/test/nccl_test_all_gather.toml diff --git a/conf/plugin/test/nccl_test_all_reduce.toml b/conf/hook/test/nccl_test_all_reduce.toml similarity index 100% rename from conf/plugin/test/nccl_test_all_reduce.toml rename to conf/hook/test/nccl_test_all_reduce.toml diff --git a/conf/plugin/nccl_test_prologue.toml b/conf/plugin/nccl_test_prologue.toml deleted file mode 100644 index e5c1a1e4..00000000 --- a/conf/plugin/nccl_test_prologue.toml +++ /dev/null @@ -1,22 +0,0 @@ -# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -name = "nccl_test_prologue" - -[[Tests]] -id = "Tests.1" -test_name = "nccl_test_all_reduce" -time_limit = "00:20:00" diff --git a/src/cloudai/_core/test_scenario.py b/src/cloudai/_core/test_scenario.py index 97c89994..39f1bd21 100644 --- a/src/cloudai/_core/test_scenario.py +++ b/src/cloudai/_core/test_scenario.py @@ -58,8 +58,8 @@ class TestRun: weight: float = 0.0 ideal_perf: float = 1.0 dependencies: dict[str, TestDependency] = field(default_factory=dict) - prologue: Optional["TestScenario"] = None - epilogue: Optional["TestScenario"] = None + pre_test: Optional["TestScenario"] = None + post_test: Optional["TestScenario"] = None def __hash__(self) -> int: return hash(self.name + self.test.name + str(self.iterations) + str(self.current_iteration)) diff --git a/src/cloudai/_core/test_scenario_parser.py b/src/cloudai/_core/test_scenario_parser.py index 8db8622d..93047d29 100644 --- a/src/cloudai/_core/test_scenario_parser.py +++ b/src/cloudai/_core/test_scenario_parser.py @@ -54,8 +54,8 @@ class _TestScenarioTOML(BaseModel): name: str job_status_check: bool = True tests: list[_TestRunTOML] = Field(alias="Tests", min_length=1) - prologue: Optional[str] = None - epilogue: Optional[str] = None + pre_test: Optional[str] = None + post_test: Optional[str] = None @model_validator(mode="after") def check_no_self_dependency(self): @@ -101,10 +101,10 @@ class TestScenarioParser: __test__ = False - def __init__(self, file_path: Path, test_mapping: Dict[str, Test], plugin_mapping: Dict[str, TestScenario]) -> None: + def __init__(self, file_path: Path, test_mapping: Dict[str, Test], hook_mapping: Dict[str, TestScenario]) -> None: self.file_path = file_path self.test_mapping = test_mapping - self.plugin_mapping = plugin_mapping + self.hook_mapping = hook_mapping def parse(self) -> TestScenario: """ @@ -139,24 +139,24 @@ def _parse_data(self, data: Dict[str, Any]) -> TestScenario: total_weight = sum(tr.weight for tr in ts_model.tests) normalized_weight = 0 if total_weight == 0 else 100 / total_weight - prologue, epilogue = None, None - if ts_model.prologue: - prologue = self.plugin_mapping.get(ts_model.prologue) - if prologue is None: + pre_test, post_test = None, None + if ts_model.pre_test: + pre_test = self.hook_mapping.get(ts_model.pre_test) + if pre_test is None: logging.warning( - f"Prologue '{ts_model.prologue}' not found in plugin mapping. " - "Ensure that a proper plugin directory is set under the working directory." + f"Prologue '{ts_model.pre_test}' not found in hook mapping. " + "Ensure that a proper hook directory is set under the working directory." ) - if ts_model.epilogue: - epilogue = self.plugin_mapping.get(ts_model.epilogue) - if epilogue is None: + if ts_model.post_test: + post_test = self.hook_mapping.get(ts_model.post_test) + if post_test is None: logging.warning( - f"Epilogue '{ts_model.epilogue}' not found in plugin mapping. " - "Ensure that a proper plugin directory is set under the working directory." + f"Epilogue '{ts_model.post_test}' not found in hook mapping. " + "Ensure that a proper hook directory is set under the working directory." ) test_runs_by_id: dict[str, TestRun] = { - tr.id: self._create_test_run(tr, normalized_weight, prologue, epilogue) for tr in ts_model.tests + tr.id: self._create_test_run(tr, normalized_weight, pre_test, post_test) for tr in ts_model.tests } tests_data: dict[str, _TestRunTOML] = {tr.id: tr for tr in ts_model.tests} @@ -176,8 +176,8 @@ def _create_test_run( self, test_info: _TestRunTOML, normalized_weight: float, - prologue: Optional[TestScenario] = None, - epilogue: Optional[TestScenario] = None, + pre_test: Optional[TestScenario] = None, + post_test: Optional[TestScenario] = None, ) -> TestRun: """ Create a section-specific Test object by copying from the test mapping. @@ -185,8 +185,8 @@ def _create_test_run( Args: test_info (Dict[str, Any]): Information of the test. normalized_weight (float): Normalized weight for the test. - prologue (Optional[TestScenario]): TestScenario object representing the prologue sequence. - epilogue (Optional[TestScenario]): TestScenario object representing the epilogue sequence. + pre_test (Optional[TestScenario]): TestScenario object representing the pre_test sequence. + post_test (Optional[TestScenario]): TestScenario object representing the post_test sequence. Returns: Test: Copied and updated Test object for the section. @@ -219,7 +219,7 @@ def _create_test_run( sol=test_info.sol, weight=test_info.weight * normalized_weight, ideal_perf=test_info.ideal_perf, - prologue=prologue, - epilogue=epilogue, + pre_test=pre_test, + post_test=post_test, ) return tr diff --git a/src/cloudai/parser.py b/src/cloudai/parser.py index eb3270c3..950eff6c 100644 --- a/src/cloudai/parser.py +++ b/src/cloudai/parser.py @@ -34,7 +34,7 @@ format_validation_error, ) -PLUGIN_ROOT = Path("conf/plugin") +PLUGIN_ROOT = Path("conf/hook") PLUGIN_TEST_ROOT = PLUGIN_ROOT / "test" @@ -85,62 +85,60 @@ def parse( logging.debug(f"PLUGIN_ROOT path '{PLUGIN_ROOT}' does not exist. Plugins will not be enabled.") try: - plugin_tests = ( + hook_tests = ( self.parse_tests(list(PLUGIN_TEST_ROOT.glob("*.toml")), system) if PLUGIN_TEST_ROOT.exists() else [] ) except TestConfigParsingError: exit(1) # exit right away to keep error message readable for users if not test_scenario_path: - all_tests = list({test.name: test for test in tests + plugin_tests}.values()) + all_tests = list({test.name: test for test in tests + hook_tests}.values()) return system, all_tests, None test_mapping = {t.name: t for t in tests} - plugin_test_scenario_mapping = {} + hook_test_scenario_mapping = {} if PLUGIN_ROOT.exists() and list(PLUGIN_ROOT.glob("*.toml")): try: - plugin_test_scenario_mapping = self.parse_plugins( - list(PLUGIN_ROOT.glob("*.toml")), {t.name: t for t in plugin_tests} + hook_test_scenario_mapping = self.parse_hooks( + list(PLUGIN_ROOT.glob("*.toml")), {t.name: t for t in hook_tests} ) except TestScenarioParsingError: exit(1) # exit right away to keep error message readable for users try: - test_scenario = self.parse_test_scenario(test_scenario_path, test_mapping, plugin_test_scenario_mapping) + test_scenario = self.parse_test_scenario(test_scenario_path, test_mapping, hook_test_scenario_mapping) except TestScenarioParsingError: exit(1) # exit right away to keep error message readable for users scenario_tests = {tr.test.name for tr in test_scenario.test_runs} - plugin_scenario_tests = { - tr.test.name - for plugin_scenario in plugin_test_scenario_mapping.values() - for tr in plugin_scenario.test_runs + hook_scenario_tests = { + tr.test.name for hook_scenario in hook_test_scenario_mapping.values() for tr in hook_scenario.test_runs } - relevant_test_names = scenario_tests.union(plugin_scenario_tests) - filtered_tests = [t for t in tests if t.name in relevant_test_names] + plugin_tests + relevant_test_names = scenario_tests.union(hook_scenario_tests) + filtered_tests = [t for t in tests if t.name in relevant_test_names] + hook_tests filtered_tests = list({test.name: test for test in filtered_tests}.values()) return system, filtered_tests, test_scenario @staticmethod - def parse_plugins(plugin_tomls: List[Path], test_mapping: Dict[str, Test]) -> Dict[str, TestScenario]: - plugin_mapping = {} - for plugin_test_scenario_path in plugin_tomls: - plugin_scenario = Parser.parse_test_scenario(plugin_test_scenario_path, test_mapping) - plugin_mapping[plugin_scenario.name] = plugin_scenario - return plugin_mapping + def parse_hooks(hook_tomls: List[Path], test_mapping: Dict[str, Test]) -> Dict[str, TestScenario]: + hook_mapping = {} + for hook_test_scenario_path in hook_tomls: + hook_scenario = Parser.parse_test_scenario(hook_test_scenario_path, test_mapping) + hook_mapping[hook_scenario.name] = hook_scenario + return hook_mapping @staticmethod def parse_test_scenario( test_scenario_path: Path, test_mapping: Dict[str, Test], - plugin_mapping: Optional[Dict[str, TestScenario]] = None, + hook_mapping: Optional[Dict[str, TestScenario]] = None, ) -> TestScenario: - if plugin_mapping is None: - plugin_mapping = {} + if hook_mapping is None: + hook_mapping = {} - test_scenario_parser = TestScenarioParser(test_scenario_path, test_mapping, plugin_mapping) + test_scenario_parser = TestScenarioParser(test_scenario_path, test_mapping, hook_mapping) test_scenario = test_scenario_parser.parse() return test_scenario diff --git a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py index e4b2f0c3..10e5ef3e 100644 --- a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py +++ b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py @@ -60,18 +60,18 @@ def gen_exec_command(self, tr: TestRun) -> str: command_list = [] indent = "" - if tr.prologue: - prologue_command = self.gen_prologue(tr.prologue, tr.output_path) - command_list = [prologue_command, "if [ $PROLOGUE_SUCCESS -eq 1 ]; then"] + if tr.pre_test: + pre_test_command = self.gen_pre_test(tr.pre_test, tr.output_path) + command_list = [pre_test_command, "if [ $PROLOGUE_SUCCESS -eq 1 ]; then"] indent = " " command_list.append(f"{indent}{srun_command}") - if tr.epilogue: - epilogue_command = self.gen_epilogue(tr.epilogue, tr.output_path) - command_list.append(f"{indent}{epilogue_command}") + if tr.post_test: + post_test_command = self.gen_post_test(tr.post_test, tr.output_path) + command_list.append(f"{indent}{post_test_command}") - if tr.prologue: + if tr.pre_test: command_list.append("fi") full_command = "\n".join(command_list).strip() @@ -123,74 +123,74 @@ def job_name(self, job_name_prefix: str) -> str: job_name = f"{self.system.account}-{job_name_prefix}.{datetime.now().strftime('%Y%m%d_%H%M%S')}" return job_name - def gen_prologue(self, prologue: TestScenario, base_output_path: Path) -> str: + def gen_pre_test(self, pre_test: TestScenario, base_output_path: Path) -> str: """ - Generate the prologue command by running all tests defined in the prologue test scenario. + Generate the pre_test command by running all tests defined in the pre_test test scenario. Args: - prologue (TestScenario): The prologue test scenario containing the tests to be run. - base_output_path (Path): The base output directory path for storing prologue outputs. + pre_test (TestScenario): The pre_test test scenario containing the tests to be run. + base_output_path (Path): The base output directory path for storing pre_test outputs. Returns: - str: A string with all the Slurm srun commands generated for the prologue. + str: A string with all the Slurm srun commands generated for the pre_test. """ - prologue_output_dir = base_output_path / "prologue" - prologue_output_dir.mkdir(parents=True, exist_ok=True) + pre_test_output_dir = base_output_path / "pre_test" + pre_test_output_dir.mkdir(parents=True, exist_ok=True) - prologue_commands = [] + pre_test_commands = [] success_vars = [] - for idx, tr in enumerate(prologue.test_runs): - plugin_dir = prologue_output_dir / tr.test.name - plugin_dir.mkdir(parents=True, exist_ok=True) - tr.output_path = plugin_dir + for idx, tr in enumerate(pre_test.test_runs): + hook_dir = pre_test_output_dir / tr.test.name + hook_dir.mkdir(parents=True, exist_ok=True) + tr.output_path = hook_dir srun_command = tr.test.test_template.gen_srun_command(tr) srun_command_with_output = srun_command.replace( - "srun ", f"srun --output={plugin_dir / 'stdout.txt'} --error={plugin_dir / 'stderr.txt'} " + "srun ", f"srun --output={hook_dir / 'stdout.txt'} --error={hook_dir / 'stderr.txt'} " ) - prologue_commands.append(srun_command_with_output) + pre_test_commands.append(srun_command_with_output) success_var = f"SUCCESS_{idx}" success_vars.append(success_var) success_check_command = tr.test.test_template.gen_srun_success_check(tr) - prologue_commands.append(f"{success_var}=$({success_check_command})") + pre_test_commands.append(f"{success_var}=$({success_check_command})") combined_success_var = " && ".join([f"[ ${var} -eq 1 ]" for var in success_vars]) - prologue_commands.append(f"PROLOGUE_SUCCESS=$( {combined_success_var} && echo 1 || echo 0 )") + pre_test_commands.append(f"PROLOGUE_SUCCESS=$( {combined_success_var} && echo 1 || echo 0 )") - return "\n".join(prologue_commands) + return "\n".join(pre_test_commands) - def gen_epilogue(self, epilogue: TestScenario, base_output_path: Path) -> str: + def gen_post_test(self, post_test: TestScenario, base_output_path: Path) -> str: """ - Generate the epilogue command by running all tests defined in the epilogue test scenario. + Generate the post_test command by running all tests defined in the post_test test scenario. Args: - epilogue (TestScenario): The epilogue test scenario containing the tests to be run. - base_output_path (Path): The base output directory path for storing epilogue outputs. + post_test (TestScenario): The post_test test scenario containing the tests to be run. + base_output_path (Path): The base output directory path for storing post_test outputs. Returns: - str: A string with all the Slurm srun commands generated for the epilogue. + str: A string with all the Slurm srun commands generated for the post_test. """ - epilogue_output_dir = base_output_path / "epilogue" - epilogue_output_dir.mkdir(parents=True, exist_ok=True) + post_test_output_dir = base_output_path / "post_test" + post_test_output_dir.mkdir(parents=True, exist_ok=True) - epilogue_commands = [] + post_test_commands = [] - for tr in epilogue.test_runs: - plugin_dir = epilogue_output_dir / tr.test.name - plugin_dir.mkdir(parents=True, exist_ok=True) - tr.output_path = plugin_dir + for tr in post_test.test_runs: + hook_dir = post_test_output_dir / tr.test.name + hook_dir.mkdir(parents=True, exist_ok=True) + tr.output_path = hook_dir srun_command = tr.test.test_template.gen_srun_command(tr) srun_command_with_output = srun_command.replace( - "srun ", f"srun --output={plugin_dir / 'stdout.txt'} --error={plugin_dir / 'stderr.txt'} " + "srun ", f"srun --output={hook_dir / 'stdout.txt'} --error={hook_dir / 'stderr.txt'} " ) - epilogue_commands.append(srun_command_with_output) + post_test_commands.append(srun_command_with_output) - return "\n".join(epilogue_commands) + return "\n".join(post_test_commands) def _gen_srun_command( self, slurm_args: Dict[str, Any], env_vars: Dict[str, str], cmd_args: Dict[str, str], tr: TestRun diff --git a/tests/ref_data/gpt-no-plugin.sbatch b/tests/ref_data/gpt-no-hook.sbatch similarity index 100% rename from tests/ref_data/gpt-no-plugin.sbatch rename to tests/ref_data/gpt-no-hook.sbatch diff --git a/tests/ref_data/gpt-prologue.sbatch b/tests/ref_data/gpt-pre-test.sbatch similarity index 100% rename from tests/ref_data/gpt-prologue.sbatch rename to tests/ref_data/gpt-pre-test.sbatch diff --git a/tests/ref_data/grok-no-plugin.sbatch b/tests/ref_data/grok-no-hook.sbatch similarity index 100% rename from tests/ref_data/grok-no-plugin.sbatch rename to tests/ref_data/grok-no-hook.sbatch diff --git a/tests/ref_data/grok-prologue.sbatch b/tests/ref_data/grok-pre-test.sbatch similarity index 100% rename from tests/ref_data/grok-prologue.sbatch rename to tests/ref_data/grok-pre-test.sbatch diff --git a/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py index e388f0d9..07b8f2e4 100644 --- a/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py +++ b/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py @@ -123,121 +123,121 @@ def test_raises_if_no_default_partition(slurm_system: SlurmSystem): @pytest.mark.parametrize( - "prologue,epilogue,expected_script_lines", + "pre_test,post_test,expected_script_lines", [ - # No prologue, no epilogue + # No pre_test, no post_test (None, None, ["srun"]), - # One prologue, no epilogue + # One pre_test, no post_test ( [Mock(test=Mock(name="test1", test_template=Mock()))], None, [ - "prologue", + "pre_test", "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 )", "if [ $PROLOGUE_SUCCESS -eq 1 ]; then", " srun", "fi", ], ), - # No prologue, one epilogue + # No pre_test, one post_test ( None, [Mock(test=Mock(name="test2", test_template=Mock()))], [ "srun", - "epilogue", + "post_test", ], ), - # One prologue, one epilogue + # One pre_test, one post_test ( [Mock(test=Mock(name="test1", test_template=Mock()))], [Mock(test=Mock(name="test2", test_template=Mock()))], [ - "prologue", + "pre_test", "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 )", "if [ $PROLOGUE_SUCCESS -eq 1 ]; then", " srun", - " epilogue", + " post_test", "fi", ], ), - # Multiple prologues, multiple epilogues + # Multiple pre_tests, multiple post_tests ( [Mock(test=Mock(name="test1", test_template=Mock())), Mock(test=Mock(name="test2", test_template=Mock()))], [Mock(test=Mock(name="test3", test_template=Mock())), Mock(test=Mock(name="test4", test_template=Mock()))], [ - "prologue", - "prologue", + "pre_test", + "pre_test", "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && [ $SUCCESS_1 -eq 1 ] && echo 1 || echo 0 )", "if [ $PROLOGUE_SUCCESS -eq 1 ]; then", " srun", - " epilogue", - " epilogue", + " post_test", + " post_test", "fi", ], ), - # Multiple prologues, no epilogue + # Multiple pre_tests, no post_test ( [Mock(test=Mock(name="test1", test_template=Mock())), Mock(test=Mock(name="test2", test_template=Mock()))], None, [ - "prologue", - "prologue", + "pre_test", + "pre_test", "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && [ $SUCCESS_1 -eq 1 ] && echo 1 || echo 0 )", "if [ $PROLOGUE_SUCCESS -eq 1 ]; then", " srun", "fi", ], ), - # No prologue, multiple epilogues + # No pre_test, multiple post_tests ( None, [Mock(test=Mock(name="test3", test_template=Mock())), Mock(test=Mock(name="test4", test_template=Mock()))], [ "srun", - "epilogue", - "epilogue", + "post_test", + "post_test", ], ), - # Multiple prologues, single epilogue + # Multiple pre_tests, single post_test ( [Mock(test=Mock(name="test1", test_template=Mock())), Mock(test=Mock(name="test2", test_template=Mock()))], [Mock(test=Mock(name="test3", test_template=Mock()))], [ - "prologue", - "prologue", + "pre_test", + "pre_test", "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && [ $SUCCESS_1 -eq 1 ] && echo 1 || echo 0 )", "if [ $PROLOGUE_SUCCESS -eq 1 ]; then", " srun", - " epilogue", + " post_test", "fi", ], ), ], ) -def test_prologue_epilogue_combinations( +def test_pre_test_post_test_combinations( strategy_fixture: SlurmCommandGenStrategy, testrun_fixture: TestRun, - prologue, - epilogue, + pre_test, + post_test, expected_script_lines, ): - testrun_fixture.prologue = Mock(spec=TestScenario) if prologue else None - testrun_fixture.epilogue = Mock(spec=TestScenario) if epilogue else None - - if prologue is not None: - testrun_fixture.prologue = Mock(spec=TestScenario) - testrun_fixture.prologue.test_runs = prologue - for idx, run in enumerate(prologue): - run.test.test_template.gen_srun_success_check.return_value = "prologue" + testrun_fixture.pre_test = Mock(spec=TestScenario) if pre_test else None + testrun_fixture.post_test = Mock(spec=TestScenario) if post_test else None + + if pre_test is not None: + testrun_fixture.pre_test = Mock(spec=TestScenario) + testrun_fixture.pre_test.test_runs = pre_test + for idx, run in enumerate(pre_test): + run.test.test_template.gen_srun_success_check.return_value = "pre_test" run.test.test_template.gen_srun_command.return_value = "srun" run.test.name = f"test{idx+1}" - if epilogue is not None: - testrun_fixture.epilogue = Mock(spec=TestScenario) - testrun_fixture.epilogue.test_runs = epilogue - for idx, run in enumerate(epilogue): - run.test.test_template.gen_srun_command.return_value = "epilogue" + if post_test is not None: + testrun_fixture.post_test = Mock(spec=TestScenario) + testrun_fixture.post_test.test_runs = post_test + for idx, run in enumerate(post_test): + run.test.test_template.gen_srun_command.return_value = "post_test" run.test.name = f"test{idx+1}" sbatch_command = strategy_fixture.gen_exec_command(testrun_fixture) diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py index bb6f7897..862c59f6 100644 --- a/tests/test_acceptance.py +++ b/tests/test_acceptance.py @@ -60,7 +60,7 @@ def test_slurm(tmp_path: Path, scenario: Dict): system_config=Path("conf/common/system/example_slurm_cluster.toml"), test_templates_dir=Path("conf/common/test_template"), tests_dir=Path("conf/common/test"), - plugin_dir=Path("conf/common/plugin"), + hook_dir=Path("conf/common/hook"), test_scenario=test_scenario_path, output_dir=tmp_path, ) @@ -91,7 +91,7 @@ def partial_tr(slurm_system: SlurmSystem) -> partial[TestRun]: return partial(TestRun, num_nodes=1, nodes=[], output_path=slurm_system.output_path) -@pytest.fixture(params=["ucc", "nccl", "sleep", "gpt-prologue", "gpt-no-plugin", "grok-prologue", "grok-no-plugin"]) +@pytest.fixture(params=["ucc", "nccl", "sleep", "gpt-pre_test", "gpt-no-hook", "grok-pre_test", "grok-no-hook"]) def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -> tuple[TestRun, str, Optional[str]]: if request.param == "ucc": tr = partial_tr( @@ -159,8 +159,8 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - slurm_system, tr.test.test_definition.cmd_args_dict ) tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name") - if "prologue" in request.param: - prologue_tr = partial_tr( + if "pre_test" in request.param: + pre_test_tr = partial_tr( name="nccl", test=Test( test_definition=NCCLTestDefinition( @@ -169,11 +169,11 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - test_template=NcclTest(slurm_system, name="nccl"), ), ) - prologue_tr.test.test_template.command_gen_strategy = NcclTestSlurmCommandGenStrategy( - slurm_system, prologue_tr.test.test_definition.cmd_args_dict + pre_test_tr.test.test_template.command_gen_strategy = NcclTestSlurmCommandGenStrategy( + slurm_system, pre_test_tr.test.test_definition.cmd_args_dict ) - prologue_tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name") - tr.prologue = TestScenario(name=f"{prologue_tr.name} NCCL Prologue", test_runs=[prologue_tr]) + pre_test_tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name") + tr.pre_test = TestScenario(name=f"{pre_test_tr.name} NCCL Prologue", test_runs=[pre_test_tr]) return (tr, f"{request.param}.sbatch", "gpt.run") elif request.param.startswith("grok-"): @@ -194,8 +194,8 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - slurm_system, tr.test.test_definition.cmd_args_dict ) tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name") - if "prologue" in request.param: - prologue_tr = partial_tr( + if "pre_test" in request.param: + pre_test_tr = partial_tr( name="nccl", test=Test( test_definition=NCCLTestDefinition( @@ -204,11 +204,11 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - test_template=NcclTest(slurm_system, name="nccl"), ), ) - prologue_tr.test.test_template.command_gen_strategy = NcclTestSlurmCommandGenStrategy( - slurm_system, prologue_tr.test.test_definition.cmd_args_dict + pre_test_tr.test.test_template.command_gen_strategy = NcclTestSlurmCommandGenStrategy( + slurm_system, pre_test_tr.test.test_definition.cmd_args_dict ) - prologue_tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name") - tr.prologue = TestScenario(name=f"{prologue_tr.name} NCCL Prologue", test_runs=[prologue_tr]) + pre_test_tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name") + tr.pre_test = TestScenario(name=f"{pre_test_tr.name} NCCL Prologue", test_runs=[pre_test_tr]) return (tr, f"{request.param}.sbatch", "grok.run") diff --git a/tests/test_parser.py b/tests/test_parser.py index e662e9f7..3f901e0d 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -50,7 +50,7 @@ def test_no_scenario(self, test_parser: Mock, parser: Parser): @patch("cloudai._core.test_parser.TestParser.parse_all") @patch("cloudai._core.test_scenario_parser.TestScenarioParser.parse") - def test_scenario_without_plugin(self, test_scenario_parser: Mock, test_parser: Mock, parser: Parser): + def test_scenario_without_hook(self, test_scenario_parser: Mock, test_parser: Mock, parser: Parser): tests_dir = parser.system_config_path.parent.parent / "test" fake_tests = [Mock(name=f"test-{i}") for i in range(3)] @@ -71,29 +71,29 @@ def test_scenario_without_plugin(self, test_scenario_parser: Mock, test_parser: @patch("cloudai._core.test_parser.TestParser.parse_all") @patch("cloudai._core.test_scenario_parser.TestScenarioParser.parse") - @patch("cloudai.parser.Parser.parse_plugins") - def test_scenario_with_plugin_common_tests( - self, parse_plugins: Mock, test_scenario_parser: Mock, test_parser: Mock, parser: Parser + @patch("cloudai.parser.Parser.parse_hooks") + def test_scenario_with_hook_common_tests( + self, parse_hooks: Mock, test_scenario_parser: Mock, test_parser: Mock, parser: Parser ): tests_dir = parser.system_config_path.parent.parent / "test" main_tests = [Mock() for _ in range(3)] for i, test in enumerate(main_tests): test.name = f"test-{i}" - plugin_tests = [Mock()] - plugin_tests[0].name = "test-1" + hook_tests = [Mock()] + hook_tests[0].name = "test-1" - test_parser.side_effect = [main_tests, plugin_tests] + test_parser.side_effect = [main_tests, hook_tests] fake_scenario = Mock() fake_scenario.test_runs = [Mock()] fake_scenario.test_runs[0].test.name = "test-1" test_scenario_parser.return_value = fake_scenario - fake_plugin = Mock() - fake_plugin.test_runs = [Mock()] - fake_plugin.test_runs[0].test.name = "test-1" - parse_plugins.return_value = {"plugin-1": fake_plugin} + fake_hook = Mock() + fake_hook.test_runs = [Mock()] + fake_hook.test_runs[0].test.name = "test-1" + parse_hooks.return_value = {"hook-1": fake_hook} _, tests, _ = parser.parse(tests_dir, Path()) @@ -103,17 +103,17 @@ def test_scenario_with_plugin_common_tests( @patch("cloudai._core.test_parser.TestParser.parse_all") @patch("cloudai._core.test_scenario_parser.TestScenarioParser.parse") - def test_scenario_with_plugin_exclusive_tests(self, test_scenario_parser: Mock, test_parser: Mock, parser: Parser): + def test_scenario_with_hook_exclusive_tests(self, test_scenario_parser: Mock, test_parser: Mock, parser: Parser): tests_dir = parser.system_config_path.parent.parent / "test" test_scenario_path = Path("/mock/test_scenario.toml") main_tests = [Mock() for _ in range(3)] - plugin_tests = [Mock()] + hook_tests = [Mock()] for i, test in enumerate(main_tests): test.name = f"test-{i}" - plugin_tests[0].name = "plugin-test-1" + hook_tests[0].name = "hook-test-1" - test_parser.side_effect = [main_tests, plugin_tests] + test_parser.side_effect = [main_tests, hook_tests] fake_scenario = Mock() fake_scenario.test_runs = [Mock()] @@ -125,7 +125,7 @@ def test_scenario_with_plugin_exclusive_tests(self, test_scenario_parser: Mock, filtered_test_names = {t.name for t in filtered_tests} assert len(filtered_tests) == 2 assert "test-1" in filtered_test_names - assert "plugin-test-1" in filtered_test_names + assert "hook-test-1" in filtered_test_names assert "test-0" not in filtered_test_names assert "test-2" not in filtered_test_names From f53420cbc3c4d1bb7ee1bdabbd60292adf5091fc Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Mon, 4 Nov 2024 14:08:11 -0600 Subject: [PATCH 54/64] Rename plugin to hook --- src/cloudai/_core/test_scenario_parser.py | 4 ++-- .../strategy/slurm_command_gen_strategy.py | 4 ++-- tests/ref_data/gpt-pre-test.sbatch | 8 ++++---- tests/ref_data/grok-pre-test.sbatch | 8 ++++---- .../test_common_slurm_command_gen_strategy.py | 20 +++++++++---------- tests/test_acceptance.py | 6 +++--- 6 files changed, 25 insertions(+), 25 deletions(-) diff --git a/src/cloudai/_core/test_scenario_parser.py b/src/cloudai/_core/test_scenario_parser.py index 93047d29..ddfb5fa0 100644 --- a/src/cloudai/_core/test_scenario_parser.py +++ b/src/cloudai/_core/test_scenario_parser.py @@ -144,14 +144,14 @@ def _parse_data(self, data: Dict[str, Any]) -> TestScenario: pre_test = self.hook_mapping.get(ts_model.pre_test) if pre_test is None: logging.warning( - f"Prologue '{ts_model.pre_test}' not found in hook mapping. " + f"Pre-test hook '{ts_model.pre_test}' not found in hook mapping. " "Ensure that a proper hook directory is set under the working directory." ) if ts_model.post_test: post_test = self.hook_mapping.get(ts_model.post_test) if post_test is None: logging.warning( - f"Epilogue '{ts_model.post_test}' not found in hook mapping. " + f"Post-test hook '{ts_model.post_test}' not found in hook mapping. " "Ensure that a proper hook directory is set under the working directory." ) diff --git a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py index 10e5ef3e..8b03379d 100644 --- a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py +++ b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py @@ -62,7 +62,7 @@ def gen_exec_command(self, tr: TestRun) -> str: if tr.pre_test: pre_test_command = self.gen_pre_test(tr.pre_test, tr.output_path) - command_list = [pre_test_command, "if [ $PROLOGUE_SUCCESS -eq 1 ]; then"] + command_list = [pre_test_command, "if [ $PRE_TEST_SUCCESS -eq 1 ]; then"] indent = " " command_list.append(f"{indent}{srun_command}") @@ -159,7 +159,7 @@ def gen_pre_test(self, pre_test: TestScenario, base_output_path: Path) -> str: combined_success_var = " && ".join([f"[ ${var} -eq 1 ]" for var in success_vars]) - pre_test_commands.append(f"PROLOGUE_SUCCESS=$( {combined_success_var} && echo 1 || echo 0 )") + pre_test_commands.append(f"PRE_TEST_SUCCESS=$( {combined_success_var} && echo 1 || echo 0 )") return "\n".join(pre_test_commands) diff --git a/tests/ref_data/gpt-pre-test.sbatch b/tests/ref_data/gpt-pre-test.sbatch index 08a3a87a..c0f6114f 100644 --- a/tests/ref_data/gpt-pre-test.sbatch +++ b/tests/ref_data/gpt-pre-test.sbatch @@ -8,10 +8,10 @@ export COMBINE_THRESHOLD=1 export PER_GPU_COMBINE_THRESHOLD=0 export XLA_FLAGS="--xla_gpu_all_gather_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_all_reduce_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_reduce_scatter_combine_threshold_bytes=$PER_GPU_COMBINE_THRESHOLD" -srun --output=__OUTPUT_DIR__/prologue/nccl/stdout.txt --error=__OUTPUT_DIR__/prologue/nccl/stderr.txt --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 /usr/local/bin/all_reduce_perf_mpi --nthreads 1 --ngpus 1 --minbytes 32M --maxbytes 32M --stepbytes 1M --op sum --datatype float --root 0 --iters 20 --warmup_iters 5 --agg_iters 1 --average 1 --parallel_init 0 --check 1 --blocking 0 --cudagraph 0 -SUCCESS_0=$(grep -q "Avg bus bandwidth" __OUTPUT_DIR__/prologue/nccl/stdout.txt && echo 1 || echo 0) -PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 ) -if [ $PROLOGUE_SUCCESS -eq 1 ]; then +srun --output=__OUTPUT_DIR__/pre_test/nccl/stdout.txt --error=__OUTPUT_DIR__/pre_test/nccl/stderr.txt --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 /usr/local/bin/all_reduce_perf_mpi --nthreads 1 --ngpus 1 --minbytes 32M --maxbytes 32M --stepbytes 1M --op sum --datatype float --root 0 --iters 20 --warmup_iters 5 --agg_iters 1 --average 1 --parallel_init 0 --check 1 --blocking 0 --cudagraph 0 +SUCCESS_0=$(grep -q "Avg bus bandwidth" __OUTPUT_DIR__/pre_test/nccl/stdout.txt && echo 1 || echo 0) +PRE_TEST_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 ) +if [ $PRE_TEST_SUCCESS -eq 1 ]; then echo "Loading container with srun command" srun --mpi=none --container-image=https://docker/url --container-name=cont true echo "Running srun command" diff --git a/tests/ref_data/grok-pre-test.sbatch b/tests/ref_data/grok-pre-test.sbatch index e75d3d77..51730bd7 100644 --- a/tests/ref_data/grok-pre-test.sbatch +++ b/tests/ref_data/grok-pre-test.sbatch @@ -8,10 +8,10 @@ export COMBINE_THRESHOLD=1 export PER_GPU_COMBINE_THRESHOLD=0 export XLA_FLAGS="--xla_disable_hlo_passes=rematerialization --xla_dump_hlo_pass_re=.* --xla_gpu_all_gather_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_all_reduce_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_enable_all_gather_combine_by_dim=false --xla_gpu_enable_highest_priority_async_stream=true --xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_pipelined_all_gather=true --xla_gpu_enable_pipelined_all_reduce=true --xla_gpu_enable_pipelined_reduce_scatter=true --xla_gpu_enable_reduce_scatter_combine_by_dim=false --xla_gpu_enable_triton_gemm=false --xla_gpu_enable_triton_softmax_fusion=false --xla_gpu_enable_while_loop_double_buffering=true --xla_gpu_graph_level=0 --xla_gpu_pgle_profile_file_or_directory_path=/opt/paxml/workspace/pgle_output_profile.pbtxt --xla_gpu_reduce_scatter_combine_threshold_bytes=$PER_GPU_COMBINE_THRESHOLD --xla_gpu_run_post_layout_collective_pipeliner=false --xla_gpu_use_memcpy_local_p2p=false" -srun --output=__OUTPUT_DIR__/prologue/nccl/stdout.txt --error=__OUTPUT_DIR__/prologue/nccl/stderr.txt --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 /usr/local/bin/all_reduce_perf_mpi --nthreads 1 --ngpus 1 --minbytes 32M --maxbytes 32M --stepbytes 1M --op sum --datatype float --root 0 --iters 20 --warmup_iters 5 --agg_iters 1 --average 1 --parallel_init 0 --check 1 --blocking 0 --cudagraph 0 -SUCCESS_0=$(grep -q "Avg bus bandwidth" __OUTPUT_DIR__/prologue/nccl/stdout.txt && echo 1 || echo 0) -PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 ) -if [ $PROLOGUE_SUCCESS -eq 1 ]; then +srun --output=__OUTPUT_DIR__/pre_test/nccl/stdout.txt --error=__OUTPUT_DIR__/pre_test/nccl/stderr.txt --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 /usr/local/bin/all_reduce_perf_mpi --nthreads 1 --ngpus 1 --minbytes 32M --maxbytes 32M --stepbytes 1M --op sum --datatype float --root 0 --iters 20 --warmup_iters 5 --agg_iters 1 --average 1 --parallel_init 0 --check 1 --blocking 0 --cudagraph 0 +SUCCESS_0=$(grep -q "Avg bus bandwidth" __OUTPUT_DIR__/pre_test/nccl/stdout.txt && echo 1 || echo 0) +PRE_TEST_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 ) +if [ $PRE_TEST_SUCCESS -eq 1 ]; then echo "Loading container with srun command" srun --mpi=none --container-image=https://docker/url --container-name=cont true echo "Running srun command" diff --git a/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py index 07b8f2e4..534d9cd1 100644 --- a/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py +++ b/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py @@ -133,8 +133,8 @@ def test_raises_if_no_default_partition(slurm_system: SlurmSystem): None, [ "pre_test", - "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 )", - "if [ $PROLOGUE_SUCCESS -eq 1 ]; then", + "PRE_TEST_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 )", + "if [ $PRE_TEST_SUCCESS -eq 1 ]; then", " srun", "fi", ], @@ -154,8 +154,8 @@ def test_raises_if_no_default_partition(slurm_system: SlurmSystem): [Mock(test=Mock(name="test2", test_template=Mock()))], [ "pre_test", - "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 )", - "if [ $PROLOGUE_SUCCESS -eq 1 ]; then", + "PRE_TEST_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 )", + "if [ $PRE_TEST_SUCCESS -eq 1 ]; then", " srun", " post_test", "fi", @@ -168,8 +168,8 @@ def test_raises_if_no_default_partition(slurm_system: SlurmSystem): [ "pre_test", "pre_test", - "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && [ $SUCCESS_1 -eq 1 ] && echo 1 || echo 0 )", - "if [ $PROLOGUE_SUCCESS -eq 1 ]; then", + "PRE_TEST_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && [ $SUCCESS_1 -eq 1 ] && echo 1 || echo 0 )", + "if [ $PRE_TEST_SUCCESS -eq 1 ]; then", " srun", " post_test", " post_test", @@ -183,8 +183,8 @@ def test_raises_if_no_default_partition(slurm_system: SlurmSystem): [ "pre_test", "pre_test", - "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && [ $SUCCESS_1 -eq 1 ] && echo 1 || echo 0 )", - "if [ $PROLOGUE_SUCCESS -eq 1 ]; then", + "PRE_TEST_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && [ $SUCCESS_1 -eq 1 ] && echo 1 || echo 0 )", + "if [ $PRE_TEST_SUCCESS -eq 1 ]; then", " srun", "fi", ], @@ -206,8 +206,8 @@ def test_raises_if_no_default_partition(slurm_system: SlurmSystem): [ "pre_test", "pre_test", - "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && [ $SUCCESS_1 -eq 1 ] && echo 1 || echo 0 )", - "if [ $PROLOGUE_SUCCESS -eq 1 ]; then", + "PRE_TEST_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && [ $SUCCESS_1 -eq 1 ] && echo 1 || echo 0 )", + "if [ $PRE_TEST_SUCCESS -eq 1 ]; then", " srun", " post_test", "fi", diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py index 862c59f6..2e3f910c 100644 --- a/tests/test_acceptance.py +++ b/tests/test_acceptance.py @@ -91,7 +91,7 @@ def partial_tr(slurm_system: SlurmSystem) -> partial[TestRun]: return partial(TestRun, num_nodes=1, nodes=[], output_path=slurm_system.output_path) -@pytest.fixture(params=["ucc", "nccl", "sleep", "gpt-pre_test", "gpt-no-hook", "grok-pre_test", "grok-no-hook"]) +@pytest.fixture(params=["ucc", "nccl", "sleep", "gpt-pre-test", "gpt-no-hook", "grok-pre-test", "grok-no-hook"]) def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -> tuple[TestRun, str, Optional[str]]: if request.param == "ucc": tr = partial_tr( @@ -159,7 +159,7 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - slurm_system, tr.test.test_definition.cmd_args_dict ) tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name") - if "pre_test" in request.param: + if "pre-test" in request.param: pre_test_tr = partial_tr( name="nccl", test=Test( @@ -194,7 +194,7 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - slurm_system, tr.test.test_definition.cmd_args_dict ) tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name") - if "pre_test" in request.param: + if "pre-test" in request.param: pre_test_tr = partial_tr( name="nccl", test=Test( From 904f377274ebe6403f82e9f7f03ee140182761db Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Mon, 4 Nov 2024 14:10:05 -0600 Subject: [PATCH 55/64] Rename plugin to hook --- conf/hook/test/nccl_test_all_reduce.toml | 30 ------------------------ 1 file changed, 30 deletions(-) delete mode 100644 conf/hook/test/nccl_test_all_reduce.toml diff --git a/conf/hook/test/nccl_test_all_reduce.toml b/conf/hook/test/nccl_test_all_reduce.toml deleted file mode 100644 index 9074b2b8..00000000 --- a/conf/hook/test/nccl_test_all_reduce.toml +++ /dev/null @@ -1,30 +0,0 @@ -# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -name = "nccl_test_all_reduce" -description = "all_reduce" -test_template_name = "NcclTest" - -[cmd_args] -"subtest_name" = "all_reduce_perf_mpi" -"ngpus" = "1" -"minbytes" = "128" -"maxbytes" = "16G" -"iters" = "100" -"warmup_iters" = "50" - -[extra_cmd_args] -"--stepfactor" = "2" From 2c84d43f8b7b5fbde109414402849f812e8c1893 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Mon, 4 Nov 2024 14:12:30 -0600 Subject: [PATCH 56/64] Rename plugin to hook --- conf/hook/nccl_test.toml | 2 +- src/cloudai/_core/test_scenario_parser.py | 4 ++-- .../slurm/strategy/slurm_command_gen_strategy.py | 14 +++++++------- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/conf/hook/nccl_test.toml b/conf/hook/nccl_test.toml index 346dc8e4..53349c43 100644 --- a/conf/hook/nccl_test.toml +++ b/conf/hook/nccl_test.toml @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -name = "nccl_test_epilogue" +name = "nccl_test" [[Tests]] id = "Tests.1" diff --git a/src/cloudai/_core/test_scenario_parser.py b/src/cloudai/_core/test_scenario_parser.py index ddfb5fa0..3dbaf133 100644 --- a/src/cloudai/_core/test_scenario_parser.py +++ b/src/cloudai/_core/test_scenario_parser.py @@ -185,8 +185,8 @@ def _create_test_run( Args: test_info (Dict[str, Any]): Information of the test. normalized_weight (float): Normalized weight for the test. - pre_test (Optional[TestScenario]): TestScenario object representing the pre_test sequence. - post_test (Optional[TestScenario]): TestScenario object representing the post_test sequence. + pre_test (Optional[TestScenario]): TestScenario object representing the pre-test sequence. + post_test (Optional[TestScenario]): TestScenario object representing the post-test sequence. Returns: Test: Copied and updated Test object for the section. diff --git a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py index 8b03379d..ee8a463a 100644 --- a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py +++ b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py @@ -125,11 +125,11 @@ def job_name(self, job_name_prefix: str) -> str: def gen_pre_test(self, pre_test: TestScenario, base_output_path: Path) -> str: """ - Generate the pre_test command by running all tests defined in the pre_test test scenario. + Generate the pre-test command by running all tests defined in the pre-test test scenario. Args: - pre_test (TestScenario): The pre_test test scenario containing the tests to be run. - base_output_path (Path): The base output directory path for storing pre_test outputs. + pre_test (TestScenario): The pre-test test scenario containing the tests to be run. + base_output_path (Path): The base output directory path for storing pre-test outputs. Returns: str: A string with all the Slurm srun commands generated for the pre_test. @@ -165,14 +165,14 @@ def gen_pre_test(self, pre_test: TestScenario, base_output_path: Path) -> str: def gen_post_test(self, post_test: TestScenario, base_output_path: Path) -> str: """ - Generate the post_test command by running all tests defined in the post_test test scenario. + Generate the post-test command by running all tests defined in the post-test test scenario. Args: - post_test (TestScenario): The post_test test scenario containing the tests to be run. - base_output_path (Path): The base output directory path for storing post_test outputs. + post_test (TestScenario): The post-test test scenario containing the tests to be run. + base_output_path (Path): The base output directory path for storing post-test outputs. Returns: - str: A string with all the Slurm srun commands generated for the post_test. + str: A string with all the Slurm srun commands generated for the post-test. """ post_test_output_dir = base_output_path / "post_test" post_test_output_dir.mkdir(parents=True, exist_ok=True) From b598779e1b10ed17669bc785bba0e12a24238976 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Mon, 4 Nov 2024 14:20:03 -0600 Subject: [PATCH 57/64] Rename plugin to hook --- tests/test_acceptance.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py index 2e3f910c..d1e57782 100644 --- a/tests/test_acceptance.py +++ b/tests/test_acceptance.py @@ -173,7 +173,7 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - slurm_system, pre_test_tr.test.test_definition.cmd_args_dict ) pre_test_tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name") - tr.pre_test = TestScenario(name=f"{pre_test_tr.name} NCCL Prologue", test_runs=[pre_test_tr]) + tr.pre_test = TestScenario(name=f"{pre_test_tr.name} NCCL pre-test", test_runs=[pre_test_tr]) return (tr, f"{request.param}.sbatch", "gpt.run") elif request.param.startswith("grok-"): @@ -208,7 +208,7 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - slurm_system, pre_test_tr.test.test_definition.cmd_args_dict ) pre_test_tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name") - tr.pre_test = TestScenario(name=f"{pre_test_tr.name} NCCL Prologue", test_runs=[pre_test_tr]) + tr.pre_test = TestScenario(name=f"{pre_test_tr.name} NCCL pre-test", test_runs=[pre_test_tr]) return (tr, f"{request.param}.sbatch", "grok.run") From de1c1a6ff687e13fde970f623ea8914bdd7a3c9f Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Mon, 4 Nov 2024 14:23:02 -0600 Subject: [PATCH 58/64] Raise an exception when hooks are not found --- src/cloudai/_core/test_scenario_parser.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/cloudai/_core/test_scenario_parser.py b/src/cloudai/_core/test_scenario_parser.py index 3dbaf133..40840872 100644 --- a/src/cloudai/_core/test_scenario_parser.py +++ b/src/cloudai/_core/test_scenario_parser.py @@ -143,17 +143,22 @@ def _parse_data(self, data: Dict[str, Any]) -> TestScenario: if ts_model.pre_test: pre_test = self.hook_mapping.get(ts_model.pre_test) if pre_test is None: - logging.warning( + msg = ( f"Pre-test hook '{ts_model.pre_test}' not found in hook mapping. " "Ensure that a proper hook directory is set under the working directory." ) + logging.error(msg) + raise TestScenarioParsingError(msg) + if ts_model.post_test: post_test = self.hook_mapping.get(ts_model.post_test) if post_test is None: - logging.warning( + msg = ( f"Post-test hook '{ts_model.post_test}' not found in hook mapping. " "Ensure that a proper hook directory is set under the working directory." ) + logging.error(msg) + raise TestScenarioParsingError(msg) test_runs_by_id: dict[str, TestRun] = { tr.id: self._create_test_run(tr, normalized_weight, pre_test, post_test) for tr in ts_model.tests From 70e8fd77912c26299399ca8bf2cac931af9804ba Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Mon, 4 Nov 2024 14:51:37 -0600 Subject: [PATCH 59/64] Fix verify-configs errors --- src/cloudai/cli/handlers.py | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py index 6105bc24..86078b2e 100644 --- a/src/cloudai/cli/handlers.py +++ b/src/cloudai/cli/handlers.py @@ -212,7 +212,11 @@ def verify_test_configs(test_tomls: List[Path]) -> int: def verify_test_scenarios( - scenario_tomls: List[Path], test_tomls: list[Path], system_config: Optional[Path] = None + scenario_tomls: List[Path], + test_tomls: list[Path], + hook_tomls: List[Path], + hook_test_tomls: list[Path], + system_config: Optional[Path] = None, ) -> int: system = Mock(spec=System) if system_config: @@ -225,7 +229,9 @@ def verify_test_scenarios( logging.debug(f"Verifying Test Scenario: {scenario_file}...") try: tests = Parser.parse_tests(test_tomls, system) - Parser.parse_test_scenario(scenario_file, {t.name: t for t in tests}) + hook_tests = Parser.parse_tests(hook_test_tomls, system) + hooks = Parser.parse_hooks(hook_tomls, {t.name: t for t in hook_tests}) + Parser.parse_test_scenario(scenario_file, {t.name: t for t in tests}, hooks) except Exception: nfailed += 1 @@ -259,7 +265,9 @@ def handle_verify_all_configs(args: argparse.Namespace) -> int: if files["test"]: nfailed += verify_test_configs(files["test"]) if files["scenario"]: - nfailed += verify_test_scenarios(files["scenario"], test_tomls, args.system_config) + nfailed += verify_test_scenarios( + files["scenario"], test_tomls, files["hook"], files["hook_test"], args.system_config + ) if files["unknown"]: logging.error(f"Unknown configuration files: {[str(f) for f in files['unknown']]}") nfailed += len(files["unknown"]) @@ -273,10 +281,22 @@ def handle_verify_all_configs(args: argparse.Namespace) -> int: def load_tomls_by_type(tomls: List[Path]) -> dict[str, List[Path]]: - files: dict[str, List[Path]] = {"system": [], "test": [], "scenario": [], "unknown": []} + files: dict[str, List[Path]] = { + "system": [], + "test": [], + "scenario": [], + "hook_test": [], + "hook": [], + "unknown": [], + } for toml_file in tomls: content = toml_file.read_text() - if "scheduler =" in content: + if "conf" in toml_file.parts and "hook" in toml_file.parts: + if "test" in toml_file.parts: + files["hook_test"].append(toml_file) + else: + files["hook"].append(toml_file) + elif "scheduler =" in content: files["system"].append(toml_file) elif "test_template_name =" in content: files["test"].append(toml_file) From b852bb8967575eda306afbb2ede9ef65e443e0a0 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Tue, 5 Nov 2024 08:08:47 -0500 Subject: [PATCH 60/64] Reflect Andrei's comments --- src/cloudai/parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cloudai/parser.py b/src/cloudai/parser.py index 950eff6c..35a0e6a1 100644 --- a/src/cloudai/parser.py +++ b/src/cloudai/parser.py @@ -82,7 +82,7 @@ def parse( exit(1) # exit right away to keep error message readable for users if not PLUGIN_ROOT.exists(): - logging.debug(f"PLUGIN_ROOT path '{PLUGIN_ROOT}' does not exist. Plugins will not be enabled.") + logging.debug(f"PLUGIN_ROOT path '{PLUGIN_ROOT}' does not exist.") try: hook_tests = ( From 701cf94923a2939c49866a8f7d3b267875d9ff40 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Tue, 5 Nov 2024 08:12:50 -0500 Subject: [PATCH 61/64] Reflect Andrei's comments --- src/cloudai/_core/test_scenario_parser.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/cloudai/_core/test_scenario_parser.py b/src/cloudai/_core/test_scenario_parser.py index 40840872..d111d002 100644 --- a/src/cloudai/_core/test_scenario_parser.py +++ b/src/cloudai/_core/test_scenario_parser.py @@ -145,6 +145,7 @@ def _parse_data(self, data: Dict[str, Any]) -> TestScenario: if pre_test is None: msg = ( f"Pre-test hook '{ts_model.pre_test}' not found in hook mapping. " + "A corresponding hook should exist under 'conf/hook'. " "Ensure that a proper hook directory is set under the working directory." ) logging.error(msg) @@ -155,6 +156,7 @@ def _parse_data(self, data: Dict[str, Any]) -> TestScenario: if post_test is None: msg = ( f"Post-test hook '{ts_model.post_test}' not found in hook mapping. " + "A corresponding hook should exist under 'conf/hook'. " "Ensure that a proper hook directory is set under the working directory." ) logging.error(msg) From 8c0cbb51ca900eaea1c9f6c5484a9d71d442f767 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Tue, 5 Nov 2024 08:15:01 -0500 Subject: [PATCH 62/64] Rename plugin to hook --- src/cloudai/parser.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/cloudai/parser.py b/src/cloudai/parser.py index 35a0e6a1..6f59f9a3 100644 --- a/src/cloudai/parser.py +++ b/src/cloudai/parser.py @@ -34,8 +34,8 @@ format_validation_error, ) -PLUGIN_ROOT = Path("conf/hook") -PLUGIN_TEST_ROOT = PLUGIN_ROOT / "test" +HOOK_ROOT = Path("conf/hook") +HOOK_TEST_ROOT = HOOK_ROOT / "test" class Parser: @@ -81,12 +81,12 @@ def parse( except TestConfigParsingError: exit(1) # exit right away to keep error message readable for users - if not PLUGIN_ROOT.exists(): - logging.debug(f"PLUGIN_ROOT path '{PLUGIN_ROOT}' does not exist.") + if not HOOK_ROOT.exists(): + logging.debug(f"HOOK_ROOT path '{HOOK_ROOT}' does not exist.") try: hook_tests = ( - self.parse_tests(list(PLUGIN_TEST_ROOT.glob("*.toml")), system) if PLUGIN_TEST_ROOT.exists() else [] + self.parse_tests(list(HOOK_TEST_ROOT.glob("*.toml")), system) if HOOK_TEST_ROOT.exists() else [] ) except TestConfigParsingError: exit(1) # exit right away to keep error message readable for users @@ -97,10 +97,10 @@ def parse( test_mapping = {t.name: t for t in tests} hook_test_scenario_mapping = {} - if PLUGIN_ROOT.exists() and list(PLUGIN_ROOT.glob("*.toml")): + if HOOK_ROOT.exists() and list(HOOK_ROOT.glob("*.toml")): try: hook_test_scenario_mapping = self.parse_hooks( - list(PLUGIN_ROOT.glob("*.toml")), {t.name: t for t in hook_tests} + list(HOOK_ROOT.glob("*.toml")), {t.name: t for t in hook_tests} ) except TestScenarioParsingError: exit(1) # exit right away to keep error message readable for users From 526aecbba3cfd3e5e852e3f20f6c8081b3e0d7a2 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Tue, 5 Nov 2024 08:17:11 -0500 Subject: [PATCH 63/64] Fix verify-configs errors --- src/cloudai/cli/handlers.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py index 86078b2e..f837e681 100644 --- a/src/cloudai/cli/handlers.py +++ b/src/cloudai/cli/handlers.py @@ -23,6 +23,8 @@ from cloudai import Installable, Parser, Registry, ReportGenerator, Runner, System +from ..parser import HOOK_ROOT + def handle_install_and_uninstall(args: argparse.Namespace) -> int: """ @@ -249,6 +251,11 @@ def handle_verify_all_configs(args: argparse.Namespace) -> int: if err: return err + err, hook_tomls = expand_file_list(HOOK_ROOT, glob="**/*.toml") + if err: + return err + tomls += hook_tomls + files = load_tomls_by_type(tomls) test_tomls = files["test"] From 04430e4d2ee3aa7008f07ad29fa26ffb8dc95279 Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Tue, 5 Nov 2024 08:56:14 -0500 Subject: [PATCH 64/64] Reflect Andrei's comments --- src/cloudai/cli/handlers.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py index f837e681..30fb7a90 100644 --- a/src/cloudai/cli/handlers.py +++ b/src/cloudai/cli/handlers.py @@ -252,8 +252,6 @@ def handle_verify_all_configs(args: argparse.Namespace) -> int: return err err, hook_tomls = expand_file_list(HOOK_ROOT, glob="**/*.toml") - if err: - return err tomls += hook_tomls files = load_tomls_by_type(tomls) @@ -298,12 +296,22 @@ def load_tomls_by_type(tomls: List[Path]) -> dict[str, List[Path]]: } for toml_file in tomls: content = toml_file.read_text() - if "conf" in toml_file.parts and "hook" in toml_file.parts: + + is_in_hook_root = False + try: + toml_file.relative_to(HOOK_ROOT) + is_in_hook_root = True + except ValueError: + pass + + if is_in_hook_root: if "test" in toml_file.parts: files["hook_test"].append(toml_file) else: files["hook"].append(toml_file) - elif "scheduler =" in content: + continue + + if "scheduler =" in content: files["system"].append(toml_file) elif "test_template_name =" in content: files["test"].append(toml_file)