From 2b6181b6fecd543f8914424fe9ccd7c60dff7b21 Mon Sep 17 00:00:00 2001 From: Taekyung Heo Date: Wed, 9 Oct 2024 09:55:52 -0400 Subject: [PATCH] Remove assigning null when the value is null (NeMo launcher) (#250) * Remove assigning null when the value is null * Reflect Andrei's comments --- .../slurm_command_gen_strategy.py | 16 ++----------- src/cloudai/test_definitions/nemo_launcher.py | 1 - .../test_nemo_launcher.py | 24 +++---------------- 3 files changed, 5 insertions(+), 36 deletions(-) diff --git a/src/cloudai/schema/test_template/nemo_launcher/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/nemo_launcher/slurm_command_gen_strategy.py index 80ac228d..f8167d63 100644 --- a/src/cloudai/schema/test_template/nemo_launcher/slurm_command_gen_strategy.py +++ b/src/cloudai/schema/test_template/nemo_launcher/slurm_command_gen_strategy.py @@ -135,18 +135,9 @@ def _set_node_config(self, nodes: List[str], num_nodes: int) -> None: self.final_cmd_args["training.trainer.num_nodes"] = str(len(nodes)) if nodes else num_nodes def _validate_data_config(self) -> None: - """Validate the data directory and prefix configuration for non-mock environments.""" + """Validate the data prefix configuration for non-mock environments.""" if self.final_cmd_args.get("training.model.data.data_impl") != "mock": - data_dir = self.final_cmd_args.get("data_dir") data_prefix = self.final_cmd_args.get("training.model.data.data_prefix") - - if not data_dir or data_dir == "~": - raise ValueError( - "The 'data_dir' field of the NeMo launcher test contains an invalid placeholder '~'. " - "Please provide a valid path to the dataset in the test schema TOML file. " - "The 'data_dir' field must point to an actual dataset location." - ) - if data_prefix == "[]": raise ValueError( "The 'data_prefix' field of the NeMo launcher test is missing or empty. " @@ -198,10 +189,7 @@ def _generate_cmd_args_str(self, args: Dict[str, str], nodes: List[str]) -> str: value = f"\\'{value}\\'" env_var_str_parts.append(f"+{key}={value}") else: - if value == "~": - cmd_arg_str_parts.append(f"~{key}=null") - else: - cmd_arg_str_parts.append(f"{key}={value}") + cmd_arg_str_parts.append(f"{key}={value}") if nodes: nodes_str = ",".join(nodes) diff --git a/src/cloudai/test_definitions/nemo_launcher.py b/src/cloudai/test_definitions/nemo_launcher.py index c6214c89..86baff1b 100644 --- a/src/cloudai/test_definitions/nemo_launcher.py +++ b/src/cloudai/test_definitions/nemo_launcher.py @@ -97,7 +97,6 @@ class NeMoLauncherCmdArgs(CmdArgs): repository_commit_hash: str = "cf411a9ede3b466677df8ee672bcc6c396e71e1a" docker_image_url: str = "nvcr.io/nvidia/nemo:24.01.01" stages: str = '["training"]' - data_dir: str = "~" numa_mapping: NumaMapping = Field(default_factory=NumaMapping) cluster: Cluster = Field(default_factory=Cluster) training: Training = Field(default_factory=Training) diff --git a/tests/slurm_command_gen_strategy/test_nemo_launcher.py b/tests/slurm_command_gen_strategy/test_nemo_launcher.py index ccb07d17..a1d286b8 100644 --- a/tests/slurm_command_gen_strategy/test_nemo_launcher.py +++ b/tests/slurm_command_gen_strategy/test_nemo_launcher.py @@ -135,21 +135,13 @@ def test_gpus_per_node_value(self, nemo_cmd_gen: NeMoLauncherSlurmCommandGenStra assert "cluster.gpus_per_node=null" in cmd - def test_argument_with_tilde_value(self, nemo_cmd_gen: NeMoLauncherSlurmCommandGenStrategy, nemo_test_run: TestRun): - tdef: NeMoLauncherTestDefinition = cast(NeMoLauncherTestDefinition, nemo_test_run.test.test_definition) - tdef.cmd_args.training.model.micro_batch_size = "~" # type: ignore - - cmd = nemo_cmd_gen.gen_exec_command(nemo_test_run) - assert "~training.model.micro_batch_size=null" in cmd - def test_data_impl_mock_skips_checks( self, nemo_cmd_gen: NeMoLauncherSlurmCommandGenStrategy, nemo_test_run: TestRun ): tdef: NeMoLauncherTestDefinition = cast(NeMoLauncherTestDefinition, nemo_test_run.test.test_definition) - tdef.cmd_args.data_dir = "DATA_DIR" - + tdef.extra_cmd_args = {"data_dir": "DATA_DIR"} cmd = nemo_cmd_gen.gen_exec_command(nemo_test_run) - assert "data_dir" in cmd + assert "data_dir=DATA_DIR" in cmd def test_data_dir_and_data_prefix_validation( self, nemo_cmd_gen: NeMoLauncherSlurmCommandGenStrategy, nemo_test_run: TestRun @@ -157,18 +149,8 @@ def test_data_dir_and_data_prefix_validation( tdef: NeMoLauncherTestDefinition = cast(NeMoLauncherTestDefinition, nemo_test_run.test.test_definition) tdef.cmd_args.training.model.data.data_impl = "not_mock" tdef.cmd_args.training.model.data.data_prefix = "[]" + tdef.extra_cmd_args = {"data_dir": "DATA_DIR"} - with pytest.raises( - ValueError, - match=( - "The 'data_dir' field of the NeMo launcher test contains an invalid placeholder '~'. " - "Please provide a valid path to the dataset in the test schema TOML file. " - "The 'data_dir' field must point to an actual dataset location." - ), - ): - nemo_cmd_gen.gen_exec_command(nemo_test_run) - - tdef.cmd_args.data_dir = "/fake/data_dir" with pytest.raises(ValueError, match="The 'data_prefix' field of the NeMo launcher test is missing or empty."): nemo_cmd_gen.gen_exec_command(nemo_test_run)