NVIDIA · TaekyungHeo · Oct 9, 2024 · Oct 8, 2024 · Oct 8, 2024 · Oct 8, 2024
diff --git a/tests/slurm_command_gen_strategy/test_chakra_replay.py b/tests/slurm_command_gen_strategy/test_chakra_replay.py
@@ -0,0 +1,134 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, List
+
+import pytest
+from cloudai.schema.test_template.chakra_replay.slurm_command_gen_strategy import ChakraReplaySlurmCommandGenStrategy
+from cloudai.systems import SlurmSystem
+
+
+class TestChakraReplaySlurmCommandGenStrategy:
+    @pytest.fixture
+    def cmd_gen_strategy(self, slurm_system: SlurmSystem) -> ChakraReplaySlurmCommandGenStrategy:
+        return ChakraReplaySlurmCommandGenStrategy(slurm_system, {})
+
+    @pytest.mark.parametrize(
+        "job_name_prefix, env_vars, cmd_args, num_nodes, nodes, expected_result",
+        [
+            (
+                "chakra_replay",
+                {"NCCL_DEBUG": "INFO"},
+                {"docker_image_url": "fake_image_url", "trace_path": "/workspace/traces/"},
+                2,
+                ["node1", "node2"],
+                {
+                    "image_path": "fake_image_url",
+                    "container_mounts": "/workspace/traces/:/workspace/traces/",
+                },
+            ),
+            (
+                "chakra_replay",
+                {"NCCL_DEBUG": "INFO"},
+                {"docker_image_url": "another_image_url", "trace_path": "/another/trace_path/"},
+                1,
+                ["node1"],
+                {
+                    "image_path": "another_image_url",
+                    "container_mounts": "/another/trace_path/:/another/trace_path/",
+                },
+            ),
+        ],
+    )
+    def test_parse_slurm_args(
+        self,
+        cmd_gen_strategy: ChakraReplaySlurmCommandGenStrategy,
+        job_name_prefix: str,
+        env_vars: Dict[str, str],
+        cmd_args: Dict[str, str],
+        num_nodes: int,
+        nodes: List[str],
+        expected_result: Dict[str, Any],
+        slurm_system: SlurmSystem,
+    ) -> None:
+        slurm_args = cmd_gen_strategy._parse_slurm_args(job_name_prefix, env_vars, cmd_args, num_nodes, nodes)
+        assert slurm_args["image_path"] == expected_result["image_path"]
+        assert slurm_args["container_mounts"] == expected_result["container_mounts"]
+
+    def test_parse_slurm_args_invalid_cmd_args(
+        self, cmd_gen_strategy: ChakraReplaySlurmCommandGenStrategy, slurm_system: SlurmSystem
+    ) -> None:
+        job_name_prefix = "chakra_replay"
+        env_vars = {"NCCL_DEBUG": "INFO"}
+        cmd_args = {"trace_path": "/workspace/traces/"}  # Missing "docker_image_url"
+        num_nodes = 2
+        nodes = ["node1", "node2"]
+
+        with pytest.raises(KeyError) as exc_info:
+            cmd_gen_strategy._parse_slurm_args(job_name_prefix, env_vars, cmd_args, num_nodes, nodes)
+
+        assert str(exc_info.value) == "'docker_image_url'", "Expected missing docker_image_url key"
+
+    @pytest.mark.parametrize(
+        "cmd_args, extra_cmd_args, expected_result",
+        [
+            (
+                {"trace_type": "comms_trace", "trace_path": "/workspace/traces/", "backend": "nccl", "device": "gpu"},
+                "--max-steps 100",
+                [
+                    "python /workspace/param/train/comms/pt/commsTraceReplay.py",
+                    "--trace-type comms_trace",
+                    "--trace-path /workspace/traces/",
+                    "--backend nccl",
+                    "--device gpu",
+                    "--max-steps 100",
+                ],
+            ),
+            (
+                {"trace_type": "comms_trace", "trace_path": "/workspace/traces/", "backend": "nccl", "device": "gpu"},
+                "",
+                [
+                    "python /workspace/param/train/comms/pt/commsTraceReplay.py",
+                    "--trace-type comms_trace",
+                    "--trace-path /workspace/traces/",
+                    "--backend nccl",
+                    "--device gpu",
+                    "",
+                ],
+            ),
+        ],
+    )
+    def test_generate_test_command(
+        self,
+        cmd_gen_strategy: ChakraReplaySlurmCommandGenStrategy,
+        cmd_args: Dict[str, str],
+        extra_cmd_args: str,
+        expected_result: List[str],
+        slurm_system: SlurmSystem,
+    ) -> None:
+        command = cmd_gen_strategy.generate_test_command({}, cmd_args, extra_cmd_args)
+        assert command == expected_result
+
+    def test_generate_test_command_invalid_args(
+        self, cmd_gen_strategy: ChakraReplaySlurmCommandGenStrategy, slurm_system: SlurmSystem
+    ) -> None:
+        cmd_args: Dict[str, str] = {"trace_type": "comms_trace", "backend": "nccl", "device": "gpu"}
+        extra_cmd_args: str = "--max-steps 100"
+
+        with pytest.raises(KeyError) as exc_info:
+            cmd_gen_strategy.generate_test_command({}, cmd_args, extra_cmd_args)
+
+        assert str(exc_info.value) == "'trace_path'", "Expected missing trace_path key"
diff --git a/tests/slurm_command_gen_strategy/test_nccl.py b/tests/slurm_command_gen_strategy/test_nccl.py
@@ -14,38 +14,87 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import Any, Dict, List
 
+import pytest
 from cloudai.schema.test_template.nccl_test.slurm_command_gen_strategy import NcclTestSlurmCommandGenStrategy
 from cloudai.systems import SlurmSystem
 
 
-class TestNCCLSlurmCommandGen:
-    def get_cmd(self, slurm_system: SlurmSystem, slurm_args: dict, cmd_args: dict) -> str:
-        return NcclTestSlurmCommandGenStrategy(slurm_system, {}).generate_srun_command(slurm_args, {}, cmd_args, "")
+class TestNcclTestSlurmCommandGenStrategy:
+    @pytest.fixture
+    def cmd_gen_strategy(self, slurm_system: SlurmSystem) -> NcclTestSlurmCommandGenStrategy:
+        return NcclTestSlurmCommandGenStrategy(slurm_system, {})
 
-    def test_only_mandatory(self, slurm_system: SlurmSystem) -> None:
-        slurm_args = {"image_path": "fake_image_path"}
-        cmd_args = {"subtest_name": "fake_subtest_name"}
-        cmd = self.get_cmd(slurm_system, slurm_args, cmd_args)
-        assert cmd == " \\\n".join(
-            [
-                "srun",
-                f"--mpi={slurm_system.mpi}",
-                f"--container-image={slurm_args['image_path']}",
-                f"/usr/local/bin/{cmd_args['subtest_name']}",
-            ]
-        )
+    @pytest.mark.parametrize(
+        "job_name_prefix, env_vars, cmd_args, num_nodes, nodes, expected_result",
+        [
+            (
+                "nccl_test",
+                {"NCCL_TOPO_FILE": "/path/to/topo", "DOCKER_NCCL_TOPO_FILE": "/docker/topo"},
+                {"subtest_name": "all_reduce_perf", "docker_image_url": "fake_image_url"},
+                2,
+                ["node1", "node2"],
+                {
+                    "container_mounts": "/path/to/topo:/docker/topo",
+                },
+            ),
+            (
+                "nccl_test",
+                {"NCCL_TOPO_FILE": "/path/to/topo"},
+                {"subtest_name": "all_reduce_perf", "docker_image_url": "another_image_url"},
+                1,
+                ["node1"],
+                {
+                    "container_mounts": "",
+                },
+            ),
+        ],
+    )
+    def test_parse_slurm_args(
+        self,
+        cmd_gen_strategy: NcclTestSlurmCommandGenStrategy,
+        job_name_prefix: str,
+        env_vars: Dict[str, str],
+        cmd_args: Dict[str, str],
+        num_nodes: int,
+        nodes: List[str],
+        expected_result: Dict[str, Any],
+    ) -> None:
+        slurm_args = cmd_gen_strategy._parse_slurm_args(job_name_prefix, env_vars, cmd_args, num_nodes, nodes)
+        assert slurm_args["container_mounts"] == expected_result["container_mounts"]
 
-    def test_with_container_mounts(self, slurm_system: SlurmSystem) -> None:
-        slurm_args = {"image_path": "fake_image_path", "container_mounts": "fake_mounts"}
-        cmd_args = {"subtest_name": "fake_subtest_name"}
-        cmd = self.get_cmd(slurm_system, slurm_args, cmd_args)
-        assert cmd == " \\\n".join(
-            [
-                "srun",
-                f"--mpi={slurm_system.mpi}",
-                f"--container-image={slurm_args['image_path']}",
-                f"--container-mounts={slurm_args['container_mounts']}",
-                f"/usr/local/bin/{cmd_args['subtest_name']}",
-            ]
-        )
+    @pytest.mark.parametrize(
+        "cmd_args, extra_cmd_args, expected_command",
+        [
+            (
+                {"subtest_name": "all_reduce_perf", "nthreads": "4", "ngpus": "2"},
+                "--max-steps 100",
+                [
+                    "/usr/local/bin/all_reduce_perf",
+                    "--nthreads 4",
+                    "--ngpus 2",
+                    "--max-steps 100",
+                ],
+            ),
+            (
+                {"subtest_name": "all_reduce_perf", "op": "sum", "datatype": "float"},
+                "",
+                [
+                    "/usr/local/bin/all_reduce_perf",
+                    "--op sum",
+                    "--datatype float",
+                ],
+            ),
+        ],
+    )
+    def test_generate_test_command(
+        self,
+        cmd_gen_strategy: NcclTestSlurmCommandGenStrategy,
+        cmd_args: Dict[str, str],
+        extra_cmd_args: str,
+        expected_command: List[str],
+    ) -> None:
+        env_vars = {}
+        command = cmd_gen_strategy.generate_test_command(env_vars, cmd_args, extra_cmd_args)
+        assert command == expected_command
diff --git a/tests/slurm_command_gen_strategy/test_sleep.py b/tests/slurm_command_gen_strategy/test_sleep.py
@@ -0,0 +1,45 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List
+
+import pytest
+from cloudai.schema.test_template.sleep.slurm_command_gen_strategy import SleepSlurmCommandGenStrategy
+from cloudai.systems import SlurmSystem
+
+
+class TestSleepSlurmCommandGenStrategy:
+    @pytest.fixture
+    def cmd_gen_strategy(self, slurm_system: SlurmSystem) -> SleepSlurmCommandGenStrategy:
+        return SleepSlurmCommandGenStrategy(slurm_system, {})
+
+    @pytest.mark.parametrize(
+        "cmd_args, expected_command",
+        [
+            ({"seconds": "60"}, ["sleep 60"]),
+            ({"seconds": "120"}, ["sleep 120"]),
+        ],
+    )
+    def test_generate_test_command(
+        self,
+        cmd_gen_strategy: SleepSlurmCommandGenStrategy,
+        cmd_args: Dict[str, str],
+        expected_command: List[str],
+    ) -> None:
+        env_vars = {}
+        extra_cmd_args = ""
+        command = cmd_gen_strategy.generate_test_command(env_vars, cmd_args, extra_cmd_args)
+        assert command == expected_command
diff --git a/tests/slurm_command_gen_strategy/test_ucc.py b/tests/slurm_command_gen_strategy/test_ucc.py
@@ -0,0 +1,67 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List
+
+import pytest
+from cloudai.schema.test_template.ucc_test.slurm_command_gen_strategy import UCCTestSlurmCommandGenStrategy
+from cloudai.systems import SlurmSystem
+
+
+class TestUCCTestSlurmCommandGenStrategy:
+    @pytest.fixture
+    def cmd_gen_strategy(self, slurm_system: SlurmSystem) -> UCCTestSlurmCommandGenStrategy:
+        return UCCTestSlurmCommandGenStrategy(slurm_system, {})
+
+    @pytest.mark.parametrize(
+        "cmd_args, extra_cmd_args, expected_command",
+        [
+            (
+                {"collective": "allgather", "b": "8", "e": "256M"},
+                "--max-steps 100",
+                [
+                    "/opt/hpcx/ucc/bin/ucc_perftest",
+                    "-c allgather",
+                    "-b 8",
+                    "-e 256M",
+                    "-m cuda",
+                    "-F",
+                    "--max-steps 100",
+                ],
+            ),
+            (
+                {"collective": "allreduce", "b": "4"},
+                "",
+                [
+                    "/opt/hpcx/ucc/bin/ucc_perftest",
+                    "-c allreduce",
+                    "-b 4",
+                    "-m cuda",
+                    "-F",
+                ],
+            ),
+        ],
+    )
+    def test_generate_test_command(
+        self,
+        cmd_gen_strategy: UCCTestSlurmCommandGenStrategy,
+        cmd_args: Dict[str, str],
+        extra_cmd_args: str,
+        expected_command: List[str],
+    ) -> None:
+        env_vars = {}
+        command = cmd_gen_strategy.generate_test_command(env_vars, cmd_args, extra_cmd_args)
+        assert command == expected_command