From bf1d9fb497286a504e45968290f14b49b982f250 Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Tue, 22 Oct 2024 12:48:01 -0400
Subject: [PATCH 01/64] Reorder SlurmCommandGenStrategy methods

---
 .../strategy/slurm_command_gen_strategy.py    | 50 +++++++++----------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py
index 506e83f1..71562496 100644
--- a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py
+++ b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py
@@ -57,22 +57,14 @@ def __init__(self, system: SlurmSystem, cmd_args: Dict[str, Any]) -> None:
         )
         self.docker_image_url = self.cmd_args.get("docker_image_url", "")
 
-    def _format_env_vars(self, env_vars: Dict[str, Any]) -> str:
-        """
-        Format environment variables for inclusion in a batch script.
-
-        Args:
-            env_vars (Dict[str, Any]): Environment variables to format.
-
-        Returns:
-            str: A string representation of the formatted environment variables.
-        """
-        formatted_vars = []
-        for key in sorted(env_vars.keys()):
-            value = env_vars[key]
-            formatted_value = str(value["default"]) if isinstance(value, dict) and "default" in value else str(value)
-            formatted_vars.append(f"export {key}={formatted_value}")
-        return "\n".join(formatted_vars)
+    def gen_exec_command(self, tr: TestRun) -> str:
+        env_vars = self._override_env_vars(self.system.global_env_vars, tr.test.extra_env_vars)
+        cmd_args = self._override_cmd_args(self.default_cmd_args, tr.test.cmd_args)
+        slurm_args = self._parse_slurm_args(
+            tr.test.test_template.__class__.__name__, env_vars, cmd_args, tr.num_nodes, tr.nodes
+        )
+        srun_command = self.generate_srun_command(slurm_args, env_vars, cmd_args, tr.test.extra_cmd_args)
+        return self._write_sbatch_script(slurm_args, env_vars, srun_command, tr.output_path)
 
     def _parse_slurm_args(
         self,
@@ -139,15 +131,6 @@ def generate_srun_prefix(self, slurm_args: Dict[str, Any]) -> List[str]:
 
         return srun_command_parts
 
-    def gen_exec_command(self, tr: TestRun) -> str:
-        env_vars = self._override_env_vars(self.system.global_env_vars, tr.test.extra_env_vars)
-        cmd_args = self._override_cmd_args(self.default_cmd_args, tr.test.cmd_args)
-        slurm_args = self._parse_slurm_args(
-            tr.test.test_template.__class__.__name__, env_vars, cmd_args, tr.num_nodes, tr.nodes
-        )
-        srun_command = self.generate_srun_command(slurm_args, env_vars, cmd_args, tr.test.extra_cmd_args)
-        return self._write_sbatch_script(slurm_args, env_vars, srun_command, tr.output_path)
-
     def generate_test_command(
         self, env_vars: Dict[str, str], cmd_args: Dict[str, str], extra_cmd_args: str
     ) -> List[str]:
@@ -237,3 +220,20 @@ def _append_sbatch_directives(
         batch_script_content.append(
             "\nexport SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)"
         )
+
+    def _format_env_vars(self, env_vars: Dict[str, Any]) -> str:
+        """
+        Format environment variables for inclusion in a batch script.
+
+        Args:
+            env_vars (Dict[str, Any]): Environment variables to format.
+
+        Returns:
+            str: A string representation of the formatted environment variables.
+        """
+        formatted_vars = []
+        for key in sorted(env_vars.keys()):
+            value = env_vars[key]
+            formatted_value = str(value["default"]) if isinstance(value, dict) and "default" in value else str(value)
+            formatted_vars.append(f"export {key}={formatted_value}")
+        return "\n".join(formatted_vars)

From 38bb8a7b5a23431aace8a3803a63ce8889565293 Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Tue, 22 Oct 2024 13:45:01 -0400
Subject: [PATCH 02/64] Rename generate_srun_command to _gen_srun_command

---
 .../jax_toolbox/slurm_command_gen_strategy.py             | 2 +-
 .../systems/slurm/strategy/slurm_command_gen_strategy.py  | 8 ++++----
 .../test_common_slurm_command_gen_strategy.py             | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py
index fc21554e..7a2616e9 100644
--- a/src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py
+++ b/src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py
@@ -152,7 +152,7 @@ def _parse_slurm_args(
 
         return base_args
 
-    def generate_srun_command(
+    def _gen_srun_command(
         self, slurm_args: Dict[str, Any], env_vars: Dict[str, str], cmd_args: Dict[str, Any], extra_cmd_args: str
     ) -> str:
         self._create_run_script(env_vars, cmd_args, extra_cmd_args)
diff --git a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py
index 71562496..3b7a0649 100644
--- a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py
+++ b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py
@@ -63,7 +63,7 @@ def gen_exec_command(self, tr: TestRun) -> str:
         slurm_args = self._parse_slurm_args(
             tr.test.test_template.__class__.__name__, env_vars, cmd_args, tr.num_nodes, tr.nodes
         )
-        srun_command = self.generate_srun_command(slurm_args, env_vars, cmd_args, tr.test.extra_cmd_args)
+        srun_command = self._gen_srun_command(slurm_args, env_vars, cmd_args, tr.test.extra_cmd_args)
         return self._write_sbatch_script(slurm_args, env_vars, srun_command, tr.output_path)
 
     def _parse_slurm_args(
@@ -112,14 +112,14 @@ def job_name(self, job_name_prefix: str) -> str:
             job_name = f"{self.system.account}-{job_name_prefix}.{datetime.now().strftime('%Y%m%d_%H%M%S')}"
         return job_name
 
-    def generate_srun_command(
+    def _gen_srun_command(
         self, slurm_args: Dict[str, Any], env_vars: Dict[str, str], cmd_args: Dict[str, str], extra_cmd_args: str
     ) -> str:
-        srun_command_parts = self.generate_srun_prefix(slurm_args)
+        srun_command_parts = self.gen_srun_prefix(slurm_args)
         test_command_parts = self.generate_test_command(env_vars, cmd_args, extra_cmd_args)
         return " \\\n".join(srun_command_parts + test_command_parts)
 
-    def generate_srun_prefix(self, slurm_args: Dict[str, Any]) -> List[str]:
+    def gen_srun_prefix(self, slurm_args: Dict[str, Any]) -> List[str]:
         srun_command_parts = ["srun", f"--mpi={self.system.mpi}"]
         if slurm_args.get("image_path"):
             srun_command_parts.append(f'--container-image={slurm_args["image_path"]}')
diff --git a/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py
index 0ea7fc38..36db4473 100644
--- a/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py
+++ b/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py
@@ -56,7 +56,7 @@ def test_filename_generation(strategy_fixture: SlurmCommandGenStrategy, testrun_
     slurm_args = strategy_fixture._parse_slurm_args(
         job_name_prefix, env_vars, cmd_args, testrun_fixture.num_nodes, testrun_fixture.nodes
     )
-    srun_command = strategy_fixture.generate_srun_command(slurm_args, env_vars, cmd_args, "")
+    srun_command = strategy_fixture._gen_srun_command(slurm_args, env_vars, cmd_args, "")
 
     sbatch_command = strategy_fixture._write_sbatch_script(
         slurm_args, env_vars, srun_command, testrun_fixture.output_path

From 57192301307b4155e54b94ab83ee023b695a3f25 Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Wed, 23 Oct 2024 12:05:42 -0400
Subject: [PATCH 03/64] Remove pre-test implementation from JaxToolbox

---
 .../jax_toolbox/slurm_command_gen_strategy.py |  93 ----------------
 src/cloudai/test_definitions/gpt.py           |   5 +-
 src/cloudai/test_definitions/grok.py          |   5 +-
 src/cloudai/test_definitions/jax_toolbox.py   |  34 +-----
 tests/ref_data/gpt-pretest.sbatch             |  55 ---------
 .../{gpt-no-pretest.sbatch => gpt.sbatch}     |   2 +-
 tests/ref_data/grok-pretest.sbatch            |  55 ---------
 .../{grok-no-pretest.sbatch => grok.sbatch}   |   2 +-
 ..._jax_toolbox_slurm_command_gen_strategy.py | 105 +-----------------
 tests/test_acceptance.py                      |  18 +--
 10 files changed, 16 insertions(+), 358 deletions(-)
 delete mode 100644 tests/ref_data/gpt-pretest.sbatch
 rename tests/ref_data/{gpt-no-pretest.sbatch => gpt.sbatch} (96%)
 delete mode 100644 tests/ref_data/grok-pretest.sbatch
 rename tests/ref_data/{grok-no-pretest.sbatch => grok.sbatch} (98%)

diff --git a/src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py
index 7a2616e9..49f4e772 100644
--- a/src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py
+++ b/src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py
@@ -158,25 +158,11 @@ def _gen_srun_command(
         self._create_run_script(env_vars, cmd_args, extra_cmd_args)
 
         commands = []
-
-        run_pre_test = cmd_args.get("pre_test.enable", False)
-
-        if run_pre_test:
-            output_path = Path(cmd_args["output_path"]).resolve() / "output_pretest-%j-%n-%t.txt"
-            error_path = Path(cmd_args["output_path"]).resolve() / "error_pretest-%j-%n-%t.txt"
-            commands.append(self._generate_pre_test_command(cmd_args, output_path, error_path))
-            commands.append(self._generate_pre_test_check_command(cmd_args, output_path))
-            commands.append('if [ "$PRE_TEST_SUCCESS" = true ]; then')
-
         load_container = cmd_args.get("load_container", False)
         if load_container:
             commands += self._generate_container_load_command(slurm_args)
-
         commands += self._generate_run_command(slurm_args)
 
-        if run_pre_test:
-            commands.append("fi")
-
         return "\n".join(commands)
 
     def _create_run_script(
@@ -347,85 +333,6 @@ def _create_pgo_nsys_converter_command(self, stage: str, cmd_args: Dict[str, str
             ["", 'if [ "$SLURM_NODEID" -eq 0 ] && [ "$SLURM_PROCID" -eq 0 ]; then', f"    {command}", "fi"]
         )
 
-    def _generate_pre_test_command(self, cmd_args: Dict[str, Any], output_path: Path, error_path: Path) -> str:
-        """
-        Generate the pre-test command for running a test.
-
-        This method constructs the pre-test command based on the command-line
-        arguments provided.
-
-        Args:
-            cmd_args (Dict[str, Any]): A dictionary containing command arguments.
-            output_path (Path): The path to the output file.
-            error_path (Path): The path to the error file.
-
-        Returns:
-            str: The generated pre-test command.
-        """
-        nccl_test_prefix = "pre_test.nccl_test."
-        nccl_test = {}
-
-        for key, value in cmd_args.items():
-            if key.startswith(nccl_test_prefix):
-                flag_name = key[len(nccl_test_prefix) :]
-                nccl_test[flag_name] = value
-        pre_test_command_parts = [
-            "srun",
-            "--mpi=pmix",
-            f"-N {nccl_test.get('num_nodes', 2)}",
-            f"-o {output_path}",
-            f"-e {error_path}",
-            f"--container-image={nccl_test.get('docker_image_url', 'nvcr.io/nvidia/pytorch:24.02-py3')}",
-            f"/usr/local/bin/{nccl_test.get('subtest_name', 'all_gather_perf_mpi')}",
-            f"--nthreads {nccl_test.get('nthreads', 1)}",
-            f"--ngpus {nccl_test.get('ngpus', 1)}",
-            f"--minbytes {nccl_test.get('minbytes', '32M')}",
-            f"--maxbytes {nccl_test.get('maxbytes', '16G')}",
-            f"--stepbytes {nccl_test.get('stepbytes', '1M')}",
-            f"--op {nccl_test.get('op', 'sum')}",
-            f"--datatype {nccl_test.get('datatype', 'float')}",
-            f"--root {nccl_test.get('root', 0)}",
-            f"--iters {nccl_test.get('iters', 20)}",
-            f"--warmup_iters {nccl_test.get('warmup_iters', 5)}",
-            f"--agg_iters {nccl_test.get('agg_iters', 1)}",
-            f"--average {nccl_test.get('average', 1)}",
-            f"--parallel_init {nccl_test.get('parallel_init', 0)}",
-            f"--check {nccl_test.get('check', 1)}",
-            f"--blocking {nccl_test.get('blocking', 0)}",
-            f"--cudagraph {nccl_test.get('cudagraph', 0)}",
-            f"--stepfactor {nccl_test.get('stepfactor', 2)}",
-        ]
-        return " \\\n".join(pre_test_command_parts)
-
-    def _generate_pre_test_check_command(self, cmd_args: Dict[str, str], output_path: Path) -> str:
-        """
-        Generate the command for pre-test check.
-
-        This method generates the command that checks the output of the pre-test to determine if the main test should
-        be run.
-
-        Args:
-            cmd_args (Dict[str, str]): Command-line arguments for the job.
-            output_path (str): The path to the output file.
-
-        Returns:
-            str: The generated command for pre-test check.
-        """
-        pretest_output_files = str(Path(output_path).parent / "output_pretest-*.txt")
-        keyword = cmd_args.get("keyword", "Avg bus bandwidth")
-
-        return "\n".join(
-            [
-                f'PRETEST_OUTPUT_FILES="{pretest_output_files}"',
-                f'keyword="{keyword}"',
-                "",
-                "# Use grep to search for the keyword in the files",
-                'if grep -q "$keyword" $PRETEST_OUTPUT_FILES; then',
-                "    PRE_TEST_SUCCESS=true",
-                "fi",
-            ]
-        )
-
     def _generate_container_load_command(self, slurm_args: Dict[str, Any]) -> List[str]:
         """Generate the command for loading a container."""
         container_image = slurm_args.get("image_path")
diff --git a/src/cloudai/test_definitions/gpt.py b/src/cloudai/test_definitions/gpt.py
index 35736ebb..5b003d55 100644
--- a/src/cloudai/test_definitions/gpt.py
+++ b/src/cloudai/test_definitions/gpt.py
@@ -16,7 +16,7 @@
 
 from pydantic import Field
 
-from .jax_toolbox import JaxFdl, JaxToolboxCmdArgs, JaxToolboxTestDefinition, PreTest, SetupFlags, XLAFlags
+from .jax_toolbox import JaxFdl, JaxToolboxCmdArgs, JaxToolboxTestDefinition, SetupFlags, XLAFlags
 
 
 class GPTFdl(JaxFdl):
@@ -43,7 +43,6 @@ class GPTCmdArgs(JaxToolboxCmdArgs):
 
     fdl_config: str
     fdl: GPTFdl = Field(default_factory=GPTFdl)
-    pre_test: PreTest = Field(default_factory=PreTest)
     xla_flags: GPTXLAFlags = Field(default_factory=GPTXLAFlags)
     setup_flags: GPTSetupFlags = Field(default_factory=GPTSetupFlags)
 
@@ -58,7 +57,7 @@ def cmd_args_dict(self):
         d = self.cmd_args.model_dump()
         res = {}
         for k, v in d.items():
-            if k in {"pre_test", "docker_image_url", "load_container", "output_path"}:
+            if k in {"docker_image_url", "load_container", "output_path"}:
                 res[k] = v
             else:
                 if k == "xla_flags":
diff --git a/src/cloudai/test_definitions/grok.py b/src/cloudai/test_definitions/grok.py
index f5e7f19c..9e42d7a2 100644
--- a/src/cloudai/test_definitions/grok.py
+++ b/src/cloudai/test_definitions/grok.py
@@ -16,7 +16,7 @@
 
 from pydantic import ConfigDict, Field
 
-from .jax_toolbox import JaxFdl, JaxToolboxCmdArgs, JaxToolboxTestDefinition, PreTest, SetupFlags, XLAFlags
+from .jax_toolbox import JaxFdl, JaxToolboxCmdArgs, JaxToolboxTestDefinition, SetupFlags, XLAFlags
 
 
 class GrokFdl(JaxFdl):
@@ -72,7 +72,6 @@ class GrokCmdArgs(JaxToolboxCmdArgs):
     setup_flags: SetupFlags = Field(default_factory=SetupFlags)
     profile: GrokProfileXLAFlags = Field(default_factory=GrokProfileXLAFlags)
     perf: GrokPerfXLAFlags = Field(default_factory=GrokPerfXLAFlags)
-    pre_test: PreTest = Field(default_factory=PreTest)
 
 
 class GrokTestDefinition(JaxToolboxTestDefinition):
@@ -91,7 +90,7 @@ def cmd_args_dict(self):
             if k in {"profile", "perf"}:
                 res.setdefault(f"Grok.{k}", {})
                 res[f"Grok.{k}"]["XLA_FLAGS"] = v
-            elif k in {"pre_test", "docker_image_url", "load_container", "output_path"}:
+            elif k in {"docker_image_url", "load_container", "output_path"}:
                 res[k] = v
             else:
                 res[f"Grok.{k}"] = v
diff --git a/src/cloudai/test_definitions/jax_toolbox.py b/src/cloudai/test_definitions/jax_toolbox.py
index 079e5b4e..4593028a 100644
--- a/src/cloudai/test_definitions/jax_toolbox.py
+++ b/src/cloudai/test_definitions/jax_toolbox.py
@@ -14,12 +14,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Any, Optional
+from typing import Optional
 
-from pydantic import BaseModel, ConfigDict, Field, field_serializer
+from pydantic import BaseModel, ConfigDict, field_serializer
 
 from cloudai import CmdArgs, TestDefinition
-from cloudai.test_definitions.nccl import NCCLCmdArgs
 
 
 class JaxFdl(BaseModel):
@@ -54,35 +53,6 @@ def checkpoint_policy_serializer(self, value: str) -> str:
         return f'\\"{value}\\"'
 
 
-class NCCLCmdAgrsPreTest(NCCLCmdArgs):
-    """NCCL pre-test command arguments."""
-
-    num_nodes: int = 8
-    stepfactor: int = 2
-    minbytes: str = "8M"
-    maxbytes: str = "16G"
-    blocking: int = 1
-
-    def model_post_init(self, _: Any) -> None:
-        self.subtest_name = "all_gather_perf_mpi"
-        self.docker_image_url = "nvcr.io/nvidia/pytorch:24.02-py3"
-
-
-class PreTest(BaseModel):
-    """Pre-test configuration."""
-
-    model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True)
-    enable: bool = True
-    nccl_test: NCCLCmdAgrsPreTest = Field(default_factory=NCCLCmdAgrsPreTest)
-
-
-class NCCLPreTest(BaseModel):
-    """Pre-test configuration."""
-
-    model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True)
-    nccl_test: Optional[NCCLCmdAgrsPreTest] = None
-
-
 class JaxToolboxCmdArgs(CmdArgs):
     """JAX Toolbox test command arguments."""
 
diff --git a/tests/ref_data/gpt-pretest.sbatch b/tests/ref_data/gpt-pretest.sbatch
deleted file mode 100644
index 17c2f53b..00000000
--- a/tests/ref_data/gpt-pretest.sbatch
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-#SBATCH --job-name=__JOB_NAME__
-#SBATCH -N 1
-#SBATCH --partition=main
-
-export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)
-export COMBINE_THRESHOLD=1
-export PER_GPU_COMBINE_THRESHOLD=0
-export XLA_FLAGS="--xla_gpu_all_gather_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_all_reduce_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_reduce_scatter_combine_threshold_bytes=$PER_GPU_COMBINE_THRESHOLD"
-
-srun \
---mpi=pmix \
--N 8 \
--o __OUTPUT_DIR__/output_pretest-%j-%n-%t.txt \
--e __OUTPUT_DIR__/error_pretest-%j-%n-%t.txt \
---container-image=nvcr.io/nvidia/pytorch:24.02-py3 \
-/usr/local/bin/all_gather_perf_mpi \
---nthreads 1 \
---ngpus 1 \
---minbytes 8M \
---maxbytes 16G \
---stepbytes 1M \
---op sum \
---datatype float \
---root 0 \
---iters 20 \
---warmup_iters 5 \
---agg_iters 1 \
---average 1 \
---parallel_init 0 \
---check 1 \
---blocking 1 \
---cudagraph 0 \
---stepfactor 2
-PRETEST_OUTPUT_FILES="__OUTPUT_DIR__/output_pretest-*.txt"
-keyword="Avg bus bandwidth"
-
-# Use grep to search for the keyword in the files
-if grep -q "$keyword" $PRETEST_OUTPUT_FILES; then
-    PRE_TEST_SUCCESS=true
-fi
-if [ "$PRE_TEST_SUCCESS" = true ]; then
-    echo "Loading container with srun command"
-    srun --mpi=none --container-image=https:/docker/url --container-name=cont true
-    echo "Running srun command"
-    srun \
-    --mpi=none \
-     \
-    --export=ALL \
-    -o __OUTPUT_DIR__/output-%j-%n-%t.txt \
-    -e __OUTPUT_DIR__/error-%j-%n-%t.txt \
-    --container-name=cont \
-    --container-mounts=__OUTPUT_DIR__:/opt/paxml/workspace/ \
-    /opt/paxml/workspace/run.sh
-fi
\ No newline at end of file
diff --git a/tests/ref_data/gpt-no-pretest.sbatch b/tests/ref_data/gpt.sbatch
similarity index 96%
rename from tests/ref_data/gpt-no-pretest.sbatch
rename to tests/ref_data/gpt.sbatch
index 30b48294..d8789804 100644
--- a/tests/ref_data/gpt-no-pretest.sbatch
+++ b/tests/ref_data/gpt.sbatch
@@ -19,4 +19,4 @@ export XLA_FLAGS="--xla_gpu_all_gather_combine_threshold_bytes=$COMBINE_THRESHOL
     -e __OUTPUT_DIR__/error-%j-%n-%t.txt \
     --container-name=cont \
     --container-mounts=__OUTPUT_DIR__:/opt/paxml/workspace/ \
-    /opt/paxml/workspace/run.sh
\ No newline at end of file
+    /opt/paxml/workspace/run.sh
diff --git a/tests/ref_data/grok-pretest.sbatch b/tests/ref_data/grok-pretest.sbatch
deleted file mode 100644
index 661ddd0b..00000000
--- a/tests/ref_data/grok-pretest.sbatch
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-#SBATCH --job-name=__JOB_NAME__
-#SBATCH -N 1
-#SBATCH --partition=main
-
-export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)
-export COMBINE_THRESHOLD=1
-export PER_GPU_COMBINE_THRESHOLD=0
-export XLA_FLAGS="--xla_disable_hlo_passes=rematerialization --xla_dump_hlo_pass_re=.* --xla_gpu_all_gather_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_all_reduce_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_enable_all_gather_combine_by_dim=false --xla_gpu_enable_highest_priority_async_stream=true --xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_pipelined_all_gather=true --xla_gpu_enable_pipelined_all_reduce=true --xla_gpu_enable_pipelined_reduce_scatter=true --xla_gpu_enable_reduce_scatter_combine_by_dim=false --xla_gpu_enable_triton_gemm=false --xla_gpu_enable_triton_softmax_fusion=false --xla_gpu_enable_while_loop_double_buffering=true --xla_gpu_graph_level=0 --xla_gpu_pgle_profile_file_or_directory_path=/opt/paxml/workspace/pgle_output_profile.pbtxt --xla_gpu_reduce_scatter_combine_threshold_bytes=$PER_GPU_COMBINE_THRESHOLD --xla_gpu_run_post_layout_collective_pipeliner=false --xla_gpu_use_memcpy_local_p2p=false"
-
-srun \
---mpi=pmix \
--N 8 \
--o __OUTPUT_DIR__/output_pretest-%j-%n-%t.txt \
--e __OUTPUT_DIR__/error_pretest-%j-%n-%t.txt \
---container-image=nvcr.io/nvidia/pytorch:24.02-py3 \
-/usr/local/bin/all_gather_perf_mpi \
---nthreads 1 \
---ngpus 1 \
---minbytes 8M \
---maxbytes 16G \
---stepbytes 1M \
---op sum \
---datatype float \
---root 0 \
---iters 20 \
---warmup_iters 5 \
---agg_iters 1 \
---average 1 \
---parallel_init 0 \
---check 1 \
---blocking 1 \
---cudagraph 0 \
---stepfactor 2
-PRETEST_OUTPUT_FILES="__OUTPUT_DIR__/output_pretest-*.txt"
-keyword="Avg bus bandwidth"
-
-# Use grep to search for the keyword in the files
-if grep -q "$keyword" $PRETEST_OUTPUT_FILES; then
-    PRE_TEST_SUCCESS=true
-fi
-if [ "$PRE_TEST_SUCCESS" = true ]; then
-    echo "Loading container with srun command"
-    srun --mpi=none --container-image=https:/docker/url --container-name=cont true
-    echo "Running srun command"
-    srun \
-    --mpi=none \
-     \
-    --export=ALL \
-    -o __OUTPUT_DIR__/output-%j-%n-%t.txt \
-    -e __OUTPUT_DIR__/error-%j-%n-%t.txt \
-    --container-name=cont \
-    --container-mounts=__OUTPUT_DIR__:/opt/paxml/workspace/ \
-    /opt/paxml/workspace/run.sh
-fi
\ No newline at end of file
diff --git a/tests/ref_data/grok-no-pretest.sbatch b/tests/ref_data/grok.sbatch
similarity index 98%
rename from tests/ref_data/grok-no-pretest.sbatch
rename to tests/ref_data/grok.sbatch
index 725d29fa..808973bb 100644
--- a/tests/ref_data/grok-no-pretest.sbatch
+++ b/tests/ref_data/grok.sbatch
@@ -19,4 +19,4 @@ export XLA_FLAGS="--xla_disable_hlo_passes=rematerialization --xla_dump_hlo_pass
     -e __OUTPUT_DIR__/error-%j-%n-%t.txt \
     --container-name=cont \
     --container-mounts=__OUTPUT_DIR__:/opt/paxml/workspace/ \
-    /opt/paxml/workspace/run.sh
\ No newline at end of file
+    /opt/paxml/workspace/run.sh
diff --git a/tests/slurm_command_gen_strategy/test_jax_toolbox_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_jax_toolbox_slurm_command_gen_strategy.py
index 9e134c26..99935121 100644
--- a/tests/slurm_command_gen_strategy/test_jax_toolbox_slurm_command_gen_strategy.py
+++ b/tests/slurm_command_gen_strategy/test_jax_toolbox_slurm_command_gen_strategy.py
@@ -25,7 +25,7 @@
 from cloudai.systems import SlurmSystem
 from cloudai.test_definitions.gpt import GPTCmdArgs, GPTTestDefinition
 from cloudai.test_definitions.grok import GrokCmdArgs, GrokTestDefinition
-from cloudai.test_definitions.jax_toolbox import JaxFdl, PreTest
+from cloudai.test_definitions.jax_toolbox import JaxFdl
 
 
 class TestJaxToolboxSlurmCommandGenStrategy:
@@ -63,7 +63,6 @@ def test_gen_exec_command(
         test_fixture,
     ) -> None:
         test_def = request.getfixturevalue(test_fixture)
-        test_def.cmd_args.pre_test = PreTest(enable=True)
 
         test = Test(test_definition=test_def, test_template=JaxToolbox(slurm_system, "name"))
         test_run = TestRun(
@@ -74,14 +73,10 @@ def test_gen_exec_command(
             name="test-job",
         )
 
-        cmd_gen_strategy._generate_pre_test_command = MagicMock(return_value="pre_test_command")
         cmd = cmd_gen_strategy.gen_exec_command(test_run)
         assert cmd == f"sbatch {test_run.output_path}/cloudai_sbatch_script.sh"
         assert (test_run.output_path / "run.sh").exists()
 
-        content = Path(f"{test_run.output_path}/cloudai_sbatch_script.sh").read_text()
-        assert "pre_test_command" in content
-
     @pytest.mark.parametrize(
         "cmd_args, expected",
         [
@@ -215,100 +210,6 @@ def test_generate_python_command(
             "fi",
         ]
 
-    def test_generate_pre_test_command(
-        self, cmd_gen_strategy: JaxToolboxSlurmCommandGenStrategy, grok_test: GrokTestDefinition, tmp_path: Path
-    ) -> None:
-        grok_test.cmd_args.pre_test = PreTest(enable=True)
-
-        nccl_test = grok_test.cmd_args.pre_test.nccl_test
-        nccl_test.num_nodes = 2
-        nccl_test.minbytes = "32M"
-        nccl_test.blocking = 0
-
-        cargs = {"output_path": str(tmp_path), **grok_test.cmd_args_dict}
-
-        pre_test_cli = cmd_gen_strategy._generate_pre_test_command(cargs, tmp_path, tmp_path).splitlines()
-
-        expected_pre_test_cli = [
-            "srun \\",
-            "--mpi=pmix \\",
-            f"-N {nccl_test.num_nodes} \\",
-            f"-o {tmp_path} \\",
-            f"-e {tmp_path} \\",
-            f"--container-image={nccl_test.docker_image_url} \\",
-            f"/usr/local/bin/{nccl_test.subtest_name} \\",
-            f"--nthreads {nccl_test.nthreads} \\",
-            f"--ngpus {nccl_test.ngpus} \\",
-            f"--minbytes {nccl_test.minbytes} \\",
-            f"--maxbytes {nccl_test.maxbytes} \\",
-            f"--stepbytes {nccl_test.stepbytes} \\",
-            f"--op {nccl_test.op} \\",
-            f"--datatype {nccl_test.datatype} \\",
-            f"--root {nccl_test.root} \\",
-            f"--iters {nccl_test.iters} \\",
-            f"--warmup_iters {nccl_test.warmup_iters} \\",
-            f"--agg_iters {nccl_test.agg_iters} \\",
-            f"--average {nccl_test.average} \\",
-            f"--parallel_init {nccl_test.parallel_init} \\",
-            f"--check {nccl_test.check} \\",
-            f"--blocking {nccl_test.blocking} \\",
-            f"--cudagraph {nccl_test.cudagraph} \\",
-            f"--stepfactor {nccl_test.stepfactor}",
-        ]
-
-        assert pre_test_cli == expected_pre_test_cli, (
-            "The generated pre-test command did not match the expected command.\n"
-            f"Expected: {expected_pre_test_cli}\n"
-            f"Actual: {pre_test_cli}"
-        )
-
-    def test_generate_srun_command(self, slurm_system, cmd_gen_strategy, grok_test):
-        cmd_gen_strategy.test_name = grok_test.name
-        Path("/tmp/output").mkdir(parents=True, exist_ok=True)
-
-        output_path = Path("/tmp/output/output")
-        output_path.mkdir(parents=True, exist_ok=True)
-
-        # Use the existing setup for mocking internal methods
-        cmd_gen_strategy._generate_pre_test_command = MagicMock(return_value="srun --mpi=none pre_test_command")
-        cmd_gen_strategy._generate_run_command = MagicMock(return_value="srun --mpi=none run_command")
-        cmd_gen_strategy._generate_container_load_command = MagicMock(
-            return_value="srun --mpi=none container_load_command"
-        )
-
-        slurm_args = {
-            "output": "/tmp/output/output-%j.txt",
-            "error": "/tmp/output/error-%j.txt",
-            "image_path": "fake_image_url",
-            "container_mounts": "/tmp/output:/workspace",
-        }
-        cmd_args = {
-            "output_path": "/tmp/output",
-            "pre_test": {"enable": True},
-            f"{grok_test.name}.setup_flags.docker_workspace_dir": "/workspace/docker",
-            f"{grok_test.name}.setup_flags.tfds_data_dir": "/workspace/tfds",
-            f"{grok_test.name}.setup_flags.enable_checkpoint_saving": True,
-        }
-
-        pre_test_command = cmd_gen_strategy._generate_pre_test_command(
-            cmd_args, Path("/tmp/output"), Path("/tmp/output")
-        )
-        run_command = cmd_gen_strategy._generate_run_command(slurm_args)
-        container_load_command = cmd_gen_strategy._generate_container_load_command(slurm_args)
-
-        result_command = f"{pre_test_command}\n{container_load_command}\n{run_command}"
-
-        # Assert expected parts of the command are in the generated result
-        assert "pre_test_command" in result_command
-        assert "container_load_command" in result_command
-        assert "run_command" in result_command
-        assert "srun" in result_command
-        assert "--mpi=none" in result_command
-
-        cmd_gen_strategy._generate_pre_test_command.assert_called_once()
-        cmd_gen_strategy._generate_run_command.assert_called_once()
-        cmd_gen_strategy._generate_container_load_command.assert_called_once()
-
 
 def test_gpt_test_definition_cmd_args_dict():
     gpt = GPTTestDefinition(
@@ -324,7 +225,7 @@ def test_gpt_test_definition_cmd_args_dict():
     assert "GPT.setup_flags" in cargs
     assert "GPT.XLA_FLAGS" in cargs
 
-    for k in {"pre_test", "docker_image_url", "load_container"}:
+    for k in {"docker_image_url", "load_container"}:
         assert k in cargs
         assert f"GPT.{k}" not in cargs
 
@@ -348,7 +249,7 @@ def test_grok_test_definition_cmd_args_dict():
     assert "Grok.perf" in cargs
     assert "XLA_FLAGS" in cargs["Grok.perf"]
 
-    for k in {"pre_test", "docker_image_url", "load_container"}:
+    for k in {"docker_image_url", "load_container"}:
         assert k in cargs
         assert f"Grok.{k}" not in cargs
 
diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py
index 36f5e6cd..384c22c5 100644
--- a/tests/test_acceptance.py
+++ b/tests/test_acceptance.py
@@ -142,7 +142,7 @@ def partial_tr(slurm_system: SlurmSystem) -> partial[TestRun]:
     return partial(TestRun, num_nodes=1, nodes=[], output_path=slurm_system.output_path)
 
 
-@pytest.fixture(params=["ucc", "nccl", "sleep", "gpt-pretest", "gpt-no-pretest", "grok-pretest", "grok-no-pretest"])
+@pytest.fixture(params=["ucc", "nccl", "sleep", "gpt", "grok"])
 def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -> tuple[TestRun, str, Optional[str]]:
     if request.param == "ucc":
         tr = partial_tr(
@@ -192,7 +192,7 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -
         tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name")
 
         return (tr, "sleep.sbatch", None)
-    elif request.param.startswith("gpt-"):
+    elif request.param.startswith("gpt"):
         tr = partial_tr(
             name="gpt",
             test=Test(
@@ -210,13 +210,9 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -
             slurm_system, tr.test.test_definition.cmd_args_dict
         )
         tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name")
-        if "no-pretest" in request.param:
-            tr.test.test_definition.cmd_args.pre_test.enable = False
-        else:
-            tr.test.test_definition.cmd_args.pre_test.enable = True
 
         return (tr, f"{request.param}.sbatch", "gpt.run")
-    elif request.param.startswith("grok-"):
+    elif request.param.startswith("grok"):
         tr = partial_tr(
             name="grok",
             test=Test(
@@ -234,10 +230,6 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -
             slurm_system, tr.test.test_definition.cmd_args_dict
         )
         tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name")
-        if "no-pretest" in request.param:
-            tr.test.test_definition.cmd_args.pre_test.enable = False
-        else:
-            tr.test.test_definition.cmd_args.pre_test.enable = True
 
         return (tr, f"{request.param}.sbatch", "grok.run")
 
@@ -251,8 +243,8 @@ def test_sbatch_generation(slurm_system: SlurmSystem, test_req: tuple[TestRun, s
 
     sbatch_script = tr.test.test_template.gen_exec_command(tr).split()[-1]
 
-    curr = Path(sbatch_script).read_text()
-    ref = (Path(__file__).parent / "ref_data" / test_req[1]).read_text()
+    curr = Path(sbatch_script).read_text().strip()
+    ref = (Path(__file__).parent / "ref_data" / test_req[1]).read_text().strip()
     ref = ref.replace("__OUTPUT_DIR__", str(slurm_system.output_path)).replace("__JOB_NAME__", "job_name")
 
     assert curr == ref

From 8e8ee3ec33d5ec1001dfbe4225c2cf0c72adcdbf Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Wed, 23 Oct 2024 07:24:23 -0400
Subject: [PATCH 04/64] Add prologue and epilogue to _TestScenarioTOML

---
 src/cloudai/_core/test_scenario_parser.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/cloudai/_core/test_scenario_parser.py b/src/cloudai/_core/test_scenario_parser.py
index 08526dca..16302a2a 100644
--- a/src/cloudai/_core/test_scenario_parser.py
+++ b/src/cloudai/_core/test_scenario_parser.py
@@ -54,6 +54,8 @@ class _TestScenarioTOML(BaseModel):
     name: str
     job_status_check: bool = True
     tests: list[_TestRunTOML] = Field(alias="Tests", min_length=1)
+    prologue: str = ""
+    epilogue: str = ""
 
     @model_validator(mode="after")
     def check_no_self_dependency(self):

From 32d7d93a30ec2c1139de646af0dc66ec12b79847 Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Tue, 22 Oct 2024 14:50:35 -0400
Subject: [PATCH 05/64] Add example plugin files

---
 conf/common/plugin/nccl_test_epilogue.toml | 22 ++++++++++++++++++++++
 conf/common/plugin/nccl_test_prologue.toml | 22 ++++++++++++++++++++++
 conf/common/test_scenario/nccl_test.toml   |  4 ++++
 3 files changed, 48 insertions(+)
 create mode 100644 conf/common/plugin/nccl_test_epilogue.toml
 create mode 100644 conf/common/plugin/nccl_test_prologue.toml

diff --git a/conf/common/plugin/nccl_test_epilogue.toml b/conf/common/plugin/nccl_test_epilogue.toml
new file mode 100644
index 00000000..346dc8e4
--- /dev/null
+++ b/conf/common/plugin/nccl_test_epilogue.toml
@@ -0,0 +1,22 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "nccl_test_epilogue"
+
+[[Tests]]
+id = "Tests.1"
+test_name = "nccl_test_all_gather"
+time_limit = "00:20:00"
diff --git a/conf/common/plugin/nccl_test_prologue.toml b/conf/common/plugin/nccl_test_prologue.toml
new file mode 100644
index 00000000..e5c1a1e4
--- /dev/null
+++ b/conf/common/plugin/nccl_test_prologue.toml
@@ -0,0 +1,22 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "nccl_test_prologue"
+
+[[Tests]]
+id = "Tests.1"
+test_name = "nccl_test_all_reduce"
+time_limit = "00:20:00"
diff --git a/conf/common/test_scenario/nccl_test.toml b/conf/common/test_scenario/nccl_test.toml
index f6ccf02c..9b731e96 100644
--- a/conf/common/test_scenario/nccl_test.toml
+++ b/conf/common/test_scenario/nccl_test.toml
@@ -15,6 +15,10 @@
 # limitations under the License.
 
 name = "nccl-test"
+
+prologue = "nccl_test_prologue"
+epilogue = "nccl_test_epilogue"
+
 [[Tests]]
 id = "Tests.1"
 test_name = "nccl_test_all_reduce"

From 28a38b841b4cf3473c585fc95e11f09f73a9ff23 Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Wed, 23 Oct 2024 07:35:01 -0400
Subject: [PATCH 06/64] Add plugin option to CLI

---
 src/cloudai/cli/cli.py |  4 ++
 tests/test_cli.py      | 92 +++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 94 insertions(+), 2 deletions(-)

diff --git a/src/cloudai/cli/cli.py b/src/cloudai/cli/cli.py
index 53059799..f962a2d5 100644
--- a/src/cloudai/cli/cli.py
+++ b/src/cloudai/cli/cli.py
@@ -60,6 +60,7 @@ def add_command(
         handler: Callable[[argparse.Namespace], int],
         system_config: Optional[bool] = None,
         tests_dir: Optional[bool] = None,
+        plugin_dir: Optional[bool] = None,
         test_scenario: Optional[bool] = None,
         output_dir: Optional[bool] = None,
         result_dir: Optional[bool] = None,
@@ -74,6 +75,8 @@ def add_command(
             p.add_argument(
                 "--tests-dir", help="Path to the test configuration directory.", required=tests_dir, type=Path
             )
+        if plugin_dir is not None:
+            p.add_argument("--plugin-dir", help="Path to the plugin directory.", required=plugin_dir, type=Path)
         if test_scenario is not None:
             p.add_argument("--test-scenario", help="Path to the test scenario file.", required=test_scenario, type=Path)
         if output_dir is not None:
@@ -127,6 +130,7 @@ def add_run_and_dry_run(self):
                 handle_dry_run_and_run,
                 system_config=True,
                 tests_dir=True,
+                plugin_dir=False,
                 test_scenario=True,
                 output_dir=False,
             )
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 538e497f..bb6c1a5d 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -20,8 +20,8 @@
 
 import pytest
 
-from cloudai.cli import CloudAICLI, handle_generate_report, handle_install_and_uninstall
-from cloudai.cli.handlers import handle_verify_all_configs
+from cloudai.cli import CloudAICLI
+from cloudai.cli.handlers import handle_generate_report, handle_install_and_uninstall, handle_verify_all_configs
 
 
 def test_help_message(capsys: pytest.CaptureFixture[str]) -> None:
@@ -108,6 +108,7 @@ def test_add_command_all_optional():
         lambda _: 0,
         system_config=False,
         tests_dir=False,
+        plugin_dir=False,
         test_scenario=False,
         output_dir=False,
     )
@@ -118,6 +119,7 @@ def test_add_command_all_optional():
         mode="test",
         system_config=None,
         tests_dir=None,
+        plugin_dir=None,
         test_scenario=None,
         output_dir=None,
     )
@@ -132,6 +134,7 @@ def test_add_command_all_required():
         lambda _: 0,
         system_config=True,
         tests_dir=True,
+        plugin_dir=True,
         test_scenario=True,
         output_dir=True,
     )
@@ -142,6 +145,8 @@ def test_add_command_all_required():
             "system_config",
             "--tests-dir",
             "tests_dir",
+            "--plugin-dir",
+            "plugin_dir",
             "--test-scenario",
             "test_scenario",
             "--output-dir",
@@ -154,11 +159,91 @@ def test_add_command_all_required():
         mode="test",
         system_config=Path("system_config"),
         tests_dir=Path("tests_dir"),
+        plugin_dir=Path("plugin_dir"),
         test_scenario=Path("test_scenario"),
         output_dir=Path("output_dir"),
     )
 
 
+@pytest.mark.parametrize(
+    "mode,args,expected_plugin_dir",
+    [
+        (
+            "run",
+            [
+                "run",
+                "--system-config",
+                "system_config",
+                "--tests-dir",
+                "tests_dir",
+                "--plugin-dir",
+                "plugin_dir",
+                "--test-scenario",
+                "test_scenario",
+            ],
+            Path("plugin_dir"),
+        ),
+        (
+            "run",
+            [
+                "run",
+                "--system-config",
+                "system_config",
+                "--tests-dir",
+                "tests_dir",
+                "--test-scenario",
+                "test_scenario",
+            ],
+            None,
+        ),
+        (
+            "dry-run",
+            [
+                "dry-run",
+                "--system-config",
+                "system_config",
+                "--tests-dir",
+                "tests_dir",
+                "--plugin-dir",
+                "plugin_dir",
+                "--test-scenario",
+                "test_scenario",
+            ],
+            Path("plugin_dir"),
+        ),
+        (
+            "dry-run",
+            [
+                "dry-run",
+                "--system-config",
+                "system_config",
+                "--tests-dir",
+                "tests_dir",
+                "--test-scenario",
+                "test_scenario",
+            ],
+            None,
+        ),
+    ],
+)
+def test_modes_with_or_without_plugin_dir(mode, args, expected_plugin_dir):
+    cli = CloudAICLI()
+
+    cli.add_command(
+        mode,
+        f"{mode} command",
+        lambda _: 0,
+        system_config=True,
+        tests_dir=True,
+        plugin_dir=False,
+        test_scenario=True,
+        output_dir=False,
+    )
+
+    parsed_args = cli.parser.parse_args(args)
+    assert parsed_args.plugin_dir == expected_plugin_dir
+
+
 def test_real_uninstall():
     cli = CloudAICLI()
     cli.init_default_args()
@@ -277,6 +362,8 @@ def test_run_dry_run_modes(self, cli: CloudAICLI):
                     "tests_dir",
                     "--test-scenario",
                     "test_scenario",
+                    "--plugin-dir",
+                    "plugin_dir",
                 ]
             )
 
@@ -286,6 +373,7 @@ def test_run_dry_run_modes(self, cli: CloudAICLI):
                 mode=mode,
                 system_config=Path("system_config"),
                 tests_dir=Path("tests_dir"),
+                plugin_dir=Path("plugin_dir"),
                 test_scenario=Path("test_scenario"),
                 output_dir=None,
             )

From bb3275fc6ea511f0a723add1909c46db1f3fa8a0 Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Wed, 23 Oct 2024 07:43:01 -0400
Subject: [PATCH 07/64] Parse plugins and pass them to TestRun

---
 src/cloudai/_core/test_scenario.py        |  2 +
 src/cloudai/_core/test_scenario_parser.py | 23 +++++++-
 src/cloudai/cli/handlers.py               |  2 +-
 src/cloudai/parser.py                     | 47 ++++++++++++---
 tests/test_acceptance.py                  |  1 +
 tests/test_parser.py                      | 70 ++++++++++++++++++++++-
 tests/test_test_scenario.py               |  2 +-
 7 files changed, 131 insertions(+), 16 deletions(-)

diff --git a/src/cloudai/_core/test_scenario.py b/src/cloudai/_core/test_scenario.py
index 3a60c036..97c89994 100644
--- a/src/cloudai/_core/test_scenario.py
+++ b/src/cloudai/_core/test_scenario.py
@@ -58,6 +58,8 @@ class TestRun:
     weight: float = 0.0
     ideal_perf: float = 1.0
     dependencies: dict[str, TestDependency] = field(default_factory=dict)
+    prologue: Optional["TestScenario"] = None
+    epilogue: Optional["TestScenario"] = None
 
     def __hash__(self) -> int:
         return hash(self.name + self.test.name + str(self.iterations) + str(self.current_iteration))
diff --git a/src/cloudai/_core/test_scenario_parser.py b/src/cloudai/_core/test_scenario_parser.py
index 16302a2a..c59adeba 100644
--- a/src/cloudai/_core/test_scenario_parser.py
+++ b/src/cloudai/_core/test_scenario_parser.py
@@ -101,9 +101,10 @@ class TestScenarioParser:
 
     __test__ = False
 
-    def __init__(self, file_path: Path, test_mapping: Dict[str, Test]) -> None:
+    def __init__(self, file_path: Path, test_mapping: Dict[str, Test], plugin_mapping: Dict[str, TestScenario]) -> None:
         self.file_path = file_path
         self.test_mapping = test_mapping
+        self.plugin_mapping = plugin_mapping
 
     def parse(self) -> TestScenario:
         """
@@ -138,8 +139,14 @@ def _parse_data(self, data: Dict[str, Any]) -> TestScenario:
         total_weight = sum(tr.weight for tr in ts_model.tests)
         normalized_weight = 0 if total_weight == 0 else 100 / total_weight
 
+        prologue_name = data.get("prologue", "")
+        epilogue_name = data.get("epilogue", "")
+
+        prologue = self.plugin_mapping.get(prologue_name, None) if prologue_name else None
+        epilogue = self.plugin_mapping.get(epilogue_name, None) if epilogue_name else None
+
         testruns_by_id: dict[str, TestRun] = {
-            tr.id: self._create_section_test_run(tr, normalized_weight) for tr in ts_model.tests
+            tr.id: self._create_section_test_run(tr, normalized_weight, prologue, epilogue) for tr in ts_model.tests
         }
 
         tests_data: dict[str, _TestRunTOML] = {tr.id: tr for tr in ts_model.tests}
@@ -155,13 +162,21 @@ def _parse_data(self, data: Dict[str, Any]) -> TestScenario:
             job_status_check=ts_model.job_status_check,
         )
 
-    def _create_section_test_run(self, test_info: _TestRunTOML, normalized_weight: float) -> TestRun:
+    def _create_section_test_run(
+        self,
+        test_info: _TestRunTOML,
+        normalized_weight: float,
+        prologue: Optional[TestScenario],
+        epilogue: Optional[TestScenario],
+    ) -> TestRun:
         """
         Create a section-specific Test object by copying from the test mapping.
 
         Args:
             test_info (Dict[str, Any]): Information of the test.
             normalized_weight (float): Normalized weight for the test.
+            prologue (Optional[TestScenario]): TestScenario object representing the prologue sequence.
+            epilogue (Optional[TestScenario]): TestScenario object representing the epilogue sequence.
 
         Returns:
             Test: Copied and updated Test object for the section.
@@ -194,5 +209,7 @@ def _create_section_test_run(self, test_info: _TestRunTOML, normalized_weight: f
             sol=test_info.sol,
             weight=test_info.weight * normalized_weight,
             ideal_perf=test_info.ideal_perf,
+            prologue=prologue if prologue is not None else TestScenario(name="default_prologue", test_runs=[]),
+            epilogue=epilogue if epilogue is not None else TestScenario(name="default_epilogue", test_runs=[]),
         )
         return tr
diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py
index 34fa9b0b..a204f70e 100644
--- a/src/cloudai/cli/handlers.py
+++ b/src/cloudai/cli/handlers.py
@@ -114,7 +114,7 @@ def handle_dry_run_and_run(args: argparse.Namespace) -> int:
         args (argparse.Namespace): The parsed command-line arguments.
     """
     parser = Parser(args.system_config)
-    system, tests, test_scenario = parser.parse(args.tests_dir, args.test_scenario)
+    system, tests, test_scenario = parser.parse(args.tests_dir, args.test_scenario, args.plugin_dir)
     assert test_scenario is not None
 
     if args.output_dir:
diff --git a/src/cloudai/parser.py b/src/cloudai/parser.py
index a627b312..5b21ab3f 100644
--- a/src/cloudai/parser.py
+++ b/src/cloudai/parser.py
@@ -16,7 +16,7 @@
 
 import logging
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Set, Tuple
 
 import toml
 from pydantic import ValidationError
@@ -49,7 +49,7 @@ def __init__(self, system_config_path: Path) -> None:
         self.system_config_path = system_config_path
 
     def parse(
-        self, test_path: Path, test_scenario_path: Optional[Path] = None
+        self, test_path: Path, test_scenario_path: Optional[Path] = None, plugin_path: Optional[Path] = None
     ) -> Tuple[System, List[Test], Optional[TestScenario]]:
         """
         Parse configurations for system, test templates, and test scenarios.
@@ -74,21 +74,50 @@ def parse(
         logging.debug(f"Parsed {len(tests)} tests: {[t.name for t in tests]}")
         test_mapping = {t.name: t for t in tests}
 
-        filtered_tests = tests
         test_scenario: Optional[TestScenario] = None
+        scenario_test_names: Set[str] = set()
         if test_scenario_path:
+            plugin_mapping: Dict[str, TestScenario] = {}
+            plugin_test_names: Set[str] = set()
+            if plugin_path and plugin_path.exists():
+                try:
+                    plugin_mapping = self.parse_plugins(list(plugin_path.glob("*.toml")), test_mapping)
+                    for plugin_scenario in plugin_mapping.values():
+                        plugin_test_names.update(tr.test.name for tr in plugin_scenario.test_runs)
+                except TestScenarioParsingError:
+                    exit(1)
+
             try:
-                test_scenario = self.parse_test_scenario(test_scenario_path, test_mapping)
+                test_scenario = self.parse_test_scenario(test_scenario_path, test_mapping, plugin_mapping)
+                scenario_test_names = set(tr.test.name for tr in test_scenario.test_runs)
             except TestScenarioParsingError:
-                exit(1)  # exit right away to keep error message readable for users
-            scenario_tests = set(tr.test.name for tr in test_scenario.test_runs)
-            filtered_tests = [t for t in tests if t.name in scenario_tests]
+                exit(1)
+
+            all_used_test_names = plugin_test_names.union(scenario_test_names)
+            filtered_tests = [t for t in tests if t.name in all_used_test_names]
+        else:
+            filtered_tests = tests
 
         return system, filtered_tests, test_scenario
 
     @staticmethod
-    def parse_test_scenario(test_scenario_path: Path, test_mapping: Dict[str, Test]) -> TestScenario:
-        test_scenario_parser = TestScenarioParser(test_scenario_path, test_mapping)
+    def parse_plugins(plugin_tomls: List[Path], test_mapping: Dict[str, Test]) -> Dict[str, TestScenario]:
+        plugin_mapping = {}
+        for plugin_path in plugin_tomls:
+            plugin_scenario = Parser.parse_test_scenario(plugin_path, test_mapping)
+            plugin_mapping[plugin_scenario.name] = plugin_scenario
+        return plugin_mapping
+
+    @staticmethod
+    def parse_test_scenario(
+        test_scenario_path: Path,
+        test_mapping: Dict[str, Test],
+        plugin_mapping: Optional[Dict[str, TestScenario]] = None,
+    ) -> TestScenario:
+        if plugin_mapping is None:
+            plugin_mapping = {}
+
+        test_scenario_parser = TestScenarioParser(test_scenario_path, test_mapping, plugin_mapping)
         test_scenario = test_scenario_parser.parse()
         return test_scenario
 
diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py
index 384c22c5..7c017ce5 100644
--- a/tests/test_acceptance.py
+++ b/tests/test_acceptance.py
@@ -60,6 +60,7 @@ def test_slurm(tmp_path: Path, scenario: Dict):
         system_config=Path("conf/common/system/example_slurm_cluster.toml"),
         test_templates_dir=Path("conf/common/test_template"),
         tests_dir=Path("conf/common/test"),
+        plugin_dir=Path("conf/common/plugin"),
         test_scenario=test_scenario_path,
         output_dir=tmp_path,
     )
diff --git a/tests/test_parser.py b/tests/test_parser.py
index d35896a9..cb809d36 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -34,7 +34,7 @@ def parser(self, tmp_path: Path) -> Parser:
     def test_no_tests_dir(self, parser: Parser):
         tests_dir = parser.system_config_path.parent / "tests"
         with pytest.raises(FileNotFoundError) as exc_info:
-            parser.parse(tests_dir, None)
+            parser.parse(tests_dir, None, None)
         assert "Test path" in str(exc_info.value)
 
     @patch("cloudai._core.test_parser.TestParser.parse_all")
@@ -50,19 +50,85 @@ def test_no_scenario(self, test_parser: Mock, parser: Parser):
 
     @patch("cloudai._core.test_parser.TestParser.parse_all")
     @patch("cloudai._core.test_scenario_parser.TestScenarioParser.parse")
-    def test_scenario_filters_tests(self, test_scenario_parser: Mock, test_parser: Mock, parser: Parser):
+    def test_scenario_without_plugin(self, test_scenario_parser: Mock, test_parser: Mock, parser: Parser):
         tests_dir = parser.system_config_path.parent.parent / "test"
+
         fake_tests = []
         for i in range(3):
             fake_tests.append(Mock())
             fake_tests[-1].name = f"test-{i}"
         test_parser.return_value = fake_tests
+
         fake_scenario = Mock()
         fake_scenario.test_runs = [Mock()]
         fake_scenario.test_runs[0].test.name = "test-1"
         test_scenario_parser.return_value = fake_scenario
+
         _, tests, _ = parser.parse(tests_dir, Path())
+
+        assert len(tests) == 1
+        assert tests[0].name == "test-1"
+
+    @patch("cloudai._core.test_parser.TestParser.parse_all")
+    @patch("cloudai._core.test_scenario_parser.TestScenarioParser.parse")
+    @patch("cloudai.parser.Parser.parse_plugins")
+    def test_scenario_with_plugin_common_tests(
+        self, parse_plugins: Mock, test_scenario_parser: Mock, test_parser: Mock, parser: Parser
+    ):
+        tests_dir = parser.system_config_path.parent.parent / "test"
+
+        fake_tests = []
+        for i in range(3):
+            fake_tests.append(Mock())
+            fake_tests[-1].name = f"test-{i}"
+        test_parser.return_value = fake_tests
+
+        fake_scenario = Mock()
+        fake_scenario.test_runs = [Mock()]
+        fake_scenario.test_runs[0].test.name = "test-1"
+        test_scenario_parser.return_value = fake_scenario
+
+        fake_plugin = Mock()
+        fake_plugin.test_runs = [Mock()]
+        fake_plugin.test_runs[0].test.name = "test-1"
+        parse_plugins.return_value = {"plugin-1": fake_plugin}
+
+        _, tests, _ = parser.parse(tests_dir, Path(), Path())
+
         assert len(tests) == 1
+        assert tests[0].name == "test-1"
+
+    @patch("cloudai._core.test_parser.TestParser.parse_all")
+    @patch("cloudai._core.test_scenario_parser.TestScenarioParser.parse")
+    @patch("cloudai.parser.Parser.parse_plugins")
+    def test_scenario_with_plugin_exclusive_tests(
+        self, parse_plugins: Mock, test_scenario_parser: Mock, test_parser: Mock, parser: Parser
+    ):
+        tests_dir = parser.system_config_path.parent.parent / "test"
+
+        fake_tests = []
+        for i in range(4):
+            fake_tests.append(Mock())
+            fake_tests[-1].name = f"test-{i}"
+        test_parser.return_value = fake_tests
+
+        fake_scenario = Mock()
+        fake_scenario.test_runs = [Mock()]
+        fake_scenario.test_runs[0].test.name = "test-1"
+        test_scenario_parser.return_value = fake_scenario
+
+        fake_plugin = Mock()
+        fake_plugin.test_runs = [Mock()]
+        fake_plugin.test_runs[0].test.name = "test-2"
+        parse_plugins.return_value = {"plugin-1": fake_plugin}
+
+        _, tests, _ = parser.parse(tests_dir, Path(), Path())
+
+        assert len(tests) == 2
+        assert "test-1" in [t.name for t in tests]
+        assert "test-2" in [t.name for t in tests]
+        assert "test-0" not in [t.name for t in tests]
+        assert "test-3" not in [t.name for t in tests]
 
     def test_parse_system(self, parser: Parser):
         parser.system_config_path = Path("conf/common/system/example_slurm_cluster.toml")
diff --git a/tests/test_test_scenario.py b/tests/test_test_scenario.py
index 6e2e1504..87a96aaf 100644
--- a/tests/test_test_scenario.py
+++ b/tests/test_test_scenario.py
@@ -27,7 +27,7 @@
 
 @pytest.fixture
 def test_scenario_parser(tmp_path: Path) -> TestScenarioParser:
-    tsp = TestScenarioParser(Path(""), {})
+    tsp = TestScenarioParser(Path(""), {}, {})
     return tsp
 
 

From 3bc3822fad704bea61615d5bcd2e998ac9b3e924 Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Fri, 25 Oct 2024 06:28:41 -0400
Subject: [PATCH 08/64] Generate plugin commands

---
 src/cloudai/_core/command_gen_strategy.py     |  26 ++++
 src/cloudai/_core/test_template.py            |  34 +++++
 .../nccl_test/slurm_command_gen_strategy.py   |   5 +
 .../strategy/slurm_command_gen_strategy.py    | 110 +++++++++++++-
 tests/ref_data/gpt.sbatch                     |   2 +-
 tests/ref_data/grok.sbatch                    |   2 +-
 tests/ref_data/nccl.sbatch                    |  21 +--
 tests/ref_data/sleep.sbatch                   |   4 +-
 tests/ref_data/ucc.sbatch                     |  10 +-
 .../test_common_slurm_command_gen_strategy.py | 137 +++++++++++++++++-
 10 files changed, 312 insertions(+), 39 deletions(-)

diff --git a/src/cloudai/_core/command_gen_strategy.py b/src/cloudai/_core/command_gen_strategy.py
index 16bd04f9..9c8bb389 100644
--- a/src/cloudai/_core/command_gen_strategy.py
+++ b/src/cloudai/_core/command_gen_strategy.py
@@ -39,3 +39,29 @@ def gen_exec_command(self, tr: TestRun) -> str:
             str: The generated execution command.
         """
         pass
+
+    @abstractmethod
+    def gen_srun_command(self, tr: TestRun) -> str:
+        """
+        Generate the Slurm srun command for a test based on the given parameters.
+
+        Args:
+            tr (TestRun): Contains the test and its run-specific configurations.
+
+        Returns:
+            str: The generated Slurm srun command.
+        """
+        pass
+
+    @abstractmethod
+    def gen_srun_success_check(self, tr: TestRun) -> str:
+        """
+        Generate the Slurm success check command to verify if a test run was successful.
+
+        Args:
+            tr (TestRun): Contains the test and its run-specific configurations.
+
+        Returns:
+            str: The generated command to check the success of the test run.
+        """
+        pass
diff --git a/src/cloudai/_core/test_template.py b/src/cloudai/_core/test_template.py
index c0227d3b..0b90b737 100644
--- a/src/cloudai/_core/test_template.py
+++ b/src/cloudai/_core/test_template.py
@@ -133,6 +133,40 @@ def gen_exec_command(self, tr: TestRun) -> str:
             )
         return self.command_gen_strategy.gen_exec_command(tr)
 
+    def gen_srun_command(self, tr: TestRun) -> str:
+        """
+        Generate an Slurm srun command for a test using the provided command generation strategy.
+
+        Args:
+            tr (TestRun): Contains the test and its run-specific configurations.
+
+        Returns:
+            str: The generated Slurm srun command.
+        """
+        if self.command_gen_strategy is None:
+            raise ValueError(
+                "command_gen_strategy is missing. Ensure the strategy is registered in the Registry "
+                "by calling the appropriate registration function for the system type."
+            )
+        return self.command_gen_strategy.gen_srun_command(tr)
+
+    def gen_srun_success_check(self, tr: TestRun) -> str:
+        """
+        Generate a Slurm success check command for a test using the provided command generation strategy.
+
+        Args:
+            tr (TestRun): Contains the test and its run-specific configurations.
+
+        Returns:
+            str: The generated command to check the success of the test run.
+        """
+        if self.command_gen_strategy is None:
+            raise ValueError(
+                "command_gen_strategy is missing. Ensure the strategy is registered in the Registry "
+                "by calling the appropriate registration function for the system type."
+            )
+        return self.command_gen_strategy.gen_srun_success_check(tr)
+
     def gen_json(self, tr: TestRun) -> Dict[Any, Any]:
         """
         Generate a JSON string representing the Kubernetes job specification for this test using this template.
diff --git a/src/cloudai/schema/test_template/nccl_test/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/nccl_test/slurm_command_gen_strategy.py
index e982c28a..b63ab35c 100644
--- a/src/cloudai/schema/test_template/nccl_test/slurm_command_gen_strategy.py
+++ b/src/cloudai/schema/test_template/nccl_test/slurm_command_gen_strategy.py
@@ -17,6 +17,7 @@
 from pathlib import Path
 from typing import Any, Dict, List
 
+from cloudai import TestRun
 from cloudai.systems.slurm.strategy import SlurmCommandGenStrategy
 
 from .slurm_install_strategy import NcclTestSlurmInstallStrategy
@@ -83,3 +84,7 @@ def generate_test_command(
             srun_command_parts.append(extra_cmd_args)
 
         return srun_command_parts
+
+    def gen_srun_success_check(self, tr: TestRun) -> str:
+        output_file = Path(tr.output_path) / "stdout.txt"
+        return f'grep -q "Avg bus bandwidth" {output_file} && echo 1 || echo 0'
diff --git a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py
index 3b7a0649..d60a16ef 100644
--- a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py
+++ b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py
@@ -18,7 +18,7 @@
 from pathlib import Path
 from typing import Any, Dict, List
 
-from cloudai import CommandGenStrategy, TestRun
+from cloudai import CommandGenStrategy, TestRun, TestScenario
 from cloudai.systems import SlurmSystem
 from cloudai.util.docker_image_cache_manager import DockerImageCacheManager
 
@@ -63,8 +63,35 @@ def gen_exec_command(self, tr: TestRun) -> str:
         slurm_args = self._parse_slurm_args(
             tr.test.test_template.__class__.__name__, env_vars, cmd_args, tr.num_nodes, tr.nodes
         )
-        srun_command = self._gen_srun_command(slurm_args, env_vars, cmd_args, tr.test.extra_cmd_args)
-        return self._write_sbatch_script(slurm_args, env_vars, srun_command, tr.output_path)
+
+        if tr.prologue:
+            prologue_command = self.gen_prologue(tr.prologue, tr.output_path)
+            srun_command = self._gen_srun_command(slurm_args, env_vars, cmd_args, tr.test.extra_cmd_args)
+            command_list = [prologue_command, "if [ $PROLOGUE_SUCCESS -eq 1 ]; then", f"    {srun_command}"]
+
+            if tr.epilogue:
+                epilogue_command = self.gen_epilogue(tr.epilogue, tr.output_path)
+                command_list.append(f"    {epilogue_command}")
+
+            command_list.append("fi")
+        else:
+            srun_command = self._gen_srun_command(slurm_args, env_vars, cmd_args, tr.test.extra_cmd_args)
+            command_list = [srun_command]
+
+            if tr.epilogue:
+                epilogue_command = self.gen_epilogue(tr.epilogue, tr.output_path)
+                command_list.append(epilogue_command)
+
+        full_command = "\n".join(command_list).strip()
+        return self._write_sbatch_script(slurm_args, env_vars, full_command, tr.output_path)
+
+    def gen_srun_command(self, tr: TestRun) -> str:
+        env_vars = self._override_env_vars(self.system.global_env_vars, tr.test.extra_env_vars)
+        cmd_args = self._override_cmd_args(self.default_cmd_args, tr.test.cmd_args)
+        slurm_args = self._parse_slurm_args(
+            tr.test.test_template.__class__.__name__, env_vars, cmd_args, tr.num_nodes, tr.nodes
+        )
+        return self._gen_srun_command(slurm_args, env_vars, cmd_args, tr.test.extra_cmd_args)
 
     def _parse_slurm_args(
         self,
@@ -112,12 +139,87 @@ def job_name(self, job_name_prefix: str) -> str:
             job_name = f"{self.system.account}-{job_name_prefix}.{datetime.now().strftime('%Y%m%d_%H%M%S')}"
         return job_name
 
+    def gen_prologue(self, prologue: TestScenario, base_output_path: Path) -> str:
+        """
+        Generate the prologue command by running all tests defined in the prologue test scenario.
+
+        Args:
+            prologue (TestScenario): The prologue test scenario containing the tests to be run.
+            base_output_path (Path): The base output directory path for storing prologue outputs.
+
+        Returns:
+            str: A string with all the Slurm srun commands generated for the prologue.
+        """
+        if not prologue.test_runs:
+            return "PROLOGUE_SUCCESS=1\n"
+
+        prologue_output_dir = base_output_path / "prologue"
+        prologue_output_dir.mkdir(parents=True, exist_ok=True)
+
+        prologue_commands = []
+        success_vars = []
+
+        for idx, tr in enumerate(prologue.test_runs):
+            plugin_dir = prologue_output_dir / tr.test.name
+            plugin_dir.mkdir(parents=True, exist_ok=True)
+            tr.output_path = plugin_dir
+
+            srun_command = tr.test.test_template.gen_srun_command(tr)
+            srun_command_with_output = srun_command.replace(
+                "srun ", f"srun --output={plugin_dir / 'stdout.txt'} --error={plugin_dir / 'stderr.txt'} "
+            )
+            prologue_commands.append(srun_command_with_output)
+
+            success_var = f"SUCCESS_{idx}"
+            success_vars.append(success_var)
+
+            success_check_command = tr.test.test_template.gen_srun_success_check(tr)
+            prologue_commands.append(f"{success_var}=$({success_check_command})")
+
+        combined_success_var = " && ".join([f"[ ${var} -eq 1 ]" for var in success_vars])
+
+        prologue_commands.append(f"PROLOGUE_SUCCESS=$( {combined_success_var} && echo 1 || echo 0 )")
+
+        return "\n".join(prologue_commands)
+
+    def gen_epilogue(self, epilogue: TestScenario, base_output_path: Path) -> str:
+        """
+        Generate the epilogue command by running all tests defined in the epilogue test scenario.
+
+        Args:
+            epilogue (TestScenario): The epilogue test scenario containing the tests to be run.
+            base_output_path (Path): The base output directory path for storing epilogue outputs.
+
+        Returns:
+            str: A string with all the Slurm srun commands generated for the epilogue.
+        """
+        if not epilogue.test_runs:
+            return ""
+
+        epilogue_output_dir = base_output_path / "epilogue"
+        epilogue_output_dir.mkdir(parents=True, exist_ok=True)
+
+        epilogue_commands = []
+
+        for tr in epilogue.test_runs:
+            plugin_dir = epilogue_output_dir / tr.test.name
+            plugin_dir.mkdir(parents=True, exist_ok=True)
+            tr.output_path = plugin_dir
+
+            srun_command = tr.test.test_template.gen_srun_command(tr)
+            srun_command_with_output = srun_command.replace(
+                "srun ", f"srun --output={plugin_dir / 'stdout.txt'} --error={plugin_dir / 'stderr.txt'} "
+            )
+            epilogue_commands.append(srun_command_with_output)
+
+        return "\n".join(epilogue_commands)
+
     def _gen_srun_command(
         self, slurm_args: Dict[str, Any], env_vars: Dict[str, str], cmd_args: Dict[str, str], extra_cmd_args: str
     ) -> str:
         srun_command_parts = self.gen_srun_prefix(slurm_args)
         test_command_parts = self.generate_test_command(env_vars, cmd_args, extra_cmd_args)
-        return " \\\n".join(srun_command_parts + test_command_parts)
+        return " ".join(srun_command_parts + test_command_parts)
 
     def gen_srun_prefix(self, slurm_args: Dict[str, Any]) -> List[str]:
         srun_command_parts = ["srun", f"--mpi={self.system.mpi}"]
diff --git a/tests/ref_data/gpt.sbatch b/tests/ref_data/gpt.sbatch
index d8789804..ec00c0d1 100644
--- a/tests/ref_data/gpt.sbatch
+++ b/tests/ref_data/gpt.sbatch
@@ -8,7 +8,7 @@ export COMBINE_THRESHOLD=1
 export PER_GPU_COMBINE_THRESHOLD=0
 export XLA_FLAGS="--xla_gpu_all_gather_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_all_reduce_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_reduce_scatter_combine_threshold_bytes=$PER_GPU_COMBINE_THRESHOLD"
 
-    echo "Loading container with srun command"
+echo "Loading container with srun command"
     srun --mpi=none --container-image=https:/docker/url --container-name=cont true
     echo "Running srun command"
     srun \
diff --git a/tests/ref_data/grok.sbatch b/tests/ref_data/grok.sbatch
index 808973bb..8ca5ebbe 100644
--- a/tests/ref_data/grok.sbatch
+++ b/tests/ref_data/grok.sbatch
@@ -8,7 +8,7 @@ export COMBINE_THRESHOLD=1
 export PER_GPU_COMBINE_THRESHOLD=0
 export XLA_FLAGS="--xla_disable_hlo_passes=rematerialization --xla_dump_hlo_pass_re=.* --xla_gpu_all_gather_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_all_reduce_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_enable_all_gather_combine_by_dim=false --xla_gpu_enable_highest_priority_async_stream=true --xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_pipelined_all_gather=true --xla_gpu_enable_pipelined_all_reduce=true --xla_gpu_enable_pipelined_reduce_scatter=true --xla_gpu_enable_reduce_scatter_combine_by_dim=false --xla_gpu_enable_triton_gemm=false --xla_gpu_enable_triton_softmax_fusion=false --xla_gpu_enable_while_loop_double_buffering=true --xla_gpu_graph_level=0 --xla_gpu_pgle_profile_file_or_directory_path=/opt/paxml/workspace/pgle_output_profile.pbtxt --xla_gpu_reduce_scatter_combine_threshold_bytes=$PER_GPU_COMBINE_THRESHOLD --xla_gpu_run_post_layout_collective_pipeliner=false --xla_gpu_use_memcpy_local_p2p=false"
 
-    echo "Loading container with srun command"
+echo "Loading container with srun command"
     srun --mpi=none --container-image=https:/docker/url --container-name=cont true
     echo "Running srun command"
     srun \
diff --git a/tests/ref_data/nccl.sbatch b/tests/ref_data/nccl.sbatch
index 3ac39077..dc179ba9 100644
--- a/tests/ref_data/nccl.sbatch
+++ b/tests/ref_data/nccl.sbatch
@@ -8,23 +8,4 @@
 export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)
 
 
-srun \
---mpi=pmix \
---container-image=nvcr.io/nvidia/pytorch:24.02-py3 \
-/usr/local/bin/all_reduce_perf_mpi \
---nthreads 1 \
---ngpus 1 \
---minbytes 32M \
---maxbytes 32M \
---stepbytes 1M \
---op sum \
---datatype float \
---root 0 \
---iters 20 \
---warmup_iters 5 \
---agg_iters 1 \
---average 1 \
---parallel_init 0 \
---check 1 \
---blocking 0 \
---cudagraph 0
\ No newline at end of file
+srun --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 /usr/local/bin/all_reduce_perf_mpi --nthreads 1 --ngpus 1 --minbytes 32M --maxbytes 32M --stepbytes 1M --op sum --datatype float --root 0 --iters 20 --warmup_iters 5 --agg_iters 1 --average 1 --parallel_init 0 --check 1 --blocking 0 --cudagraph 0
diff --git a/tests/ref_data/sleep.sbatch b/tests/ref_data/sleep.sbatch
index 7c24ec14..9262001b 100644
--- a/tests/ref_data/sleep.sbatch
+++ b/tests/ref_data/sleep.sbatch
@@ -8,6 +8,4 @@
 export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)
 
 
-srun \
---mpi=pmix \
-sleep 5
\ No newline at end of file
+srun --mpi=pmix sleep 5
diff --git a/tests/ref_data/ucc.sbatch b/tests/ref_data/ucc.sbatch
index 74fa7799..a9f9e686 100644
--- a/tests/ref_data/ucc.sbatch
+++ b/tests/ref_data/ucc.sbatch
@@ -8,12 +8,4 @@
 export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)
 
 
-srun \
---mpi=pmix \
---container-image=nvcr.io/nvidia/pytorch:24.02-py3 \
-/opt/hpcx/ucc/bin/ucc_perftest \
--c alltoall \
--b 1 \
--e 8M \
--m cuda \
--F
\ No newline at end of file
+srun --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 /opt/hpcx/ucc/bin/ucc_perftest -c alltoall -b 1 -e 8M -m cuda -F
diff --git a/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py
index 36db4473..fb59cc38 100644
--- a/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py
+++ b/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py
@@ -19,7 +19,7 @@
 
 import pytest
 
-from cloudai import Test, TestDefinition, TestRun, TestTemplate
+from cloudai import Test, TestDefinition, TestRun, TestScenario, TestTemplate
 from cloudai.systems import SlurmSystem
 from cloudai.systems.slurm.strategy import SlurmCommandGenStrategy
 
@@ -120,3 +120,138 @@ def test_raises_if_no_default_partition(slurm_system: SlurmSystem):
         "system configuration. Please ensure that 'default_partition' is set correctly "
         "in the corresponding system configuration (e.g., system.toml)."
     ) in str(exc_info.value)
+
+
+@pytest.mark.parametrize(
+    "prologue,epilogue,expected_script_lines",
+    [
+        # No prologue, no epilogue
+        (None, None, ["srun"]),
+        # One prologue, no epilogue
+        (
+            [Mock(test=Mock(name="test1", test_template=Mock()))],
+            None,
+            [
+                "SUCCESS_0=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)",
+                "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 )",
+                "if [ $PROLOGUE_SUCCESS -eq 1 ]; then",
+                "    srun",
+                "fi",
+            ],
+        ),
+        # No prologue, one epilogue
+        (
+            None,
+            [Mock(test=Mock(name="test2", test_template=Mock()))],
+            [
+                "srun",
+                "epilogue",
+            ],
+        ),
+        # One prologue, one epilogue
+        (
+            [Mock(test=Mock(name="test1", test_template=Mock()))],
+            [Mock(test=Mock(name="test2", test_template=Mock()))],
+            [
+                "SUCCESS_0=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)",
+                "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 )",
+                "if [ $PROLOGUE_SUCCESS -eq 1 ]; then",
+                "    srun",
+                "    epilogue",
+                "fi",
+            ],
+        ),
+        # Multiple prologues, multiple epilogues
+        (
+            [Mock(test=Mock(name="test1", test_template=Mock())), Mock(test=Mock(name="test2", test_template=Mock()))],
+            [Mock(test=Mock(name="test3", test_template=Mock())), Mock(test=Mock(name="test4", test_template=Mock()))],
+            [
+                "SUCCESS_0=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)",
+                "SUCCESS_1=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)",
+                "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && [ $SUCCESS_1 -eq 1 ] && echo 1 || echo 0 )",
+                "if [ $PROLOGUE_SUCCESS -eq 1 ]; then",
+                "    srun",
+                "    epilogue",
+                "    epilogue",
+                "fi",
+            ],
+        ),
+        # Multiple prologues, no epilogue
+        (
+            [Mock(test=Mock(name="test1", test_template=Mock())), Mock(test=Mock(name="test2", test_template=Mock()))],
+            None,
+            [
+                "SUCCESS_0=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)",
+                "SUCCESS_1=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)",
+                "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && [ $SUCCESS_1 -eq 1 ] && echo 1 || echo 0 )",
+                "if [ $PROLOGUE_SUCCESS -eq 1 ]; then",
+                "    srun",
+                "fi",
+            ],
+        ),
+        # No prologue, multiple epilogues
+        (
+            None,
+            [Mock(test=Mock(name="test3", test_template=Mock())), Mock(test=Mock(name="test4", test_template=Mock()))],
+            [
+                "srun",
+                "epilogue",
+                "epilogue",
+            ],
+        ),
+        # Multiple prologues, single epilogue
+        (
+            [Mock(test=Mock(name="test1", test_template=Mock())), Mock(test=Mock(name="test2", test_template=Mock()))],
+            [Mock(test=Mock(name="test3", test_template=Mock()))],
+            [
+                "SUCCESS_0=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)",
+                "SUCCESS_1=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)",
+                "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && [ $SUCCESS_1 -eq 1 ] && echo 1 || echo 0 )",
+                "if [ $PROLOGUE_SUCCESS -eq 1 ]; then",
+                "    srun",
+                "    epilogue",
+                "fi",
+            ],
+        ),
+    ],
+)
+def test_prologue_epilogue_combinations(
+    strategy_fixture: SlurmCommandGenStrategy,
+    testrun_fixture: TestRun,
+    prologue,
+    epilogue,
+    expected_script_lines,
+    tmp_path,
+):
+    testrun_fixture.prologue = Mock(spec=TestScenario) if prologue else None
+    testrun_fixture.epilogue = Mock(spec=TestScenario) if epilogue else None
+
+    if prologue is not None:
+        testrun_fixture.prologue = Mock(spec=TestScenario)
+        testrun_fixture.prologue.test_runs = prologue
+        for idx, run in enumerate(prologue):
+            run.test.test_template.gen_srun_success_check.return_value = (
+                "grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0"
+            )
+            run.test.test_template.gen_srun_command.return_value = "srun"
+            run.test.name = f"test{idx+1}"
+    else:
+        testrun_fixture.prologue = None
+
+    if epilogue is not None:
+        testrun_fixture.epilogue = Mock(spec=TestScenario)
+        testrun_fixture.epilogue.test_runs = epilogue
+        for idx, run in enumerate(epilogue):
+            run.test.test_template.gen_srun_command.return_value = "epilogue"
+            run.test.name = f"test{idx+1}"
+    else:
+        testrun_fixture.epilogue = None
+
+    sbatch_command = strategy_fixture.gen_exec_command(testrun_fixture)
+    script_file_path = sbatch_command.split()[-1]
+
+    with open(script_file_path, "r") as script_file:
+        script_content = script_file.read()
+
+    for expected_line in expected_script_lines:
+        assert expected_line in script_content, f"Expected '{expected_line}' in generated script but it was missing."

From 06d4b7dc3d0bcfcf19ac5578e295fddb6bc9a45d Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Fri, 25 Oct 2024 11:33:12 -0400
Subject: [PATCH 09/64] Remove plugin option from CLI

---
 src/cloudai/cli/cli.py |  4 --
 tests/test_cli.py      | 92 +-----------------------------------------
 2 files changed, 2 insertions(+), 94 deletions(-)

diff --git a/src/cloudai/cli/cli.py b/src/cloudai/cli/cli.py
index f962a2d5..53059799 100644
--- a/src/cloudai/cli/cli.py
+++ b/src/cloudai/cli/cli.py
@@ -60,7 +60,6 @@ def add_command(
         handler: Callable[[argparse.Namespace], int],
         system_config: Optional[bool] = None,
         tests_dir: Optional[bool] = None,
-        plugin_dir: Optional[bool] = None,
         test_scenario: Optional[bool] = None,
         output_dir: Optional[bool] = None,
         result_dir: Optional[bool] = None,
@@ -75,8 +74,6 @@ def add_command(
             p.add_argument(
                 "--tests-dir", help="Path to the test configuration directory.", required=tests_dir, type=Path
             )
-        if plugin_dir is not None:
-            p.add_argument("--plugin-dir", help="Path to the plugin directory.", required=plugin_dir, type=Path)
         if test_scenario is not None:
             p.add_argument("--test-scenario", help="Path to the test scenario file.", required=test_scenario, type=Path)
         if output_dir is not None:
@@ -130,7 +127,6 @@ def add_run_and_dry_run(self):
                 handle_dry_run_and_run,
                 system_config=True,
                 tests_dir=True,
-                plugin_dir=False,
                 test_scenario=True,
                 output_dir=False,
             )
diff --git a/tests/test_cli.py b/tests/test_cli.py
index bb6c1a5d..538e497f 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -20,8 +20,8 @@
 
 import pytest
 
-from cloudai.cli import CloudAICLI
-from cloudai.cli.handlers import handle_generate_report, handle_install_and_uninstall, handle_verify_all_configs
+from cloudai.cli import CloudAICLI, handle_generate_report, handle_install_and_uninstall
+from cloudai.cli.handlers import handle_verify_all_configs
 
 
 def test_help_message(capsys: pytest.CaptureFixture[str]) -> None:
@@ -108,7 +108,6 @@ def test_add_command_all_optional():
         lambda _: 0,
         system_config=False,
         tests_dir=False,
-        plugin_dir=False,
         test_scenario=False,
         output_dir=False,
     )
@@ -119,7 +118,6 @@ def test_add_command_all_optional():
         mode="test",
         system_config=None,
         tests_dir=None,
-        plugin_dir=None,
         test_scenario=None,
         output_dir=None,
     )
@@ -134,7 +132,6 @@ def test_add_command_all_required():
         lambda _: 0,
         system_config=True,
         tests_dir=True,
-        plugin_dir=True,
         test_scenario=True,
         output_dir=True,
     )
@@ -145,8 +142,6 @@ def test_add_command_all_required():
             "system_config",
             "--tests-dir",
             "tests_dir",
-            "--plugin-dir",
-            "plugin_dir",
             "--test-scenario",
             "test_scenario",
             "--output-dir",
@@ -159,91 +154,11 @@ def test_add_command_all_required():
         mode="test",
         system_config=Path("system_config"),
         tests_dir=Path("tests_dir"),
-        plugin_dir=Path("plugin_dir"),
         test_scenario=Path("test_scenario"),
         output_dir=Path("output_dir"),
     )
 
 
-@pytest.mark.parametrize(
-    "mode,args,expected_plugin_dir",
-    [
-        (
-            "run",
-            [
-                "run",
-                "--system-config",
-                "system_config",
-                "--tests-dir",
-                "tests_dir",
-                "--plugin-dir",
-                "plugin_dir",
-                "--test-scenario",
-                "test_scenario",
-            ],
-            Path("plugin_dir"),
-        ),
-        (
-            "run",
-            [
-                "run",
-                "--system-config",
-                "system_config",
-                "--tests-dir",
-                "tests_dir",
-                "--test-scenario",
-                "test_scenario",
-            ],
-            None,
-        ),
-        (
-            "dry-run",
-            [
-                "dry-run",
-                "--system-config",
-                "system_config",
-                "--tests-dir",
-                "tests_dir",
-                "--plugin-dir",
-                "plugin_dir",
-                "--test-scenario",
-                "test_scenario",
-            ],
-            Path("plugin_dir"),
-        ),
-        (
-            "dry-run",
-            [
-                "dry-run",
-                "--system-config",
-                "system_config",
-                "--tests-dir",
-                "tests_dir",
-                "--test-scenario",
-                "test_scenario",
-            ],
-            None,
-        ),
-    ],
-)
-def test_modes_with_or_without_plugin_dir(mode, args, expected_plugin_dir):
-    cli = CloudAICLI()
-
-    cli.add_command(
-        mode,
-        f"{mode} command",
-        lambda _: 0,
-        system_config=True,
-        tests_dir=True,
-        plugin_dir=False,
-        test_scenario=True,
-        output_dir=False,
-    )
-
-    parsed_args = cli.parser.parse_args(args)
-    assert parsed_args.plugin_dir == expected_plugin_dir
-
-
 def test_real_uninstall():
     cli = CloudAICLI()
     cli.init_default_args()
@@ -362,8 +277,6 @@ def test_run_dry_run_modes(self, cli: CloudAICLI):
                     "tests_dir",
                     "--test-scenario",
                     "test_scenario",
-                    "--plugin-dir",
-                    "plugin_dir",
                 ]
             )
 
@@ -373,7 +286,6 @@ def test_run_dry_run_modes(self, cli: CloudAICLI):
                 mode=mode,
                 system_config=Path("system_config"),
                 tests_dir=Path("tests_dir"),
-                plugin_dir=Path("plugin_dir"),
                 test_scenario=Path("test_scenario"),
                 output_dir=None,
             )

From 9174f0027709dfc92dde69ab5e85dcfc8eb5c2cb Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Fri, 25 Oct 2024 11:38:04 -0400
Subject: [PATCH 10/64] Make plugin directory self-contained

---
 .../plugin/test/nccl_test_all_gather.toml     | 33 +++++++++++++++++++
 .../plugin/test/nccl_test_all_reduce.toml     | 30 +++++++++++++++++
 .../nccl_test_epilogue.toml                   |  0
 .../nccl_test_prologue.toml                   |  0
 4 files changed, 63 insertions(+)
 create mode 100644 conf/common/plugin/test/nccl_test_all_gather.toml
 create mode 100644 conf/common/plugin/test/nccl_test_all_reduce.toml
 rename conf/common/plugin/{ => test_scenario}/nccl_test_epilogue.toml (100%)
 rename conf/common/plugin/{ => test_scenario}/nccl_test_prologue.toml (100%)

diff --git a/conf/common/plugin/test/nccl_test_all_gather.toml b/conf/common/plugin/test/nccl_test_all_gather.toml
new file mode 100644
index 00000000..4fec288a
--- /dev/null
+++ b/conf/common/plugin/test/nccl_test_all_gather.toml
@@ -0,0 +1,33 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "nccl_test_all_gather"
+description = "all_gather"
+test_template_name = "NcclTest"
+
+[cmd_args]
+"subtest_name" = "all_gather_perf_mpi"
+"ngpus" = "1"
+"minbytes" = "128"
+"maxbytes" = "4G"
+"iters" = "100"
+"warmup_iters" = "50"
+
+[extra_cmd_args]
+"--stepfactor" = "2"
+
+[extra_env_vars]
+"NCCL_TEST_SPLIT_MASK" = "0x7"
diff --git a/conf/common/plugin/test/nccl_test_all_reduce.toml b/conf/common/plugin/test/nccl_test_all_reduce.toml
new file mode 100644
index 00000000..9074b2b8
--- /dev/null
+++ b/conf/common/plugin/test/nccl_test_all_reduce.toml
@@ -0,0 +1,30 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "nccl_test_all_reduce"
+description = "all_reduce"
+test_template_name = "NcclTest"
+
+[cmd_args]
+"subtest_name" = "all_reduce_perf_mpi"
+"ngpus" = "1"
+"minbytes" = "128"
+"maxbytes" = "16G"
+"iters" = "100"
+"warmup_iters" = "50"
+
+[extra_cmd_args]
+"--stepfactor" = "2"
diff --git a/conf/common/plugin/nccl_test_epilogue.toml b/conf/common/plugin/test_scenario/nccl_test_epilogue.toml
similarity index 100%
rename from conf/common/plugin/nccl_test_epilogue.toml
rename to conf/common/plugin/test_scenario/nccl_test_epilogue.toml
diff --git a/conf/common/plugin/nccl_test_prologue.toml b/conf/common/plugin/test_scenario/nccl_test_prologue.toml
similarity index 100%
rename from conf/common/plugin/nccl_test_prologue.toml
rename to conf/common/plugin/test_scenario/nccl_test_prologue.toml

From 7afa73fa20d7eeecc8259d266545f426685c3f19 Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Fri, 25 Oct 2024 11:36:06 -0400
Subject: [PATCH 11/64] Update Parser to support self-contained plugin
 directory

---
 src/cloudai/cli/handlers.py |   4 +-
 src/cloudai/parser.py       | 105 ++++++++++++++++++++++++++----------
 tests/test_parser.py        |  58 +++++++++++++-------
 3 files changed, 117 insertions(+), 50 deletions(-)

diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py
index a204f70e..ac42363f 100644
--- a/src/cloudai/cli/handlers.py
+++ b/src/cloudai/cli/handlers.py
@@ -114,7 +114,9 @@ def handle_dry_run_and_run(args: argparse.Namespace) -> int:
         args (argparse.Namespace): The parsed command-line arguments.
     """
     parser = Parser(args.system_config)
-    system, tests, test_scenario = parser.parse(args.tests_dir, args.test_scenario, args.plugin_dir)
+    system, tests, test_scenario = parser.parse(
+        args.tests_dir, args.test_scenario, Path("conf/common/plugin/test"), Path("conf/common/plugin/test_scenario")
+    )
     assert test_scenario is not None
 
     if args.output_dir:
diff --git a/src/cloudai/parser.py b/src/cloudai/parser.py
index 5b21ab3f..73ab8717 100644
--- a/src/cloudai/parser.py
+++ b/src/cloudai/parser.py
@@ -49,14 +49,25 @@ def __init__(self, system_config_path: Path) -> None:
         self.system_config_path = system_config_path
 
     def parse(
-        self, test_path: Path, test_scenario_path: Optional[Path] = None, plugin_path: Optional[Path] = None
+        self,
+        test_path: Path,
+        test_scenario_path: Optional[Path] = None,
+        plugin_test_path: Optional[Path] = None,
+        plugin_test_scenario_path: Optional[Path] = None,
     ) -> Tuple[System, List[Test], Optional[TestScenario]]:
         """
         Parse configurations for system, test templates, and test scenarios.
 
-        Returns
-            Tuple[System, List[TestTemplate], TestScenario]: A tuple containing the system object, a list of test
-                template objects, and the test scenario object.
+        Args:
+            test_path (Path): The file path for tests.
+            test_scenario_path (Optional[Path]): The file path for the main test scenario.
+                If None, all tests are included.
+            plugin_test_path (Optional[Path]): The file path for plugin-specific tests.
+            plugin_test_scenario_path (Optional[Path]): The file path for plugin-specific test scenarios.
+
+        Returns:
+            Tuple[System, List[Test], Optional[TestScenario]]: A tuple containing the system object, a list of filtered
+                test template objects, and the main test scenario object if provided.
         """
         if not test_path.exists():
             raise FileNotFoundError(f"Test path '{test_path}' not found.")
@@ -64,47 +75,83 @@ def parse(
         try:
             system = self.parse_system(self.system_config_path)
         except SystemConfigParsingError:
-            exit(1)  # exit right away to keep error message readable for users
+            exit(1)
 
         try:
             tests = self.parse_tests(list(test_path.glob("*.toml")), system)
         except TestConfigParsingError:
-            exit(1)  # exit right away to keep error message readable for users
+            exit(1)
 
-        logging.debug(f"Parsed {len(tests)} tests: {[t.name for t in tests]}")
-        test_mapping = {t.name: t for t in tests}
+        plugin_tests = (
+            self.parse_tests(list(plugin_test_path.glob("*.toml")), system)
+            if plugin_test_path and plugin_test_path.exists()
+            else []
+        )
 
-        test_scenario: Optional[TestScenario] = None
-        scenario_test_names: Set[str] = set()
         if test_scenario_path:
-            plugin_mapping: Dict[str, TestScenario] = {}
-            plugin_test_names: Set[str] = set()
-            if plugin_path and plugin_path.exists():
-                try:
-                    plugin_mapping = self.parse_plugins(list(plugin_path.glob("*.toml")), test_mapping)
-                    for plugin_scenario in plugin_mapping.values():
-                        plugin_test_names.update(tr.test.name for tr in plugin_scenario.test_runs)
-                except TestScenarioParsingError:
-                    exit(1)
+            return self._parse_with_scenario(system, tests, test_scenario_path, plugin_tests, plugin_test_scenario_path)
+
+        return system, tests + plugin_tests, None
+
+    def _parse_with_scenario(
+        self,
+        system: System,
+        tests: List[Test],
+        test_scenario_path: Path,
+        plugin_tests: List[Test],
+        plugin_test_scenario_path: Optional[Path],
+    ) -> Tuple[System, List[Test], Optional[TestScenario]]:
+        """Parse tests and scenarios with a main test scenario path specified."""
+        test_mapping = {t.name: t for t in tests}
+        plugin_test_mapping = {t.name: t for t in plugin_tests}
+
+        plugin_test_scenario_mapping = self._load_plugin_scenarios(plugin_test_scenario_path, plugin_test_mapping)
+        test_scenario = self._load_main_scenario(test_scenario_path, test_mapping, plugin_test_scenario_mapping)
 
+        all_used_test_names = self._collect_used_test_names(plugin_test_scenario_mapping, test_scenario)
+        filtered_tests = [t for t in tests if t.name in all_used_test_names]
+
+        return system, filtered_tests, test_scenario
+
+    def _load_plugin_scenarios(
+        self, plugin_test_scenario_path: Optional[Path], plugin_test_mapping: Dict[str, Test]
+    ) -> Dict[str, TestScenario]:
+        """Load plugin-specific test scenarios from the specified path."""
+        if plugin_test_scenario_path and plugin_test_scenario_path.exists():
             try:
-                test_scenario = self.parse_test_scenario(test_scenario_path, test_mapping, plugin_mapping)
-                scenario_test_names = set(tr.test.name for tr in test_scenario.test_runs)
+                return self.parse_plugins(list(plugin_test_scenario_path.glob("*.toml")), plugin_test_mapping)
             except TestScenarioParsingError:
                 exit(1)
+        return {}
 
-            all_used_test_names = plugin_test_names.union(scenario_test_names)
-            filtered_tests = [t for t in tests if t.name in all_used_test_names]
-        else:
-            filtered_tests = tests
-
-        return system, filtered_tests, test_scenario
+    def _load_main_scenario(
+        self,
+        test_scenario_path: Path,
+        test_mapping: Dict[str, Test],
+        plugin_test_scenario_mapping: Dict[str, TestScenario],
+    ) -> Optional[TestScenario]:
+        """Load the main test scenario using provided mappings."""
+        try:
+            return self.parse_test_scenario(test_scenario_path, test_mapping, plugin_test_scenario_mapping)
+        except TestScenarioParsingError:
+            exit(1)
+
+    def _collect_used_test_names(
+        self, plugin_test_scenario_mapping: Dict[str, TestScenario], test_scenario: Optional[TestScenario]
+    ) -> Set[str]:
+        """Collect test names used in both plugin and main test scenarios."""
+        # TODO: collect test names in the plugin test scenarios only
+        plugin_test_names = {
+            tr.test.name for scenario in plugin_test_scenario_mapping.values() for tr in scenario.test_runs
+        }
+        scenario_test_names = {tr.test.name for tr in test_scenario.test_runs} if test_scenario else set()
+        return plugin_test_names.union(scenario_test_names)
 
     @staticmethod
     def parse_plugins(plugin_tomls: List[Path], test_mapping: Dict[str, Test]) -> Dict[str, TestScenario]:
         plugin_mapping = {}
-        for plugin_path in plugin_tomls:
-            plugin_scenario = Parser.parse_test_scenario(plugin_path, test_mapping)
+        for plugin_test_scenario_path in plugin_tomls:
+            plugin_scenario = Parser.parse_test_scenario(plugin_test_scenario_path, test_mapping)
             plugin_mapping[plugin_scenario.name] = plugin_scenario
         return plugin_mapping
 
diff --git a/tests/test_parser.py b/tests/test_parser.py
index cb809d36..12372755 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -15,13 +15,13 @@
 # limitations under the License.
 
 from pathlib import Path
-from typing import cast
+from typing import Dict, cast
 from unittest.mock import Mock, patch
 
 import pytest
 from pydantic_core import ErrorDetails
 
-from cloudai import Parser, format_validation_error
+from cloudai import Parser, TestScenario, format_validation_error
 from cloudai.systems.slurm.slurm_system import SlurmSystem
 
 
@@ -100,16 +100,14 @@ def test_scenario_with_plugin_common_tests(
 
     @patch("cloudai._core.test_parser.TestParser.parse_all")
     @patch("cloudai._core.test_scenario_parser.TestScenarioParser.parse")
-    @patch("cloudai.parser.Parser.parse_plugins")
-    def test_scenario_with_plugin_exclusive_tests(
-        self, parse_plugins: Mock, test_scenario_parser: Mock, test_parser: Mock, parser: Parser
-    ):
+    def test_scenario_with_plugin_exclusive_tests(self, test_scenario_parser: Mock, test_parser: Mock, parser: Parser):
         tests_dir = parser.system_config_path.parent.parent / "test"
+        test_scenario_path = Path("/mock/test_scenario.toml")
+        plugin_test_scenario_path = Path("/mock/plugin_scenarios")
 
-        fake_tests = []
-        for i in range(4):
-            fake_tests.append(Mock())
-            fake_tests[-1].name = f"test-{i}"
+        fake_tests = [Mock() for _ in range(4)]
+        for i, test in enumerate(fake_tests):
+            test.name = f"test-{i}"
         test_parser.return_value = fake_tests
 
         fake_scenario = Mock()
@@ -117,18 +115,38 @@ def test_scenario_with_plugin_exclusive_tests(
         fake_scenario.test_runs[0].test.name = "test-1"
         test_scenario_parser.return_value = fake_scenario
 
-        fake_plugin = Mock()
-        fake_plugin.test_runs = [Mock()]
-        fake_plugin.test_runs[0].test.name = "test-2"
-        parse_plugins.return_value = {"plugin-1": fake_plugin}
+        fake_plugin_scenarios = {"plugin-1": Mock(test_runs=[Mock()])}
+        fake_plugin_scenarios["plugin-1"].test_runs[0].test.name = "test-2"
 
-        _, tests, _ = parser.parse(tests_dir, Path(), Path())
+        with patch.object(parser, "_load_plugin_scenarios", return_value=fake_plugin_scenarios):
+            _, filtered_tests, _ = parser.parse(tests_dir, test_scenario_path, tests_dir, plugin_test_scenario_path)
+
+        filtered_test_names = {t.name for t in filtered_tests}
+        assert len(filtered_tests) == 2
+        assert "test-1" in filtered_test_names
+        assert "test-2" in filtered_test_names
+        assert "test-0" not in filtered_test_names
+        assert "test-3" not in filtered_test_names
+
+    def test_collect_used_test_names(self, parser: Parser):
+        fake_scenario = Mock()
+        fake_scenario.test_runs = [Mock()]
+        fake_scenario.test_runs[0].test.name = "test-1"
+
+        fake_plugin_scenario_1 = Mock(spec=TestScenario)
+        fake_plugin_scenario_1.test_runs = [Mock()]
+        fake_plugin_scenario_1.test_runs[0].test.name = "test-2"
+
+        fake_plugin_scenario_2 = Mock(spec=TestScenario)
+        fake_plugin_scenario_2.test_runs = [Mock()]
+        fake_plugin_scenario_2.test_runs[0].test.name = "test-3"
+
+        fake_plugin_scenarios = cast(
+            Dict[str, TestScenario], {"plugin-1": fake_plugin_scenario_1, "plugin-2": fake_plugin_scenario_2}
+        )
 
-        assert len(tests) == 2
-        assert "test-1" in [t.name for t in tests]
-        assert "test-2" in [t.name for t in tests]
-        assert "test-0" not in [t.name for t in tests]
-        assert "test-3" not in [t.name for t in tests]
+        used_test_names = parser._collect_used_test_names(fake_plugin_scenarios, fake_scenario)
+        assert used_test_names == {"test-1", "test-2", "test-3"}
 
     def test_parse_system(self, parser: Parser):
         parser.system_config_path = Path("conf/common/system/example_slurm_cluster.toml")

From 3e45b25b7d6cb2047cf9ad37dc787696630af925 Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Mon, 28 Oct 2024 13:35:01 -0400
Subject: [PATCH 12/64] Refactor plugin path handling in parse to use a single
 plugin_path param

---
 src/cloudai/cli/handlers.py | 4 +---
 src/cloudai/parser.py       | 9 +++++----
 tests/test_parser.py        | 4 ++--
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py
index ac42363f..3382ff4b 100644
--- a/src/cloudai/cli/handlers.py
+++ b/src/cloudai/cli/handlers.py
@@ -114,9 +114,7 @@ def handle_dry_run_and_run(args: argparse.Namespace) -> int:
         args (argparse.Namespace): The parsed command-line arguments.
     """
     parser = Parser(args.system_config)
-    system, tests, test_scenario = parser.parse(
-        args.tests_dir, args.test_scenario, Path("conf/common/plugin/test"), Path("conf/common/plugin/test_scenario")
-    )
+    system, tests, test_scenario = parser.parse(args.tests_dir, args.test_scenario, Path("conf/common/plugin"))
     assert test_scenario is not None
 
     if args.output_dir:
diff --git a/src/cloudai/parser.py b/src/cloudai/parser.py
index 73ab8717..a9227f88 100644
--- a/src/cloudai/parser.py
+++ b/src/cloudai/parser.py
@@ -52,8 +52,7 @@ def parse(
         self,
         test_path: Path,
         test_scenario_path: Optional[Path] = None,
-        plugin_test_path: Optional[Path] = None,
-        plugin_test_scenario_path: Optional[Path] = None,
+        plugin_path: Optional[Path] = None,
     ) -> Tuple[System, List[Test], Optional[TestScenario]]:
         """
         Parse configurations for system, test templates, and test scenarios.
@@ -62,8 +61,7 @@ def parse(
             test_path (Path): The file path for tests.
             test_scenario_path (Optional[Path]): The file path for the main test scenario.
                 If None, all tests are included.
-            plugin_test_path (Optional[Path]): The file path for plugin-specific tests.
-            plugin_test_scenario_path (Optional[Path]): The file path for plugin-specific test scenarios.
+            plugin_path (Optional[Path]): The base file path for plugin-specific tests and scenarios.
 
         Returns:
             Tuple[System, List[Test], Optional[TestScenario]]: A tuple containing the system object, a list of filtered
@@ -82,6 +80,9 @@ def parse(
         except TestConfigParsingError:
             exit(1)
 
+        plugin_test_scenario_path = plugin_path
+        plugin_test_path = plugin_path / "test" if plugin_path else None
+
         plugin_tests = (
             self.parse_tests(list(plugin_test_path.glob("*.toml")), system)
             if plugin_test_path and plugin_test_path.exists()
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 12372755..bcfd63a3 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -103,7 +103,7 @@ def test_scenario_with_plugin_common_tests(
     def test_scenario_with_plugin_exclusive_tests(self, test_scenario_parser: Mock, test_parser: Mock, parser: Parser):
         tests_dir = parser.system_config_path.parent.parent / "test"
         test_scenario_path = Path("/mock/test_scenario.toml")
-        plugin_test_scenario_path = Path("/mock/plugin_scenarios")
+        plugin_path = Path("/mock/plugin_scenarios")
 
         fake_tests = [Mock() for _ in range(4)]
         for i, test in enumerate(fake_tests):
@@ -119,7 +119,7 @@ def test_scenario_with_plugin_exclusive_tests(self, test_scenario_parser: Mock,
         fake_plugin_scenarios["plugin-1"].test_runs[0].test.name = "test-2"
 
         with patch.object(parser, "_load_plugin_scenarios", return_value=fake_plugin_scenarios):
-            _, filtered_tests, _ = parser.parse(tests_dir, test_scenario_path, tests_dir, plugin_test_scenario_path)
+            _, filtered_tests, _ = parser.parse(tests_dir, test_scenario_path, plugin_path)
 
         filtered_test_names = {t.name for t in filtered_tests}
         assert len(filtered_tests) == 2

From 5634f743463bb157bb4b843941f4b7fe4d1c0311 Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Mon, 28 Oct 2024 13:29:04 -0400
Subject: [PATCH 13/64] Remove test_scenario directory from conf/common/plugin/

---
 conf/common/plugin/{test_scenario => }/nccl_test_epilogue.toml | 0
 conf/common/plugin/{test_scenario => }/nccl_test_prologue.toml | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename conf/common/plugin/{test_scenario => }/nccl_test_epilogue.toml (100%)
 rename conf/common/plugin/{test_scenario => }/nccl_test_prologue.toml (100%)

diff --git a/conf/common/plugin/test_scenario/nccl_test_epilogue.toml b/conf/common/plugin/nccl_test_epilogue.toml
similarity index 100%
rename from conf/common/plugin/test_scenario/nccl_test_epilogue.toml
rename to conf/common/plugin/nccl_test_epilogue.toml
diff --git a/conf/common/plugin/test_scenario/nccl_test_prologue.toml b/conf/common/plugin/nccl_test_prologue.toml
similarity index 100%
rename from conf/common/plugin/test_scenario/nccl_test_prologue.toml
rename to conf/common/plugin/nccl_test_prologue.toml

From b7989812201c1c94141b9621dd542d18141377a4 Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Tue, 29 Oct 2024 09:26:27 -0400
Subject: [PATCH 14/64] Restore comments in src/cloudai/parser.py

---
 src/cloudai/parser.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/cloudai/parser.py b/src/cloudai/parser.py
index a9227f88..00f494aa 100644
--- a/src/cloudai/parser.py
+++ b/src/cloudai/parser.py
@@ -73,12 +73,12 @@ def parse(
         try:
             system = self.parse_system(self.system_config_path)
         except SystemConfigParsingError:
-            exit(1)
+            exit(1)  # exit right away to keep error message readable for users
 
         try:
             tests = self.parse_tests(list(test_path.glob("*.toml")), system)
         except TestConfigParsingError:
-            exit(1)
+            exit(1)  # exit right away to keep error message readable for users
 
         plugin_test_scenario_path = plugin_path
         plugin_test_path = plugin_path / "test" if plugin_path else None

From da9abbfddc46f308785f3f5852849ffd3bc413a4 Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Tue, 29 Oct 2024 09:38:47 -0400
Subject: [PATCH 15/64] Remove unused tmp_path from unit tests

---
 .../test_common_slurm_command_gen_strategy.py                    | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py
index fb59cc38..dac84a1d 100644
--- a/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py
+++ b/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py
@@ -221,7 +221,6 @@ def test_prologue_epilogue_combinations(
     prologue,
     epilogue,
     expected_script_lines,
-    tmp_path,
 ):
     testrun_fixture.prologue = Mock(spec=TestScenario) if prologue else None
     testrun_fixture.epilogue = Mock(spec=TestScenario) if epilogue else None

From 6b11f54c8b033a4801d182c1403950da69a4a09c Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Tue, 29 Oct 2024 09:39:19 -0400
Subject: [PATCH 16/64] Set prologue and epilogue to None by default

---
 src/cloudai/_core/test_scenario_parser.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/cloudai/_core/test_scenario_parser.py b/src/cloudai/_core/test_scenario_parser.py
index c59adeba..c3cbca99 100644
--- a/src/cloudai/_core/test_scenario_parser.py
+++ b/src/cloudai/_core/test_scenario_parser.py
@@ -166,8 +166,8 @@ def _create_section_test_run(
         self,
         test_info: _TestRunTOML,
         normalized_weight: float,
-        prologue: Optional[TestScenario],
-        epilogue: Optional[TestScenario],
+        prologue: Optional[TestScenario] = None,
+        epilogue: Optional[TestScenario] = None,
     ) -> TestRun:
         """
         Create a section-specific Test object by copying from the test mapping.

From b84c16fef9665de9c0fd4eed91f9de070f06674b Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Tue, 29 Oct 2024 09:45:08 -0400
Subject: [PATCH 17/64] Add validation to ensure 'prologue' and 'epilogue' are
 not empty strings

---
 src/cloudai/_core/test_scenario_parser.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/cloudai/_core/test_scenario_parser.py b/src/cloudai/_core/test_scenario_parser.py
index c3cbca99..03ac1288 100644
--- a/src/cloudai/_core/test_scenario_parser.py
+++ b/src/cloudai/_core/test_scenario_parser.py
@@ -54,8 +54,8 @@ class _TestScenarioTOML(BaseModel):
     name: str
     job_status_check: bool = True
     tests: list[_TestRunTOML] = Field(alias="Tests", min_length=1)
-    prologue: str = ""
-    epilogue: str = ""
+    prologue: Optional[str] = None
+    epilogue: Optional[str] = None
 
     @model_validator(mode="after")
     def check_no_self_dependency(self):
@@ -89,6 +89,20 @@ def check_all_dependencies_are_known(self):
 
         return self
 
+    @model_validator(mode="after")
+    def check_prologue_not_empty(self):
+        """Ensure that prologue is not an empty string if provided."""
+        if self.prologue == "":
+            raise ValueError("The 'prologue' field should not be an empty string.")
+        return self
+
+    @model_validator(mode="after")
+    def check_epilogue_not_empty(self):
+        """Ensure that epilogue is not an empty string if provided."""
+        if self.epilogue == "":
+            raise ValueError("The 'epilogue' field should not be an empty string.")
+        return self
+
 
 class TestScenarioParser:
     """

From 8d840cf4b36398d9767b6be1916d563d32e16ae4 Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Tue, 22 Oct 2024 12:48:01 -0400
Subject: [PATCH 18/64] Reorder SlurmCommandGenStrategy methods

---
 .../strategy/slurm_command_gen_strategy.py    | 46 +++++++++----------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py
index 4a052a47..7dc786f9 100644
--- a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py
+++ b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py
@@ -51,22 +51,12 @@ def __init__(self, system: SlurmSystem, cmd_args: Dict[str, Any]) -> None:
 
         self.docker_image_url = self.cmd_args.get("docker_image_url", "")
 
-    def _format_env_vars(self, env_vars: Dict[str, Any]) -> str:
-        """
-        Format environment variables for inclusion in a batch script.
-
-        Args:
-            env_vars (Dict[str, Any]): Environment variables to format.
-
-        Returns:
-            str: A string representation of the formatted environment variables.
-        """
-        formatted_vars = []
-        for key in sorted(env_vars.keys()):
-            value = env_vars[key]
-            formatted_value = str(value["default"]) if isinstance(value, dict) and "default" in value else str(value)
-            formatted_vars.append(f"export {key}={formatted_value}")
-        return "\n".join(formatted_vars)
+    def gen_exec_command(self, tr: TestRun) -> str:
+        env_vars = self._override_env_vars(self.system.global_env_vars, tr.test.extra_env_vars)
+        cmd_args = self._override_cmd_args(self.default_cmd_args, tr.test.cmd_args)
+        slurm_args = self._parse_slurm_args(tr.test.test_template.__class__.__name__, env_vars, cmd_args, tr)
+        srun_command = self.generate_srun_command(slurm_args, env_vars, cmd_args, tr.test.extra_cmd_args)
+        return self._write_sbatch_script(slurm_args, env_vars, srun_command, tr.output_path)
 
     def _parse_slurm_args(
         self, job_name_prefix: str, env_vars: Dict[str, str], cmd_args: Dict[str, str], tr: TestRun
@@ -127,13 +117,6 @@ def generate_srun_prefix(self, slurm_args: Dict[str, Any]) -> List[str]:
 
         return srun_command_parts
 
-    def gen_exec_command(self, tr: TestRun) -> str:
-        env_vars = self._override_env_vars(self.system.global_env_vars, tr.test.extra_env_vars)
-        cmd_args = self._override_cmd_args(self.default_cmd_args, tr.test.cmd_args)
-        slurm_args = self._parse_slurm_args(tr.test.test_template.__class__.__name__, env_vars, cmd_args, tr)
-        srun_command = self.generate_srun_command(slurm_args, env_vars, cmd_args, tr.test.extra_cmd_args)
-        return self._write_sbatch_script(slurm_args, env_vars, srun_command, tr.output_path)
-
     def generate_test_command(
         self, env_vars: Dict[str, str], cmd_args: Dict[str, str], extra_cmd_args: str
     ) -> List[str]:
@@ -223,3 +206,20 @@ def _append_sbatch_directives(
         batch_script_content.append(
             "\nexport SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)"
         )
+
+    def _format_env_vars(self, env_vars: Dict[str, Any]) -> str:
+        """
+        Format environment variables for inclusion in a batch script.
+
+        Args:
+            env_vars (Dict[str, Any]): Environment variables to format.
+
+        Returns:
+            str: A string representation of the formatted environment variables.
+        """
+        formatted_vars = []
+        for key in sorted(env_vars.keys()):
+            value = env_vars[key]
+            formatted_value = str(value["default"]) if isinstance(value, dict) and "default" in value else str(value)
+            formatted_vars.append(f"export {key}={formatted_value}")
+        return "\n".join(formatted_vars)

From 9725b365141bdc905ae62949f5a4075b31a9ab19 Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Tue, 22 Oct 2024 13:45:01 -0400
Subject: [PATCH 19/64] Rename generate_srun_command to _gen_srun_command

---
 .../test_template/jax_toolbox/slurm_command_gen_strategy.py | 2 +-
 .../systems/slurm/strategy/slurm_command_gen_strategy.py    | 6 +++---
 .../test_common_slurm_command_gen_strategy.py               | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py
index ff70b5c4..c5a98509 100644
--- a/src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py
+++ b/src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py
@@ -146,7 +146,7 @@ def _parse_slurm_args(
 
         return base_args
 
-    def generate_srun_command(
+    def _gen_srun_command(
         self, slurm_args: Dict[str, Any], env_vars: Dict[str, str], cmd_args: Dict[str, Any], extra_cmd_args: str
     ) -> str:
         self._create_run_script(env_vars, cmd_args, extra_cmd_args)
diff --git a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py
index 7dc786f9..5c33a141 100644
--- a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py
+++ b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py
@@ -98,14 +98,14 @@ def job_name(self, job_name_prefix: str) -> str:
             job_name = f"{self.system.account}-{job_name_prefix}.{datetime.now().strftime('%Y%m%d_%H%M%S')}"
         return job_name
 
-    def generate_srun_command(
+    def _gen_srun_command(
         self, slurm_args: Dict[str, Any], env_vars: Dict[str, str], cmd_args: Dict[str, str], extra_cmd_args: str
     ) -> str:
-        srun_command_parts = self.generate_srun_prefix(slurm_args)
+        srun_command_parts = self.gen_srun_prefix(slurm_args)
         test_command_parts = self.generate_test_command(env_vars, cmd_args, extra_cmd_args)
         return " \\\n".join(srun_command_parts + test_command_parts)
 
-    def generate_srun_prefix(self, slurm_args: Dict[str, Any]) -> List[str]:
+    def gen_srun_prefix(self, slurm_args: Dict[str, Any]) -> List[str]:
         srun_command_parts = ["srun", f"--mpi={self.system.mpi}"]
         if slurm_args.get("image_path"):
             srun_command_parts.append(f'--container-image={slurm_args["image_path"]}')
diff --git a/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py
index 37d6a962..f2aae181 100644
--- a/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py
+++ b/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py
@@ -54,7 +54,7 @@ def test_filename_generation(strategy_fixture: SlurmCommandGenStrategy, testrun_
     env_vars = {"TEST_VAR": "VALUE"}
     cmd_args = {"test_arg": "test_value"}
     slurm_args = strategy_fixture._parse_slurm_args(job_name_prefix, env_vars, cmd_args, testrun_fixture)
-    srun_command = strategy_fixture.generate_srun_command(slurm_args, env_vars, cmd_args, "")
+    srun_command = strategy_fixture._gen_srun_command(slurm_args, env_vars, cmd_args, "")
 
     sbatch_command = strategy_fixture._write_sbatch_script(
         slurm_args, env_vars, srun_command, testrun_fixture.output_path

From 5a658c3b44890755e9690521b1756fd2e903828d Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Wed, 23 Oct 2024 12:05:42 -0400
Subject: [PATCH 20/64] Remove pre-test implementation from JaxToolbox

---
 .../jax_toolbox/slurm_command_gen_strategy.py |  93 ----------------
 src/cloudai/test_definitions/gpt.py           |   5 +-
 src/cloudai/test_definitions/grok.py          |   5 +-
 src/cloudai/test_definitions/jax_toolbox.py   |  34 +-----
 .../{gpt-no-pretest.sbatch => gpt.sbatch}     |   2 +-
 .../{grok-no-pretest.sbatch => grok.sbatch}   |   2 +-
 ..._jax_toolbox_slurm_command_gen_strategy.py | 105 +-----------------
 tests/test_acceptance.py                      |  18 +--
 8 files changed, 16 insertions(+), 248 deletions(-)
 rename tests/ref_data/{gpt-no-pretest.sbatch => gpt.sbatch} (96%)
 rename tests/ref_data/{grok-no-pretest.sbatch => grok.sbatch} (98%)

diff --git a/src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py
index c5a98509..b27d878d 100644
--- a/src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py
+++ b/src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py
@@ -152,25 +152,11 @@ def _gen_srun_command(
         self._create_run_script(env_vars, cmd_args, extra_cmd_args)
 
         commands = []
-
-        run_pre_test = cmd_args.get("pre_test.enable", False)
-
-        if run_pre_test:
-            output_path = Path(cmd_args["output_path"]).resolve() / "output_pretest-%j-%n-%t.txt"
-            error_path = Path(cmd_args["output_path"]).resolve() / "error_pretest-%j-%n-%t.txt"
-            commands.append(self._generate_pre_test_command(cmd_args, output_path, error_path))
-            commands.append(self._generate_pre_test_check_command(cmd_args, output_path))
-            commands.append('if [ "$PRE_TEST_SUCCESS" = true ]; then')
-
         load_container = cmd_args.get("load_container", False)
         if load_container:
             commands += self._generate_container_load_command(slurm_args)
-
         commands += self._generate_run_command(slurm_args)
 
-        if run_pre_test:
-            commands.append("fi")
-
         return "\n".join(commands)
 
     def _create_run_script(
@@ -341,85 +327,6 @@ def _create_pgo_nsys_converter_command(self, stage: str, cmd_args: Dict[str, str
             ["", 'if [ "$SLURM_NODEID" -eq 0 ] && [ "$SLURM_PROCID" -eq 0 ]; then', f"    {command}", "fi"]
         )
 
-    def _generate_pre_test_command(self, cmd_args: Dict[str, Any], output_path: Path, error_path: Path) -> str:
-        """
-        Generate the pre-test command for running a test.
-
-        This method constructs the pre-test command based on the command-line
-        arguments provided.
-
-        Args:
-            cmd_args (Dict[str, Any]): A dictionary containing command arguments.
-            output_path (Path): The path to the output file.
-            error_path (Path): The path to the error file.
-
-        Returns:
-            str: The generated pre-test command.
-        """
-        nccl_test_prefix = "pre_test.nccl_test."
-        nccl_test = {}
-
-        for key, value in cmd_args.items():
-            if key.startswith(nccl_test_prefix):
-                flag_name = key[len(nccl_test_prefix) :]
-                nccl_test[flag_name] = value
-        pre_test_command_parts = [
-            "srun",
-            "--mpi=pmix",
-            f"-N {nccl_test.get('num_nodes', 2)}",
-            f"-o {output_path}",
-            f"-e {error_path}",
-            f"--container-image={nccl_test.get('docker_image_url', 'nvcr.io/nvidia/pytorch:24.02-py3')}",
-            f"/usr/local/bin/{nccl_test.get('subtest_name', 'all_gather_perf_mpi')}",
-            f"--nthreads {nccl_test.get('nthreads', 1)}",
-            f"--ngpus {nccl_test.get('ngpus', 1)}",
-            f"--minbytes {nccl_test.get('minbytes', '32M')}",
-            f"--maxbytes {nccl_test.get('maxbytes', '16G')}",
-            f"--stepbytes {nccl_test.get('stepbytes', '1M')}",
-            f"--op {nccl_test.get('op', 'sum')}",
-            f"--datatype {nccl_test.get('datatype', 'float')}",
-            f"--root {nccl_test.get('root', 0)}",
-            f"--iters {nccl_test.get('iters', 20)}",
-            f"--warmup_iters {nccl_test.get('warmup_iters', 5)}",
-            f"--agg_iters {nccl_test.get('agg_iters', 1)}",
-            f"--average {nccl_test.get('average', 1)}",
-            f"--parallel_init {nccl_test.get('parallel_init', 0)}",
-            f"--check {nccl_test.get('check', 1)}",
-            f"--blocking {nccl_test.get('blocking', 0)}",
-            f"--cudagraph {nccl_test.get('cudagraph', 0)}",
-            f"--stepfactor {nccl_test.get('stepfactor', 2)}",
-        ]
-        return " \\\n".join(pre_test_command_parts)
-
-    def _generate_pre_test_check_command(self, cmd_args: Dict[str, str], output_path: Path) -> str:
-        """
-        Generate the command for pre-test check.
-
-        This method generates the command that checks the output of the pre-test to determine if the main test should
-        be run.
-
-        Args:
-            cmd_args (Dict[str, str]): Command-line arguments for the job.
-            output_path (str): The path to the output file.
-
-        Returns:
-            str: The generated command for pre-test check.
-        """
-        pretest_output_files = str(Path(output_path).parent / "output_pretest-*.txt")
-        keyword = cmd_args.get("keyword", "Avg bus bandwidth")
-
-        return "\n".join(
-            [
-                f'PRETEST_OUTPUT_FILES="{pretest_output_files}"',
-                f'keyword="{keyword}"',
-                "",
-                "# Use grep to search for the keyword in the files",
-                'if grep -q "$keyword" $PRETEST_OUTPUT_FILES; then',
-                "    PRE_TEST_SUCCESS=true",
-                "fi",
-            ]
-        )
-
     def _generate_container_load_command(self, slurm_args: Dict[str, Any]) -> List[str]:
         """Generate the command for loading a container."""
         container_image = slurm_args.get("image_path")
diff --git a/src/cloudai/test_definitions/gpt.py b/src/cloudai/test_definitions/gpt.py
index ff1e8f1e..353d97fe 100644
--- a/src/cloudai/test_definitions/gpt.py
+++ b/src/cloudai/test_definitions/gpt.py
@@ -21,7 +21,7 @@
 from cloudai import Installable
 from cloudai.installer.installables import DockerImage
 
-from .jax_toolbox import JaxFdl, JaxToolboxCmdArgs, JaxToolboxTestDefinition, PreTest, SetupFlags, XLAFlags
+from .jax_toolbox import JaxFdl, JaxToolboxCmdArgs, JaxToolboxTestDefinition, SetupFlags, XLAFlags
 
 
 class GPTFdl(JaxFdl):
@@ -48,7 +48,6 @@ class GPTCmdArgs(JaxToolboxCmdArgs):
 
     fdl_config: str
     fdl: GPTFdl = Field(default_factory=GPTFdl)
-    pre_test: PreTest = Field(default_factory=PreTest)
     xla_flags: GPTXLAFlags = Field(default_factory=GPTXLAFlags)
     setup_flags: GPTSetupFlags = Field(default_factory=GPTSetupFlags)
 
@@ -64,7 +63,7 @@ def cmd_args_dict(self):
         d = self.cmd_args.model_dump()
         res = {}
         for k, v in d.items():
-            if k in {"pre_test", "docker_image_url", "load_container", "output_path"}:
+            if k in {"docker_image_url", "load_container", "output_path"}:
                 res[k] = v
             else:
                 if k == "xla_flags":
diff --git a/src/cloudai/test_definitions/grok.py b/src/cloudai/test_definitions/grok.py
index c87c6e44..88a358be 100644
--- a/src/cloudai/test_definitions/grok.py
+++ b/src/cloudai/test_definitions/grok.py
@@ -21,7 +21,7 @@
 from cloudai import Installable
 from cloudai.installer.installables import DockerImage
 
-from .jax_toolbox import JaxFdl, JaxToolboxCmdArgs, JaxToolboxTestDefinition, PreTest, SetupFlags, XLAFlags
+from .jax_toolbox import JaxFdl, JaxToolboxCmdArgs, JaxToolboxTestDefinition, SetupFlags, XLAFlags
 
 
 class GrokFdl(JaxFdl):
@@ -77,7 +77,6 @@ class GrokCmdArgs(JaxToolboxCmdArgs):
     setup_flags: SetupFlags = Field(default_factory=SetupFlags)
     profile: GrokProfileXLAFlags = Field(default_factory=GrokProfileXLAFlags)
     perf: GrokPerfXLAFlags = Field(default_factory=GrokPerfXLAFlags)
-    pre_test: PreTest = Field(default_factory=PreTest)
 
 
 class GrokTestDefinition(JaxToolboxTestDefinition):
@@ -97,7 +96,7 @@ def cmd_args_dict(self):
             if k in {"profile", "perf"}:
                 res.setdefault(f"Grok.{k}", {})
                 res[f"Grok.{k}"]["XLA_FLAGS"] = v
-            elif k in {"pre_test", "docker_image_url", "load_container", "output_path"}:
+            elif k in {"docker_image_url", "load_container", "output_path"}:
                 res[k] = v
             else:
                 res[f"Grok.{k}"] = v
diff --git a/src/cloudai/test_definitions/jax_toolbox.py b/src/cloudai/test_definitions/jax_toolbox.py
index 079e5b4e..4593028a 100644
--- a/src/cloudai/test_definitions/jax_toolbox.py
+++ b/src/cloudai/test_definitions/jax_toolbox.py
@@ -14,12 +14,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Any, Optional
+from typing import Optional
 
-from pydantic import BaseModel, ConfigDict, Field, field_serializer
+from pydantic import BaseModel, ConfigDict, field_serializer
 
 from cloudai import CmdArgs, TestDefinition
-from cloudai.test_definitions.nccl import NCCLCmdArgs
 
 
 class JaxFdl(BaseModel):
@@ -54,35 +53,6 @@ def checkpoint_policy_serializer(self, value: str) -> str:
         return f'\\"{value}\\"'
 
 
-class NCCLCmdAgrsPreTest(NCCLCmdArgs):
-    """NCCL pre-test command arguments."""
-
-    num_nodes: int = 8
-    stepfactor: int = 2
-    minbytes: str = "8M"
-    maxbytes: str = "16G"
-    blocking: int = 1
-
-    def model_post_init(self, _: Any) -> None:
-        self.subtest_name = "all_gather_perf_mpi"
-        self.docker_image_url = "nvcr.io/nvidia/pytorch:24.02-py3"
-
-
-class PreTest(BaseModel):
-    """Pre-test configuration."""
-
-    model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True)
-    enable: bool = True
-    nccl_test: NCCLCmdAgrsPreTest = Field(default_factory=NCCLCmdAgrsPreTest)
-
-
-class NCCLPreTest(BaseModel):
-    """Pre-test configuration."""
-
-    model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True)
-    nccl_test: Optional[NCCLCmdAgrsPreTest] = None
-
-
 class JaxToolboxCmdArgs(CmdArgs):
     """JAX Toolbox test command arguments."""
 
diff --git a/tests/ref_data/gpt-no-pretest.sbatch b/tests/ref_data/gpt.sbatch
similarity index 96%
rename from tests/ref_data/gpt-no-pretest.sbatch
rename to tests/ref_data/gpt.sbatch
index edc4d19c..3cd84b21 100644
--- a/tests/ref_data/gpt-no-pretest.sbatch
+++ b/tests/ref_data/gpt.sbatch
@@ -19,4 +19,4 @@ export XLA_FLAGS="--xla_gpu_all_gather_combine_threshold_bytes=$COMBINE_THRESHOL
     -e __OUTPUT_DIR__/error-%j-%n-%t.txt \
     --container-name=cont \
     --container-mounts=__OUTPUT_DIR__:/opt/paxml/workspace/ \
-    /opt/paxml/workspace/run.sh
\ No newline at end of file
+    /opt/paxml/workspace/run.sh
diff --git a/tests/ref_data/grok-no-pretest.sbatch b/tests/ref_data/grok.sbatch
similarity index 98%
rename from tests/ref_data/grok-no-pretest.sbatch
rename to tests/ref_data/grok.sbatch
index a8274477..f5d32243 100644
--- a/tests/ref_data/grok-no-pretest.sbatch
+++ b/tests/ref_data/grok.sbatch
@@ -19,4 +19,4 @@ export XLA_FLAGS="--xla_disable_hlo_passes=rematerialization --xla_dump_hlo_pass
     -e __OUTPUT_DIR__/error-%j-%n-%t.txt \
     --container-name=cont \
     --container-mounts=__OUTPUT_DIR__:/opt/paxml/workspace/ \
-    /opt/paxml/workspace/run.sh
\ No newline at end of file
+    /opt/paxml/workspace/run.sh
diff --git a/tests/slurm_command_gen_strategy/test_jax_toolbox_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_jax_toolbox_slurm_command_gen_strategy.py
index 5db0d1bd..131e4a55 100644
--- a/tests/slurm_command_gen_strategy/test_jax_toolbox_slurm_command_gen_strategy.py
+++ b/tests/slurm_command_gen_strategy/test_jax_toolbox_slurm_command_gen_strategy.py
@@ -25,7 +25,7 @@
 from cloudai.systems import SlurmSystem
 from cloudai.test_definitions.gpt import GPTCmdArgs, GPTTestDefinition
 from cloudai.test_definitions.grok import GrokCmdArgs, GrokTestDefinition
-from cloudai.test_definitions.jax_toolbox import JaxFdl, PreTest
+from cloudai.test_definitions.jax_toolbox import JaxFdl
 
 
 class TestJaxToolboxSlurmCommandGenStrategy:
@@ -63,7 +63,6 @@ def test_gen_exec_command(
         test_fixture,
     ) -> None:
         test_def = request.getfixturevalue(test_fixture)
-        test_def.cmd_args.pre_test = PreTest(enable=True)
 
         test = Test(test_definition=test_def, test_template=JaxToolbox(slurm_system, "name"))
         test_run = TestRun(
@@ -74,14 +73,10 @@ def test_gen_exec_command(
             name="test-job",
         )
 
-        cmd_gen_strategy._generate_pre_test_command = MagicMock(return_value="pre_test_command")
         cmd = cmd_gen_strategy.gen_exec_command(test_run)
         assert cmd == f"sbatch {test_run.output_path}/cloudai_sbatch_script.sh"
         assert (test_run.output_path / "run.sh").exists()
 
-        content = Path(f"{test_run.output_path}/cloudai_sbatch_script.sh").read_text()
-        assert "pre_test_command" in content
-
     @pytest.mark.parametrize(
         "cmd_args, expected",
         [
@@ -215,100 +210,6 @@ def test_generate_python_command(
             "fi",
         ]
 
-    def test_generate_pre_test_command(
-        self, cmd_gen_strategy: JaxToolboxSlurmCommandGenStrategy, grok_test: GrokTestDefinition, tmp_path: Path
-    ) -> None:
-        grok_test.cmd_args.pre_test = PreTest(enable=True)
-
-        nccl_test = grok_test.cmd_args.pre_test.nccl_test
-        nccl_test.num_nodes = 2
-        nccl_test.minbytes = "32M"
-        nccl_test.blocking = 0
-
-        cargs = {"output_path": str(tmp_path), **grok_test.cmd_args_dict}
-
-        pre_test_cli = cmd_gen_strategy._generate_pre_test_command(cargs, tmp_path, tmp_path).splitlines()
-
-        expected_pre_test_cli = [
-            "srun \\",
-            "--mpi=pmix \\",
-            f"-N {nccl_test.num_nodes} \\",
-            f"-o {tmp_path} \\",
-            f"-e {tmp_path} \\",
-            f"--container-image={nccl_test.docker_image_url} \\",
-            f"/usr/local/bin/{nccl_test.subtest_name} \\",
-            f"--nthreads {nccl_test.nthreads} \\",
-            f"--ngpus {nccl_test.ngpus} \\",
-            f"--minbytes {nccl_test.minbytes} \\",
-            f"--maxbytes {nccl_test.maxbytes} \\",
-            f"--stepbytes {nccl_test.stepbytes} \\",
-            f"--op {nccl_test.op} \\",
-            f"--datatype {nccl_test.datatype} \\",
-            f"--root {nccl_test.root} \\",
-            f"--iters {nccl_test.iters} \\",
-            f"--warmup_iters {nccl_test.warmup_iters} \\",
-            f"--agg_iters {nccl_test.agg_iters} \\",
-            f"--average {nccl_test.average} \\",
-            f"--parallel_init {nccl_test.parallel_init} \\",
-            f"--check {nccl_test.check} \\",
-            f"--blocking {nccl_test.blocking} \\",
-            f"--cudagraph {nccl_test.cudagraph} \\",
-            f"--stepfactor {nccl_test.stepfactor}",
-        ]
-
-        assert pre_test_cli == expected_pre_test_cli, (
-            "The generated pre-test command did not match the expected command.\n"
-            f"Expected: {expected_pre_test_cli}\n"
-            f"Actual: {pre_test_cli}"
-        )
-
-    def test_generate_srun_command(self, slurm_system, cmd_gen_strategy, grok_test):
-        cmd_gen_strategy.test_name = grok_test.name
-        Path("/tmp/output").mkdir(parents=True, exist_ok=True)
-
-        output_path = Path("/tmp/output/output")
-        output_path.mkdir(parents=True, exist_ok=True)
-
-        # Use the existing setup for mocking internal methods
-        cmd_gen_strategy._generate_pre_test_command = MagicMock(return_value="srun --mpi=none pre_test_command")
-        cmd_gen_strategy._generate_run_command = MagicMock(return_value="srun --mpi=none run_command")
-        cmd_gen_strategy._generate_container_load_command = MagicMock(
-            return_value="srun --mpi=none container_load_command"
-        )
-
-        slurm_args = {
-            "output": "/tmp/output/output-%j.txt",
-            "error": "/tmp/output/error-%j.txt",
-            "image_path": "fake_image_url",
-            "container_mounts": "/tmp/output:/workspace",
-        }
-        cmd_args = {
-            "output_path": "/tmp/output",
-            "pre_test": {"enable": True},
-            f"{grok_test.name}.setup_flags.docker_workspace_dir": "/workspace/docker",
-            f"{grok_test.name}.setup_flags.tfds_data_dir": "/workspace/tfds",
-            f"{grok_test.name}.setup_flags.enable_checkpoint_saving": True,
-        }
-
-        pre_test_command = cmd_gen_strategy._generate_pre_test_command(
-            cmd_args, Path("/tmp/output"), Path("/tmp/output")
-        )
-        run_command = cmd_gen_strategy._generate_run_command(slurm_args)
-        container_load_command = cmd_gen_strategy._generate_container_load_command(slurm_args)
-
-        result_command = f"{pre_test_command}\n{container_load_command}\n{run_command}"
-
-        # Assert expected parts of the command are in the generated result
-        assert "pre_test_command" in result_command
-        assert "container_load_command" in result_command
-        assert "run_command" in result_command
-        assert "srun" in result_command
-        assert "--mpi=none" in result_command
-
-        cmd_gen_strategy._generate_pre_test_command.assert_called_once()
-        cmd_gen_strategy._generate_run_command.assert_called_once()
-        cmd_gen_strategy._generate_container_load_command.assert_called_once()
-
 
 def test_gpt_test_definition_cmd_args_dict():
     gpt = GPTTestDefinition(
@@ -324,7 +225,7 @@ def test_gpt_test_definition_cmd_args_dict():
     assert "GPT.setup_flags" in cargs
     assert "GPT.XLA_FLAGS" in cargs
 
-    for k in {"pre_test", "docker_image_url", "load_container"}:
+    for k in {"docker_image_url", "load_container"}:
         assert k in cargs
         assert f"GPT.{k}" not in cargs
 
@@ -348,7 +249,7 @@ def test_grok_test_definition_cmd_args_dict():
     assert "Grok.perf" in cargs
     assert "XLA_FLAGS" in cargs["Grok.perf"]
 
-    for k in {"pre_test", "docker_image_url", "load_container"}:
+    for k in {"docker_image_url", "load_container"}:
         assert k in cargs
         assert f"Grok.{k}" not in cargs
 
diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py
index be5f1299..e18d5d60 100644
--- a/tests/test_acceptance.py
+++ b/tests/test_acceptance.py
@@ -90,7 +90,7 @@ def partial_tr(slurm_system: SlurmSystem) -> partial[TestRun]:
     return partial(TestRun, num_nodes=1, nodes=[], output_path=slurm_system.output_path)
 
 
-@pytest.fixture(params=["ucc", "nccl", "sleep", "gpt-pretest", "gpt-no-pretest", "grok-pretest", "grok-no-pretest"])
+@pytest.fixture(params=["ucc", "nccl", "sleep", "gpt", "grok"])
 def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -> tuple[TestRun, str, Optional[str]]:
     if request.param == "ucc":
         tr = partial_tr(
@@ -140,7 +140,7 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -
         tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name")
 
         return (tr, "sleep.sbatch", None)
-    elif request.param.startswith("gpt-"):
+    elif request.param.startswith("gpt"):
         tr = partial_tr(
             name="gpt",
             test=Test(
@@ -158,13 +158,9 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -
             slurm_system, tr.test.test_definition.cmd_args_dict
         )
         tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name")
-        if "no-pretest" in request.param:
-            tr.test.test_definition.cmd_args.pre_test.enable = False
-        else:
-            tr.test.test_definition.cmd_args.pre_test.enable = True
 
         return (tr, f"{request.param}.sbatch", "gpt.run")
-    elif request.param.startswith("grok-"):
+    elif request.param.startswith("grok"):
         tr = partial_tr(
             name="grok",
             test=Test(
@@ -182,10 +178,6 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -
             slurm_system, tr.test.test_definition.cmd_args_dict
         )
         tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name")
-        if "no-pretest" in request.param:
-            tr.test.test_definition.cmd_args.pre_test.enable = False
-        else:
-            tr.test.test_definition.cmd_args.pre_test.enable = True
 
         return (tr, f"{request.param}.sbatch", "grok.run")
 
@@ -199,8 +191,8 @@ def test_sbatch_generation(slurm_system: SlurmSystem, test_req: tuple[TestRun, s
 
     sbatch_script = tr.test.test_template.gen_exec_command(tr).split()[-1]
 
-    curr = Path(sbatch_script).read_text()
-    ref = (Path(__file__).parent / "ref_data" / test_req[1]).read_text()
+    curr = Path(sbatch_script).read_text().strip()
+    ref = (Path(__file__).parent / "ref_data" / test_req[1]).read_text().strip()
     ref = ref.replace("__OUTPUT_DIR__", str(slurm_system.output_path)).replace("__JOB_NAME__", "job_name")
 
     assert curr == ref

From cac548409e262164f74fdd8b7c336bc6f038c2f6 Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Wed, 23 Oct 2024 07:24:23 -0400
Subject: [PATCH 21/64] Add prologue and epilogue to _TestScenarioTOML

---
 src/cloudai/_core/test_scenario_parser.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/cloudai/_core/test_scenario_parser.py b/src/cloudai/_core/test_scenario_parser.py
index 08526dca..16302a2a 100644
--- a/src/cloudai/_core/test_scenario_parser.py
+++ b/src/cloudai/_core/test_scenario_parser.py
@@ -54,6 +54,8 @@ class _TestScenarioTOML(BaseModel):
     name: str
     job_status_check: bool = True
     tests: list[_TestRunTOML] = Field(alias="Tests", min_length=1)
+    prologue: str = ""
+    epilogue: str = ""
 
     @model_validator(mode="after")
     def check_no_self_dependency(self):

From aab165bb1e31f75d84dce21815947e4828e56b08 Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Tue, 22 Oct 2024 14:50:35 -0400
Subject: [PATCH 22/64] Add example plugin files

---
 conf/common/plugin/nccl_test_epilogue.toml | 22 ++++++++++++++++++++++
 conf/common/plugin/nccl_test_prologue.toml | 22 ++++++++++++++++++++++
 conf/common/test_scenario/nccl_test.toml   |  4 ++++
 3 files changed, 48 insertions(+)
 create mode 100644 conf/common/plugin/nccl_test_epilogue.toml
 create mode 100644 conf/common/plugin/nccl_test_prologue.toml

diff --git a/conf/common/plugin/nccl_test_epilogue.toml b/conf/common/plugin/nccl_test_epilogue.toml
new file mode 100644
index 00000000..346dc8e4
--- /dev/null
+++ b/conf/common/plugin/nccl_test_epilogue.toml
@@ -0,0 +1,22 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "nccl_test_epilogue"
+
+[[Tests]]
+id = "Tests.1"
+test_name = "nccl_test_all_gather"
+time_limit = "00:20:00"
diff --git a/conf/common/plugin/nccl_test_prologue.toml b/conf/common/plugin/nccl_test_prologue.toml
new file mode 100644
index 00000000..e5c1a1e4
--- /dev/null
+++ b/conf/common/plugin/nccl_test_prologue.toml
@@ -0,0 +1,22 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "nccl_test_prologue"
+
+[[Tests]]
+id = "Tests.1"
+test_name = "nccl_test_all_reduce"
+time_limit = "00:20:00"
diff --git a/conf/common/test_scenario/nccl_test.toml b/conf/common/test_scenario/nccl_test.toml
index f6ccf02c..9b731e96 100644
--- a/conf/common/test_scenario/nccl_test.toml
+++ b/conf/common/test_scenario/nccl_test.toml
@@ -15,6 +15,10 @@
 # limitations under the License.
 
 name = "nccl-test"
+
+prologue = "nccl_test_prologue"
+epilogue = "nccl_test_epilogue"
+
 [[Tests]]
 id = "Tests.1"
 test_name = "nccl_test_all_reduce"

From 265e42e651197452e9040e26086471218bcfaab9 Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Wed, 23 Oct 2024 07:35:01 -0400
Subject: [PATCH 23/64] Add plugin option to CLI

---
 src/cloudai/cli/cli.py |  4 ++
 tests/test_cli.py      | 92 +++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 94 insertions(+), 2 deletions(-)

diff --git a/src/cloudai/cli/cli.py b/src/cloudai/cli/cli.py
index 53059799..f962a2d5 100644
--- a/src/cloudai/cli/cli.py
+++ b/src/cloudai/cli/cli.py
@@ -60,6 +60,7 @@ def add_command(
         handler: Callable[[argparse.Namespace], int],
         system_config: Optional[bool] = None,
         tests_dir: Optional[bool] = None,
+        plugin_dir: Optional[bool] = None,
         test_scenario: Optional[bool] = None,
         output_dir: Optional[bool] = None,
         result_dir: Optional[bool] = None,
@@ -74,6 +75,8 @@ def add_command(
             p.add_argument(
                 "--tests-dir", help="Path to the test configuration directory.", required=tests_dir, type=Path
             )
+        if plugin_dir is not None:
+            p.add_argument("--plugin-dir", help="Path to the plugin directory.", required=plugin_dir, type=Path)
         if test_scenario is not None:
             p.add_argument("--test-scenario", help="Path to the test scenario file.", required=test_scenario, type=Path)
         if output_dir is not None:
@@ -127,6 +130,7 @@ def add_run_and_dry_run(self):
                 handle_dry_run_and_run,
                 system_config=True,
                 tests_dir=True,
+                plugin_dir=False,
                 test_scenario=True,
                 output_dir=False,
             )
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 538e497f..bb6c1a5d 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -20,8 +20,8 @@
 
 import pytest
 
-from cloudai.cli import CloudAICLI, handle_generate_report, handle_install_and_uninstall
-from cloudai.cli.handlers import handle_verify_all_configs
+from cloudai.cli import CloudAICLI
+from cloudai.cli.handlers import handle_generate_report, handle_install_and_uninstall, handle_verify_all_configs
 
 
 def test_help_message(capsys: pytest.CaptureFixture[str]) -> None:
@@ -108,6 +108,7 @@ def test_add_command_all_optional():
         lambda _: 0,
         system_config=False,
         tests_dir=False,
+        plugin_dir=False,
         test_scenario=False,
         output_dir=False,
     )
@@ -118,6 +119,7 @@ def test_add_command_all_optional():
         mode="test",
         system_config=None,
         tests_dir=None,
+        plugin_dir=None,
         test_scenario=None,
         output_dir=None,
     )
@@ -132,6 +134,7 @@ def test_add_command_all_required():
         lambda _: 0,
         system_config=True,
         tests_dir=True,
+        plugin_dir=True,
         test_scenario=True,
         output_dir=True,
     )
@@ -142,6 +145,8 @@ def test_add_command_all_required():
             "system_config",
             "--tests-dir",
             "tests_dir",
+            "--plugin-dir",
+            "plugin_dir",
             "--test-scenario",
             "test_scenario",
             "--output-dir",
@@ -154,11 +159,91 @@ def test_add_command_all_required():
         mode="test",
         system_config=Path("system_config"),
         tests_dir=Path("tests_dir"),
+        plugin_dir=Path("plugin_dir"),
         test_scenario=Path("test_scenario"),
         output_dir=Path("output_dir"),
     )
 
 
+@pytest.mark.parametrize(
+    "mode,args,expected_plugin_dir",
+    [
+        (
+            "run",
+            [
+                "run",
+                "--system-config",
+                "system_config",
+                "--tests-dir",
+                "tests_dir",
+                "--plugin-dir",
+                "plugin_dir",
+                "--test-scenario",
+                "test_scenario",
+            ],
+            Path("plugin_dir"),
+        ),
+        (
+            "run",
+            [
+                "run",
+                "--system-config",
+                "system_config",
+                "--tests-dir",
+                "tests_dir",
+                "--test-scenario",
+                "test_scenario",
+            ],
+            None,
+        ),
+        (
+            "dry-run",
+            [
+                "dry-run",
+                "--system-config",
+                "system_config",
+                "--tests-dir",
+                "tests_dir",
+                "--plugin-dir",
+                "plugin_dir",
+                "--test-scenario",
+                "test_scenario",
+            ],
+            Path("plugin_dir"),
+        ),
+        (
+            "dry-run",
+            [
+                "dry-run",
+                "--system-config",
+                "system_config",
+                "--tests-dir",
+                "tests_dir",
+                "--test-scenario",
+                "test_scenario",
+            ],
+            None,
+        ),
+    ],
+)
+def test_modes_with_or_without_plugin_dir(mode, args, expected_plugin_dir):
+    cli = CloudAICLI()
+
+    cli.add_command(
+        mode,
+        f"{mode} command",
+        lambda _: 0,
+        system_config=True,
+        tests_dir=True,
+        plugin_dir=False,
+        test_scenario=True,
+        output_dir=False,
+    )
+
+    parsed_args = cli.parser.parse_args(args)
+    assert parsed_args.plugin_dir == expected_plugin_dir
+
+
 def test_real_uninstall():
     cli = CloudAICLI()
     cli.init_default_args()
@@ -277,6 +362,8 @@ def test_run_dry_run_modes(self, cli: CloudAICLI):
                     "tests_dir",
                     "--test-scenario",
                     "test_scenario",
+                    "--plugin-dir",
+                    "plugin_dir",
                 ]
             )
 
@@ -286,6 +373,7 @@ def test_run_dry_run_modes(self, cli: CloudAICLI):
                 mode=mode,
                 system_config=Path("system_config"),
                 tests_dir=Path("tests_dir"),
+                plugin_dir=Path("plugin_dir"),
                 test_scenario=Path("test_scenario"),
                 output_dir=None,
             )

From d9b2e83f8ee4e5dab3e22618533411eba2a16081 Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Wed, 23 Oct 2024 07:43:01 -0400
Subject: [PATCH 24/64] Parse plugins and pass them to TestRun

---
 src/cloudai/_core/test_scenario.py        |  2 +
 src/cloudai/_core/test_scenario_parser.py | 23 +++++++-
 src/cloudai/cli/handlers.py               |  2 +-
 src/cloudai/parser.py                     | 47 ++++++++++++---
 tests/test_acceptance.py                  |  1 +
 tests/test_parser.py                      | 70 ++++++++++++++++++++++-
 tests/test_test_scenario.py               |  2 +-
 7 files changed, 131 insertions(+), 16 deletions(-)

diff --git a/src/cloudai/_core/test_scenario.py b/src/cloudai/_core/test_scenario.py
index 3a60c036..97c89994 100644
--- a/src/cloudai/_core/test_scenario.py
+++ b/src/cloudai/_core/test_scenario.py
@@ -58,6 +58,8 @@ class TestRun:
     weight: float = 0.0
     ideal_perf: float = 1.0
     dependencies: dict[str, TestDependency] = field(default_factory=dict)
+    prologue: Optional["TestScenario"] = None
+    epilogue: Optional["TestScenario"] = None
 
     def __hash__(self) -> int:
         return hash(self.name + self.test.name + str(self.iterations) + str(self.current_iteration))
diff --git a/src/cloudai/_core/test_scenario_parser.py b/src/cloudai/_core/test_scenario_parser.py
index 16302a2a..c59adeba 100644
--- a/src/cloudai/_core/test_scenario_parser.py
+++ b/src/cloudai/_core/test_scenario_parser.py
@@ -101,9 +101,10 @@ class TestScenarioParser:
 
     __test__ = False
 
-    def __init__(self, file_path: Path, test_mapping: Dict[str, Test]) -> None:
+    def __init__(self, file_path: Path, test_mapping: Dict[str, Test], plugin_mapping: Dict[str, TestScenario]) -> None:
         self.file_path = file_path
         self.test_mapping = test_mapping
+        self.plugin_mapping = plugin_mapping
 
     def parse(self) -> TestScenario:
         """
@@ -138,8 +139,14 @@ def _parse_data(self, data: Dict[str, Any]) -> TestScenario:
         total_weight = sum(tr.weight for tr in ts_model.tests)
         normalized_weight = 0 if total_weight == 0 else 100 / total_weight
 
+        prologue_name = data.get("prologue", "")
+        epilogue_name = data.get("epilogue", "")
+
+        prologue = self.plugin_mapping.get(prologue_name, None) if prologue_name else None
+        epilogue = self.plugin_mapping.get(epilogue_name, None) if epilogue_name else None
+
         testruns_by_id: dict[str, TestRun] = {
-            tr.id: self._create_section_test_run(tr, normalized_weight) for tr in ts_model.tests
+            tr.id: self._create_section_test_run(tr, normalized_weight, prologue, epilogue) for tr in ts_model.tests
         }
 
         tests_data: dict[str, _TestRunTOML] = {tr.id: tr for tr in ts_model.tests}
@@ -155,13 +162,21 @@ def _parse_data(self, data: Dict[str, Any]) -> TestScenario:
             job_status_check=ts_model.job_status_check,
         )
 
-    def _create_section_test_run(self, test_info: _TestRunTOML, normalized_weight: float) -> TestRun:
+    def _create_section_test_run(
+        self,
+        test_info: _TestRunTOML,
+        normalized_weight: float,
+        prologue: Optional[TestScenario],
+        epilogue: Optional[TestScenario],
+    ) -> TestRun:
         """
         Create a section-specific Test object by copying from the test mapping.
 
         Args:
             test_info (Dict[str, Any]): Information of the test.
             normalized_weight (float): Normalized weight for the test.
+            prologue (Optional[TestScenario]): TestScenario object representing the prologue sequence.
+            epilogue (Optional[TestScenario]): TestScenario object representing the epilogue sequence.
 
         Returns:
             Test: Copied and updated Test object for the section.
@@ -194,5 +209,7 @@ def _create_section_test_run(self, test_info: _TestRunTOML, normalized_weight: f
             sol=test_info.sol,
             weight=test_info.weight * normalized_weight,
             ideal_perf=test_info.ideal_perf,
+            prologue=prologue if prologue is not None else TestScenario(name="default_prologue", test_runs=[]),
+            epilogue=epilogue if epilogue is not None else TestScenario(name="default_epilogue", test_runs=[]),
         )
         return tr
diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py
index 6105bc24..e654cf03 100644
--- a/src/cloudai/cli/handlers.py
+++ b/src/cloudai/cli/handlers.py
@@ -90,7 +90,7 @@ def handle_dry_run_and_run(args: argparse.Namespace) -> int:
         args (argparse.Namespace): The parsed command-line arguments.
     """
     parser = Parser(args.system_config)
-    system, tests, test_scenario = parser.parse(args.tests_dir, args.test_scenario)
+    system, tests, test_scenario = parser.parse(args.tests_dir, args.test_scenario, args.plugin_dir)
     assert test_scenario is not None
 
     if args.output_dir:
diff --git a/src/cloudai/parser.py b/src/cloudai/parser.py
index a627b312..5b21ab3f 100644
--- a/src/cloudai/parser.py
+++ b/src/cloudai/parser.py
@@ -16,7 +16,7 @@
 
 import logging
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Set, Tuple
 
 import toml
 from pydantic import ValidationError
@@ -49,7 +49,7 @@ def __init__(self, system_config_path: Path) -> None:
         self.system_config_path = system_config_path
 
     def parse(
-        self, test_path: Path, test_scenario_path: Optional[Path] = None
+        self, test_path: Path, test_scenario_path: Optional[Path] = None, plugin_path: Optional[Path] = None
     ) -> Tuple[System, List[Test], Optional[TestScenario]]:
         """
         Parse configurations for system, test templates, and test scenarios.
@@ -74,21 +74,50 @@ def parse(
         logging.debug(f"Parsed {len(tests)} tests: {[t.name for t in tests]}")
         test_mapping = {t.name: t for t in tests}
 
-        filtered_tests = tests
         test_scenario: Optional[TestScenario] = None
+        scenario_test_names: Set[str] = set()
         if test_scenario_path:
+            plugin_mapping: Dict[str, TestScenario] = {}
+            plugin_test_names: Set[str] = set()
+            if plugin_path and plugin_path.exists():
+                try:
+                    plugin_mapping = self.parse_plugins(list(plugin_path.glob("*.toml")), test_mapping)
+                    for plugin_scenario in plugin_mapping.values():
+                        plugin_test_names.update(tr.test.name for tr in plugin_scenario.test_runs)
+                except TestScenarioParsingError:
+                    exit(1)
+
             try:
-                test_scenario = self.parse_test_scenario(test_scenario_path, test_mapping)
+                test_scenario = self.parse_test_scenario(test_scenario_path, test_mapping, plugin_mapping)
+                scenario_test_names = set(tr.test.name for tr in test_scenario.test_runs)
             except TestScenarioParsingError:
-                exit(1)  # exit right away to keep error message readable for users
-            scenario_tests = set(tr.test.name for tr in test_scenario.test_runs)
-            filtered_tests = [t for t in tests if t.name in scenario_tests]
+                exit(1)
+
+            all_used_test_names = plugin_test_names.union(scenario_test_names)
+            filtered_tests = [t for t in tests if t.name in all_used_test_names]
+        else:
+            filtered_tests = tests
 
         return system, filtered_tests, test_scenario
 
     @staticmethod
-    def parse_test_scenario(test_scenario_path: Path, test_mapping: Dict[str, Test]) -> TestScenario:
-        test_scenario_parser = TestScenarioParser(test_scenario_path, test_mapping)
+    def parse_plugins(plugin_tomls: List[Path], test_mapping: Dict[str, Test]) -> Dict[str, TestScenario]:
+        plugin_mapping = {}
+        for plugin_path in plugin_tomls:
+            plugin_scenario = Parser.parse_test_scenario(plugin_path, test_mapping)
+            plugin_mapping[plugin_scenario.name] = plugin_scenario
+        return plugin_mapping
+
+    @staticmethod
+    def parse_test_scenario(
+        test_scenario_path: Path,
+        test_mapping: Dict[str, Test],
+        plugin_mapping: Optional[Dict[str, TestScenario]] = None,
+    ) -> TestScenario:
+        if plugin_mapping is None:
+            plugin_mapping = {}
+
+        test_scenario_parser = TestScenarioParser(test_scenario_path, test_mapping, plugin_mapping)
         test_scenario = test_scenario_parser.parse()
         return test_scenario
 
diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py
index e18d5d60..e11ff50b 100644
--- a/tests/test_acceptance.py
+++ b/tests/test_acceptance.py
@@ -60,6 +60,7 @@ def test_slurm(tmp_path: Path, scenario: Dict):
         system_config=Path("conf/common/system/example_slurm_cluster.toml"),
         test_templates_dir=Path("conf/common/test_template"),
         tests_dir=Path("conf/common/test"),
+        plugin_dir=Path("conf/common/plugin"),
         test_scenario=test_scenario_path,
         output_dir=tmp_path,
     )
diff --git a/tests/test_parser.py b/tests/test_parser.py
index d35896a9..cb809d36 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -34,7 +34,7 @@ def parser(self, tmp_path: Path) -> Parser:
     def test_no_tests_dir(self, parser: Parser):
         tests_dir = parser.system_config_path.parent / "tests"
         with pytest.raises(FileNotFoundError) as exc_info:
-            parser.parse(tests_dir, None)
+            parser.parse(tests_dir, None, None)
         assert "Test path" in str(exc_info.value)
 
     @patch("cloudai._core.test_parser.TestParser.parse_all")
@@ -50,19 +50,85 @@ def test_no_scenario(self, test_parser: Mock, parser: Parser):
 
     @patch("cloudai._core.test_parser.TestParser.parse_all")
     @patch("cloudai._core.test_scenario_parser.TestScenarioParser.parse")
-    def test_scenario_filters_tests(self, test_scenario_parser: Mock, test_parser: Mock, parser: Parser):
+    def test_scenario_without_plugin(self, test_scenario_parser: Mock, test_parser: Mock, parser: Parser):
         tests_dir = parser.system_config_path.parent.parent / "test"
+
         fake_tests = []
         for i in range(3):
             fake_tests.append(Mock())
             fake_tests[-1].name = f"test-{i}"
         test_parser.return_value = fake_tests
+
         fake_scenario = Mock()
         fake_scenario.test_runs = [Mock()]
         fake_scenario.test_runs[0].test.name = "test-1"
         test_scenario_parser.return_value = fake_scenario
+
         _, tests, _ = parser.parse(tests_dir, Path())
+
+        assert len(tests) == 1
+        assert tests[0].name == "test-1"
+
+    @patch("cloudai._core.test_parser.TestParser.parse_all")
+    @patch("cloudai._core.test_scenario_parser.TestScenarioParser.parse")
+    @patch("cloudai.parser.Parser.parse_plugins")
+    def test_scenario_with_plugin_common_tests(
+        self, parse_plugins: Mock, test_scenario_parser: Mock, test_parser: Mock, parser: Parser
+    ):
+        tests_dir = parser.system_config_path.parent.parent / "test"
+
+        fake_tests = []
+        for i in range(3):
+            fake_tests.append(Mock())
+            fake_tests[-1].name = f"test-{i}"
+        test_parser.return_value = fake_tests
+
+        fake_scenario = Mock()
+        fake_scenario.test_runs = [Mock()]
+        fake_scenario.test_runs[0].test.name = "test-1"
+        test_scenario_parser.return_value = fake_scenario
+
+        fake_plugin = Mock()
+        fake_plugin.test_runs = [Mock()]
+        fake_plugin.test_runs[0].test.name = "test-1"
+        parse_plugins.return_value = {"plugin-1": fake_plugin}
+
+        _, tests, _ = parser.parse(tests_dir, Path(), Path())
+
         assert len(tests) == 1
+        assert tests[0].name == "test-1"
+
+    @patch("cloudai._core.test_parser.TestParser.parse_all")
+    @patch("cloudai._core.test_scenario_parser.TestScenarioParser.parse")
+    @patch("cloudai.parser.Parser.parse_plugins")
+    def test_scenario_with_plugin_exclusive_tests(
+        self, parse_plugins: Mock, test_scenario_parser: Mock, test_parser: Mock, parser: Parser
+    ):
+        tests_dir = parser.system_config_path.parent.parent / "test"
+
+        fake_tests = []
+        for i in range(4):
+            fake_tests.append(Mock())
+            fake_tests[-1].name = f"test-{i}"
+        test_parser.return_value = fake_tests
+
+        fake_scenario = Mock()
+        fake_scenario.test_runs = [Mock()]
+        fake_scenario.test_runs[0].test.name = "test-1"
+        test_scenario_parser.return_value = fake_scenario
+
+        fake_plugin = Mock()
+        fake_plugin.test_runs = [Mock()]
+        fake_plugin.test_runs[0].test.name = "test-2"
+        parse_plugins.return_value = {"plugin-1": fake_plugin}
+
+        _, tests, _ = parser.parse(tests_dir, Path(), Path())
+
+        assert len(tests) == 2
+        assert "test-1" in [t.name for t in tests]
+        assert "test-2" in [t.name for t in tests]
+        assert "test-0" not in [t.name for t in tests]
+        assert "test-3" not in [t.name for t in tests]
 
     def test_parse_system(self, parser: Parser):
         parser.system_config_path = Path("conf/common/system/example_slurm_cluster.toml")
diff --git a/tests/test_test_scenario.py b/tests/test_test_scenario.py
index ab81bdbd..72639068 100644
--- a/tests/test_test_scenario.py
+++ b/tests/test_test_scenario.py
@@ -27,7 +27,7 @@
 
 @pytest.fixture
 def test_scenario_parser(tmp_path: Path) -> TestScenarioParser:
-    tsp = TestScenarioParser(Path(""), {})
+    tsp = TestScenarioParser(Path(""), {}, {})
     return tsp
 
 

From bfb653f935910fb266da508dcf0bd1fe3722c101 Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Fri, 25 Oct 2024 06:28:41 -0400
Subject: [PATCH 25/64] Generate plugin commands

---
 src/cloudai/_core/command_gen_strategy.py     |  26 ++++
 src/cloudai/_core/test_template.py            |  34 +++++
 .../nccl_test/slurm_command_gen_strategy.py   |   4 +
 .../strategy/slurm_command_gen_strategy.py    | 108 +++++++++++++-
 tests/ref_data/gpt.sbatch                     |   2 +-
 tests/ref_data/grok.sbatch                    |   2 +-
 tests/ref_data/nccl.sbatch                    |  21 +--
 tests/ref_data/sleep.sbatch                   |   4 +-
 tests/ref_data/ucc.sbatch                     |  10 +-
 .../test_common_slurm_command_gen_strategy.py | 137 +++++++++++++++++-
 10 files changed, 309 insertions(+), 39 deletions(-)

diff --git a/src/cloudai/_core/command_gen_strategy.py b/src/cloudai/_core/command_gen_strategy.py
index 16bd04f9..9c8bb389 100644
--- a/src/cloudai/_core/command_gen_strategy.py
+++ b/src/cloudai/_core/command_gen_strategy.py
@@ -39,3 +39,29 @@ def gen_exec_command(self, tr: TestRun) -> str:
             str: The generated execution command.
         """
         pass
+
+    @abstractmethod
+    def gen_srun_command(self, tr: TestRun) -> str:
+        """
+        Generate the Slurm srun command for a test based on the given parameters.
+
+        Args:
+            tr (TestRun): Contains the test and its run-specific configurations.
+
+        Returns:
+            str: The generated Slurm srun command.
+        """
+        pass
+
+    @abstractmethod
+    def gen_srun_success_check(self, tr: TestRun) -> str:
+        """
+        Generate the Slurm success check command to verify if a test run was successful.
+
+        Args:
+            tr (TestRun): Contains the test and its run-specific configurations.
+
+        Returns:
+            str: The generated command to check the success of the test run.
+        """
+        pass
diff --git a/src/cloudai/_core/test_template.py b/src/cloudai/_core/test_template.py
index de98b1a8..2ccd872a 100644
--- a/src/cloudai/_core/test_template.py
+++ b/src/cloudai/_core/test_template.py
@@ -94,6 +94,40 @@ def gen_exec_command(self, tr: TestRun) -> str:
             )
         return self.command_gen_strategy.gen_exec_command(tr)
 
+    def gen_srun_command(self, tr: TestRun) -> str:
+        """
+        Generate an Slurm srun command for a test using the provided command generation strategy.
+
+        Args:
+            tr (TestRun): Contains the test and its run-specific configurations.
+
+        Returns:
+            str: The generated Slurm srun command.
+        """
+        if self.command_gen_strategy is None:
+            raise ValueError(
+                "command_gen_strategy is missing. Ensure the strategy is registered in the Registry "
+                "by calling the appropriate registration function for the system type."
+            )
+        return self.command_gen_strategy.gen_srun_command(tr)
+
+    def gen_srun_success_check(self, tr: TestRun) -> str:
+        """
+        Generate a Slurm success check command for a test using the provided command generation strategy.
+
+        Args:
+            tr (TestRun): Contains the test and its run-specific configurations.
+
+        Returns:
+            str: The generated command to check the success of the test run.
+        """
+        if self.command_gen_strategy is None:
+            raise ValueError(
+                "command_gen_strategy is missing. Ensure the strategy is registered in the Registry "
+                "by calling the appropriate registration function for the system type."
+            )
+        return self.command_gen_strategy.gen_srun_success_check(tr)
+
     def gen_json(self, tr: TestRun) -> Dict[Any, Any]:
         """
         Generate a JSON string representing the Kubernetes job specification for this test using this template.
diff --git a/src/cloudai/schema/test_template/nccl_test/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/nccl_test/slurm_command_gen_strategy.py
index 8805202c..28281841 100644
--- a/src/cloudai/schema/test_template/nccl_test/slurm_command_gen_strategy.py
+++ b/src/cloudai/schema/test_template/nccl_test/slurm_command_gen_strategy.py
@@ -73,3 +73,7 @@ def generate_test_command(
             srun_command_parts.append(extra_cmd_args)
 
         return srun_command_parts
+
+    def gen_srun_success_check(self, tr: TestRun) -> str:
+        output_file = Path(tr.output_path) / "stdout.txt"
+        return f'grep -q "Avg bus bandwidth" {output_file} && echo 1 || echo 0'
diff --git a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py
index 5c33a141..57c37812 100644
--- a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py
+++ b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py
@@ -18,7 +18,7 @@
 from pathlib import Path
 from typing import Any, Dict, List
 
-from cloudai import CommandGenStrategy, TestRun
+from cloudai import CommandGenStrategy, TestRun, TestScenario
 from cloudai.systems import SlurmSystem
 
 
@@ -55,8 +55,33 @@ def gen_exec_command(self, tr: TestRun) -> str:
         env_vars = self._override_env_vars(self.system.global_env_vars, tr.test.extra_env_vars)
         cmd_args = self._override_cmd_args(self.default_cmd_args, tr.test.cmd_args)
         slurm_args = self._parse_slurm_args(tr.test.test_template.__class__.__name__, env_vars, cmd_args, tr)
-        srun_command = self.generate_srun_command(slurm_args, env_vars, cmd_args, tr.test.extra_cmd_args)
-        return self._write_sbatch_script(slurm_args, env_vars, srun_command, tr.output_path)
+
+        if tr.prologue:
+            prologue_command = self.gen_prologue(tr.prologue, tr.output_path)
+            srun_command = self._gen_srun_command(slurm_args, env_vars, cmd_args, tr.test.extra_cmd_args)
+            command_list = [prologue_command, "if [ $PROLOGUE_SUCCESS -eq 1 ]; then", f"    {srun_command}"]
+
+            if tr.epilogue:
+                epilogue_command = self.gen_epilogue(tr.epilogue, tr.output_path)
+                command_list.append(f"    {epilogue_command}")
+
+            command_list.append("fi")
+        else:
+            srun_command = self._gen_srun_command(slurm_args, env_vars, cmd_args, tr.test.extra_cmd_args)
+            command_list = [srun_command]
+
+            if tr.epilogue:
+                epilogue_command = self.gen_epilogue(tr.epilogue, tr.output_path)
+                command_list.append(epilogue_command)
+
+        full_command = "\n".join(command_list).strip()
+        return self._write_sbatch_script(slurm_args, env_vars, full_command, tr.output_path)
+
+    def gen_srun_command(self, tr: TestRun) -> str:
+        env_vars = self._override_env_vars(self.system.global_env_vars, tr.test.extra_env_vars)
+        cmd_args = self._override_cmd_args(self.default_cmd_args, tr.test.cmd_args)
+        slurm_args = self._parse_slurm_args(tr.test.test_template.__class__.__name__, env_vars, cmd_args, tr)
+        return self._gen_srun_command(slurm_args, env_vars, cmd_args, tr.test.extra_cmd_args)
 
     def _parse_slurm_args(
         self, job_name_prefix: str, env_vars: Dict[str, str], cmd_args: Dict[str, str], tr: TestRun
@@ -98,12 +123,87 @@ def job_name(self, job_name_prefix: str) -> str:
             job_name = f"{self.system.account}-{job_name_prefix}.{datetime.now().strftime('%Y%m%d_%H%M%S')}"
         return job_name
 
+    def gen_prologue(self, prologue: TestScenario, base_output_path: Path) -> str:
+        """
+        Generate the prologue command by running all tests defined in the prologue test scenario.
+
+        Args:
+            prologue (TestScenario): The prologue test scenario containing the tests to be run.
+            base_output_path (Path): The base output directory path for storing prologue outputs.
+
+        Returns:
+            str: A string with all the Slurm srun commands generated for the prologue.
+        """
+        if not prologue.test_runs:
+            return "PROLOGUE_SUCCESS=1\n"
+
+        prologue_output_dir = base_output_path / "prologue"
+        prologue_output_dir.mkdir(parents=True, exist_ok=True)
+
+        prologue_commands = []
+        success_vars = []
+
+        for idx, tr in enumerate(prologue.test_runs):
+            plugin_dir = prologue_output_dir / tr.test.name
+            plugin_dir.mkdir(parents=True, exist_ok=True)
+            tr.output_path = plugin_dir
+
+            srun_command = tr.test.test_template.gen_srun_command(tr)
+            srun_command_with_output = srun_command.replace(
+                "srun ", f"srun --output={plugin_dir / 'stdout.txt'} --error={plugin_dir / 'stderr.txt'} "
+            )
+            prologue_commands.append(srun_command_with_output)
+
+            success_var = f"SUCCESS_{idx}"
+            success_vars.append(success_var)
+
+            success_check_command = tr.test.test_template.gen_srun_success_check(tr)
+            prologue_commands.append(f"{success_var}=$({success_check_command})")
+
+        combined_success_var = " && ".join([f"[ ${var} -eq 1 ]" for var in success_vars])
+
+        prologue_commands.append(f"PROLOGUE_SUCCESS=$( {combined_success_var} && echo 1 || echo 0 )")
+
+        return "\n".join(prologue_commands)
+
+    def gen_epilogue(self, epilogue: TestScenario, base_output_path: Path) -> str:
+        """
+        Generate the epilogue command by running all tests defined in the epilogue test scenario.
+
+        Args:
+            epilogue (TestScenario): The epilogue test scenario containing the tests to be run.
+            base_output_path (Path): The base output directory path for storing epilogue outputs.
+
+        Returns:
+            str: A string with all the Slurm srun commands generated for the epilogue.
+        """
+        if not epilogue.test_runs:
+            return ""
+
+        epilogue_output_dir = base_output_path / "epilogue"
+        epilogue_output_dir.mkdir(parents=True, exist_ok=True)
+
+        epilogue_commands = []
+
+        for tr in epilogue.test_runs:
+            plugin_dir = epilogue_output_dir / tr.test.name
+            plugin_dir.mkdir(parents=True, exist_ok=True)
+            tr.output_path = plugin_dir
+
+            srun_command = tr.test.test_template.gen_srun_command(tr)
+            srun_command_with_output = srun_command.replace(
+                "srun ", f"srun --output={plugin_dir / 'stdout.txt'} --error={plugin_dir / 'stderr.txt'} "
+            )
+            epilogue_commands.append(srun_command_with_output)
+
+        return "\n".join(epilogue_commands)
+
     def _gen_srun_command(
         self, slurm_args: Dict[str, Any], env_vars: Dict[str, str], cmd_args: Dict[str, str], extra_cmd_args: str
     ) -> str:
         srun_command_parts = self.gen_srun_prefix(slurm_args)
         test_command_parts = self.generate_test_command(env_vars, cmd_args, extra_cmd_args)
-        return " \\\n".join(srun_command_parts + test_command_parts)
+        return " ".join(srun_command_parts + test_command_parts)
 
     def gen_srun_prefix(self, slurm_args: Dict[str, Any]) -> List[str]:
         srun_command_parts = ["srun", f"--mpi={self.system.mpi}"]
diff --git a/tests/ref_data/gpt.sbatch b/tests/ref_data/gpt.sbatch
index 3cd84b21..f01e9222 100644
--- a/tests/ref_data/gpt.sbatch
+++ b/tests/ref_data/gpt.sbatch
@@ -8,7 +8,7 @@ export COMBINE_THRESHOLD=1
 export PER_GPU_COMBINE_THRESHOLD=0
 export XLA_FLAGS="--xla_gpu_all_gather_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_all_reduce_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_reduce_scatter_combine_threshold_bytes=$PER_GPU_COMBINE_THRESHOLD"
 
-    echo "Loading container with srun command"
+echo "Loading container with srun command"
     srun --mpi=none --container-image=https://docker/url --container-name=cont true
     echo "Running srun command"
     srun \
diff --git a/tests/ref_data/grok.sbatch b/tests/ref_data/grok.sbatch
index f5d32243..7e7adfc2 100644
--- a/tests/ref_data/grok.sbatch
+++ b/tests/ref_data/grok.sbatch
@@ -8,7 +8,7 @@ export COMBINE_THRESHOLD=1
 export PER_GPU_COMBINE_THRESHOLD=0
 export XLA_FLAGS="--xla_disable_hlo_passes=rematerialization --xla_dump_hlo_pass_re=.* --xla_gpu_all_gather_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_all_reduce_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_enable_all_gather_combine_by_dim=false --xla_gpu_enable_highest_priority_async_stream=true --xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_pipelined_all_gather=true --xla_gpu_enable_pipelined_all_reduce=true --xla_gpu_enable_pipelined_reduce_scatter=true --xla_gpu_enable_reduce_scatter_combine_by_dim=false --xla_gpu_enable_triton_gemm=false --xla_gpu_enable_triton_softmax_fusion=false --xla_gpu_enable_while_loop_double_buffering=true --xla_gpu_graph_level=0 --xla_gpu_pgle_profile_file_or_directory_path=/opt/paxml/workspace/pgle_output_profile.pbtxt --xla_gpu_reduce_scatter_combine_threshold_bytes=$PER_GPU_COMBINE_THRESHOLD --xla_gpu_run_post_layout_collective_pipeliner=false --xla_gpu_use_memcpy_local_p2p=false"
 
-    echo "Loading container with srun command"
+echo "Loading container with srun command"
     srun --mpi=none --container-image=https://docker/url --container-name=cont true
     echo "Running srun command"
     srun \
diff --git a/tests/ref_data/nccl.sbatch b/tests/ref_data/nccl.sbatch
index 3ac39077..dc179ba9 100644
--- a/tests/ref_data/nccl.sbatch
+++ b/tests/ref_data/nccl.sbatch
@@ -8,23 +8,4 @@
 export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)
 
 
-srun \
---mpi=pmix \
---container-image=nvcr.io/nvidia/pytorch:24.02-py3 \
-/usr/local/bin/all_reduce_perf_mpi \
---nthreads 1 \
---ngpus 1 \
---minbytes 32M \
---maxbytes 32M \
---stepbytes 1M \
---op sum \
---datatype float \
---root 0 \
---iters 20 \
---warmup_iters 5 \
---agg_iters 1 \
---average 1 \
---parallel_init 0 \
---check 1 \
---blocking 0 \
---cudagraph 0
\ No newline at end of file
+srun --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 /usr/local/bin/all_reduce_perf_mpi --nthreads 1 --ngpus 1 --minbytes 32M --maxbytes 32M --stepbytes 1M --op sum --datatype float --root 0 --iters 20 --warmup_iters 5 --agg_iters 1 --average 1 --parallel_init 0 --check 1 --blocking 0 --cudagraph 0
diff --git a/tests/ref_data/sleep.sbatch b/tests/ref_data/sleep.sbatch
index 7c24ec14..9262001b 100644
--- a/tests/ref_data/sleep.sbatch
+++ b/tests/ref_data/sleep.sbatch
@@ -8,6 +8,4 @@
 export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)
 
 
-srun \
---mpi=pmix \
-sleep 5
\ No newline at end of file
+srun --mpi=pmix sleep 5
diff --git a/tests/ref_data/ucc.sbatch b/tests/ref_data/ucc.sbatch
index 74fa7799..a9f9e686 100644
--- a/tests/ref_data/ucc.sbatch
+++ b/tests/ref_data/ucc.sbatch
@@ -8,12 +8,4 @@
 export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)
 
 
-srun \
---mpi=pmix \
---container-image=nvcr.io/nvidia/pytorch:24.02-py3 \
-/opt/hpcx/ucc/bin/ucc_perftest \
--c alltoall \
--b 1 \
--e 8M \
--m cuda \
--F
\ No newline at end of file
+srun --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 /opt/hpcx/ucc/bin/ucc_perftest -c alltoall -b 1 -e 8M -m cuda -F
diff --git a/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py
index f2aae181..4484b6e1 100644
--- a/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py
+++ b/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py
@@ -19,7 +19,7 @@
 
 import pytest
 
-from cloudai import Test, TestDefinition, TestRun, TestTemplate
+from cloudai import Test, TestDefinition, TestRun, TestScenario, TestTemplate
 from cloudai.systems import SlurmSystem
 from cloudai.systems.slurm.strategy import SlurmCommandGenStrategy
 
@@ -122,3 +122,138 @@ def test_raises_if_no_default_partition(slurm_system: SlurmSystem):
         "system configuration. Please ensure that 'default_partition' is set correctly "
         "in the corresponding system configuration (e.g., system.toml)."
     ) in str(exc_info.value)
+
+
+@pytest.mark.parametrize(
+    "prologue,epilogue,expected_script_lines",
+    [
+        # No prologue, no epilogue
+        (None, None, ["srun"]),
+        # One prologue, no epilogue
+        (
+            [Mock(test=Mock(name="test1", test_template=Mock()))],
+            None,
+            [
+                "SUCCESS_0=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)",
+                "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 )",
+                "if [ $PROLOGUE_SUCCESS -eq 1 ]; then",
+                "    srun",
+                "fi",
+            ],
+        ),
+        # No prologue, one epilogue
+        (
+            None,
+            [Mock(test=Mock(name="test2", test_template=Mock()))],
+            [
+                "srun",
+                "epilogue",
+            ],
+        ),
+        # One prologue, one epilogue
+        (
+            [Mock(test=Mock(name="test1", test_template=Mock()))],
+            [Mock(test=Mock(name="test2", test_template=Mock()))],
+            [
+                "SUCCESS_0=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)",
+                "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 )",
+                "if [ $PROLOGUE_SUCCESS -eq 1 ]; then",
+                "    srun",
+                "    epilogue",
+                "fi",
+            ],
+        ),
+        # Multiple prologues, multiple epilogues
+        (
+            [Mock(test=Mock(name="test1", test_template=Mock())), Mock(test=Mock(name="test2", test_template=Mock()))],
+            [Mock(test=Mock(name="test3", test_template=Mock())), Mock(test=Mock(name="test4", test_template=Mock()))],
+            [
+                "SUCCESS_0=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)",
+                "SUCCESS_1=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)",
+                "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && [ $SUCCESS_1 -eq 1 ] && echo 1 || echo 0 )",
+                "if [ $PROLOGUE_SUCCESS -eq 1 ]; then",
+                "    srun",
+                "    epilogue",
+                "    epilogue",
+                "fi",
+            ],
+        ),
+        # Multiple prologues, no epilogue
+        (
+            [Mock(test=Mock(name="test1", test_template=Mock())), Mock(test=Mock(name="test2", test_template=Mock()))],
+            None,
+            [
+                "SUCCESS_0=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)",
+                "SUCCESS_1=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)",
+                "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && [ $SUCCESS_1 -eq 1 ] && echo 1 || echo 0 )",
+                "if [ $PROLOGUE_SUCCESS -eq 1 ]; then",
+                "    srun",
+                "fi",
+            ],
+        ),
+        # No prologue, multiple epilogues
+        (
+            None,
+            [Mock(test=Mock(name="test3", test_template=Mock())), Mock(test=Mock(name="test4", test_template=Mock()))],
+            [
+                "srun",
+                "epilogue",
+                "epilogue",
+            ],
+        ),
+        # Multiple prologues, single epilogue
+        (
+            [Mock(test=Mock(name="test1", test_template=Mock())), Mock(test=Mock(name="test2", test_template=Mock()))],
+            [Mock(test=Mock(name="test3", test_template=Mock()))],
+            [
+                "SUCCESS_0=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)",
+                "SUCCESS_1=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)",
+                "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && [ $SUCCESS_1 -eq 1 ] && echo 1 || echo 0 )",
+                "if [ $PROLOGUE_SUCCESS -eq 1 ]; then",
+                "    srun",
+                "    epilogue",
+                "fi",
+            ],
+        ),
+    ],
+)
+def test_prologue_epilogue_combinations(
+    strategy_fixture: SlurmCommandGenStrategy,
+    testrun_fixture: TestRun,
+    prologue,
+    epilogue,
+    expected_script_lines,
+    tmp_path,
+):
+    testrun_fixture.prologue = Mock(spec=TestScenario) if prologue else None
+    testrun_fixture.epilogue = Mock(spec=TestScenario) if epilogue else None
+
+    if prologue is not None:
+        testrun_fixture.prologue = Mock(spec=TestScenario)
+        testrun_fixture.prologue.test_runs = prologue
+        for idx, run in enumerate(prologue):
+            run.test.test_template.gen_srun_success_check.return_value = (
+                "grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0"
+            )
+            run.test.test_template.gen_srun_command.return_value = "srun"
+            run.test.name = f"test{idx+1}"
+    else:
+        testrun_fixture.prologue = None
+
+    if epilogue is not None:
+        testrun_fixture.epilogue = Mock(spec=TestScenario)
+        testrun_fixture.epilogue.test_runs = epilogue
+        for idx, run in enumerate(epilogue):
+            run.test.test_template.gen_srun_command.return_value = "epilogue"
+            run.test.name = f"test{idx+1}"
+    else:
+        testrun_fixture.epilogue = None
+
+    sbatch_command = strategy_fixture.gen_exec_command(testrun_fixture)
+    script_file_path = sbatch_command.split()[-1]
+
+    with open(script_file_path, "r") as script_file:
+        script_content = script_file.read()
+
+    for expected_line in expected_script_lines:
+        assert expected_line in script_content, f"Expected '{expected_line}' in generated script but it was missing."

From 9f83cd58d70fc9011f8871e44ff65ff3782ae52c Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Fri, 25 Oct 2024 11:33:12 -0400
Subject: [PATCH 26/64] Remove plugin option from CLI

---
 src/cloudai/cli/cli.py |  4 --
 tests/test_cli.py      | 92 +-----------------------------------------
 2 files changed, 2 insertions(+), 94 deletions(-)

diff --git a/src/cloudai/cli/cli.py b/src/cloudai/cli/cli.py
index f962a2d5..53059799 100644
--- a/src/cloudai/cli/cli.py
+++ b/src/cloudai/cli/cli.py
@@ -60,7 +60,6 @@ def add_command(
         handler: Callable[[argparse.Namespace], int],
         system_config: Optional[bool] = None,
         tests_dir: Optional[bool] = None,
-        plugin_dir: Optional[bool] = None,
         test_scenario: Optional[bool] = None,
         output_dir: Optional[bool] = None,
         result_dir: Optional[bool] = None,
@@ -75,8 +74,6 @@ def add_command(
             p.add_argument(
                 "--tests-dir", help="Path to the test configuration directory.", required=tests_dir, type=Path
             )
-        if plugin_dir is not None:
-            p.add_argument("--plugin-dir", help="Path to the plugin directory.", required=plugin_dir, type=Path)
         if test_scenario is not None:
             p.add_argument("--test-scenario", help="Path to the test scenario file.", required=test_scenario, type=Path)
         if output_dir is not None:
@@ -130,7 +127,6 @@ def add_run_and_dry_run(self):
                 handle_dry_run_and_run,
                 system_config=True,
                 tests_dir=True,
-                plugin_dir=False,
                 test_scenario=True,
                 output_dir=False,
             )
diff --git a/tests/test_cli.py b/tests/test_cli.py
index bb6c1a5d..538e497f 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -20,8 +20,8 @@
 
 import pytest
 
-from cloudai.cli import CloudAICLI
-from cloudai.cli.handlers import handle_generate_report, handle_install_and_uninstall, handle_verify_all_configs
+from cloudai.cli import CloudAICLI, handle_generate_report, handle_install_and_uninstall
+from cloudai.cli.handlers import handle_verify_all_configs
 
 
 def test_help_message(capsys: pytest.CaptureFixture[str]) -> None:
@@ -108,7 +108,6 @@ def test_add_command_all_optional():
         lambda _: 0,
         system_config=False,
         tests_dir=False,
-        plugin_dir=False,
         test_scenario=False,
         output_dir=False,
     )
@@ -119,7 +118,6 @@ def test_add_command_all_optional():
         mode="test",
         system_config=None,
         tests_dir=None,
-        plugin_dir=None,
         test_scenario=None,
         output_dir=None,
     )
@@ -134,7 +132,6 @@ def test_add_command_all_required():
         lambda _: 0,
         system_config=True,
         tests_dir=True,
-        plugin_dir=True,
         test_scenario=True,
         output_dir=True,
     )
@@ -145,8 +142,6 @@ def test_add_command_all_required():
             "system_config",
             "--tests-dir",
             "tests_dir",
-            "--plugin-dir",
-            "plugin_dir",
             "--test-scenario",
             "test_scenario",
             "--output-dir",
@@ -159,91 +154,11 @@ def test_add_command_all_required():
         mode="test",
         system_config=Path("system_config"),
         tests_dir=Path("tests_dir"),
-        plugin_dir=Path("plugin_dir"),
         test_scenario=Path("test_scenario"),
         output_dir=Path("output_dir"),
     )
 
 
-@pytest.mark.parametrize(
-    "mode,args,expected_plugin_dir",
-    [
-        (
-            "run",
-            [
-                "run",
-                "--system-config",
-                "system_config",
-                "--tests-dir",
-                "tests_dir",
-                "--plugin-dir",
-                "plugin_dir",
-                "--test-scenario",
-                "test_scenario",
-            ],
-            Path("plugin_dir"),
-        ),
-        (
-            "run",
-            [
-                "run",
-                "--system-config",
-                "system_config",
-                "--tests-dir",
-                "tests_dir",
-                "--test-scenario",
-                "test_scenario",
-            ],
-            None,
-        ),
-        (
-            "dry-run",
-            [
-                "dry-run",
-                "--system-config",
-                "system_config",
-                "--tests-dir",
-                "tests_dir",
-                "--plugin-dir",
-                "plugin_dir",
-                "--test-scenario",
-                "test_scenario",
-            ],
-            Path("plugin_dir"),
-        ),
-        (
-            "dry-run",
-            [
-                "dry-run",
-                "--system-config",
-                "system_config",
-                "--tests-dir",
-                "tests_dir",
-                "--test-scenario",
-                "test_scenario",
-            ],
-            None,
-        ),
-    ],
-)
-def test_modes_with_or_without_plugin_dir(mode, args, expected_plugin_dir):
-    cli = CloudAICLI()
-
-    cli.add_command(
-        mode,
-        f"{mode} command",
-        lambda _: 0,
-        system_config=True,
-        tests_dir=True,
-        plugin_dir=False,
-        test_scenario=True,
-        output_dir=False,
-    )
-
-    parsed_args = cli.parser.parse_args(args)
-    assert parsed_args.plugin_dir == expected_plugin_dir
-
-
 def test_real_uninstall():
     cli = CloudAICLI()
     cli.init_default_args()
@@ -362,8 +277,6 @@ def test_run_dry_run_modes(self, cli: CloudAICLI):
                     "tests_dir",
                     "--test-scenario",
                     "test_scenario",
-                    "--plugin-dir",
-                    "plugin_dir",
                 ]
             )
 
@@ -373,7 +286,6 @@ def test_run_dry_run_modes(self, cli: CloudAICLI):
                 mode=mode,
                 system_config=Path("system_config"),
                 tests_dir=Path("tests_dir"),
-                plugin_dir=Path("plugin_dir"),
                 test_scenario=Path("test_scenario"),
                 output_dir=None,
             )

From f656eee58c81ed56a36db44c968a4b0e1626fa6a Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Fri, 25 Oct 2024 11:38:04 -0400
Subject: [PATCH 27/64] Make plugin directory self-contained

---
 .../plugin/test/nccl_test_all_gather.toml     | 33 +++++++++++++++++++
 .../plugin/test/nccl_test_all_reduce.toml     | 30 +++++++++++++++++
 .../nccl_test_epilogue.toml                   |  0
 .../nccl_test_prologue.toml                   |  0
 4 files changed, 63 insertions(+)
 create mode 100644 conf/common/plugin/test/nccl_test_all_gather.toml
 create mode 100644 conf/common/plugin/test/nccl_test_all_reduce.toml
 rename conf/common/plugin/{ => test_scenario}/nccl_test_epilogue.toml (100%)
 rename conf/common/plugin/{ => test_scenario}/nccl_test_prologue.toml (100%)

diff --git a/conf/common/plugin/test/nccl_test_all_gather.toml b/conf/common/plugin/test/nccl_test_all_gather.toml
new file mode 100644
index 00000000..4fec288a
--- /dev/null
+++ b/conf/common/plugin/test/nccl_test_all_gather.toml
@@ -0,0 +1,33 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "nccl_test_all_gather"
+description = "all_gather"
+test_template_name = "NcclTest"
+
+[cmd_args]
+"subtest_name" = "all_gather_perf_mpi"
+"ngpus" = "1"
+"minbytes" = "128"
+"maxbytes" = "4G"
+"iters" = "100"
+"warmup_iters" = "50"
+
+[extra_cmd_args]
+"--stepfactor" = "2"
+
+[extra_env_vars]
+"NCCL_TEST_SPLIT_MASK" = "0x7"
diff --git a/conf/common/plugin/test/nccl_test_all_reduce.toml b/conf/common/plugin/test/nccl_test_all_reduce.toml
new file mode 100644
index 00000000..9074b2b8
--- /dev/null
+++ b/conf/common/plugin/test/nccl_test_all_reduce.toml
@@ -0,0 +1,30 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "nccl_test_all_reduce"
+description = "all_reduce"
+test_template_name = "NcclTest"
+
+[cmd_args]
+"subtest_name" = "all_reduce_perf_mpi"
+"ngpus" = "1"
+"minbytes" = "128"
+"maxbytes" = "16G"
+"iters" = "100"
+"warmup_iters" = "50"
+
+[extra_cmd_args]
+"--stepfactor" = "2"
diff --git a/conf/common/plugin/nccl_test_epilogue.toml b/conf/common/plugin/test_scenario/nccl_test_epilogue.toml
similarity index 100%
rename from conf/common/plugin/nccl_test_epilogue.toml
rename to conf/common/plugin/test_scenario/nccl_test_epilogue.toml
diff --git a/conf/common/plugin/nccl_test_prologue.toml b/conf/common/plugin/test_scenario/nccl_test_prologue.toml
similarity index 100%
rename from conf/common/plugin/nccl_test_prologue.toml
rename to conf/common/plugin/test_scenario/nccl_test_prologue.toml

From 5af7113c9f22848f48f287adaa10516417392a04 Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Fri, 25 Oct 2024 11:36:06 -0400
Subject: [PATCH 28/64] Update Parser to support self-contained plugin
 directory

---
 src/cloudai/cli/handlers.py |   4 +-
 src/cloudai/parser.py       | 105 ++++++++++++++++++++++++++----------
 tests/test_parser.py        |  58 +++++++++++++-------
 3 files changed, 117 insertions(+), 50 deletions(-)

diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py
index e654cf03..b76609ef 100644
--- a/src/cloudai/cli/handlers.py
+++ b/src/cloudai/cli/handlers.py
@@ -90,7 +90,9 @@ def handle_dry_run_and_run(args: argparse.Namespace) -> int:
         args (argparse.Namespace): The parsed command-line arguments.
     """
     parser = Parser(args.system_config)
-    system, tests, test_scenario = parser.parse(args.tests_dir, args.test_scenario, args.plugin_dir)
+    system, tests, test_scenario = parser.parse(
+        args.tests_dir, args.test_scenario, Path("conf/common/plugin/test"), Path("conf/common/plugin/test_scenario")
+    )
     assert test_scenario is not None
 
     if args.output_dir:
diff --git a/src/cloudai/parser.py b/src/cloudai/parser.py
index 5b21ab3f..73ab8717 100644
--- a/src/cloudai/parser.py
+++ b/src/cloudai/parser.py
@@ -49,14 +49,25 @@ def __init__(self, system_config_path: Path) -> None:
         self.system_config_path = system_config_path
 
     def parse(
-        self, test_path: Path, test_scenario_path: Optional[Path] = None, plugin_path: Optional[Path] = None
+        self,
+        test_path: Path,
+        test_scenario_path: Optional[Path] = None,
+        plugin_test_path: Optional[Path] = None,
+        plugin_test_scenario_path: Optional[Path] = None,
     ) -> Tuple[System, List[Test], Optional[TestScenario]]:
         """
         Parse configurations for system, test templates, and test scenarios.
 
-        Returns
-            Tuple[System, List[TestTemplate], TestScenario]: A tuple containing the system object, a list of test
-                template objects, and the test scenario object.
+        Args:
+            test_path (Path): The file path for tests.
+            test_scenario_path (Optional[Path]): The file path for the main test scenario.
+                If None, all tests are included.
+            plugin_test_path (Optional[Path]): The file path for plugin-specific tests.
+            plugin_test_scenario_path (Optional[Path]): The file path for plugin-specific test scenarios.
+
+        Returns:
+            Tuple[System, List[Test], Optional[TestScenario]]: A tuple containing the system object, a list of filtered
+                test template objects, and the main test scenario object if provided.
         """
         if not test_path.exists():
             raise FileNotFoundError(f"Test path '{test_path}' not found.")
@@ -64,47 +75,83 @@ def parse(
         try:
             system = self.parse_system(self.system_config_path)
         except SystemConfigParsingError:
-            exit(1)  # exit right away to keep error message readable for users
+            exit(1)
 
         try:
             tests = self.parse_tests(list(test_path.glob("*.toml")), system)
         except TestConfigParsingError:
-            exit(1)  # exit right away to keep error message readable for users
+            exit(1)
 
-        logging.debug(f"Parsed {len(tests)} tests: {[t.name for t in tests]}")
-        test_mapping = {t.name: t for t in tests}
+        plugin_tests = (
+            self.parse_tests(list(plugin_test_path.glob("*.toml")), system)
+            if plugin_test_path and plugin_test_path.exists()
+            else []
+        )
 
-        test_scenario: Optional[TestScenario] = None
-        scenario_test_names: Set[str] = set()
         if test_scenario_path:
-            plugin_mapping: Dict[str, TestScenario] = {}
-            plugin_test_names: Set[str] = set()
-            if plugin_path and plugin_path.exists():
-                try:
-                    plugin_mapping = self.parse_plugins(list(plugin_path.glob("*.toml")), test_mapping)
-                    for plugin_scenario in plugin_mapping.values():
-                        plugin_test_names.update(tr.test.name for tr in plugin_scenario.test_runs)
-                except TestScenarioParsingError:
-                    exit(1)
+            return self._parse_with_scenario(system, tests, test_scenario_path, plugin_tests, plugin_test_scenario_path)
+
+        return system, tests + plugin_tests, None
+
+    def _parse_with_scenario(
+        self,
+        system: System,
+        tests: List[Test],
+        test_scenario_path: Path,
+        plugin_tests: List[Test],
+        plugin_test_scenario_path: Optional[Path],
+    ) -> Tuple[System, List[Test], Optional[TestScenario]]:
+        """Parse tests and scenarios with a main test scenario path specified."""
+        test_mapping = {t.name: t for t in tests}
+        plugin_test_mapping = {t.name: t for t in plugin_tests}
+
+        plugin_test_scenario_mapping = self._load_plugin_scenarios(plugin_test_scenario_path, plugin_test_mapping)
+        test_scenario = self._load_main_scenario(test_scenario_path, test_mapping, plugin_test_scenario_mapping)
 
+        all_used_test_names = self._collect_used_test_names(plugin_test_scenario_mapping, test_scenario)
+        filtered_tests = [t for t in tests if t.name in all_used_test_names]
+
+        return system, filtered_tests, test_scenario
+
+    def _load_plugin_scenarios(
+        self, plugin_test_scenario_path: Optional[Path], plugin_test_mapping: Dict[str, Test]
+    ) -> Dict[str, TestScenario]:
+        """Load plugin-specific test scenarios from the specified path."""
+        if plugin_test_scenario_path and plugin_test_scenario_path.exists():
             try:
-                test_scenario = self.parse_test_scenario(test_scenario_path, test_mapping, plugin_mapping)
-                scenario_test_names = set(tr.test.name for tr in test_scenario.test_runs)
+                return self.parse_plugins(list(plugin_test_scenario_path.glob("*.toml")), plugin_test_mapping)
             except TestScenarioParsingError:
                 exit(1)
+        return {}
 
-            all_used_test_names = plugin_test_names.union(scenario_test_names)
-            filtered_tests = [t for t in tests if t.name in all_used_test_names]
-        else:
-            filtered_tests = tests
-
-        return system, filtered_tests, test_scenario
+    def _load_main_scenario(
+        self,
+        test_scenario_path: Path,
+        test_mapping: Dict[str, Test],
+        plugin_test_scenario_mapping: Dict[str, TestScenario],
+    ) -> Optional[TestScenario]:
+        """Load the main test scenario using provided mappings."""
+        try:
+            return self.parse_test_scenario(test_scenario_path, test_mapping, plugin_test_scenario_mapping)
+        except TestScenarioParsingError:
+            exit(1)
+
+    def _collect_used_test_names(
+        self, plugin_test_scenario_mapping: Dict[str, TestScenario], test_scenario: Optional[TestScenario]
+    ) -> Set[str]:
+        """Collect test names used in both plugin and main test scenarios."""
+        # TODO: collect test names in the plugin test scenarios only
+        plugin_test_names = {
+            tr.test.name for scenario in plugin_test_scenario_mapping.values() for tr in scenario.test_runs
+        }
+        scenario_test_names = {tr.test.name for tr in test_scenario.test_runs} if test_scenario else set()
+        return plugin_test_names.union(scenario_test_names)
 
     @staticmethod
     def parse_plugins(plugin_tomls: List[Path], test_mapping: Dict[str, Test]) -> Dict[str, TestScenario]:
         plugin_mapping = {}
-        for plugin_path in plugin_tomls:
-            plugin_scenario = Parser.parse_test_scenario(plugin_path, test_mapping)
+        for plugin_test_scenario_path in plugin_tomls:
+            plugin_scenario = Parser.parse_test_scenario(plugin_test_scenario_path, test_mapping)
             plugin_mapping[plugin_scenario.name] = plugin_scenario
         return plugin_mapping
 
diff --git a/tests/test_parser.py b/tests/test_parser.py
index cb809d36..12372755 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -15,13 +15,13 @@
 # limitations under the License.
 
 from pathlib import Path
-from typing import cast
+from typing import Dict, cast
 from unittest.mock import Mock, patch
 
 import pytest
 from pydantic_core import ErrorDetails
 
-from cloudai import Parser, format_validation_error
+from cloudai import Parser, TestScenario, format_validation_error
 from cloudai.systems.slurm.slurm_system import SlurmSystem
 
 
@@ -100,16 +100,14 @@ def test_scenario_with_plugin_common_tests(
 
     @patch("cloudai._core.test_parser.TestParser.parse_all")
     @patch("cloudai._core.test_scenario_parser.TestScenarioParser.parse")
-    @patch("cloudai.parser.Parser.parse_plugins")
-    def test_scenario_with_plugin_exclusive_tests(
-        self, parse_plugins: Mock, test_scenario_parser: Mock, test_parser: Mock, parser: Parser
-    ):
+    def test_scenario_with_plugin_exclusive_tests(self, test_scenario_parser: Mock, test_parser: Mock, parser: Parser):
         tests_dir = parser.system_config_path.parent.parent / "test"
+        test_scenario_path = Path("/mock/test_scenario.toml")
+        plugin_test_scenario_path = Path("/mock/plugin_scenarios")
 
-        fake_tests = []
-        for i in range(4):
-            fake_tests.append(Mock())
-            fake_tests[-1].name = f"test-{i}"
+        fake_tests = [Mock() for _ in range(4)]
+        for i, test in enumerate(fake_tests):
+            test.name = f"test-{i}"
         test_parser.return_value = fake_tests
 
         fake_scenario = Mock()
@@ -117,18 +115,38 @@ def test_scenario_with_plugin_exclusive_tests(
         fake_scenario.test_runs[0].test.name = "test-1"
         test_scenario_parser.return_value = fake_scenario
 
-        fake_plugin = Mock()
-        fake_plugin.test_runs = [Mock()]
-        fake_plugin.test_runs[0].test.name = "test-2"
-        parse_plugins.return_value = {"plugin-1": fake_plugin}
+        fake_plugin_scenarios = {"plugin-1": Mock(test_runs=[Mock()])}
+        fake_plugin_scenarios["plugin-1"].test_runs[0].test.name = "test-2"
 
-        _, tests, _ = parser.parse(tests_dir, Path(), Path())
+        with patch.object(parser, "_load_plugin_scenarios", return_value=fake_plugin_scenarios):
+            _, filtered_tests, _ = parser.parse(tests_dir, test_scenario_path, tests_dir, plugin_test_scenario_path)
+
+        filtered_test_names = {t.name for t in filtered_tests}
+        assert len(filtered_tests) == 2
+        assert "test-1" in filtered_test_names
+        assert "test-2" in filtered_test_names
+        assert "test-0" not in filtered_test_names
+        assert "test-3" not in filtered_test_names
+
+    def test_collect_used_test_names(self, parser: Parser):
+        fake_scenario = Mock()
+        fake_scenario.test_runs = [Mock()]
+        fake_scenario.test_runs[0].test.name = "test-1"
+
+        fake_plugin_scenario_1 = Mock(spec=TestScenario)
+        fake_plugin_scenario_1.test_runs = [Mock()]
+        fake_plugin_scenario_1.test_runs[0].test.name = "test-2"
+
+        fake_plugin_scenario_2 = Mock(spec=TestScenario)
+        fake_plugin_scenario_2.test_runs = [Mock()]
+        fake_plugin_scenario_2.test_runs[0].test.name = "test-3"
+
+        fake_plugin_scenarios = cast(
+            Dict[str, TestScenario], {"plugin-1": fake_plugin_scenario_1, "plugin-2": fake_plugin_scenario_2}
+        )
 
-        assert len(tests) == 2
-        assert "test-1" in [t.name for t in tests]
-        assert "test-2" in [t.name for t in tests]
-        assert "test-0" not in [t.name for t in tests]
-        assert "test-3" not in [t.name for t in tests]
+        used_test_names = parser._collect_used_test_names(fake_plugin_scenarios, fake_scenario)
+        assert used_test_names == {"test-1", "test-2", "test-3"}
 
     def test_parse_system(self, parser: Parser):
         parser.system_config_path = Path("conf/common/system/example_slurm_cluster.toml")

From b22c2f2d47d0bda68de6c9cebbc0e1453d1c5432 Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Mon, 28 Oct 2024 13:35:01 -0400
Subject: [PATCH 29/64] Refactor plugin path handling in parse to use a single
 plugin_path param

---
 src/cloudai/cli/handlers.py | 4 +---
 src/cloudai/parser.py       | 9 +++++----
 tests/test_parser.py        | 4 ++--
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py
index b76609ef..1085440f 100644
--- a/src/cloudai/cli/handlers.py
+++ b/src/cloudai/cli/handlers.py
@@ -90,9 +90,7 @@ def handle_dry_run_and_run(args: argparse.Namespace) -> int:
         args (argparse.Namespace): The parsed command-line arguments.
     """
     parser = Parser(args.system_config)
-    system, tests, test_scenario = parser.parse(
-        args.tests_dir, args.test_scenario, Path("conf/common/plugin/test"), Path("conf/common/plugin/test_scenario")
-    )
+    system, tests, test_scenario = parser.parse(args.tests_dir, args.test_scenario, Path("conf/common/plugin"))
     assert test_scenario is not None
 
     if args.output_dir:
diff --git a/src/cloudai/parser.py b/src/cloudai/parser.py
index 73ab8717..a9227f88 100644
--- a/src/cloudai/parser.py
+++ b/src/cloudai/parser.py
@@ -52,8 +52,7 @@ def parse(
         self,
         test_path: Path,
         test_scenario_path: Optional[Path] = None,
-        plugin_test_path: Optional[Path] = None,
-        plugin_test_scenario_path: Optional[Path] = None,
+        plugin_path: Optional[Path] = None,
     ) -> Tuple[System, List[Test], Optional[TestScenario]]:
         """
         Parse configurations for system, test templates, and test scenarios.
@@ -62,8 +61,7 @@ def parse(
             test_path (Path): The file path for tests.
             test_scenario_path (Optional[Path]): The file path for the main test scenario.
                 If None, all tests are included.
-            plugin_test_path (Optional[Path]): The file path for plugin-specific tests.
-            plugin_test_scenario_path (Optional[Path]): The file path for plugin-specific test scenarios.
+            plugin_path (Optional[Path]): The base file path for plugin-specific tests and scenarios.
 
         Returns:
             Tuple[System, List[Test], Optional[TestScenario]]: A tuple containing the system object, a list of filtered
@@ -82,6 +80,9 @@ def parse(
         except TestConfigParsingError:
             exit(1)
 
+        plugin_test_scenario_path = plugin_path
+        plugin_test_path = plugin_path / "test" if plugin_path else None
+
         plugin_tests = (
             self.parse_tests(list(plugin_test_path.glob("*.toml")), system)
             if plugin_test_path and plugin_test_path.exists()
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 12372755..bcfd63a3 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -103,7 +103,7 @@ def test_scenario_with_plugin_common_tests(
     def test_scenario_with_plugin_exclusive_tests(self, test_scenario_parser: Mock, test_parser: Mock, parser: Parser):
         tests_dir = parser.system_config_path.parent.parent / "test"
         test_scenario_path = Path("/mock/test_scenario.toml")
-        plugin_test_scenario_path = Path("/mock/plugin_scenarios")
+        plugin_path = Path("/mock/plugin_scenarios")
 
         fake_tests = [Mock() for _ in range(4)]
         for i, test in enumerate(fake_tests):
@@ -119,7 +119,7 @@ def test_scenario_with_plugin_exclusive_tests(self, test_scenario_parser: Mock,
         fake_plugin_scenarios["plugin-1"].test_runs[0].test.name = "test-2"
 
         with patch.object(parser, "_load_plugin_scenarios", return_value=fake_plugin_scenarios):
-            _, filtered_tests, _ = parser.parse(tests_dir, test_scenario_path, tests_dir, plugin_test_scenario_path)
+            _, filtered_tests, _ = parser.parse(tests_dir, test_scenario_path, plugin_path)
 
         filtered_test_names = {t.name for t in filtered_tests}
         assert len(filtered_tests) == 2

From c88fe2ef42986e0c5fd502de9d45e5b7c2c42844 Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Mon, 28 Oct 2024 13:29:04 -0400
Subject: [PATCH 30/64] Remove test_scenario directory from conf/common/plugin/

---
 conf/common/plugin/{test_scenario => }/nccl_test_epilogue.toml | 0
 conf/common/plugin/{test_scenario => }/nccl_test_prologue.toml | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename conf/common/plugin/{test_scenario => }/nccl_test_epilogue.toml (100%)
 rename conf/common/plugin/{test_scenario => }/nccl_test_prologue.toml (100%)

diff --git a/conf/common/plugin/test_scenario/nccl_test_epilogue.toml b/conf/common/plugin/nccl_test_epilogue.toml
similarity index 100%
rename from conf/common/plugin/test_scenario/nccl_test_epilogue.toml
rename to conf/common/plugin/nccl_test_epilogue.toml
diff --git a/conf/common/plugin/test_scenario/nccl_test_prologue.toml b/conf/common/plugin/nccl_test_prologue.toml
similarity index 100%
rename from conf/common/plugin/test_scenario/nccl_test_prologue.toml
rename to conf/common/plugin/nccl_test_prologue.toml

From c814ccb65258a5b77c4a7e1ddb8d4a3c9d4624ac Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Tue, 29 Oct 2024 10:11:34 -0400
Subject: [PATCH 31/64] Use Pydantic model to load prologue and epilogue

---
 src/cloudai/_core/test_scenario_parser.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/cloudai/_core/test_scenario_parser.py b/src/cloudai/_core/test_scenario_parser.py
index c59adeba..4d192a7a 100644
--- a/src/cloudai/_core/test_scenario_parser.py
+++ b/src/cloudai/_core/test_scenario_parser.py
@@ -139,11 +139,11 @@ def _parse_data(self, data: Dict[str, Any]) -> TestScenario:
         total_weight = sum(tr.weight for tr in ts_model.tests)
         normalized_weight = 0 if total_weight == 0 else 100 / total_weight
 
-        prologue_name = data.get("prologue", "")
-        epilogue_name = data.get("epilogue", "")
-
-        prologue = self.plugin_mapping.get(prologue_name, None) if prologue_name else None
-        epilogue = self.plugin_mapping.get(epilogue_name, None) if epilogue_name else None
+        prologue, epilogue = None, None
+        if ts_model.prologue:
+            prologue = self.plugin_mapping.get(ts_model.prologue)
+        if ts_model.epilogue:
+            epilogue = self.plugin_mapping.get(ts_model.epilogue)
 
         testruns_by_id: dict[str, TestRun] = {
             tr.id: self._create_section_test_run(tr, normalized_weight, prologue, epilogue) for tr in ts_model.tests

From a6d3efc631368bd6314d14efc00a3cf71d1a416e Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Tue, 29 Oct 2024 10:48:42 -0400
Subject: [PATCH 32/64] Recover acceptance tests with plugin

---
 .../{gpt.sbatch => gpt-no-plugin.sbatch}      |  0
 .../{gpt-pretest.sbatch => gpt-plugin.sbatch} | 40 +++----------------
 .../{grok.sbatch => grok-no-plugin.sbatch}    |  0
 ...grok-pretest.sbatch => grok-plugin.sbatch} | 40 +++----------------
 tests/test_acceptance.py                      | 38 ++++++++++++++++--
 5 files changed, 46 insertions(+), 72 deletions(-)
 rename tests/ref_data/{gpt.sbatch => gpt-no-plugin.sbatch} (100%)
 rename tests/ref_data/{gpt-pretest.sbatch => gpt-plugin.sbatch} (52%)
 rename tests/ref_data/{grok.sbatch => grok-no-plugin.sbatch} (100%)
 rename tests/ref_data/{grok-pretest.sbatch => grok-plugin.sbatch} (67%)

diff --git a/tests/ref_data/gpt.sbatch b/tests/ref_data/gpt-no-plugin.sbatch
similarity index 100%
rename from tests/ref_data/gpt.sbatch
rename to tests/ref_data/gpt-no-plugin.sbatch
diff --git a/tests/ref_data/gpt-pretest.sbatch b/tests/ref_data/gpt-plugin.sbatch
similarity index 52%
rename from tests/ref_data/gpt-pretest.sbatch
rename to tests/ref_data/gpt-plugin.sbatch
index 3a64823c..08a3a87a 100644
--- a/tests/ref_data/gpt-pretest.sbatch
+++ b/tests/ref_data/gpt-plugin.sbatch
@@ -8,39 +8,11 @@ export COMBINE_THRESHOLD=1
 export PER_GPU_COMBINE_THRESHOLD=0
 export XLA_FLAGS="--xla_gpu_all_gather_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_all_reduce_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_reduce_scatter_combine_threshold_bytes=$PER_GPU_COMBINE_THRESHOLD"
 
-srun \
---mpi=pmix \
--N 8 \
--o __OUTPUT_DIR__/output_pretest-%j-%n-%t.txt \
--e __OUTPUT_DIR__/error_pretest-%j-%n-%t.txt \
---container-image=nvcr.io/nvidia/pytorch:24.02-py3 \
-/usr/local/bin/all_gather_perf_mpi \
---nthreads 1 \
---ngpus 1 \
---minbytes 8M \
---maxbytes 16G \
---stepbytes 1M \
---op sum \
---datatype float \
---root 0 \
---iters 20 \
---warmup_iters 5 \
---agg_iters 1 \
---average 1 \
---parallel_init 0 \
---check 1 \
---blocking 1 \
---cudagraph 0 \
---stepfactor 2
-PRETEST_OUTPUT_FILES="__OUTPUT_DIR__/output_pretest-*.txt"
-keyword="Avg bus bandwidth"
-
-# Use grep to search for the keyword in the files
-if grep -q "$keyword" $PRETEST_OUTPUT_FILES; then
-    PRE_TEST_SUCCESS=true
-fi
-if [ "$PRE_TEST_SUCCESS" = true ]; then
-    echo "Loading container with srun command"
+srun --output=__OUTPUT_DIR__/prologue/nccl/stdout.txt --error=__OUTPUT_DIR__/prologue/nccl/stderr.txt --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 /usr/local/bin/all_reduce_perf_mpi --nthreads 1 --ngpus 1 --minbytes 32M --maxbytes 32M --stepbytes 1M --op sum --datatype float --root 0 --iters 20 --warmup_iters 5 --agg_iters 1 --average 1 --parallel_init 0 --check 1 --blocking 0 --cudagraph 0
+SUCCESS_0=$(grep -q "Avg bus bandwidth" __OUTPUT_DIR__/prologue/nccl/stdout.txt && echo 1 || echo 0)
+PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 )
+if [ $PROLOGUE_SUCCESS -eq 1 ]; then
+        echo "Loading container with srun command"
     srun --mpi=none --container-image=https://docker/url --container-name=cont true
     echo "Running srun command"
     srun \
@@ -52,4 +24,4 @@ if [ "$PRE_TEST_SUCCESS" = true ]; then
     --container-name=cont \
     --container-mounts=__OUTPUT_DIR__:/opt/paxml/workspace/ \
     /opt/paxml/workspace/run.sh
-fi
\ No newline at end of file
+fi
diff --git a/tests/ref_data/grok.sbatch b/tests/ref_data/grok-no-plugin.sbatch
similarity index 100%
rename from tests/ref_data/grok.sbatch
rename to tests/ref_data/grok-no-plugin.sbatch
diff --git a/tests/ref_data/grok-pretest.sbatch b/tests/ref_data/grok-plugin.sbatch
similarity index 67%
rename from tests/ref_data/grok-pretest.sbatch
rename to tests/ref_data/grok-plugin.sbatch
index 0e2672d5..e75d3d77 100644
--- a/tests/ref_data/grok-pretest.sbatch
+++ b/tests/ref_data/grok-plugin.sbatch
@@ -8,39 +8,11 @@ export COMBINE_THRESHOLD=1
 export PER_GPU_COMBINE_THRESHOLD=0
 export XLA_FLAGS="--xla_disable_hlo_passes=rematerialization --xla_dump_hlo_pass_re=.* --xla_gpu_all_gather_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_all_reduce_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_enable_all_gather_combine_by_dim=false --xla_gpu_enable_highest_priority_async_stream=true --xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_pipelined_all_gather=true --xla_gpu_enable_pipelined_all_reduce=true --xla_gpu_enable_pipelined_reduce_scatter=true --xla_gpu_enable_reduce_scatter_combine_by_dim=false --xla_gpu_enable_triton_gemm=false --xla_gpu_enable_triton_softmax_fusion=false --xla_gpu_enable_while_loop_double_buffering=true --xla_gpu_graph_level=0 --xla_gpu_pgle_profile_file_or_directory_path=/opt/paxml/workspace/pgle_output_profile.pbtxt --xla_gpu_reduce_scatter_combine_threshold_bytes=$PER_GPU_COMBINE_THRESHOLD --xla_gpu_run_post_layout_collective_pipeliner=false --xla_gpu_use_memcpy_local_p2p=false"
 
-srun \
---mpi=pmix \
--N 8 \
--o __OUTPUT_DIR__/output_pretest-%j-%n-%t.txt \
--e __OUTPUT_DIR__/error_pretest-%j-%n-%t.txt \
---container-image=nvcr.io/nvidia/pytorch:24.02-py3 \
-/usr/local/bin/all_gather_perf_mpi \
---nthreads 1 \
---ngpus 1 \
---minbytes 8M \
---maxbytes 16G \
---stepbytes 1M \
---op sum \
---datatype float \
---root 0 \
---iters 20 \
---warmup_iters 5 \
---agg_iters 1 \
---average 1 \
---parallel_init 0 \
---check 1 \
---blocking 1 \
---cudagraph 0 \
---stepfactor 2
-PRETEST_OUTPUT_FILES="__OUTPUT_DIR__/output_pretest-*.txt"
-keyword="Avg bus bandwidth"
-
-# Use grep to search for the keyword in the files
-if grep -q "$keyword" $PRETEST_OUTPUT_FILES; then
-    PRE_TEST_SUCCESS=true
-fi
-if [ "$PRE_TEST_SUCCESS" = true ]; then
-    echo "Loading container with srun command"
+srun --output=__OUTPUT_DIR__/prologue/nccl/stdout.txt --error=__OUTPUT_DIR__/prologue/nccl/stderr.txt --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 /usr/local/bin/all_reduce_perf_mpi --nthreads 1 --ngpus 1 --minbytes 32M --maxbytes 32M --stepbytes 1M --op sum --datatype float --root 0 --iters 20 --warmup_iters 5 --agg_iters 1 --average 1 --parallel_init 0 --check 1 --blocking 0 --cudagraph 0
+SUCCESS_0=$(grep -q "Avg bus bandwidth" __OUTPUT_DIR__/prologue/nccl/stdout.txt && echo 1 || echo 0)
+PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 )
+if [ $PROLOGUE_SUCCESS -eq 1 ]; then
+        echo "Loading container with srun command"
     srun --mpi=none --container-image=https://docker/url --container-name=cont true
     echo "Running srun command"
     srun \
@@ -52,4 +24,4 @@ if [ "$PRE_TEST_SUCCESS" = true ]; then
     --container-name=cont \
     --container-mounts=__OUTPUT_DIR__:/opt/paxml/workspace/ \
     /opt/paxml/workspace/run.sh
-fi
\ No newline at end of file
+fi
diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py
index e11ff50b..19e7acb9 100644
--- a/tests/test_acceptance.py
+++ b/tests/test_acceptance.py
@@ -22,7 +22,7 @@
 
 import pytest
 
-from cloudai import NcclTest, Test, TestRun, UCCTest
+from cloudai import NcclTest, Test, TestRun, TestScenario, UCCTest
 from cloudai.cli import handle_dry_run_and_run, setup_logging
 from cloudai.schema.test_template.jax_toolbox.slurm_command_gen_strategy import JaxToolboxSlurmCommandGenStrategy
 from cloudai.schema.test_template.jax_toolbox.template import JaxToolbox
@@ -91,7 +91,7 @@ def partial_tr(slurm_system: SlurmSystem) -> partial[TestRun]:
     return partial(TestRun, num_nodes=1, nodes=[], output_path=slurm_system.output_path)
 
 
-@pytest.fixture(params=["ucc", "nccl", "sleep", "gpt", "grok"])
+@pytest.fixture(params=["ucc", "nccl", "sleep", "gpt-plugin", "gpt-no-plugin", "grok-plugin", "grok-no-plugin"])
 def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -> tuple[TestRun, str, Optional[str]]:
     if request.param == "ucc":
         tr = partial_tr(
@@ -141,7 +141,7 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -
         tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name")
 
         return (tr, "sleep.sbatch", None)
-    elif request.param.startswith("gpt"):
+    elif request.param.startswith("gpt-"):
         tr = partial_tr(
             name="gpt",
             test=Test(
@@ -159,9 +159,24 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -
             slurm_system, tr.test.test_definition.cmd_args_dict
         )
         tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name")
+        if "no-plugin" not in request.param:
+            prologue_tr = partial_tr(
+                name="nccl",
+                test=Test(
+                    test_definition=NCCLTestDefinition(
+                        name="nccl", description="nccl", test_template_name="nccl", cmd_args=NCCLCmdArgs()
+                    ),
+                    test_template=NcclTest(slurm_system, name="nccl"),
+                ),
+            )
+            prologue_tr.test.test_template.command_gen_strategy = NcclTestSlurmCommandGenStrategy(
+                slurm_system, prologue_tr.test.test_definition.cmd_args_dict
+            )
+            prologue_tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name")
+            tr.prologue = TestScenario(name=f"{prologue_tr.name} NCCL Prologue", test_runs=[prologue_tr])
 
         return (tr, f"{request.param}.sbatch", "gpt.run")
-    elif request.param.startswith("grok"):
+    elif request.param.startswith("grok-"):
         tr = partial_tr(
             name="grok",
             test=Test(
@@ -179,6 +194,21 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -
             slurm_system, tr.test.test_definition.cmd_args_dict
         )
         tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name")
+        if "no-plugin" not in request.param:
+            prologue_tr = partial_tr(
+                name="nccl",
+                test=Test(
+                    test_definition=NCCLTestDefinition(
+                        name="nccl", description="nccl", test_template_name="nccl", cmd_args=NCCLCmdArgs()
+                    ),
+                    test_template=NcclTest(slurm_system, name="nccl"),
+                ),
+            )
+            prologue_tr.test.test_template.command_gen_strategy = NcclTestSlurmCommandGenStrategy(
+                slurm_system, prologue_tr.test.test_definition.cmd_args_dict
+            )
+            prologue_tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name")
+            tr.prologue = TestScenario(name=f"{prologue_tr.name} NCCL Prologue", test_runs=[prologue_tr])
 
         return (tr, f"{request.param}.sbatch", "grok.run")
 

From 46cabe912bfd22d39330b75df7665e48919432ed Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Tue, 29 Oct 2024 10:50:38 -0400
Subject: [PATCH 33/64] Clean up unit tests

---
 .../test_common_slurm_command_gen_strategy.py                 | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py
index 4484b6e1..0f7821a3 100644
--- a/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py
+++ b/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py
@@ -237,8 +237,6 @@ def test_prologue_epilogue_combinations(
             )
             run.test.test_template.gen_srun_command.return_value = "srun"
             run.test.name = f"test{idx+1}"
-    else:
-        testrun_fixture.prologue = None
 
     if epilogue is not None:
         testrun_fixture.epilogue = Mock(spec=TestScenario)
@@ -246,8 +244,6 @@ def test_prologue_epilogue_combinations(
         for idx, run in enumerate(epilogue):
             run.test.test_template.gen_srun_command.return_value = "epilogue"
             run.test.name = f"test{idx+1}"
-    else:
-        testrun_fixture.epilogue = None
 
     sbatch_command = strategy_fixture.gen_exec_command(testrun_fixture)
     script_file_path = sbatch_command.split()[-1]

From 764e18150e5b00f3ff610b42e845853f35e765c6 Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Tue, 29 Oct 2024 11:08:11 -0400
Subject: [PATCH 34/64] Refactor parser to remove explicit plugin_path
 argument, use default Path

---
 src/cloudai/cli/handlers.py | 2 +-
 src/cloudai/parser.py       | 8 +++-----
 tests/test_parser.py        | 7 +++----
 3 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py
index 1085440f..6105bc24 100644
--- a/src/cloudai/cli/handlers.py
+++ b/src/cloudai/cli/handlers.py
@@ -90,7 +90,7 @@ def handle_dry_run_and_run(args: argparse.Namespace) -> int:
         args (argparse.Namespace): The parsed command-line arguments.
     """
     parser = Parser(args.system_config)
-    system, tests, test_scenario = parser.parse(args.tests_dir, args.test_scenario, Path("conf/common/plugin"))
+    system, tests, test_scenario = parser.parse(args.tests_dir, args.test_scenario)
     assert test_scenario is not None
 
     if args.output_dir:
diff --git a/src/cloudai/parser.py b/src/cloudai/parser.py
index a9227f88..9e9a6766 100644
--- a/src/cloudai/parser.py
+++ b/src/cloudai/parser.py
@@ -52,7 +52,6 @@ def parse(
         self,
         test_path: Path,
         test_scenario_path: Optional[Path] = None,
-        plugin_path: Optional[Path] = None,
     ) -> Tuple[System, List[Test], Optional[TestScenario]]:
         """
         Parse configurations for system, test templates, and test scenarios.
@@ -61,7 +60,6 @@ def parse(
             test_path (Path): The file path for tests.
             test_scenario_path (Optional[Path]): The file path for the main test scenario.
                 If None, all tests are included.
-            plugin_path (Optional[Path]): The base file path for plugin-specific tests and scenarios.
 
         Returns:
             Tuple[System, List[Test], Optional[TestScenario]]: A tuple containing the system object, a list of filtered
@@ -80,8 +78,8 @@ def parse(
         except TestConfigParsingError:
             exit(1)
 
-        plugin_test_scenario_path = plugin_path
-        plugin_test_path = plugin_path / "test" if plugin_path else None
+        plugin_test_scenario_path = Path("conf/common/plugin")
+        plugin_test_path = Path("conf/common/plugin/test")
 
         plugin_tests = (
             self.parse_tests(list(plugin_test_path.glob("*.toml")), system)
@@ -92,7 +90,7 @@ def parse(
         if test_scenario_path:
             return self._parse_with_scenario(system, tests, test_scenario_path, plugin_tests, plugin_test_scenario_path)
 
-        return system, tests + plugin_tests, None
+        return system, list(set(tests + plugin_tests)), None
 
     def _parse_with_scenario(
         self,
diff --git a/tests/test_parser.py b/tests/test_parser.py
index bcfd63a3..f347c7ee 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -34,7 +34,7 @@ def parser(self, tmp_path: Path) -> Parser:
     def test_no_tests_dir(self, parser: Parser):
         tests_dir = parser.system_config_path.parent / "tests"
         with pytest.raises(FileNotFoundError) as exc_info:
-            parser.parse(tests_dir, None, None)
+            parser.parse(tests_dir, None)
         assert "Test path" in str(exc_info.value)
 
     @patch("cloudai._core.test_parser.TestParser.parse_all")
@@ -93,7 +93,7 @@ def test_scenario_with_plugin_common_tests(
         fake_plugin.test_runs[0].test.name = "test-1"
         parse_plugins.return_value = {"plugin-1": fake_plugin}
 
-        _, tests, _ = parser.parse(tests_dir, Path(), Path())
+        _, tests, _ = parser.parse(tests_dir, Path())
 
         assert len(tests) == 1
         assert tests[0].name == "test-1"
@@ -103,7 +103,6 @@ def test_scenario_with_plugin_common_tests(
     def test_scenario_with_plugin_exclusive_tests(self, test_scenario_parser: Mock, test_parser: Mock, parser: Parser):
         tests_dir = parser.system_config_path.parent.parent / "test"
         test_scenario_path = Path("/mock/test_scenario.toml")
-        plugin_path = Path("/mock/plugin_scenarios")
 
         fake_tests = [Mock() for _ in range(4)]
         for i, test in enumerate(fake_tests):
@@ -119,7 +118,7 @@ def test_scenario_with_plugin_exclusive_tests(self, test_scenario_parser: Mock,
         fake_plugin_scenarios["plugin-1"].test_runs[0].test.name = "test-2"
 
         with patch.object(parser, "_load_plugin_scenarios", return_value=fake_plugin_scenarios):
-            _, filtered_tests, _ = parser.parse(tests_dir, test_scenario_path, plugin_path)
+            _, filtered_tests, _ = parser.parse(tests_dir, test_scenario_path)
 
         filtered_test_names = {t.name for t in filtered_tests}
         assert len(filtered_tests) == 2

From d9e8c1fb84c0aff19eccaee5de99d674d6cd9f66 Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Tue, 29 Oct 2024 11:21:40 -0400
Subject: [PATCH 35/64] Refactor gen_exec_command to simplify indentation logic
 for readability

---
 .../strategy/slurm_command_gen_strategy.py    | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py
index 57c37812..1d58e4bf 100644
--- a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py
+++ b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py
@@ -56,23 +56,23 @@ def gen_exec_command(self, tr: TestRun) -> str:
         cmd_args = self._override_cmd_args(self.default_cmd_args, tr.test.cmd_args)
         slurm_args = self._parse_slurm_args(tr.test.test_template.__class__.__name__, env_vars, cmd_args, tr)
 
+        srun_command = self._gen_srun_command(slurm_args, env_vars, cmd_args, tr.test.extra_cmd_args)
+        command_list = []
+        indent = ""
+
         if tr.prologue:
             prologue_command = self.gen_prologue(tr.prologue, tr.output_path)
-            srun_command = self._gen_srun_command(slurm_args, env_vars, cmd_args, tr.test.extra_cmd_args)
-            command_list = [prologue_command, "if [ $PROLOGUE_SUCCESS -eq 1 ]; then", f"    {srun_command}"]
+            command_list = [prologue_command, "if [ $PROLOGUE_SUCCESS -eq 1 ]; then"]
+            indent = "    "
 
-            if tr.epilogue:
-                epilogue_command = self.gen_epilogue(tr.epilogue, tr.output_path)
-                command_list.append(f"    {epilogue_command}")
+        command_list.append(f"{indent}{srun_command}")
 
-            command_list.append("fi")
-        else:
-            srun_command = self._gen_srun_command(slurm_args, env_vars, cmd_args, tr.test.extra_cmd_args)
-            command_list = [srun_command]
+        if tr.epilogue:
+            epilogue_command = self.gen_epilogue(tr.epilogue, tr.output_path)
+            command_list.append(f"{indent}{epilogue_command}")
 
-            if tr.epilogue:
-                epilogue_command = self.gen_epilogue(tr.epilogue, tr.output_path)
-                command_list.append(epilogue_command)
+        if tr.prologue:
+            command_list.append("fi")
 
         full_command = "\n".join(command_list).strip()
         return self._write_sbatch_script(slurm_args, env_vars, full_command, tr.output_path)

From 897a7da60ef7db0d7e6ead059d619076199b2e93 Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Tue, 29 Oct 2024 15:29:52 -0400
Subject: [PATCH 36/64] Make prologue and epilogue fields optional

---
 src/cloudai/_core/test_scenario_parser.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/cloudai/_core/test_scenario_parser.py b/src/cloudai/_core/test_scenario_parser.py
index 4d192a7a..67f52eac 100644
--- a/src/cloudai/_core/test_scenario_parser.py
+++ b/src/cloudai/_core/test_scenario_parser.py
@@ -54,8 +54,8 @@ class _TestScenarioTOML(BaseModel):
     name: str
     job_status_check: bool = True
     tests: list[_TestRunTOML] = Field(alias="Tests", min_length=1)
-    prologue: str = ""
-    epilogue: str = ""
+    prologue: Optional[str] = None
+    epilogue: Optional[str] = None
 
     @model_validator(mode="after")
     def check_no_self_dependency(self):

From d44023bf9923ec6f5a1edd879029d423acfac753 Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Tue, 29 Oct 2024 15:31:29 -0400
Subject: [PATCH 37/64] Set prologue and epilogue to None by default

---
 src/cloudai/_core/test_scenario_parser.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/cloudai/_core/test_scenario_parser.py b/src/cloudai/_core/test_scenario_parser.py
index 67f52eac..6ff81dc9 100644
--- a/src/cloudai/_core/test_scenario_parser.py
+++ b/src/cloudai/_core/test_scenario_parser.py
@@ -166,8 +166,8 @@ def _create_section_test_run(
         self,
         test_info: _TestRunTOML,
         normalized_weight: float,
-        prologue: Optional[TestScenario],
-        epilogue: Optional[TestScenario],
+        prologue: Optional[TestScenario] = None,
+        epilogue: Optional[TestScenario] = None,
     ) -> TestRun:
         """
         Create a section-specific Test object by copying from the test mapping.

From 12022de61f5655eb303c22157a4d4aec10b5fb9e Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Tue, 29 Oct 2024 15:33:58 -0400
Subject: [PATCH 38/64] Recover comments

---
 src/cloudai/parser.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/cloudai/parser.py b/src/cloudai/parser.py
index 9e9a6766..cbb21d78 100644
--- a/src/cloudai/parser.py
+++ b/src/cloudai/parser.py
@@ -71,12 +71,12 @@ def parse(
         try:
             system = self.parse_system(self.system_config_path)
         except SystemConfigParsingError:
-            exit(1)
+            exit(1)  # exit right away to keep error message readable for users
 
         try:
             tests = self.parse_tests(list(test_path.glob("*.toml")), system)
         except TestConfigParsingError:
-            exit(1)
+            exit(1)  # exit right away to keep error message readable for users
 
         plugin_test_scenario_path = Path("conf/common/plugin")
         plugin_test_path = Path("conf/common/plugin/test")

From 3cf27df886de2fff6a40de1cdeda406647e8ab11 Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Tue, 29 Oct 2024 15:34:32 -0400
Subject: [PATCH 39/64] Remove unused tmp_path from unit tests

---
 .../test_common_slurm_command_gen_strategy.py                    | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py
index 0f7821a3..c5a6d0f0 100644
--- a/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py
+++ b/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py
@@ -223,7 +223,6 @@ def test_prologue_epilogue_combinations(
     prologue,
     epilogue,
     expected_script_lines,
-    tmp_path,
 ):
     testrun_fixture.prologue = Mock(spec=TestScenario) if prologue else None
     testrun_fixture.epilogue = Mock(spec=TestScenario) if epilogue else None

From 9244bd6e510e6807f246b0d2c4c0a6ec71eac5f1 Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Wed, 30 Oct 2024 10:52:54 -0400
Subject: [PATCH 40/64] Do not allow empty test runs in plugins

---
 .../systems/slurm/strategy/slurm_command_gen_strategy.py    | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py
index a06dd33f..e4b2f0c3 100644
--- a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py
+++ b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py
@@ -134,9 +134,6 @@ def gen_prologue(self, prologue: TestScenario, base_output_path: Path) -> str:
         Returns:
             str: A string with all the Slurm srun commands generated for the prologue.
         """
-        if not prologue.test_runs:
-            return "PROLOGUE_SUCCESS=1\n"
-
         prologue_output_dir = base_output_path / "prologue"
         prologue_output_dir.mkdir(parents=True, exist_ok=True)
 
@@ -177,9 +174,6 @@ def gen_epilogue(self, epilogue: TestScenario, base_output_path: Path) -> str:
         Returns:
             str: A string with all the Slurm srun commands generated for the epilogue.
         """
-        if not epilogue.test_runs:
-            return ""
-
         epilogue_output_dir = base_output_path / "epilogue"
         epilogue_output_dir.mkdir(parents=True, exist_ok=True)
 

From 00f34f2bdd349e802d7d053619610fdad677af33 Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Wed, 30 Oct 2024 12:15:17 -0400
Subject: [PATCH 41/64] Simplify prologue unit tests

---
 .../test_common_slurm_command_gen_strategy.py | 20 +++++++++----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py
index 25efdf16..e388f0d9 100644
--- a/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py
+++ b/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py
@@ -132,7 +132,7 @@ def test_raises_if_no_default_partition(slurm_system: SlurmSystem):
             [Mock(test=Mock(name="test1", test_template=Mock()))],
             None,
             [
-                "SUCCESS_0=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)",
+                "prologue",
                 "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 )",
                 "if [ $PROLOGUE_SUCCESS -eq 1 ]; then",
                 "    srun",
@@ -153,7 +153,7 @@ def test_raises_if_no_default_partition(slurm_system: SlurmSystem):
             [Mock(test=Mock(name="test1", test_template=Mock()))],
             [Mock(test=Mock(name="test2", test_template=Mock()))],
             [
-                "SUCCESS_0=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)",
+                "prologue",
                 "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 )",
                 "if [ $PROLOGUE_SUCCESS -eq 1 ]; then",
                 "    srun",
@@ -166,8 +166,8 @@ def test_raises_if_no_default_partition(slurm_system: SlurmSystem):
             [Mock(test=Mock(name="test1", test_template=Mock())), Mock(test=Mock(name="test2", test_template=Mock()))],
             [Mock(test=Mock(name="test3", test_template=Mock())), Mock(test=Mock(name="test4", test_template=Mock()))],
             [
-                "SUCCESS_0=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)",
-                "SUCCESS_1=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)",
+                "prologue",
+                "prologue",
                 "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && [ $SUCCESS_1 -eq 1 ] && echo 1 || echo 0 )",
                 "if [ $PROLOGUE_SUCCESS -eq 1 ]; then",
                 "    srun",
@@ -181,8 +181,8 @@ def test_raises_if_no_default_partition(slurm_system: SlurmSystem):
             [Mock(test=Mock(name="test1", test_template=Mock())), Mock(test=Mock(name="test2", test_template=Mock()))],
             None,
             [
-                "SUCCESS_0=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)",
-                "SUCCESS_1=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)",
+                "prologue",
+                "prologue",
                 "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && [ $SUCCESS_1 -eq 1 ] && echo 1 || echo 0 )",
                 "if [ $PROLOGUE_SUCCESS -eq 1 ]; then",
                 "    srun",
@@ -204,8 +204,8 @@ def test_raises_if_no_default_partition(slurm_system: SlurmSystem):
             [Mock(test=Mock(name="test1", test_template=Mock())), Mock(test=Mock(name="test2", test_template=Mock()))],
             [Mock(test=Mock(name="test3", test_template=Mock()))],
             [
-                "SUCCESS_0=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)",
-                "SUCCESS_1=$(grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0)",
+                "prologue",
+                "prologue",
                 "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && [ $SUCCESS_1 -eq 1 ] && echo 1 || echo 0 )",
                 "if [ $PROLOGUE_SUCCESS -eq 1 ]; then",
                 "    srun",
@@ -229,9 +229,7 @@ def test_prologue_epilogue_combinations(
         testrun_fixture.prologue = Mock(spec=TestScenario)
         testrun_fixture.prologue.test_runs = prologue
         for idx, run in enumerate(prologue):
-            run.test.test_template.gen_srun_success_check.return_value = (
-                "grep -q 'Avg bus bandwidth' stdout.txt && echo 1 || echo 0"
-            )
+            run.test.test_template.gen_srun_success_check.return_value = "prologue"
             run.test.test_template.gen_srun_command.return_value = "srun"
             run.test.name = f"test{idx+1}"
 

From 7de11857a2b815f2ad06fa7851f50673ea1e53a4 Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Wed, 30 Oct 2024 12:16:31 -0400
Subject: [PATCH 42/64] Move plugin directory to conf

---
 conf/{common => }/plugin/nccl_test_epilogue.toml        | 0
 conf/{common => }/plugin/nccl_test_prologue.toml        | 0
 conf/{common => }/plugin/test/nccl_test_all_gather.toml | 0
 conf/{common => }/plugin/test/nccl_test_all_reduce.toml | 0
 src/cloudai/parser.py                                   | 4 ++--
 5 files changed, 2 insertions(+), 2 deletions(-)
 rename conf/{common => }/plugin/nccl_test_epilogue.toml (100%)
 rename conf/{common => }/plugin/nccl_test_prologue.toml (100%)
 rename conf/{common => }/plugin/test/nccl_test_all_gather.toml (100%)
 rename conf/{common => }/plugin/test/nccl_test_all_reduce.toml (100%)

diff --git a/conf/common/plugin/nccl_test_epilogue.toml b/conf/plugin/nccl_test_epilogue.toml
similarity index 100%
rename from conf/common/plugin/nccl_test_epilogue.toml
rename to conf/plugin/nccl_test_epilogue.toml
diff --git a/conf/common/plugin/nccl_test_prologue.toml b/conf/plugin/nccl_test_prologue.toml
similarity index 100%
rename from conf/common/plugin/nccl_test_prologue.toml
rename to conf/plugin/nccl_test_prologue.toml
diff --git a/conf/common/plugin/test/nccl_test_all_gather.toml b/conf/plugin/test/nccl_test_all_gather.toml
similarity index 100%
rename from conf/common/plugin/test/nccl_test_all_gather.toml
rename to conf/plugin/test/nccl_test_all_gather.toml
diff --git a/conf/common/plugin/test/nccl_test_all_reduce.toml b/conf/plugin/test/nccl_test_all_reduce.toml
similarity index 100%
rename from conf/common/plugin/test/nccl_test_all_reduce.toml
rename to conf/plugin/test/nccl_test_all_reduce.toml
diff --git a/src/cloudai/parser.py b/src/cloudai/parser.py
index cbb21d78..0db9147e 100644
--- a/src/cloudai/parser.py
+++ b/src/cloudai/parser.py
@@ -78,8 +78,8 @@ def parse(
         except TestConfigParsingError:
             exit(1)  # exit right away to keep error message readable for users
 
-        plugin_test_scenario_path = Path("conf/common/plugin")
-        plugin_test_path = Path("conf/common/plugin/test")
+        plugin_test_scenario_path = Path("conf/plugin")
+        plugin_test_path = Path("conf/plugin/test")
 
         plugin_tests = (
             self.parse_tests(list(plugin_test_path.glob("*.toml")), system)

From c2b8d834fd0b3fe5ffe31dd9fe79eb7d88f4ddde Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Wed, 30 Oct 2024 12:31:21 -0400
Subject: [PATCH 43/64] Reflect Andrei's comments

---
 src/cloudai/parser.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/cloudai/parser.py b/src/cloudai/parser.py
index 0db9147e..42186ad8 100644
--- a/src/cloudai/parser.py
+++ b/src/cloudai/parser.py
@@ -78,19 +78,20 @@ def parse(
         except TestConfigParsingError:
             exit(1)  # exit right away to keep error message readable for users
 
-        plugin_test_scenario_path = Path("conf/plugin")
         plugin_test_path = Path("conf/plugin/test")
+        try:
+            plugin_tests = (
+                self.parse_tests(list(plugin_test_path.glob("*.toml")), system) if plugin_test_path.exists() else []
+            )
+        except TestConfigParsingError:
+            exit(1)  # exit right away to keep error message readable for users
 
-        plugin_tests = (
-            self.parse_tests(list(plugin_test_path.glob("*.toml")), system)
-            if plugin_test_path and plugin_test_path.exists()
-            else []
-        )
-
+        plugin_test_scenario_path = Path("conf/plugin")
         if test_scenario_path:
             return self._parse_with_scenario(system, tests, test_scenario_path, plugin_tests, plugin_test_scenario_path)
 
-        return system, list(set(tests + plugin_tests)), None
+        combined_tests = list(set(tests + plugin_tests))
+        return system, combined_tests, None
 
     def _parse_with_scenario(
         self,

From e1534d14963b6248c7edfa42a3ce570514bd45e0 Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Thu, 31 Oct 2024 11:02:29 -0400
Subject: [PATCH 44/64] Reflect Andrei's comments

---
 src/cloudai/_core/test_scenario_parser.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/cloudai/_core/test_scenario_parser.py b/src/cloudai/_core/test_scenario_parser.py
index 19f7e4c1..a3ea8d9a 100644
--- a/src/cloudai/_core/test_scenario_parser.py
+++ b/src/cloudai/_core/test_scenario_parser.py
@@ -209,7 +209,7 @@ def _create_test_run(
             sol=test_info.sol,
             weight=test_info.weight * normalized_weight,
             ideal_perf=test_info.ideal_perf,
-            prologue=prologue if prologue is not None else TestScenario(name="default_prologue", test_runs=[]),
-            epilogue=epilogue if epilogue is not None else TestScenario(name="default_epilogue", test_runs=[]),
+            prologue=prologue,
+            epilogue=epilogue,
         )
         return tr

From 42080e25f2cbf5d9fac7a0d2272784d934b8b5fd Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Thu, 31 Oct 2024 11:06:45 -0400
Subject: [PATCH 45/64] Print out warning when plugins are missing

---
 src/cloudai/_core/test_scenario_parser.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/cloudai/_core/test_scenario_parser.py b/src/cloudai/_core/test_scenario_parser.py
index a3ea8d9a..8db8622d 100644
--- a/src/cloudai/_core/test_scenario_parser.py
+++ b/src/cloudai/_core/test_scenario_parser.py
@@ -142,8 +142,18 @@ def _parse_data(self, data: Dict[str, Any]) -> TestScenario:
         prologue, epilogue = None, None
         if ts_model.prologue:
             prologue = self.plugin_mapping.get(ts_model.prologue)
+            if prologue is None:
+                logging.warning(
+                    f"Prologue '{ts_model.prologue}' not found in plugin mapping. "
+                    "Ensure that a proper plugin directory is set under the working directory."
+                )
         if ts_model.epilogue:
             epilogue = self.plugin_mapping.get(ts_model.epilogue)
+            if epilogue is None:
+                logging.warning(
+                    f"Epilogue '{ts_model.epilogue}' not found in plugin mapping. "
+                    "Ensure that a proper plugin directory is set under the working directory."
+                )
 
         test_runs_by_id: dict[str, TestRun] = {
             tr.id: self._create_test_run(tr, normalized_weight, prologue, epilogue) for tr in ts_model.tests

From 3dbb4d3d2893e2b6faca2c1543580035d3455cfc Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Thu, 31 Oct 2024 11:09:06 -0400
Subject: [PATCH 46/64] Update acceptance test sbatch script names

---
 tests/ref_data/{gpt-plugin.sbatch => gpt-prologue.sbatch}   | 0
 tests/ref_data/{grok-plugin.sbatch => grok-prologue.sbatch} | 0
 tests/test_acceptance.py                                    | 6 +++---
 3 files changed, 3 insertions(+), 3 deletions(-)
 rename tests/ref_data/{gpt-plugin.sbatch => gpt-prologue.sbatch} (100%)
 rename tests/ref_data/{grok-plugin.sbatch => grok-prologue.sbatch} (100%)

diff --git a/tests/ref_data/gpt-plugin.sbatch b/tests/ref_data/gpt-prologue.sbatch
similarity index 100%
rename from tests/ref_data/gpt-plugin.sbatch
rename to tests/ref_data/gpt-prologue.sbatch
diff --git a/tests/ref_data/grok-plugin.sbatch b/tests/ref_data/grok-prologue.sbatch
similarity index 100%
rename from tests/ref_data/grok-plugin.sbatch
rename to tests/ref_data/grok-prologue.sbatch
diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py
index 19e7acb9..bb6f7897 100644
--- a/tests/test_acceptance.py
+++ b/tests/test_acceptance.py
@@ -91,7 +91,7 @@ def partial_tr(slurm_system: SlurmSystem) -> partial[TestRun]:
     return partial(TestRun, num_nodes=1, nodes=[], output_path=slurm_system.output_path)
 
 
-@pytest.fixture(params=["ucc", "nccl", "sleep", "gpt-plugin", "gpt-no-plugin", "grok-plugin", "grok-no-plugin"])
+@pytest.fixture(params=["ucc", "nccl", "sleep", "gpt-prologue", "gpt-no-plugin", "grok-prologue", "grok-no-plugin"])
 def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -> tuple[TestRun, str, Optional[str]]:
     if request.param == "ucc":
         tr = partial_tr(
@@ -159,7 +159,7 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -
             slurm_system, tr.test.test_definition.cmd_args_dict
         )
         tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name")
-        if "no-plugin" not in request.param:
+        if "prologue" in request.param:
             prologue_tr = partial_tr(
                 name="nccl",
                 test=Test(
@@ -194,7 +194,7 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -
             slurm_system, tr.test.test_definition.cmd_args_dict
         )
         tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name")
-        if "no-plugin" not in request.param:
+        if "prologue" in request.param:
             prologue_tr = partial_tr(
                 name="nccl",
                 test=Test(

From a9f5c979a7835aa8b5aab2eae8664951447dd13d Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Thu, 31 Oct 2024 11:59:36 -0400
Subject: [PATCH 47/64] Reflect Andrei's comments

---
 src/cloudai/parser.py | 83 +++++++++++++++----------------------------
 tests/test_parser.py  | 73 +++++++++++++++----------------------
 2 files changed, 57 insertions(+), 99 deletions(-)

diff --git a/src/cloudai/parser.py b/src/cloudai/parser.py
index 42186ad8..71435e90 100644
--- a/src/cloudai/parser.py
+++ b/src/cloudai/parser.py
@@ -16,7 +16,7 @@
 
 import logging
 from pathlib import Path
-from typing import Dict, List, Optional, Set, Tuple
+from typing import Dict, List, Optional, Tuple
 
 import toml
 from pydantic import ValidationError
@@ -34,6 +34,9 @@
     format_validation_error,
 )
 
+PLUGIN_ROOT = Path("conf/plugin")
+PLUGIN_TEST_ROOT = PLUGIN_ROOT / "test"
+
 
 class Parser:
     """Main parser for parsing all types of configurations."""
@@ -78,74 +81,44 @@ def parse(
         except TestConfigParsingError:
             exit(1)  # exit right away to keep error message readable for users
 
-        plugin_test_path = Path("conf/plugin/test")
         try:
             plugin_tests = (
-                self.parse_tests(list(plugin_test_path.glob("*.toml")), system) if plugin_test_path.exists() else []
+                self.parse_tests(list(PLUGIN_TEST_ROOT.glob("*.toml")), system) if PLUGIN_TEST_ROOT.exists() else []
             )
         except TestConfigParsingError:
             exit(1)  # exit right away to keep error message readable for users
 
-        plugin_test_scenario_path = Path("conf/plugin")
-        if test_scenario_path:
-            return self._parse_with_scenario(system, tests, test_scenario_path, plugin_tests, plugin_test_scenario_path)
-
-        combined_tests = list(set(tests + plugin_tests))
-        return system, combined_tests, None
+        if not test_scenario_path:
+            all_tests = list({test.name: test for test in tests + plugin_tests}.values())
+            return system, all_tests, None
 
-    def _parse_with_scenario(
-        self,
-        system: System,
-        tests: List[Test],
-        test_scenario_path: Path,
-        plugin_tests: List[Test],
-        plugin_test_scenario_path: Optional[Path],
-    ) -> Tuple[System, List[Test], Optional[TestScenario]]:
-        """Parse tests and scenarios with a main test scenario path specified."""
         test_mapping = {t.name: t for t in tests}
-        plugin_test_mapping = {t.name: t for t in plugin_tests}
-
-        plugin_test_scenario_mapping = self._load_plugin_scenarios(plugin_test_scenario_path, plugin_test_mapping)
-        test_scenario = self._load_main_scenario(test_scenario_path, test_mapping, plugin_test_scenario_mapping)
-
-        all_used_test_names = self._collect_used_test_names(plugin_test_scenario_mapping, test_scenario)
-        filtered_tests = [t for t in tests if t.name in all_used_test_names]
-
-        return system, filtered_tests, test_scenario
-
-    def _load_plugin_scenarios(
-        self, plugin_test_scenario_path: Optional[Path], plugin_test_mapping: Dict[str, Test]
-    ) -> Dict[str, TestScenario]:
-        """Load plugin-specific test scenarios from the specified path."""
-        if plugin_test_scenario_path and plugin_test_scenario_path.exists():
+        plugin_test_scenario_mapping = {}
+        if PLUGIN_ROOT.exists() and list(PLUGIN_ROOT.glob("*.toml")):
             try:
-                return self.parse_plugins(list(plugin_test_scenario_path.glob("*.toml")), plugin_test_mapping)
+                plugin_test_scenario_mapping = self.parse_plugins(
+                    list(PLUGIN_ROOT.glob("*.toml")), {t.name: t for t in plugin_tests}
+                )
             except TestScenarioParsingError:
-                exit(1)
-        return {}
+                exit(1)  # exit right away to keep error message readable for users
 
-    def _load_main_scenario(
-        self,
-        test_scenario_path: Path,
-        test_mapping: Dict[str, Test],
-        plugin_test_scenario_mapping: Dict[str, TestScenario],
-    ) -> Optional[TestScenario]:
-        """Load the main test scenario using provided mappings."""
         try:
-            return self.parse_test_scenario(test_scenario_path, test_mapping, plugin_test_scenario_mapping)
+            test_scenario = self.parse_test_scenario(test_scenario_path, test_mapping, plugin_test_scenario_mapping)
         except TestScenarioParsingError:
-            exit(1)
-
-    def _collect_used_test_names(
-        self, plugin_test_scenario_mapping: Dict[str, TestScenario], test_scenario: Optional[TestScenario]
-    ) -> Set[str]:
-        """Collect test names used in both plugin and main test scenarios."""
-        # TODO: collect test names in the plugin test scenarios only
-        plugin_test_names = {
-            tr.test.name for scenario in plugin_test_scenario_mapping.values() for tr in scenario.test_runs
+            exit(1)  # exit right away to keep error message readable for users
+
+        scenario_tests = {tr.test.name for tr in test_scenario.test_runs}
+        plugin_scenario_tests = {
+            tr.test.name
+            for plugin_scenario in plugin_test_scenario_mapping.values()
+            for tr in plugin_scenario.test_runs
         }
-        scenario_test_names = {tr.test.name for tr in test_scenario.test_runs} if test_scenario else set()
-        return plugin_test_names.union(scenario_test_names)
+
+        relevant_test_names = scenario_tests.union(plugin_scenario_tests)
+        filtered_tests = [t for t in tests if t.name in relevant_test_names] + plugin_tests
+        filtered_tests = list({test.name: test for test in filtered_tests}.values())
+
+        return system, filtered_tests, test_scenario
 
     @staticmethod
     def parse_plugins(plugin_tomls: List[Path], test_mapping: Dict[str, Test]) -> Dict[str, TestScenario]:
diff --git a/tests/test_parser.py b/tests/test_parser.py
index f347c7ee..2b709938 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -15,13 +15,13 @@
 # limitations under the License.
 
 from pathlib import Path
-from typing import Dict, cast
+from typing import cast
 from unittest.mock import Mock, patch
 
 import pytest
 from pydantic_core import ErrorDetails
 
-from cloudai import Parser, TestScenario, format_validation_error
+from cloudai import Parser, format_validation_error
 from cloudai.systems.slurm.slurm_system import SlurmSystem
 
 
@@ -53,11 +53,11 @@ def test_no_scenario(self, test_parser: Mock, parser: Parser):
     def test_scenario_without_plugin(self, test_scenario_parser: Mock, test_parser: Mock, parser: Parser):
         tests_dir = parser.system_config_path.parent.parent / "test"
 
-        fake_tests = []
-        for i in range(3):
-            fake_tests.append(Mock())
-            fake_tests[-1].name = f"test-{i}"
-        test_parser.return_value = fake_tests
+        fake_tests = [Mock(name=f"test-{i}") for i in range(3)]
+        for i, test in enumerate(fake_tests):
+            test.name = f"test-{i}"
+
+        test_parser.side_effect = [fake_tests, []]
 
         fake_scenario = Mock()
         fake_scenario.test_runs = [Mock()]
@@ -77,11 +77,13 @@ def test_scenario_with_plugin_common_tests(
     ):
         tests_dir = parser.system_config_path.parent.parent / "test"
 
-        fake_tests = []
-        for i in range(3):
-            fake_tests.append(Mock())
-            fake_tests[-1].name = f"test-{i}"
-        test_parser.return_value = fake_tests
+        main_tests = [Mock() for _ in range(3)]
+        for i, test in enumerate(main_tests):
+            test.name = f"test-{i}"
+        plugin_tests = [Mock()]
+        plugin_tests[0].name = "test-1"
+
+        test_parser.side_effect = [main_tests, plugin_tests]
 
         fake_scenario = Mock()
         fake_scenario.test_runs = [Mock()]
@@ -95,57 +97,40 @@ def test_scenario_with_plugin_common_tests(
 
         _, tests, _ = parser.parse(tests_dir, Path())
 
+        filtered_test_names = {t.name for t in tests}
         assert len(tests) == 1
-        assert tests[0].name == "test-1"
+        assert "test-1" in filtered_test_names
 
     @patch("cloudai._core.test_parser.TestParser.parse_all")
     @patch("cloudai._core.test_scenario_parser.TestScenarioParser.parse")
-    def test_scenario_with_plugin_exclusive_tests(self, test_scenario_parser: Mock, test_parser: Mock, parser: Parser):
+    @patch("pathlib.Path.exists", return_value=True)
+    def test_scenario_with_plugin_exclusive_tests(
+        self, path_exists_mock: Mock, test_scenario_parser: Mock, test_parser: Mock, parser: Parser
+    ):
         tests_dir = parser.system_config_path.parent.parent / "test"
         test_scenario_path = Path("/mock/test_scenario.toml")
 
-        fake_tests = [Mock() for _ in range(4)]
-        for i, test in enumerate(fake_tests):
+        main_tests = [Mock() for _ in range(3)]
+        plugin_tests = [Mock()]
+        for i, test in enumerate(main_tests):
             test.name = f"test-{i}"
-        test_parser.return_value = fake_tests
+        plugin_tests[0].name = "plugin-test-1"
+
+        test_parser.side_effect = [main_tests, plugin_tests]
 
         fake_scenario = Mock()
         fake_scenario.test_runs = [Mock()]
         fake_scenario.test_runs[0].test.name = "test-1"
         test_scenario_parser.return_value = fake_scenario
 
-        fake_plugin_scenarios = {"plugin-1": Mock(test_runs=[Mock()])}
-        fake_plugin_scenarios["plugin-1"].test_runs[0].test.name = "test-2"
-
-        with patch.object(parser, "_load_plugin_scenarios", return_value=fake_plugin_scenarios):
-            _, filtered_tests, _ = parser.parse(tests_dir, test_scenario_path)
+        _, filtered_tests, _ = parser.parse(tests_dir, test_scenario_path)
 
         filtered_test_names = {t.name for t in filtered_tests}
         assert len(filtered_tests) == 2
         assert "test-1" in filtered_test_names
-        assert "test-2" in filtered_test_names
+        assert "plugin-test-1" in filtered_test_names
         assert "test-0" not in filtered_test_names
-        assert "test-3" not in filtered_test_names
-
-    def test_collect_used_test_names(self, parser: Parser):
-        fake_scenario = Mock()
-        fake_scenario.test_runs = [Mock()]
-        fake_scenario.test_runs[0].test.name = "test-1"
-
-        fake_plugin_scenario_1 = Mock(spec=TestScenario)
-        fake_plugin_scenario_1.test_runs = [Mock()]
-        fake_plugin_scenario_1.test_runs[0].test.name = "test-2"
-
-        fake_plugin_scenario_2 = Mock(spec=TestScenario)
-        fake_plugin_scenario_2.test_runs = [Mock()]
-        fake_plugin_scenario_2.test_runs[0].test.name = "test-3"
-
-        fake_plugin_scenarios = cast(
-            Dict[str, TestScenario], {"plugin-1": fake_plugin_scenario_1, "plugin-2": fake_plugin_scenario_2}
-        )
-
-        used_test_names = parser._collect_used_test_names(fake_plugin_scenarios, fake_scenario)
-        assert used_test_names == {"test-1", "test-2", "test-3"}
+        assert "test-2" not in filtered_test_names
 
     def test_parse_system(self, parser: Parser):
         parser.system_config_path = Path("conf/common/system/example_slurm_cluster.toml")

From d3c7cfd224f6fd80b019b31d55069fd5e90893b3 Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Thu, 31 Oct 2024 12:02:35 -0400
Subject: [PATCH 48/64] Make vulture happy

---
 tests/test_parser.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/tests/test_parser.py b/tests/test_parser.py
index 2b709938..c7d4f873 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -103,10 +103,7 @@ def test_scenario_with_plugin_common_tests(
 
     @patch("cloudai._core.test_parser.TestParser.parse_all")
     @patch("cloudai._core.test_scenario_parser.TestScenarioParser.parse")
-    @patch("pathlib.Path.exists", return_value=True)
-    def test_scenario_with_plugin_exclusive_tests(
-        self, path_exists_mock: Mock, test_scenario_parser: Mock, test_parser: Mock, parser: Parser
-    ):
+    def test_scenario_with_plugin_exclusive_tests(self, test_scenario_parser: Mock, test_parser: Mock, parser: Parser):
         tests_dir = parser.system_config_path.parent.parent / "test"
         test_scenario_path = Path("/mock/test_scenario.toml")
 

From f12309976c154adb7041cbfd884c371483651f9c Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Fri, 1 Nov 2024 07:03:26 -0400
Subject: [PATCH 49/64] Add logging messages to parser.parse

---
 src/cloudai/parser.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/cloudai/parser.py b/src/cloudai/parser.py
index 71435e90..a4c59f19 100644
--- a/src/cloudai/parser.py
+++ b/src/cloudai/parser.py
@@ -95,10 +95,12 @@ def parse(
         test_mapping = {t.name: t for t in tests}
         plugin_test_scenario_mapping = {}
         if PLUGIN_ROOT.exists() and list(PLUGIN_ROOT.glob("*.toml")):
+            logging.debug("PLUGIN_ROOT exists and contains .toml files. Proceeding with plugin test scenario parsing.")
             try:
                 plugin_test_scenario_mapping = self.parse_plugins(
                     list(PLUGIN_ROOT.glob("*.toml")), {t.name: t for t in plugin_tests}
                 )
+                logging.debug("Plugin test scenarios successfully parsed from PLUGIN_ROOT.")
             except TestScenarioParsingError:
                 exit(1)  # exit right away to keep error message readable for users
 

From e886bf69e4daaccc99c35cad1db85cd9882d11ca Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Fri, 1 Nov 2024 07:09:36 -0400
Subject: [PATCH 50/64] Simplify unit test for readability

---
 tests/test_parser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_parser.py b/tests/test_parser.py
index c7d4f873..e662e9f7 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -97,7 +97,7 @@ def test_scenario_with_plugin_common_tests(
 
         _, tests, _ = parser.parse(tests_dir, Path())
 
-        filtered_test_names = {t.name for t in tests}
+        filtered_test_names = {"test-1"}
         assert len(tests) == 1
         assert "test-1" in filtered_test_names
 

From 59d5cb36b02f799b360c6ea282d3a9231e9874d9 Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Fri, 1 Nov 2024 08:56:35 -0400
Subject: [PATCH 51/64] Reflect Andrei's comments

---
 src/cloudai/parser.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/cloudai/parser.py b/src/cloudai/parser.py
index a4c59f19..b87518e3 100644
--- a/src/cloudai/parser.py
+++ b/src/cloudai/parser.py
@@ -81,6 +81,9 @@ def parse(
         except TestConfigParsingError:
             exit(1)  # exit right away to keep error message readable for users
 
+        if not PLUGIN_ROOT.exists():
+            logger.debug(f"PLUGIN_ROOT path '{PLUGIN_ROOT}' does not exist. Plugins will not be enabled.")
+
         try:
             plugin_tests = (
                 self.parse_tests(list(PLUGIN_TEST_ROOT.glob("*.toml")), system) if PLUGIN_TEST_ROOT.exists() else []
@@ -95,12 +98,10 @@ def parse(
         test_mapping = {t.name: t for t in tests}
         plugin_test_scenario_mapping = {}
         if PLUGIN_ROOT.exists() and list(PLUGIN_ROOT.glob("*.toml")):
-            logging.debug("PLUGIN_ROOT exists and contains .toml files. Proceeding with plugin test scenario parsing.")
             try:
                 plugin_test_scenario_mapping = self.parse_plugins(
                     list(PLUGIN_ROOT.glob("*.toml")), {t.name: t for t in plugin_tests}
                 )
-                logging.debug("Plugin test scenarios successfully parsed from PLUGIN_ROOT.")
             except TestScenarioParsingError:
                 exit(1)  # exit right away to keep error message readable for users
 

From 4894972f6c439366c293a6d6b6e11dd13acf4b24 Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Fri, 1 Nov 2024 08:57:30 -0400
Subject: [PATCH 52/64] Reflect Andrei's comments

---
 src/cloudai/parser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cloudai/parser.py b/src/cloudai/parser.py
index b87518e3..eb3270c3 100644
--- a/src/cloudai/parser.py
+++ b/src/cloudai/parser.py
@@ -82,7 +82,7 @@ def parse(
             exit(1)  # exit right away to keep error message readable for users
 
         if not PLUGIN_ROOT.exists():
-            logger.debug(f"PLUGIN_ROOT path '{PLUGIN_ROOT}' does not exist. Plugins will not be enabled.")
+            logging.debug(f"PLUGIN_ROOT path '{PLUGIN_ROOT}' does not exist. Plugins will not be enabled.")
 
         try:
             plugin_tests = (

From c19f24b07784ba890b0230cd0c141a32683c92c4 Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Mon, 4 Nov 2024 14:03:03 -0600
Subject: [PATCH 53/64] Rename plugin to hook

---
 conf/common/test_scenario/nccl_test.toml      |  4 +-
 .../nccl_test.toml}                           |  0
 .../test/nccl_test_all_gather.toml            |  0
 .../test/nccl_test_all_reduce.toml            |  0
 conf/plugin/nccl_test_prologue.toml           | 22 -----
 src/cloudai/_core/test_scenario.py            |  4 +-
 src/cloudai/_core/test_scenario_parser.py     | 44 +++++-----
 src/cloudai/parser.py                         | 44 +++++-----
 .../strategy/slurm_command_gen_strategy.py    | 78 +++++++++---------
 ...pt-no-plugin.sbatch => gpt-no-hook.sbatch} |  0
 ...pt-prologue.sbatch => gpt-pre-test.sbatch} |  0
 ...k-no-plugin.sbatch => grok-no-hook.sbatch} |  0
 ...k-prologue.sbatch => grok-pre-test.sbatch} |  0
 .../test_common_slurm_command_gen_strategy.py | 80 +++++++++----------
 tests/test_acceptance.py                      | 28 +++----
 tests/test_parser.py                          | 32 ++++----
 16 files changed, 156 insertions(+), 180 deletions(-)
 rename conf/{plugin/nccl_test_epilogue.toml => hook/nccl_test.toml} (100%)
 rename conf/{plugin => hook}/test/nccl_test_all_gather.toml (100%)
 rename conf/{plugin => hook}/test/nccl_test_all_reduce.toml (100%)
 delete mode 100644 conf/plugin/nccl_test_prologue.toml
 rename tests/ref_data/{gpt-no-plugin.sbatch => gpt-no-hook.sbatch} (100%)
 rename tests/ref_data/{gpt-prologue.sbatch => gpt-pre-test.sbatch} (100%)
 rename tests/ref_data/{grok-no-plugin.sbatch => grok-no-hook.sbatch} (100%)
 rename tests/ref_data/{grok-prologue.sbatch => grok-pre-test.sbatch} (100%)

diff --git a/conf/common/test_scenario/nccl_test.toml b/conf/common/test_scenario/nccl_test.toml
index 9b731e96..15064561 100644
--- a/conf/common/test_scenario/nccl_test.toml
+++ b/conf/common/test_scenario/nccl_test.toml
@@ -16,8 +16,8 @@
 
 name = "nccl-test"
 
-prologue = "nccl_test_prologue"
-epilogue = "nccl_test_epilogue"
+pre_test = "nccl_test"
+post_test = "nccl_test"
 
 [[Tests]]
 id = "Tests.1"
diff --git a/conf/plugin/nccl_test_epilogue.toml b/conf/hook/nccl_test.toml
similarity index 100%
rename from conf/plugin/nccl_test_epilogue.toml
rename to conf/hook/nccl_test.toml
diff --git a/conf/plugin/test/nccl_test_all_gather.toml b/conf/hook/test/nccl_test_all_gather.toml
similarity index 100%
rename from conf/plugin/test/nccl_test_all_gather.toml
rename to conf/hook/test/nccl_test_all_gather.toml
diff --git a/conf/plugin/test/nccl_test_all_reduce.toml b/conf/hook/test/nccl_test_all_reduce.toml
similarity index 100%
rename from conf/plugin/test/nccl_test_all_reduce.toml
rename to conf/hook/test/nccl_test_all_reduce.toml
diff --git a/conf/plugin/nccl_test_prologue.toml b/conf/plugin/nccl_test_prologue.toml
deleted file mode 100644
index e5c1a1e4..00000000
--- a/conf/plugin/nccl_test_prologue.toml
+++ /dev/null
@@ -1,22 +0,0 @@
-# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
-# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-name = "nccl_test_prologue"
-
-[[Tests]]
-id = "Tests.1"
-test_name = "nccl_test_all_reduce"
-time_limit = "00:20:00"
diff --git a/src/cloudai/_core/test_scenario.py b/src/cloudai/_core/test_scenario.py
index 97c89994..39f1bd21 100644
--- a/src/cloudai/_core/test_scenario.py
+++ b/src/cloudai/_core/test_scenario.py
@@ -58,8 +58,8 @@ class TestRun:
     weight: float = 0.0
     ideal_perf: float = 1.0
     dependencies: dict[str, TestDependency] = field(default_factory=dict)
-    prologue: Optional["TestScenario"] = None
-    epilogue: Optional["TestScenario"] = None
+    pre_test: Optional["TestScenario"] = None
+    post_test: Optional["TestScenario"] = None
 
     def __hash__(self) -> int:
         return hash(self.name + self.test.name + str(self.iterations) + str(self.current_iteration))
diff --git a/src/cloudai/_core/test_scenario_parser.py b/src/cloudai/_core/test_scenario_parser.py
index 8db8622d..93047d29 100644
--- a/src/cloudai/_core/test_scenario_parser.py
+++ b/src/cloudai/_core/test_scenario_parser.py
@@ -54,8 +54,8 @@ class _TestScenarioTOML(BaseModel):
     name: str
     job_status_check: bool = True
     tests: list[_TestRunTOML] = Field(alias="Tests", min_length=1)
-    prologue: Optional[str] = None
-    epilogue: Optional[str] = None
+    pre_test: Optional[str] = None
+    post_test: Optional[str] = None
 
     @model_validator(mode="after")
     def check_no_self_dependency(self):
@@ -101,10 +101,10 @@ class TestScenarioParser:
 
     __test__ = False
 
-    def __init__(self, file_path: Path, test_mapping: Dict[str, Test], plugin_mapping: Dict[str, TestScenario]) -> None:
+    def __init__(self, file_path: Path, test_mapping: Dict[str, Test], hook_mapping: Dict[str, TestScenario]) -> None:
         self.file_path = file_path
         self.test_mapping = test_mapping
-        self.plugin_mapping = plugin_mapping
+        self.hook_mapping = hook_mapping
 
     def parse(self) -> TestScenario:
         """
@@ -139,24 +139,24 @@ def _parse_data(self, data: Dict[str, Any]) -> TestScenario:
         total_weight = sum(tr.weight for tr in ts_model.tests)
         normalized_weight = 0 if total_weight == 0 else 100 / total_weight
 
-        prologue, epilogue = None, None
-        if ts_model.prologue:
-            prologue = self.plugin_mapping.get(ts_model.prologue)
-            if prologue is None:
+        pre_test, post_test = None, None
+        if ts_model.pre_test:
+            pre_test = self.hook_mapping.get(ts_model.pre_test)
+            if pre_test is None:
                 logging.warning(
-                    f"Prologue '{ts_model.prologue}' not found in plugin mapping. "
-                    "Ensure that a proper plugin directory is set under the working directory."
+                    f"Prologue '{ts_model.pre_test}' not found in hook mapping. "
+                    "Ensure that a proper hook directory is set under the working directory."
                 )
-        if ts_model.epilogue:
-            epilogue = self.plugin_mapping.get(ts_model.epilogue)
-            if epilogue is None:
+        if ts_model.post_test:
+            post_test = self.hook_mapping.get(ts_model.post_test)
+            if post_test is None:
                 logging.warning(
-                    f"Epilogue '{ts_model.epilogue}' not found in plugin mapping. "
-                    "Ensure that a proper plugin directory is set under the working directory."
+                    f"Epilogue '{ts_model.post_test}' not found in hook mapping. "
+                    "Ensure that a proper hook directory is set under the working directory."
                 )
 
         test_runs_by_id: dict[str, TestRun] = {
-            tr.id: self._create_test_run(tr, normalized_weight, prologue, epilogue) for tr in ts_model.tests
+            tr.id: self._create_test_run(tr, normalized_weight, pre_test, post_test) for tr in ts_model.tests
         }
 
         tests_data: dict[str, _TestRunTOML] = {tr.id: tr for tr in ts_model.tests}
@@ -176,8 +176,8 @@ def _create_test_run(
         self,
         test_info: _TestRunTOML,
         normalized_weight: float,
-        prologue: Optional[TestScenario] = None,
-        epilogue: Optional[TestScenario] = None,
+        pre_test: Optional[TestScenario] = None,
+        post_test: Optional[TestScenario] = None,
     ) -> TestRun:
         """
         Create a section-specific Test object by copying from the test mapping.
@@ -185,8 +185,8 @@ def _create_test_run(
         Args:
             test_info (Dict[str, Any]): Information of the test.
             normalized_weight (float): Normalized weight for the test.
-            prologue (Optional[TestScenario]): TestScenario object representing the prologue sequence.
-            epilogue (Optional[TestScenario]): TestScenario object representing the epilogue sequence.
+            pre_test (Optional[TestScenario]): TestScenario object representing the pre_test sequence.
+            post_test (Optional[TestScenario]): TestScenario object representing the post_test sequence.
 
         Returns:
             Test: Copied and updated Test object for the section.
@@ -219,7 +219,7 @@ def _create_test_run(
             sol=test_info.sol,
             weight=test_info.weight * normalized_weight,
             ideal_perf=test_info.ideal_perf,
-            prologue=prologue,
-            epilogue=epilogue,
+            pre_test=pre_test,
+            post_test=post_test,
         )
         return tr
diff --git a/src/cloudai/parser.py b/src/cloudai/parser.py
index eb3270c3..950eff6c 100644
--- a/src/cloudai/parser.py
+++ b/src/cloudai/parser.py
@@ -34,7 +34,7 @@
     format_validation_error,
 )
 
-PLUGIN_ROOT = Path("conf/plugin")
+PLUGIN_ROOT = Path("conf/hook")
 PLUGIN_TEST_ROOT = PLUGIN_ROOT / "test"
 
 
@@ -85,62 +85,60 @@ def parse(
             logging.debug(f"PLUGIN_ROOT path '{PLUGIN_ROOT}' does not exist. Plugins will not be enabled.")
 
         try:
-            plugin_tests = (
+            hook_tests = (
                 self.parse_tests(list(PLUGIN_TEST_ROOT.glob("*.toml")), system) if PLUGIN_TEST_ROOT.exists() else []
             )
         except TestConfigParsingError:
             exit(1)  # exit right away to keep error message readable for users
 
         if not test_scenario_path:
-            all_tests = list({test.name: test for test in tests + plugin_tests}.values())
+            all_tests = list({test.name: test for test in tests + hook_tests}.values())
             return system, all_tests, None
 
         test_mapping = {t.name: t for t in tests}
-        plugin_test_scenario_mapping = {}
+        hook_test_scenario_mapping = {}
         if PLUGIN_ROOT.exists() and list(PLUGIN_ROOT.glob("*.toml")):
             try:
-                plugin_test_scenario_mapping = self.parse_plugins(
-                    list(PLUGIN_ROOT.glob("*.toml")), {t.name: t for t in plugin_tests}
+                hook_test_scenario_mapping = self.parse_hooks(
+                    list(PLUGIN_ROOT.glob("*.toml")), {t.name: t for t in hook_tests}
                 )
             except TestScenarioParsingError:
                 exit(1)  # exit right away to keep error message readable for users
 
         try:
-            test_scenario = self.parse_test_scenario(test_scenario_path, test_mapping, plugin_test_scenario_mapping)
+            test_scenario = self.parse_test_scenario(test_scenario_path, test_mapping, hook_test_scenario_mapping)
         except TestScenarioParsingError:
             exit(1)  # exit right away to keep error message readable for users
 
         scenario_tests = {tr.test.name for tr in test_scenario.test_runs}
-        plugin_scenario_tests = {
-            tr.test.name
-            for plugin_scenario in plugin_test_scenario_mapping.values()
-            for tr in plugin_scenario.test_runs
+        hook_scenario_tests = {
+            tr.test.name for hook_scenario in hook_test_scenario_mapping.values() for tr in hook_scenario.test_runs
         }
 
-        relevant_test_names = scenario_tests.union(plugin_scenario_tests)
-        filtered_tests = [t for t in tests if t.name in relevant_test_names] + plugin_tests
+        relevant_test_names = scenario_tests.union(hook_scenario_tests)
+        filtered_tests = [t for t in tests if t.name in relevant_test_names] + hook_tests
         filtered_tests = list({test.name: test for test in filtered_tests}.values())
 
         return system, filtered_tests, test_scenario
 
     @staticmethod
-    def parse_plugins(plugin_tomls: List[Path], test_mapping: Dict[str, Test]) -> Dict[str, TestScenario]:
-        plugin_mapping = {}
-        for plugin_test_scenario_path in plugin_tomls:
-            plugin_scenario = Parser.parse_test_scenario(plugin_test_scenario_path, test_mapping)
-            plugin_mapping[plugin_scenario.name] = plugin_scenario
-        return plugin_mapping
+    def parse_hooks(hook_tomls: List[Path], test_mapping: Dict[str, Test]) -> Dict[str, TestScenario]:
+        hook_mapping = {}
+        for hook_test_scenario_path in hook_tomls:
+            hook_scenario = Parser.parse_test_scenario(hook_test_scenario_path, test_mapping)
+            hook_mapping[hook_scenario.name] = hook_scenario
+        return hook_mapping
 
     @staticmethod
     def parse_test_scenario(
         test_scenario_path: Path,
         test_mapping: Dict[str, Test],
-        plugin_mapping: Optional[Dict[str, TestScenario]] = None,
+        hook_mapping: Optional[Dict[str, TestScenario]] = None,
     ) -> TestScenario:
-        if plugin_mapping is None:
-            plugin_mapping = {}
+        if hook_mapping is None:
+            hook_mapping = {}
 
-        test_scenario_parser = TestScenarioParser(test_scenario_path, test_mapping, plugin_mapping)
+        test_scenario_parser = TestScenarioParser(test_scenario_path, test_mapping, hook_mapping)
         test_scenario = test_scenario_parser.parse()
         return test_scenario
 
diff --git a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py
index e4b2f0c3..10e5ef3e 100644
--- a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py
+++ b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py
@@ -60,18 +60,18 @@ def gen_exec_command(self, tr: TestRun) -> str:
         command_list = []
         indent = ""
 
-        if tr.prologue:
-            prologue_command = self.gen_prologue(tr.prologue, tr.output_path)
-            command_list = [prologue_command, "if [ $PROLOGUE_SUCCESS -eq 1 ]; then"]
+        if tr.pre_test:
+            pre_test_command = self.gen_pre_test(tr.pre_test, tr.output_path)
+            command_list = [pre_test_command, "if [ $PROLOGUE_SUCCESS -eq 1 ]; then"]
             indent = "    "
 
         command_list.append(f"{indent}{srun_command}")
 
-        if tr.epilogue:
-            epilogue_command = self.gen_epilogue(tr.epilogue, tr.output_path)
-            command_list.append(f"{indent}{epilogue_command}")
+        if tr.post_test:
+            post_test_command = self.gen_post_test(tr.post_test, tr.output_path)
+            command_list.append(f"{indent}{post_test_command}")
 
-        if tr.prologue:
+        if tr.pre_test:
             command_list.append("fi")
 
         full_command = "\n".join(command_list).strip()
@@ -123,74 +123,74 @@ def job_name(self, job_name_prefix: str) -> str:
             job_name = f"{self.system.account}-{job_name_prefix}.{datetime.now().strftime('%Y%m%d_%H%M%S')}"
         return job_name
 
-    def gen_prologue(self, prologue: TestScenario, base_output_path: Path) -> str:
+    def gen_pre_test(self, pre_test: TestScenario, base_output_path: Path) -> str:
         """
-        Generate the prologue command by running all tests defined in the prologue test scenario.
+        Generate the pre_test command by running all tests defined in the pre_test test scenario.
 
         Args:
-            prologue (TestScenario): The prologue test scenario containing the tests to be run.
-            base_output_path (Path): The base output directory path for storing prologue outputs.
+            pre_test (TestScenario): The pre_test test scenario containing the tests to be run.
+            base_output_path (Path): The base output directory path for storing pre_test outputs.
 
         Returns:
-            str: A string with all the Slurm srun commands generated for the prologue.
+            str: A string with all the Slurm srun commands generated for the pre_test.
         """
-        prologue_output_dir = base_output_path / "prologue"
-        prologue_output_dir.mkdir(parents=True, exist_ok=True)
+        pre_test_output_dir = base_output_path / "pre_test"
+        pre_test_output_dir.mkdir(parents=True, exist_ok=True)
 
-        prologue_commands = []
+        pre_test_commands = []
         success_vars = []
 
-        for idx, tr in enumerate(prologue.test_runs):
-            plugin_dir = prologue_output_dir / tr.test.name
-            plugin_dir.mkdir(parents=True, exist_ok=True)
-            tr.output_path = plugin_dir
+        for idx, tr in enumerate(pre_test.test_runs):
+            hook_dir = pre_test_output_dir / tr.test.name
+            hook_dir.mkdir(parents=True, exist_ok=True)
+            tr.output_path = hook_dir
 
             srun_command = tr.test.test_template.gen_srun_command(tr)
             srun_command_with_output = srun_command.replace(
-                "srun ", f"srun --output={plugin_dir / 'stdout.txt'} --error={plugin_dir / 'stderr.txt'} "
+                "srun ", f"srun --output={hook_dir / 'stdout.txt'} --error={hook_dir / 'stderr.txt'} "
             )
-            prologue_commands.append(srun_command_with_output)
+            pre_test_commands.append(srun_command_with_output)
 
             success_var = f"SUCCESS_{idx}"
             success_vars.append(success_var)
 
             success_check_command = tr.test.test_template.gen_srun_success_check(tr)
-            prologue_commands.append(f"{success_var}=$({success_check_command})")
+            pre_test_commands.append(f"{success_var}=$({success_check_command})")
 
         combined_success_var = " && ".join([f"[ ${var} -eq 1 ]" for var in success_vars])
 
-        prologue_commands.append(f"PROLOGUE_SUCCESS=$( {combined_success_var} && echo 1 || echo 0 )")
+        pre_test_commands.append(f"PROLOGUE_SUCCESS=$( {combined_success_var} && echo 1 || echo 0 )")
 
-        return "\n".join(prologue_commands)
+        return "\n".join(pre_test_commands)
 
-    def gen_epilogue(self, epilogue: TestScenario, base_output_path: Path) -> str:
+    def gen_post_test(self, post_test: TestScenario, base_output_path: Path) -> str:
         """
-        Generate the epilogue command by running all tests defined in the epilogue test scenario.
+        Generate the post_test command by running all tests defined in the post_test test scenario.
 
         Args:
-            epilogue (TestScenario): The epilogue test scenario containing the tests to be run.
-            base_output_path (Path): The base output directory path for storing epilogue outputs.
+            post_test (TestScenario): The post_test test scenario containing the tests to be run.
+            base_output_path (Path): The base output directory path for storing post_test outputs.
 
         Returns:
-            str: A string with all the Slurm srun commands generated for the epilogue.
+            str: A string with all the Slurm srun commands generated for the post_test.
         """
-        epilogue_output_dir = base_output_path / "epilogue"
-        epilogue_output_dir.mkdir(parents=True, exist_ok=True)
+        post_test_output_dir = base_output_path / "post_test"
+        post_test_output_dir.mkdir(parents=True, exist_ok=True)
 
-        epilogue_commands = []
+        post_test_commands = []
 
-        for tr in epilogue.test_runs:
-            plugin_dir = epilogue_output_dir / tr.test.name
-            plugin_dir.mkdir(parents=True, exist_ok=True)
-            tr.output_path = plugin_dir
+        for tr in post_test.test_runs:
+            hook_dir = post_test_output_dir / tr.test.name
+            hook_dir.mkdir(parents=True, exist_ok=True)
+            tr.output_path = hook_dir
 
             srun_command = tr.test.test_template.gen_srun_command(tr)
             srun_command_with_output = srun_command.replace(
-                "srun ", f"srun --output={plugin_dir / 'stdout.txt'} --error={plugin_dir / 'stderr.txt'} "
+                "srun ", f"srun --output={hook_dir / 'stdout.txt'} --error={hook_dir / 'stderr.txt'} "
             )
-            epilogue_commands.append(srun_command_with_output)
+            post_test_commands.append(srun_command_with_output)
 
-        return "\n".join(epilogue_commands)
+        return "\n".join(post_test_commands)
 
     def _gen_srun_command(
         self, slurm_args: Dict[str, Any], env_vars: Dict[str, str], cmd_args: Dict[str, str], tr: TestRun
diff --git a/tests/ref_data/gpt-no-plugin.sbatch b/tests/ref_data/gpt-no-hook.sbatch
similarity index 100%
rename from tests/ref_data/gpt-no-plugin.sbatch
rename to tests/ref_data/gpt-no-hook.sbatch
diff --git a/tests/ref_data/gpt-prologue.sbatch b/tests/ref_data/gpt-pre-test.sbatch
similarity index 100%
rename from tests/ref_data/gpt-prologue.sbatch
rename to tests/ref_data/gpt-pre-test.sbatch
diff --git a/tests/ref_data/grok-no-plugin.sbatch b/tests/ref_data/grok-no-hook.sbatch
similarity index 100%
rename from tests/ref_data/grok-no-plugin.sbatch
rename to tests/ref_data/grok-no-hook.sbatch
diff --git a/tests/ref_data/grok-prologue.sbatch b/tests/ref_data/grok-pre-test.sbatch
similarity index 100%
rename from tests/ref_data/grok-prologue.sbatch
rename to tests/ref_data/grok-pre-test.sbatch
diff --git a/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py
index e388f0d9..07b8f2e4 100644
--- a/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py
+++ b/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py
@@ -123,121 +123,121 @@ def test_raises_if_no_default_partition(slurm_system: SlurmSystem):
 
 
 @pytest.mark.parametrize(
-    "prologue,epilogue,expected_script_lines",
+    "pre_test,post_test,expected_script_lines",
     [
-        # No prologue, no epilogue
+        # No pre_test, no post_test
         (None, None, ["srun"]),
-        # One prologue, no epilogue
+        # One pre_test, no post_test
         (
             [Mock(test=Mock(name="test1", test_template=Mock()))],
             None,
             [
-                "prologue",
+                "pre_test",
                 "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 )",
                 "if [ $PROLOGUE_SUCCESS -eq 1 ]; then",
                 "    srun",
                 "fi",
             ],
         ),
-        # No prologue, one epilogue
+        # No pre_test, one post_test
         (
             None,
             [Mock(test=Mock(name="test2", test_template=Mock()))],
             [
                 "srun",
-                "epilogue",
+                "post_test",
             ],
         ),
-        # One prologue, one epilogue
+        # One pre_test, one post_test
         (
             [Mock(test=Mock(name="test1", test_template=Mock()))],
             [Mock(test=Mock(name="test2", test_template=Mock()))],
             [
-                "prologue",
+                "pre_test",
                 "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 )",
                 "if [ $PROLOGUE_SUCCESS -eq 1 ]; then",
                 "    srun",
-                "    epilogue",
+                "    post_test",
                 "fi",
             ],
         ),
-        # Multiple prologues, multiple epilogues
+        # Multiple pre_tests, multiple post_tests
         (
             [Mock(test=Mock(name="test1", test_template=Mock())), Mock(test=Mock(name="test2", test_template=Mock()))],
             [Mock(test=Mock(name="test3", test_template=Mock())), Mock(test=Mock(name="test4", test_template=Mock()))],
             [
-                "prologue",
-                "prologue",
+                "pre_test",
+                "pre_test",
                 "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && [ $SUCCESS_1 -eq 1 ] && echo 1 || echo 0 )",
                 "if [ $PROLOGUE_SUCCESS -eq 1 ]; then",
                 "    srun",
-                "    epilogue",
-                "    epilogue",
+                "    post_test",
+                "    post_test",
                 "fi",
             ],
         ),
-        # Multiple prologues, no epilogue
+        # Multiple pre_tests, no post_test
         (
             [Mock(test=Mock(name="test1", test_template=Mock())), Mock(test=Mock(name="test2", test_template=Mock()))],
             None,
             [
-                "prologue",
-                "prologue",
+                "pre_test",
+                "pre_test",
                 "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && [ $SUCCESS_1 -eq 1 ] && echo 1 || echo 0 )",
                 "if [ $PROLOGUE_SUCCESS -eq 1 ]; then",
                 "    srun",
                 "fi",
             ],
         ),
-        # No prologue, multiple epilogues
+        # No pre_test, multiple post_tests
         (
             None,
             [Mock(test=Mock(name="test3", test_template=Mock())), Mock(test=Mock(name="test4", test_template=Mock()))],
             [
                 "srun",
-                "epilogue",
-                "epilogue",
+                "post_test",
+                "post_test",
             ],
         ),
-        # Multiple prologues, single epilogue
+        # Multiple pre_tests, single post_test
         (
             [Mock(test=Mock(name="test1", test_template=Mock())), Mock(test=Mock(name="test2", test_template=Mock()))],
             [Mock(test=Mock(name="test3", test_template=Mock()))],
             [
-                "prologue",
-                "prologue",
+                "pre_test",
+                "pre_test",
                 "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && [ $SUCCESS_1 -eq 1 ] && echo 1 || echo 0 )",
                 "if [ $PROLOGUE_SUCCESS -eq 1 ]; then",
                 "    srun",
-                "    epilogue",
+                "    post_test",
                 "fi",
             ],
         ),
     ],
 )
-def test_prologue_epilogue_combinations(
+def test_pre_test_post_test_combinations(
     strategy_fixture: SlurmCommandGenStrategy,
     testrun_fixture: TestRun,
-    prologue,
-    epilogue,
+    pre_test,
+    post_test,
     expected_script_lines,
 ):
-    testrun_fixture.prologue = Mock(spec=TestScenario) if prologue else None
-    testrun_fixture.epilogue = Mock(spec=TestScenario) if epilogue else None
-
-    if prologue is not None:
-        testrun_fixture.prologue = Mock(spec=TestScenario)
-        testrun_fixture.prologue.test_runs = prologue
-        for idx, run in enumerate(prologue):
-            run.test.test_template.gen_srun_success_check.return_value = "prologue"
+    testrun_fixture.pre_test = Mock(spec=TestScenario) if pre_test else None
+    testrun_fixture.post_test = Mock(spec=TestScenario) if post_test else None
+
+    if pre_test is not None:
+        testrun_fixture.pre_test = Mock(spec=TestScenario)
+        testrun_fixture.pre_test.test_runs = pre_test
+        for idx, run in enumerate(pre_test):
+            run.test.test_template.gen_srun_success_check.return_value = "pre_test"
             run.test.test_template.gen_srun_command.return_value = "srun"
             run.test.name = f"test{idx+1}"
 
-    if epilogue is not None:
-        testrun_fixture.epilogue = Mock(spec=TestScenario)
-        testrun_fixture.epilogue.test_runs = epilogue
-        for idx, run in enumerate(epilogue):
-            run.test.test_template.gen_srun_command.return_value = "epilogue"
+    if post_test is not None:
+        testrun_fixture.post_test = Mock(spec=TestScenario)
+        testrun_fixture.post_test.test_runs = post_test
+        for idx, run in enumerate(post_test):
+            run.test.test_template.gen_srun_command.return_value = "post_test"
             run.test.name = f"test{idx+1}"
 
     sbatch_command = strategy_fixture.gen_exec_command(testrun_fixture)
diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py
index bb6f7897..862c59f6 100644
--- a/tests/test_acceptance.py
+++ b/tests/test_acceptance.py
@@ -60,7 +60,7 @@ def test_slurm(tmp_path: Path, scenario: Dict):
         system_config=Path("conf/common/system/example_slurm_cluster.toml"),
         test_templates_dir=Path("conf/common/test_template"),
         tests_dir=Path("conf/common/test"),
-        plugin_dir=Path("conf/common/plugin"),
+        hook_dir=Path("conf/common/hook"),
         test_scenario=test_scenario_path,
         output_dir=tmp_path,
     )
@@ -91,7 +91,7 @@ def partial_tr(slurm_system: SlurmSystem) -> partial[TestRun]:
     return partial(TestRun, num_nodes=1, nodes=[], output_path=slurm_system.output_path)
 
 
-@pytest.fixture(params=["ucc", "nccl", "sleep", "gpt-prologue", "gpt-no-plugin", "grok-prologue", "grok-no-plugin"])
+@pytest.fixture(params=["ucc", "nccl", "sleep", "gpt-pre_test", "gpt-no-hook", "grok-pre_test", "grok-no-hook"])
 def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -> tuple[TestRun, str, Optional[str]]:
     if request.param == "ucc":
         tr = partial_tr(
@@ -159,8 +159,8 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -
             slurm_system, tr.test.test_definition.cmd_args_dict
         )
         tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name")
-        if "prologue" in request.param:
-            prologue_tr = partial_tr(
+        if "pre_test" in request.param:
+            pre_test_tr = partial_tr(
                 name="nccl",
                 test=Test(
                     test_definition=NCCLTestDefinition(
@@ -169,11 +169,11 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -
                     test_template=NcclTest(slurm_system, name="nccl"),
                 ),
             )
-            prologue_tr.test.test_template.command_gen_strategy = NcclTestSlurmCommandGenStrategy(
-                slurm_system, prologue_tr.test.test_definition.cmd_args_dict
+            pre_test_tr.test.test_template.command_gen_strategy = NcclTestSlurmCommandGenStrategy(
+                slurm_system, pre_test_tr.test.test_definition.cmd_args_dict
             )
-            prologue_tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name")
-            tr.prologue = TestScenario(name=f"{prologue_tr.name} NCCL Prologue", test_runs=[prologue_tr])
+            pre_test_tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name")
+            tr.pre_test = TestScenario(name=f"{pre_test_tr.name} NCCL Prologue", test_runs=[pre_test_tr])
 
         return (tr, f"{request.param}.sbatch", "gpt.run")
     elif request.param.startswith("grok-"):
@@ -194,8 +194,8 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -
             slurm_system, tr.test.test_definition.cmd_args_dict
         )
         tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name")
-        if "prologue" in request.param:
-            prologue_tr = partial_tr(
+        if "pre_test" in request.param:
+            pre_test_tr = partial_tr(
                 name="nccl",
                 test=Test(
                     test_definition=NCCLTestDefinition(
@@ -204,11 +204,11 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -
                     test_template=NcclTest(slurm_system, name="nccl"),
                 ),
             )
-            prologue_tr.test.test_template.command_gen_strategy = NcclTestSlurmCommandGenStrategy(
-                slurm_system, prologue_tr.test.test_definition.cmd_args_dict
+            pre_test_tr.test.test_template.command_gen_strategy = NcclTestSlurmCommandGenStrategy(
+                slurm_system, pre_test_tr.test.test_definition.cmd_args_dict
             )
-            prologue_tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name")
-            tr.prologue = TestScenario(name=f"{prologue_tr.name} NCCL Prologue", test_runs=[prologue_tr])
+            pre_test_tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name")
+            tr.pre_test = TestScenario(name=f"{pre_test_tr.name} NCCL Prologue", test_runs=[pre_test_tr])
 
         return (tr, f"{request.param}.sbatch", "grok.run")
 
diff --git a/tests/test_parser.py b/tests/test_parser.py
index e662e9f7..3f901e0d 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -50,7 +50,7 @@ def test_no_scenario(self, test_parser: Mock, parser: Parser):
 
     @patch("cloudai._core.test_parser.TestParser.parse_all")
     @patch("cloudai._core.test_scenario_parser.TestScenarioParser.parse")
-    def test_scenario_without_plugin(self, test_scenario_parser: Mock, test_parser: Mock, parser: Parser):
+    def test_scenario_without_hook(self, test_scenario_parser: Mock, test_parser: Mock, parser: Parser):
         tests_dir = parser.system_config_path.parent.parent / "test"
 
         fake_tests = [Mock(name=f"test-{i}") for i in range(3)]
@@ -71,29 +71,29 @@ def test_scenario_without_plugin(self, test_scenario_parser: Mock, test_parser:
 
     @patch("cloudai._core.test_parser.TestParser.parse_all")
     @patch("cloudai._core.test_scenario_parser.TestScenarioParser.parse")
-    @patch("cloudai.parser.Parser.parse_plugins")
-    def test_scenario_with_plugin_common_tests(
-        self, parse_plugins: Mock, test_scenario_parser: Mock, test_parser: Mock, parser: Parser
+    @patch("cloudai.parser.Parser.parse_hooks")
+    def test_scenario_with_hook_common_tests(
+        self, parse_hooks: Mock, test_scenario_parser: Mock, test_parser: Mock, parser: Parser
     ):
         tests_dir = parser.system_config_path.parent.parent / "test"
 
         main_tests = [Mock() for _ in range(3)]
         for i, test in enumerate(main_tests):
             test.name = f"test-{i}"
-        plugin_tests = [Mock()]
-        plugin_tests[0].name = "test-1"
+        hook_tests = [Mock()]
+        hook_tests[0].name = "test-1"
 
-        test_parser.side_effect = [main_tests, plugin_tests]
+        test_parser.side_effect = [main_tests, hook_tests]
 
         fake_scenario = Mock()
         fake_scenario.test_runs = [Mock()]
         fake_scenario.test_runs[0].test.name = "test-1"
         test_scenario_parser.return_value = fake_scenario
 
-        fake_plugin = Mock()
-        fake_plugin.test_runs = [Mock()]
-        fake_plugin.test_runs[0].test.name = "test-1"
-        parse_plugins.return_value = {"plugin-1": fake_plugin}
+        fake_hook = Mock()
+        fake_hook.test_runs = [Mock()]
+        fake_hook.test_runs[0].test.name = "test-1"
+        parse_hooks.return_value = {"hook-1": fake_hook}
 
         _, tests, _ = parser.parse(tests_dir, Path())
 
@@ -103,17 +103,17 @@ def test_scenario_with_plugin_common_tests(
 
     @patch("cloudai._core.test_parser.TestParser.parse_all")
     @patch("cloudai._core.test_scenario_parser.TestScenarioParser.parse")
-    def test_scenario_with_plugin_exclusive_tests(self, test_scenario_parser: Mock, test_parser: Mock, parser: Parser):
+    def test_scenario_with_hook_exclusive_tests(self, test_scenario_parser: Mock, test_parser: Mock, parser: Parser):
         tests_dir = parser.system_config_path.parent.parent / "test"
         test_scenario_path = Path("/mock/test_scenario.toml")
 
         main_tests = [Mock() for _ in range(3)]
-        plugin_tests = [Mock()]
+        hook_tests = [Mock()]
         for i, test in enumerate(main_tests):
             test.name = f"test-{i}"
-        plugin_tests[0].name = "plugin-test-1"
+        hook_tests[0].name = "hook-test-1"
 
-        test_parser.side_effect = [main_tests, plugin_tests]
+        test_parser.side_effect = [main_tests, hook_tests]
 
         fake_scenario = Mock()
         fake_scenario.test_runs = [Mock()]
@@ -125,7 +125,7 @@ def test_scenario_with_plugin_exclusive_tests(self, test_scenario_parser: Mock,
         filtered_test_names = {t.name for t in filtered_tests}
         assert len(filtered_tests) == 2
         assert "test-1" in filtered_test_names
-        assert "plugin-test-1" in filtered_test_names
+        assert "hook-test-1" in filtered_test_names
         assert "test-0" not in filtered_test_names
         assert "test-2" not in filtered_test_names
 

From f53420cbc3c4d1bb7ee1bdabbd60292adf5091fc Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Mon, 4 Nov 2024 14:08:11 -0600
Subject: [PATCH 54/64] Rename plugin to hook

---
 src/cloudai/_core/test_scenario_parser.py     |  4 ++--
 .../strategy/slurm_command_gen_strategy.py    |  4 ++--
 tests/ref_data/gpt-pre-test.sbatch            |  8 ++++----
 tests/ref_data/grok-pre-test.sbatch           |  8 ++++----
 .../test_common_slurm_command_gen_strategy.py | 20 +++++++++----------
 tests/test_acceptance.py                      |  6 +++---
 6 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/src/cloudai/_core/test_scenario_parser.py b/src/cloudai/_core/test_scenario_parser.py
index 93047d29..ddfb5fa0 100644
--- a/src/cloudai/_core/test_scenario_parser.py
+++ b/src/cloudai/_core/test_scenario_parser.py
@@ -144,14 +144,14 @@ def _parse_data(self, data: Dict[str, Any]) -> TestScenario:
             pre_test = self.hook_mapping.get(ts_model.pre_test)
             if pre_test is None:
                 logging.warning(
-                    f"Prologue '{ts_model.pre_test}' not found in hook mapping. "
+                    f"Pre-test hook '{ts_model.pre_test}' not found in hook mapping. "
                     "Ensure that a proper hook directory is set under the working directory."
                 )
         if ts_model.post_test:
             post_test = self.hook_mapping.get(ts_model.post_test)
             if post_test is None:
                 logging.warning(
-                    f"Epilogue '{ts_model.post_test}' not found in hook mapping. "
+                    f"Post-test hook '{ts_model.post_test}' not found in hook mapping. "
                     "Ensure that a proper hook directory is set under the working directory."
                 )
 
diff --git a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py
index 10e5ef3e..8b03379d 100644
--- a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py
+++ b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py
@@ -62,7 +62,7 @@ def gen_exec_command(self, tr: TestRun) -> str:
 
         if tr.pre_test:
             pre_test_command = self.gen_pre_test(tr.pre_test, tr.output_path)
-            command_list = [pre_test_command, "if [ $PROLOGUE_SUCCESS -eq 1 ]; then"]
+            command_list = [pre_test_command, "if [ $PRE_TEST_SUCCESS -eq 1 ]; then"]
             indent = "    "
 
         command_list.append(f"{indent}{srun_command}")
@@ -159,7 +159,7 @@ def gen_pre_test(self, pre_test: TestScenario, base_output_path: Path) -> str:
 
         combined_success_var = " && ".join([f"[ ${var} -eq 1 ]" for var in success_vars])
 
-        pre_test_commands.append(f"PROLOGUE_SUCCESS=$( {combined_success_var} && echo 1 || echo 0 )")
+        pre_test_commands.append(f"PRE_TEST_SUCCESS=$( {combined_success_var} && echo 1 || echo 0 )")
 
         return "\n".join(pre_test_commands)
 
diff --git a/tests/ref_data/gpt-pre-test.sbatch b/tests/ref_data/gpt-pre-test.sbatch
index 08a3a87a..c0f6114f 100644
--- a/tests/ref_data/gpt-pre-test.sbatch
+++ b/tests/ref_data/gpt-pre-test.sbatch
@@ -8,10 +8,10 @@ export COMBINE_THRESHOLD=1
 export PER_GPU_COMBINE_THRESHOLD=0
 export XLA_FLAGS="--xla_gpu_all_gather_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_all_reduce_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_reduce_scatter_combine_threshold_bytes=$PER_GPU_COMBINE_THRESHOLD"
 
-srun --output=__OUTPUT_DIR__/prologue/nccl/stdout.txt --error=__OUTPUT_DIR__/prologue/nccl/stderr.txt --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 /usr/local/bin/all_reduce_perf_mpi --nthreads 1 --ngpus 1 --minbytes 32M --maxbytes 32M --stepbytes 1M --op sum --datatype float --root 0 --iters 20 --warmup_iters 5 --agg_iters 1 --average 1 --parallel_init 0 --check 1 --blocking 0 --cudagraph 0
-SUCCESS_0=$(grep -q "Avg bus bandwidth" __OUTPUT_DIR__/prologue/nccl/stdout.txt && echo 1 || echo 0)
-PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 )
-if [ $PROLOGUE_SUCCESS -eq 1 ]; then
+srun --output=__OUTPUT_DIR__/pre_test/nccl/stdout.txt --error=__OUTPUT_DIR__/pre_test/nccl/stderr.txt --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 /usr/local/bin/all_reduce_perf_mpi --nthreads 1 --ngpus 1 --minbytes 32M --maxbytes 32M --stepbytes 1M --op sum --datatype float --root 0 --iters 20 --warmup_iters 5 --agg_iters 1 --average 1 --parallel_init 0 --check 1 --blocking 0 --cudagraph 0
+SUCCESS_0=$(grep -q "Avg bus bandwidth" __OUTPUT_DIR__/pre_test/nccl/stdout.txt && echo 1 || echo 0)
+PRE_TEST_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 )
+if [ $PRE_TEST_SUCCESS -eq 1 ]; then
         echo "Loading container with srun command"
     srun --mpi=none --container-image=https://docker/url --container-name=cont true
     echo "Running srun command"
diff --git a/tests/ref_data/grok-pre-test.sbatch b/tests/ref_data/grok-pre-test.sbatch
index e75d3d77..51730bd7 100644
--- a/tests/ref_data/grok-pre-test.sbatch
+++ b/tests/ref_data/grok-pre-test.sbatch
@@ -8,10 +8,10 @@ export COMBINE_THRESHOLD=1
 export PER_GPU_COMBINE_THRESHOLD=0
 export XLA_FLAGS="--xla_disable_hlo_passes=rematerialization --xla_dump_hlo_pass_re=.* --xla_gpu_all_gather_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_all_reduce_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_enable_all_gather_combine_by_dim=false --xla_gpu_enable_highest_priority_async_stream=true --xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_pipelined_all_gather=true --xla_gpu_enable_pipelined_all_reduce=true --xla_gpu_enable_pipelined_reduce_scatter=true --xla_gpu_enable_reduce_scatter_combine_by_dim=false --xla_gpu_enable_triton_gemm=false --xla_gpu_enable_triton_softmax_fusion=false --xla_gpu_enable_while_loop_double_buffering=true --xla_gpu_graph_level=0 --xla_gpu_pgle_profile_file_or_directory_path=/opt/paxml/workspace/pgle_output_profile.pbtxt --xla_gpu_reduce_scatter_combine_threshold_bytes=$PER_GPU_COMBINE_THRESHOLD --xla_gpu_run_post_layout_collective_pipeliner=false --xla_gpu_use_memcpy_local_p2p=false"
 
-srun --output=__OUTPUT_DIR__/prologue/nccl/stdout.txt --error=__OUTPUT_DIR__/prologue/nccl/stderr.txt --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 /usr/local/bin/all_reduce_perf_mpi --nthreads 1 --ngpus 1 --minbytes 32M --maxbytes 32M --stepbytes 1M --op sum --datatype float --root 0 --iters 20 --warmup_iters 5 --agg_iters 1 --average 1 --parallel_init 0 --check 1 --blocking 0 --cudagraph 0
-SUCCESS_0=$(grep -q "Avg bus bandwidth" __OUTPUT_DIR__/prologue/nccl/stdout.txt && echo 1 || echo 0)
-PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 )
-if [ $PROLOGUE_SUCCESS -eq 1 ]; then
+srun --output=__OUTPUT_DIR__/pre_test/nccl/stdout.txt --error=__OUTPUT_DIR__/pre_test/nccl/stderr.txt --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 /usr/local/bin/all_reduce_perf_mpi --nthreads 1 --ngpus 1 --minbytes 32M --maxbytes 32M --stepbytes 1M --op sum --datatype float --root 0 --iters 20 --warmup_iters 5 --agg_iters 1 --average 1 --parallel_init 0 --check 1 --blocking 0 --cudagraph 0
+SUCCESS_0=$(grep -q "Avg bus bandwidth" __OUTPUT_DIR__/pre_test/nccl/stdout.txt && echo 1 || echo 0)
+PRE_TEST_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 )
+if [ $PRE_TEST_SUCCESS -eq 1 ]; then
         echo "Loading container with srun command"
     srun --mpi=none --container-image=https://docker/url --container-name=cont true
     echo "Running srun command"
diff --git a/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py
index 07b8f2e4..534d9cd1 100644
--- a/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py
+++ b/tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py
@@ -133,8 +133,8 @@ def test_raises_if_no_default_partition(slurm_system: SlurmSystem):
             None,
             [
                 "pre_test",
-                "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 )",
-                "if [ $PROLOGUE_SUCCESS -eq 1 ]; then",
+                "PRE_TEST_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 )",
+                "if [ $PRE_TEST_SUCCESS -eq 1 ]; then",
                 "    srun",
                 "fi",
             ],
@@ -154,8 +154,8 @@ def test_raises_if_no_default_partition(slurm_system: SlurmSystem):
             [Mock(test=Mock(name="test2", test_template=Mock()))],
             [
                 "pre_test",
-                "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 )",
-                "if [ $PROLOGUE_SUCCESS -eq 1 ]; then",
+                "PRE_TEST_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 )",
+                "if [ $PRE_TEST_SUCCESS -eq 1 ]; then",
                 "    srun",
                 "    post_test",
                 "fi",
@@ -168,8 +168,8 @@ def test_raises_if_no_default_partition(slurm_system: SlurmSystem):
             [
                 "pre_test",
                 "pre_test",
-                "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && [ $SUCCESS_1 -eq 1 ] && echo 1 || echo 0 )",
-                "if [ $PROLOGUE_SUCCESS -eq 1 ]; then",
+                "PRE_TEST_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && [ $SUCCESS_1 -eq 1 ] && echo 1 || echo 0 )",
+                "if [ $PRE_TEST_SUCCESS -eq 1 ]; then",
                 "    srun",
                 "    post_test",
                 "    post_test",
@@ -183,8 +183,8 @@ def test_raises_if_no_default_partition(slurm_system: SlurmSystem):
             [
                 "pre_test",
                 "pre_test",
-                "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && [ $SUCCESS_1 -eq 1 ] && echo 1 || echo 0 )",
-                "if [ $PROLOGUE_SUCCESS -eq 1 ]; then",
+                "PRE_TEST_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && [ $SUCCESS_1 -eq 1 ] && echo 1 || echo 0 )",
+                "if [ $PRE_TEST_SUCCESS -eq 1 ]; then",
                 "    srun",
                 "fi",
             ],
@@ -206,8 +206,8 @@ def test_raises_if_no_default_partition(slurm_system: SlurmSystem):
             [
                 "pre_test",
                 "pre_test",
-                "PROLOGUE_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && [ $SUCCESS_1 -eq 1 ] && echo 1 || echo 0 )",
-                "if [ $PROLOGUE_SUCCESS -eq 1 ]; then",
+                "PRE_TEST_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && [ $SUCCESS_1 -eq 1 ] && echo 1 || echo 0 )",
+                "if [ $PRE_TEST_SUCCESS -eq 1 ]; then",
                 "    srun",
                 "    post_test",
                 "fi",
diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py
index 862c59f6..2e3f910c 100644
--- a/tests/test_acceptance.py
+++ b/tests/test_acceptance.py
@@ -91,7 +91,7 @@ def partial_tr(slurm_system: SlurmSystem) -> partial[TestRun]:
     return partial(TestRun, num_nodes=1, nodes=[], output_path=slurm_system.output_path)
 
 
-@pytest.fixture(params=["ucc", "nccl", "sleep", "gpt-pre_test", "gpt-no-hook", "grok-pre_test", "grok-no-hook"])
+@pytest.fixture(params=["ucc", "nccl", "sleep", "gpt-pre-test", "gpt-no-hook", "grok-pre-test", "grok-no-hook"])
 def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -> tuple[TestRun, str, Optional[str]]:
     if request.param == "ucc":
         tr = partial_tr(
@@ -159,7 +159,7 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -
             slurm_system, tr.test.test_definition.cmd_args_dict
         )
         tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name")
-        if "pre_test" in request.param:
+        if "pre-test" in request.param:
             pre_test_tr = partial_tr(
                 name="nccl",
                 test=Test(
@@ -194,7 +194,7 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -
             slurm_system, tr.test.test_definition.cmd_args_dict
         )
         tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name")
-        if "pre_test" in request.param:
+        if "pre-test" in request.param:
             pre_test_tr = partial_tr(
                 name="nccl",
                 test=Test(

From 904f377274ebe6403f82e9f7f03ee140182761db Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Mon, 4 Nov 2024 14:10:05 -0600
Subject: [PATCH 55/64] Rename plugin to hook

---
 conf/hook/test/nccl_test_all_reduce.toml | 30 ------------------------
 1 file changed, 30 deletions(-)
 delete mode 100644 conf/hook/test/nccl_test_all_reduce.toml

diff --git a/conf/hook/test/nccl_test_all_reduce.toml b/conf/hook/test/nccl_test_all_reduce.toml
deleted file mode 100644
index 9074b2b8..00000000
--- a/conf/hook/test/nccl_test_all_reduce.toml
+++ /dev/null
@@ -1,30 +0,0 @@
-# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
-# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-name = "nccl_test_all_reduce"
-description = "all_reduce"
-test_template_name = "NcclTest"
-
-[cmd_args]
-"subtest_name" = "all_reduce_perf_mpi"
-"ngpus" = "1"
-"minbytes" = "128"
-"maxbytes" = "16G"
-"iters" = "100"
-"warmup_iters" = "50"
-
-[extra_cmd_args]
-"--stepfactor" = "2"

From 2c84d43f8b7b5fbde109414402849f812e8c1893 Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Mon, 4 Nov 2024 14:12:30 -0600
Subject: [PATCH 56/64] Rename plugin to hook

---
 conf/hook/nccl_test.toml                           |  2 +-
 src/cloudai/_core/test_scenario_parser.py          |  4 ++--
 .../slurm/strategy/slurm_command_gen_strategy.py   | 14 +++++++-------
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/conf/hook/nccl_test.toml b/conf/hook/nccl_test.toml
index 346dc8e4..53349c43 100644
--- a/conf/hook/nccl_test.toml
+++ b/conf/hook/nccl_test.toml
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-name = "nccl_test_epilogue"
+name = "nccl_test"
 
 [[Tests]]
 id = "Tests.1"
diff --git a/src/cloudai/_core/test_scenario_parser.py b/src/cloudai/_core/test_scenario_parser.py
index ddfb5fa0..3dbaf133 100644
--- a/src/cloudai/_core/test_scenario_parser.py
+++ b/src/cloudai/_core/test_scenario_parser.py
@@ -185,8 +185,8 @@ def _create_test_run(
         Args:
             test_info (Dict[str, Any]): Information of the test.
             normalized_weight (float): Normalized weight for the test.
-            pre_test (Optional[TestScenario]): TestScenario object representing the pre_test sequence.
-            post_test (Optional[TestScenario]): TestScenario object representing the post_test sequence.
+            pre_test (Optional[TestScenario]): TestScenario object representing the pre-test sequence.
+            post_test (Optional[TestScenario]): TestScenario object representing the post-test sequence.
 
         Returns:
             Test: Copied and updated Test object for the section.
diff --git a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py
index 8b03379d..ee8a463a 100644
--- a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py
+++ b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py
@@ -125,11 +125,11 @@ def job_name(self, job_name_prefix: str) -> str:
 
     def gen_pre_test(self, pre_test: TestScenario, base_output_path: Path) -> str:
         """
-        Generate the pre_test command by running all tests defined in the pre_test test scenario.
+        Generate the pre-test command by running all tests defined in the pre-test test scenario.
 
         Args:
-            pre_test (TestScenario): The pre_test test scenario containing the tests to be run.
-            base_output_path (Path): The base output directory path for storing pre_test outputs.
+            pre_test (TestScenario): The pre-test test scenario containing the tests to be run.
+            base_output_path (Path): The base output directory path for storing pre-test outputs.
 
         Returns:
             str: A string with all the Slurm srun commands generated for the pre_test.
@@ -165,14 +165,14 @@ def gen_pre_test(self, pre_test: TestScenario, base_output_path: Path) -> str:
 
     def gen_post_test(self, post_test: TestScenario, base_output_path: Path) -> str:
         """
-        Generate the post_test command by running all tests defined in the post_test test scenario.
+        Generate the post-test command by running all tests defined in the post-test test scenario.
 
         Args:
-            post_test (TestScenario): The post_test test scenario containing the tests to be run.
-            base_output_path (Path): The base output directory path for storing post_test outputs.
+            post_test (TestScenario): The post-test test scenario containing the tests to be run.
+            base_output_path (Path): The base output directory path for storing post-test outputs.
 
         Returns:
-            str: A string with all the Slurm srun commands generated for the post_test.
+            str: A string with all the Slurm srun commands generated for the post-test.
         """
         post_test_output_dir = base_output_path / "post_test"
         post_test_output_dir.mkdir(parents=True, exist_ok=True)

From b598779e1b10ed17669bc785bba0e12a24238976 Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Mon, 4 Nov 2024 14:20:03 -0600
Subject: [PATCH 57/64] Rename plugin to hook

---
 tests/test_acceptance.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py
index 2e3f910c..d1e57782 100644
--- a/tests/test_acceptance.py
+++ b/tests/test_acceptance.py
@@ -173,7 +173,7 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -
                 slurm_system, pre_test_tr.test.test_definition.cmd_args_dict
             )
             pre_test_tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name")
-            tr.pre_test = TestScenario(name=f"{pre_test_tr.name} NCCL Prologue", test_runs=[pre_test_tr])
+            tr.pre_test = TestScenario(name=f"{pre_test_tr.name} NCCL pre-test", test_runs=[pre_test_tr])
 
         return (tr, f"{request.param}.sbatch", "gpt.run")
     elif request.param.startswith("grok-"):
@@ -208,7 +208,7 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -
                 slurm_system, pre_test_tr.test.test_definition.cmd_args_dict
             )
             pre_test_tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name")
-            tr.pre_test = TestScenario(name=f"{pre_test_tr.name} NCCL Prologue", test_runs=[pre_test_tr])
+            tr.pre_test = TestScenario(name=f"{pre_test_tr.name} NCCL pre-test", test_runs=[pre_test_tr])
 
         return (tr, f"{request.param}.sbatch", "grok.run")
 

From de1c1a6ff687e13fde970f623ea8914bdd7a3c9f Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Mon, 4 Nov 2024 14:23:02 -0600
Subject: [PATCH 58/64] Raise an exception when hooks are not found

---
 src/cloudai/_core/test_scenario_parser.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/cloudai/_core/test_scenario_parser.py b/src/cloudai/_core/test_scenario_parser.py
index 3dbaf133..40840872 100644
--- a/src/cloudai/_core/test_scenario_parser.py
+++ b/src/cloudai/_core/test_scenario_parser.py
@@ -143,17 +143,22 @@ def _parse_data(self, data: Dict[str, Any]) -> TestScenario:
         if ts_model.pre_test:
             pre_test = self.hook_mapping.get(ts_model.pre_test)
             if pre_test is None:
-                logging.warning(
+                msg = (
                     f"Pre-test hook '{ts_model.pre_test}' not found in hook mapping. "
                     "Ensure that a proper hook directory is set under the working directory."
                 )
+                logging.error(msg)
+                raise TestScenarioParsingError(msg)
+
         if ts_model.post_test:
             post_test = self.hook_mapping.get(ts_model.post_test)
             if post_test is None:
-                logging.warning(
+                msg = (
                     f"Post-test hook '{ts_model.post_test}' not found in hook mapping. "
                     "Ensure that a proper hook directory is set under the working directory."
                 )
+                logging.error(msg)
+                raise TestScenarioParsingError(msg)
 
         test_runs_by_id: dict[str, TestRun] = {
             tr.id: self._create_test_run(tr, normalized_weight, pre_test, post_test) for tr in ts_model.tests

From 70e8fd77912c26299399ca8bf2cac931af9804ba Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Mon, 4 Nov 2024 14:51:37 -0600
Subject: [PATCH 59/64] Fix verify-configs errors

---
 src/cloudai/cli/handlers.py | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py
index 6105bc24..86078b2e 100644
--- a/src/cloudai/cli/handlers.py
+++ b/src/cloudai/cli/handlers.py
@@ -212,7 +212,11 @@ def verify_test_configs(test_tomls: List[Path]) -> int:
 
 
 def verify_test_scenarios(
-    scenario_tomls: List[Path], test_tomls: list[Path], system_config: Optional[Path] = None
+    scenario_tomls: List[Path],
+    test_tomls: list[Path],
+    hook_tomls: List[Path],
+    hook_test_tomls: list[Path],
+    system_config: Optional[Path] = None,
 ) -> int:
     system = Mock(spec=System)
     if system_config:
@@ -225,7 +229,9 @@ def verify_test_scenarios(
         logging.debug(f"Verifying Test Scenario: {scenario_file}...")
         try:
             tests = Parser.parse_tests(test_tomls, system)
-            Parser.parse_test_scenario(scenario_file, {t.name: t for t in tests})
+            hook_tests = Parser.parse_tests(hook_test_tomls, system)
+            hooks = Parser.parse_hooks(hook_tomls, {t.name: t for t in hook_tests})
+            Parser.parse_test_scenario(scenario_file, {t.name: t for t in tests}, hooks)
         except Exception:
             nfailed += 1
 
@@ -259,7 +265,9 @@ def handle_verify_all_configs(args: argparse.Namespace) -> int:
     if files["test"]:
         nfailed += verify_test_configs(files["test"])
     if files["scenario"]:
-        nfailed += verify_test_scenarios(files["scenario"], test_tomls, args.system_config)
+        nfailed += verify_test_scenarios(
+            files["scenario"], test_tomls, files["hook"], files["hook_test"], args.system_config
+        )
     if files["unknown"]:
         logging.error(f"Unknown configuration files: {[str(f) for f in files['unknown']]}")
         nfailed += len(files["unknown"])
@@ -273,10 +281,22 @@ def handle_verify_all_configs(args: argparse.Namespace) -> int:
 
 
 def load_tomls_by_type(tomls: List[Path]) -> dict[str, List[Path]]:
-    files: dict[str, List[Path]] = {"system": [], "test": [], "scenario": [], "unknown": []}
+    files: dict[str, List[Path]] = {
+        "system": [],
+        "test": [],
+        "scenario": [],
+        "hook_test": [],
+        "hook": [],
+        "unknown": [],
+    }
     for toml_file in tomls:
         content = toml_file.read_text()
-        if "scheduler =" in content:
+        if "conf" in toml_file.parts and "hook" in toml_file.parts:
+            if "test" in toml_file.parts:
+                files["hook_test"].append(toml_file)
+            else:
+                files["hook"].append(toml_file)
+        elif "scheduler =" in content:
             files["system"].append(toml_file)
         elif "test_template_name =" in content:
             files["test"].append(toml_file)

From b852bb8967575eda306afbb2ede9ef65e443e0a0 Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Tue, 5 Nov 2024 08:08:47 -0500
Subject: [PATCH 60/64] Reflect Andrei's comments

---
 src/cloudai/parser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cloudai/parser.py b/src/cloudai/parser.py
index 950eff6c..35a0e6a1 100644
--- a/src/cloudai/parser.py
+++ b/src/cloudai/parser.py
@@ -82,7 +82,7 @@ def parse(
             exit(1)  # exit right away to keep error message readable for users
 
         if not PLUGIN_ROOT.exists():
-            logging.debug(f"PLUGIN_ROOT path '{PLUGIN_ROOT}' does not exist. Plugins will not be enabled.")
+            logging.debug(f"PLUGIN_ROOT path '{PLUGIN_ROOT}' does not exist.")
 
         try:
             hook_tests = (

From 701cf94923a2939c49866a8f7d3b267875d9ff40 Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Tue, 5 Nov 2024 08:12:50 -0500
Subject: [PATCH 61/64] Reflect Andrei's comments

---
 src/cloudai/_core/test_scenario_parser.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/cloudai/_core/test_scenario_parser.py b/src/cloudai/_core/test_scenario_parser.py
index 40840872..d111d002 100644
--- a/src/cloudai/_core/test_scenario_parser.py
+++ b/src/cloudai/_core/test_scenario_parser.py
@@ -145,6 +145,7 @@ def _parse_data(self, data: Dict[str, Any]) -> TestScenario:
             if pre_test is None:
                 msg = (
                     f"Pre-test hook '{ts_model.pre_test}' not found in hook mapping. "
+                    "A corresponding hook should exist under 'conf/hook'. "
                     "Ensure that a proper hook directory is set under the working directory."
                 )
                 logging.error(msg)
@@ -155,6 +156,7 @@ def _parse_data(self, data: Dict[str, Any]) -> TestScenario:
             if post_test is None:
                 msg = (
                     f"Post-test hook '{ts_model.post_test}' not found in hook mapping. "
+                    "A corresponding hook should exist under 'conf/hook'. "
                     "Ensure that a proper hook directory is set under the working directory."
                 )
                 logging.error(msg)

From 8c0cbb51ca900eaea1c9f6c5484a9d71d442f767 Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Tue, 5 Nov 2024 08:15:01 -0500
Subject: [PATCH 62/64] Rename plugin to hook

---
 src/cloudai/parser.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/cloudai/parser.py b/src/cloudai/parser.py
index 35a0e6a1..6f59f9a3 100644
--- a/src/cloudai/parser.py
+++ b/src/cloudai/parser.py
@@ -34,8 +34,8 @@
     format_validation_error,
 )
 
-PLUGIN_ROOT = Path("conf/hook")
-PLUGIN_TEST_ROOT = PLUGIN_ROOT / "test"
+HOOK_ROOT = Path("conf/hook")
+HOOK_TEST_ROOT = HOOK_ROOT / "test"
 
 
 class Parser:
@@ -81,12 +81,12 @@ def parse(
         except TestConfigParsingError:
             exit(1)  # exit right away to keep error message readable for users
 
-        if not PLUGIN_ROOT.exists():
-            logging.debug(f"PLUGIN_ROOT path '{PLUGIN_ROOT}' does not exist.")
+        if not HOOK_ROOT.exists():
+            logging.debug(f"HOOK_ROOT path '{HOOK_ROOT}' does not exist.")
 
         try:
             hook_tests = (
-                self.parse_tests(list(PLUGIN_TEST_ROOT.glob("*.toml")), system) if PLUGIN_TEST_ROOT.exists() else []
+                self.parse_tests(list(HOOK_TEST_ROOT.glob("*.toml")), system) if HOOK_TEST_ROOT.exists() else []
             )
         except TestConfigParsingError:
             exit(1)  # exit right away to keep error message readable for users
@@ -97,10 +97,10 @@ def parse(
 
         test_mapping = {t.name: t for t in tests}
         hook_test_scenario_mapping = {}
-        if PLUGIN_ROOT.exists() and list(PLUGIN_ROOT.glob("*.toml")):
+        if HOOK_ROOT.exists() and list(HOOK_ROOT.glob("*.toml")):
             try:
                 hook_test_scenario_mapping = self.parse_hooks(
-                    list(PLUGIN_ROOT.glob("*.toml")), {t.name: t for t in hook_tests}
+                    list(HOOK_ROOT.glob("*.toml")), {t.name: t for t in hook_tests}
                 )
             except TestScenarioParsingError:
                 exit(1)  # exit right away to keep error message readable for users

From 526aecbba3cfd3e5e852e3f20f6c8081b3e0d7a2 Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Tue, 5 Nov 2024 08:17:11 -0500
Subject: [PATCH 63/64] Fix verify-configs errors

---
 src/cloudai/cli/handlers.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py
index 86078b2e..f837e681 100644
--- a/src/cloudai/cli/handlers.py
+++ b/src/cloudai/cli/handlers.py
@@ -23,6 +23,8 @@
 
 from cloudai import Installable, Parser, Registry, ReportGenerator, Runner, System
 
+from ..parser import HOOK_ROOT
+
 
 def handle_install_and_uninstall(args: argparse.Namespace) -> int:
     """
@@ -249,6 +251,11 @@ def handle_verify_all_configs(args: argparse.Namespace) -> int:
     if err:
         return err
 
+    err, hook_tomls = expand_file_list(HOOK_ROOT, glob="**/*.toml")
+    if err:
+        return err
+    tomls += hook_tomls
+
     files = load_tomls_by_type(tomls)
 
     test_tomls = files["test"]

From 04430e4d2ee3aa7008f07ad29fa26ffb8dc95279 Mon Sep 17 00:00:00 2001
From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com>
Date: Tue, 5 Nov 2024 08:56:14 -0500
Subject: [PATCH 64/64] Reflect Andrei's comments

---
 src/cloudai/cli/handlers.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py
index f837e681..30fb7a90 100644
--- a/src/cloudai/cli/handlers.py
+++ b/src/cloudai/cli/handlers.py
@@ -252,8 +252,6 @@ def handle_verify_all_configs(args: argparse.Namespace) -> int:
         return err
 
     err, hook_tomls = expand_file_list(HOOK_ROOT, glob="**/*.toml")
-    if err:
-        return err
     tomls += hook_tomls
 
     files = load_tomls_by_type(tomls)
@@ -298,12 +296,22 @@ def load_tomls_by_type(tomls: List[Path]) -> dict[str, List[Path]]:
     }
     for toml_file in tomls:
         content = toml_file.read_text()
-        if "conf" in toml_file.parts and "hook" in toml_file.parts:
+
+        is_in_hook_root = False
+        try:
+            toml_file.relative_to(HOOK_ROOT)
+            is_in_hook_root = True
+        except ValueError:
+            pass
+
+        if is_in_hook_root:
             if "test" in toml_file.parts:
                 files["hook_test"].append(toml_file)
             else:
                 files["hook"].append(toml_file)
-        elif "scheduler =" in content:
+            continue
+
+        if "scheduler =" in content:
             files["system"].append(toml_file)
         elif "test_template_name =" in content:
             files["test"].append(toml_file)