Am/verify configs (#268)

* WIP: add verify-configs option * Finalize verify-configs * Cleanup and polishing
NVIDIA · Oct 16, 2024 · e3f4d92 · e3f4d92
1 parent da08ea9
commit e3f4d92
Show file tree

Hide file tree

Showing 6 changed files with 157 additions and 34 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -82,7 +82,10 @@ jobs:
           set -o pipefail
 
           cloudai --help
-          cloudai verify-systems conf/common/system
-          cloudai verify-tests conf/common/test
-          cloudai verify-tests conf/release/spcx/l40s/test
-          cloudai verify-test-scenarios --system-config conf/common/system/example_slurm_cluster.toml --tests-dir conf/common/test conf/common/test_scenario
+
+          # this checks that all TOMLs are valid, Test Scenarios are checked agains _all_ tests
+          cloudai verify-configs conf/
+
+          # this checks that all TOMLs are valid, Test Scenarios are checked agains _only_ the tests in the specified directory
+          cloudai verify-configs --tests-dir conf/common/test conf/common
+          cloudai verify-configs --tests-dir conf/release/spcx/l40s/test conf/release/spcx/l40s
diff --git a/src/cloudai/_core/test_parser.py b/src/cloudai/_core/test_parser.py
@@ -141,7 +141,7 @@ def _fetch_strategy(  # noqa: D417
             else:
                 return strategy_type()
 
-        logging.warning(
+        logging.debug(
             f"No {strategy_interface.__name__} found for " f"{type(self).__name__} and " f"{type(self.system).__name__}"
         )
         return None

diff --git a/src/cloudai/cli/cli.py b/src/cloudai/cli/cli.py
@@ -22,6 +22,7 @@
     handle_dry_run_and_run,
     handle_generate_report,
     handle_install_and_uninstall,
+    handle_verify_all_configs,
     handle_verify_systems,
     handle_verify_test_scenarios,
     handle_verify_tests,
@@ -41,6 +42,7 @@ def __init__(self):
             "verify-systems",
             "verify-tests",
             "verify-test-scenarios",
+            "verify-configs",
         }
 
         self.parser = argparse.ArgumentParser(description="Cloud AI")
@@ -100,23 +102,42 @@ def init_default_args(self) -> argparse.ArgumentParser:
             )
 
         if "verify-systems" in self.DEFAULT_MODES:
-            p = self.add_command("verify-systems", "Verify the system configurations.", handle_verify_systems)
+            p = self.add_command(
+                "verify-systems",
+                "[DEPRECATED: use verify-configs] Verify the system configurations.",
+                handle_verify_systems,
+            )
             p.add_argument("system_configs", help="Path to the system configuration file or directory.", type=Path)
 
         if "verify-tests" in self.DEFAULT_MODES:
-            p = self.add_command("verify-tests", "Verify the test configurations.", handle_verify_tests)
+            p = self.add_command(
+                "verify-tests", "[DEPRECATED: use verify-configs] Verify the test configurations.", handle_verify_tests
+            )
             p.add_argument("test_configs", help="Path to the test configuration file or directory.", type=Path)
 
         if "verify-test-scenarios" in self.DEFAULT_MODES:
             p = self.add_command(
                 "verify-test-scenarios",
-                "Verify the test scenario configurations.",
+                "[DEPRECATED: use verify-configs] Verify the test scenario configurations.",
                 handle_verify_test_scenarios,
-                system_config=True,
+                system_config=False,
                 tests_dir=True,
             )
             p.add_argument("test_scenarios", help="Path to the test scenario file or directory.", type=Path)
 
+        if "verify-configs" in self.DEFAULT_MODES:
+            p = self.add_command(
+                "verify-configs",
+                (
+                    "Verify all found TOML files in the given directory. Test Scenarios are verified against all found "
+                    "Test TOML files or all Test TOML files in the given directory."
+                ),
+                handle_verify_all_configs,
+                system_config=False,
+                tests_dir=False,
+            )
+            p.add_argument("configs_dir", help="Path to a file or the directory containing the TOML files.", type=Path)
+
         return self.parser
 
     def add_run_and_dry_run(self):

diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py
@@ -18,9 +18,10 @@
 import asyncio
 import logging
 from pathlib import Path
-from typing import List, Set
+from typing import List, Optional, Set
+from unittest.mock import Mock
 
-from cloudai import Parser, Registry, ReportGenerator, Runner, Test, TestTemplate
+from cloudai import Parser, Registry, ReportGenerator, Runner, System, Test, TestTemplate
 
 
 def identify_unique_test_templates(tests: List[Test]) -> List[TestTemplate]:
@@ -182,14 +183,14 @@ def handle_generate_report(args: argparse.Namespace) -> int:
     return 0
 
 
-def expand_file_list(root: Path) -> tuple[int, List[Path]]:
+def expand_file_list(root: Path, glob: str = "*.toml") -> tuple[int, List[Path]]:
     if not root.exists():
         logging.error(f"{root} does not exist.")
         return (1, [])
 
     test_tomls = [root]
     if root.is_dir():
-        test_tomls = list(root.glob("*.toml"))
+        test_tomls = list(root.glob(glob))
         if not test_tomls:
             logging.error(f"No TOMLs found in {root}")
             return (1, [])
@@ -203,9 +204,13 @@ def handle_verify_systems(args: argparse.Namespace) -> int:
     if err:
         return err
 
+    return verify_system_configs(system_tomls)
+
+
+def verify_system_configs(system_tomls: List[Path]) -> int:
     nfailed = 0
     for test_toml in system_tomls:
-        logging.info(f"Verifying {test_toml}...")
+        logging.debug(f"Verifying System: {test_toml}...")
         try:
             Parser.parse_system(test_toml)
         except Exception:
@@ -225,9 +230,13 @@ def handle_verify_tests(args: argparse.Namespace) -> int:
     if err:
         return err
 
+    return verify_test_configs(test_tomls)
+
+
+def verify_test_configs(test_tomls: List[Path]) -> int:
     nfailed = 0
     for test_toml in test_tomls:
-        logging.info(f"Verifying {test_toml}...")
+        logging.debug(f"Verifying Test: {test_toml}...")
         try:
             Parser.parse_tests([test_toml], None)  # type: ignore
         except Exception:
@@ -247,18 +256,81 @@ def handle_verify_test_scenarios(args: argparse.Namespace) -> int:
     if err:
         return err
 
+    return verify_test_scenarios(test_tomls, list(args.tests_dir.glob("*.toml")), args.system_config)
+
+
+def verify_test_scenarios(
+    scenario_tomls: List[Path], test_tomls: list[Path], system_config: Optional[Path] = None
+) -> int:
+    system = Mock(spec=System)
+    if system_config:
+        system = Parser.parse_system(system_config)
+    else:
+        logging.warning("System configuration not provided, mocking it.")
+
     nfailed = 0
-    for test_toml in test_tomls:
-        logging.info(f"Verifying {test_toml}...")
+    for scenario_file in scenario_tomls:
+        logging.debug(f"Verifying Test Scenario: {scenario_file}...")
         try:
-            parser = Parser(args.system_config)
-            parser.parse(args.tests_dir, test_toml)
+            tests = Parser.parse_tests(test_tomls, system)
+            Parser.parse_test_scenario(scenario_file, {t.name: t for t in tests})
         except Exception:
             nfailed += 1
 
     if nfailed:
-        logging.error(f"{nfailed} out of {len(test_tomls)} test scenarios have issues.")
+        logging.error(f"{nfailed} out of {len(scenario_tomls)} test scenarios have issues.")
     else:
-        logging.info(f"Checked scenarios: {len(test_tomls)}, all passed")
+        logging.info(f"Checked scenarios: {len(scenario_tomls)}, all passed")
 
     return nfailed
+
+
+def handle_verify_all_configs(args: argparse.Namespace) -> int:
+    root: Path = args.configs_dir
+    err, tomls = expand_file_list(root, glob="**/*.toml")
+    if err:
+        return err
+
+    files = load_tomls_by_type(tomls)
+
+    test_tomls = files["test"]
+    if args.tests_dir:
+        test_tomls = list(args.tests_dir.glob("*.toml"))
+    elif files["scenario"]:
+        logging.warning(
+            "Test configuration directory not provided, using all found test TOMLs in the specified directory."
+        )
+
+    nfailed = 0
+    if files["system"]:
+        nfailed += verify_system_configs(files["system"])
+    if files["test"]:
+        nfailed += verify_test_configs(files["test"])
+    if files["scenario"]:
+        nfailed += verify_test_scenarios(files["scenario"], test_tomls, args.system_config)
+    if files["unknown"]:
+        logging.error(f"Unknown configuration files: {[str(f) for f in files['unknown']]}")
+        nfailed += len(files["unknown"])
+
+    if nfailed:
+        logging.error(f"{nfailed} out of {len(tomls)} configuration files have issues.")
+    else:
+        logging.info(f"Checked {len(tomls)} configuration files, all passed")
+
+    return nfailed
+
+
+def load_tomls_by_type(tomls: List[Path]) -> dict[str, List[Path]]:
+    files: dict[str, List[Path]] = {"system": [], "test": [], "scenario": [], "unknown": []}
+    for toml_file in tomls:
+        content = toml_file.read_text()
+        if "scheduler =" in content:
+            files["system"].append(toml_file)
+        elif "test_template_name =" in content:
+            files["test"].append(toml_file)
+        elif "[[Tests]]" in content:
+            files["scenario"].append(toml_file)
+        else:
+            files["unknown"].append(toml_file)
+
+    return files
diff --git a/src/cloudai/parser.py b/src/cloudai/parser.py
@@ -16,7 +16,7 @@
 
 import logging
 from pathlib import Path
-from typing import List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple
 
 import toml
 from pydantic import ValidationError
@@ -64,33 +64,34 @@ def parse(
         try:
             system = self.parse_system(self.system_config_path)
         except SystemConfigParsingError:
-            # exit right away to keep error message readable for users
-            exit(1)
+            exit(1)  # exit right away to keep error message readable for users
 
         try:
             tests = self.parse_tests(list(test_path.glob("*.toml")), system)
         except TestConfigParsingError:
-            # exit right away to keep error message readable for users
-            exit(1)
-        test_mapping = {t.name: t for t in tests}
+            exit(1)  # exit right away to keep error message readable for users
+
         logging.debug(f"Parsed {len(tests)} tests: {[t.name for t in tests]}")
+        test_mapping = {t.name: t for t in tests}
 
         filtered_tests = tests
         test_scenario: Optional[TestScenario] = None
         if test_scenario_path:
-            test_scenario_parser = TestScenarioParser(str(test_scenario_path), test_mapping)
             try:
-                test_scenario = test_scenario_parser.parse()
+                test_scenario = self.parse_test_scenario(test_scenario_path, test_mapping)
             except TestScenarioParsingError:
-                # exit right away to keep error message readable for users
-                exit(1)
-            logging.debug("Parsed test scenario")
-
+                exit(1)  # exit right away to keep error message readable for users
             scenario_tests = set(tr.test.name for tr in test_scenario.test_runs)
             filtered_tests = [t for t in tests if t.name in scenario_tests]
 
         return system, filtered_tests, test_scenario
 
+    @staticmethod
+    def parse_test_scenario(test_scenario_path: Path, test_mapping: Dict[str, Test]) -> TestScenario:
+        test_scenario_parser = TestScenarioParser(str(test_scenario_path), test_mapping)
+        test_scenario = test_scenario_parser.parse()
+        return test_scenario
+
     @staticmethod
     def parse_tests(test_tomls: list[Path], system: System) -> list[Test]:
         test_parser = TestParser(test_tomls, system)

diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -26,7 +26,7 @@
     handle_verify_systems,
     handle_verify_tests,
 )
-from cloudai.cli.handlers import handle_verify_test_scenarios
+from cloudai.cli.handlers import handle_verify_all_configs, handle_verify_test_scenarios
 
 
 def test_help_message(capsys: pytest.CaptureFixture[str]) -> None:
@@ -261,6 +261,32 @@ def test_verify_test_scenarios_mode(self, cli: CloudAICLI):
             **{"test_scenarios": Path("test_scenarios")},
         )
 
+    def test_verify_all_configs_mode(self, cli: CloudAICLI):
+        assert "verify-configs" in cli.handlers
+        assert cli.handlers["verify-configs"] is handle_verify_all_configs
+
+        args = cli.parser.parse_args(
+            ["verify-configs", "--system-config", "system_config", "--tests-dir", "tests_dir", "configs_dir"]
+        )
+        assert args == argparse.Namespace(
+            log_file="debug.log",
+            log_level="INFO",
+            mode="verify-configs",
+            system_config=Path("system_config"),
+            tests_dir=Path("tests_dir"),
+            **{"configs_dir": Path("configs_dir")},
+        )
+
+        args = cli.parser.parse_args(["verify-configs", "configs_dir"])
+        assert args == argparse.Namespace(
+            log_file="debug.log",
+            log_level="INFO",
+            mode="verify-configs",
+            system_config=None,
+            tests_dir=None,
+            **{"configs_dir": Path("configs_dir")},
+        )
+
     def test_report_generation_mode(self, cli: CloudAICLI):
         assert "generate-report" in cli.handlers
         assert cli.handlers["generate-report"] is handle_generate_report