Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Basic error checks and exit zero on success #850

Merged
merged 5 commits into from
Aug 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 53 additions & 2 deletions mlos_bench/mlos_bench/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,29 +12,80 @@
"""

import logging
import sys
from typing import Dict, List, Optional, Tuple

import numpy as np

from mlos_bench.launcher import Launcher
from mlos_bench.tunables.tunable_groups import TunableGroups

_LOG = logging.getLogger(__name__)


def _sanity_check_results(launcher: Launcher) -> None:
"""Do some sanity checking on the results and throw an exception if it looks like
something went wrong.
"""
basic_err_msg = "Check configuration, scripts, and logs for details."

# Check if the scheduler has any trials.
if not launcher.scheduler.trial_count:
raise RuntimeError(f"No trials were run. {basic_err_msg}")

# Check if the scheduler ran the expected number of trials.
expected_trial_count = min(
launcher.scheduler.max_trials if launcher.scheduler.max_trials > 0 else np.inf,
launcher.scheduler.trial_config_repeat_count * launcher.optimizer.max_suggestions,
)
if launcher.scheduler.trial_count < expected_trial_count:
raise RuntimeError(
f"Expected {expected_trial_count} trials, "
f"but only {launcher.scheduler.trial_count} were run. {basic_err_msg}"
)

# Check to see if "too many" trials seem to have failed (#523).
unsuccessful_trials = [t for t in launcher.scheduler.ran_trials if not t.status.is_succeeded()]
if len(unsuccessful_trials) > 0.2 * launcher.scheduler.trial_count:
bpkroth marked this conversation as resolved.
Show resolved Hide resolved
raise RuntimeWarning(
"Too many trials failed: "
f"{len(unsuccessful_trials)} out of {launcher.scheduler.trial_count}. "
f"{basic_err_msg}"
)


def _main(
argv: Optional[List[str]] = None,
) -> Tuple[Optional[Dict[str, float]], Optional[TunableGroups]]:

launcher = Launcher("mlos_bench", "Systems autotuning and benchmarking tool", argv=argv)

with launcher.scheduler as scheduler_context:
scheduler_context.start()
scheduler_context.teardown()

_sanity_check_results(launcher)

(score, _config) = result = launcher.scheduler.get_best_observation()
# NOTE: This log line is used in test_launch_main_app_* unit tests:
_LOG.info("Final score: %s", score)
return result


def _shell_main(
argv: Optional[List[str]] = None,
) -> int:
(best_score, best_config) = _main(argv)
# Exit zero if it looks like the overall operation was successful.
# TODO: Improve this sanity check to be more robust.
if (
best_score
and best_config
and all(isinstance(score_value, float) for score_value in best_score.values())
):
return 0
else:
raise ValueError(f"Unexpected result: {best_score=}, {best_config=}")


if __name__ == "__main__":
_main()
sys.exit(_shell_main())
14 changes: 13 additions & 1 deletion mlos_bench/mlos_bench/schedulers/base_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from abc import ABCMeta, abstractmethod
from datetime import datetime
from types import TracebackType
from typing import Any, Dict, Optional, Tuple, Type
from typing import Any, Dict, List, Optional, Tuple, Type

from pytz import UTC
from typing_extensions import Literal
Expand Down Expand Up @@ -87,6 +87,7 @@ def __init__( # pylint: disable=too-many-arguments
self.storage = storage
self._root_env_config = root_env_config
self._last_trial_id = -1
self._ran_trials: List[Storage.Trial] = []

_LOG.debug("Scheduler instantiated: %s :: %s", self, config)

Expand All @@ -113,6 +114,11 @@ def trial_config_repeat_count(self) -> int:
"""Gets the number of trials to run for a given config."""
return self._trial_config_repeat_count

@property
def trial_count(self) -> int:
"""Gets the current number of trials run for the experiment."""
return self._trial_count

@property
def max_trials(self) -> int:
"""Gets the maximum number of trials to run for a given experiment, or -1 for no
Expand Down Expand Up @@ -302,4 +308,10 @@ def run_trial(self, trial: Storage.Trial) -> None:
"""
assert self.experiment is not None
self._trial_count += 1
self._ran_trials.append(trial)
_LOG.info("QUEUE: Execute trial # %d/%d :: %s", self._trial_count, self._max_trials, trial)

@property
def ran_trials(self) -> List[Storage.Trial]:
"""Get the list of trials that were run."""
return self._ran_trials
7 changes: 7 additions & 0 deletions mlos_bench/mlos_bench/storage/base_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,7 @@ def __init__( # pylint: disable=too-many-arguments
self._tunable_config_id = tunable_config_id
self._opt_targets = opt_targets
self._config = config or {}
self._status = Status.UNKNOWN

def __repr__(self) -> str:
return f"{self._experiment_id}:{self._trial_id}:{self._tunable_config_id}"
Expand Down Expand Up @@ -435,6 +436,11 @@ def config(self, global_config: Optional[Dict[str, Any]] = None) -> Dict[str, An
config["trial_id"] = self._trial_id
return config

@property
def status(self) -> Status:
"""Get the status of the current trial."""
return self._status

@abstractmethod
def update(
self,
Expand Down Expand Up @@ -471,6 +477,7 @@ def update(
opt_targets.difference(metrics.keys()),
)
# raise ValueError()
self._status = status
return metrics

@abstractmethod
Expand Down
2 changes: 1 addition & 1 deletion mlos_bench/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ dynamic = [
]

[project.scripts]
mlos_bench = "mlos_bench.run:_main"
mlos_bench = "mlos_bench.run:_shell_main"

[project.urls]
Documentation = "https://microsoft.github.io/MLOS/source_tree_docs/mlos_bench/"
Expand Down
Loading