From 09d055cf720bfb12252045612809bacd01b45965 Mon Sep 17 00:00:00 2001 From: Xiaowei Jiang Date: Mon, 2 Aug 2021 21:37:55 -0700 Subject: [PATCH] [RayTune] Output insufficent resource warning msg when autoscaler is not running. RayTune currently does not receive a definitive signal from resource management about whether a certain request is not fulfilled because of other competing requests or would never be fulfilled due to resource limitations. As a result, user complains repeated PENDING status of trials without making any progress. This implementation is at best a calculated investment to collect some low hanging fruits. A proper fix should involve API changes in resource management in the future. --- python/ray/tune/trial_executor.py | 24 +++++++++++++++++++++++- python/ray/tune/trial_runner.py | 4 ++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/python/ray/tune/trial_executor.py b/python/ray/tune/trial_executor.py index 7ca6c1884ca3..d317c232f380 100644 --- a/python/ray/tune/trial_executor.py +++ b/python/ray/tune/trial_executor.py @@ -7,6 +7,16 @@ logger = logging.getLogger(__name__) +# We may prompt user to check if resource is insufficent to start even one +# single trial run if we observe the following: +# 1. all trials are in pending state +# 2. autoscaler is disabled +# 3. No progress is made after this number of iterations (executor.step() +# is looped this number of times). +# Shown every this number of times as a warning msg so as not to pollute +# logging. +SHOW_MAYBE_INSUFFICIENT_RESOURCE_WARNING_ITER_DELAY = 100 + class TrialExecutor: """Module for interacting with remote trainables. @@ -177,7 +187,19 @@ def force_reconcilation_on_next_step_end(self): def on_no_available_trials(self, trial_runner): if self._queue_trials: return - for trial in trial_runner.get_trials(): + + all_trials = trial_runner.get_trials() + all_trials_are_pending = all( + trial.status == Trial.PENDING for trial in all_trials) + if all_trials_are_pending and not is_ray_cluster() and ( + trial_runner.iteration + + 1) % SHOW_MAYBE_INSUFFICIENT_RESOURCE_WARNING_ITER_DELAY == 0: + logger.warning( + "Autoscaler is not enabled and resource is not ready after " + "extended amoung of time - please consider if the allocated " + "resource is not enough for starting even a single trial." + ) + for trial in all_trials: if trial.uses_placement_groups: return if trial.status == Trial.PENDING: diff --git a/python/ray/tune/trial_runner.py b/python/ray/tune/trial_runner.py index 722f87b17fd6..6566ef24f2ba 100644 --- a/python/ray/tune/trial_runner.py +++ b/python/ray/tune/trial_runner.py @@ -350,6 +350,10 @@ def search_alg(self): def scheduler_alg(self): return self._scheduler_alg + @property + def iteration(self): + return self._iteration + def _validate_resume(self, resume_type): """Checks whether to resume experiment.