ray-project · richardliaw · Aug 13, 2021 · Aug 3, 2021 · Aug 4, 2021 · Aug 4, 2021
@@ -846,6 +846,13 @@ These are the environment variables Ray Tune currently considers:
   trial startups. After the grace period, Tune will block until a result from a running trial is received. Can
   be disabled by setting this to lower or equal to 0.
 * **TUNE_WARN_THRESHOLD_S**: Threshold for logging if an Tune event loop operation takes too long. Defaults to 0.5 (seconds).
+* **TUNE_WARN_INSUFFICENT_RESOURCE_THRESHOLD_S**: Threshold for throwing a warning if no active trials are in ``RUNNING`` state
+  for this amount of seconds. If the Ray Tune job is stuck in this state (most likely due to insufficient resources),
+  the warning message is printed repeatedly every this amount of seconds. Defaults to 1 (seconds).
+* **TUNE_WARN_INSUFFICENT_RESOURCE_THRESHOLD_S_AUTOSCALER**: Threshold for throwing a warning, when the autoscaler is enabled,
+  if no active trials are in ``RUNNING`` state for this amount of seconds.
+  If the Ray Tune job is stuck in this state (most likely due to insufficient resources), the warning message is printed
+  repeatedly every this amount of seconds. Defaults to 60 (seconds).
 * **TUNE_STATE_REFRESH_PERIOD**: Frequency of updating the resource tracking from Ray. Defaults to 10 (seconds).
 
 

@@ -1,4 +1,6 @@
 # coding: utf-8
+from freezegun import freeze_time
+from mock import patch
 import os
 import unittest
 
@@ -16,6 +18,57 @@
 from ray.tune.utils.placement_groups import PlacementGroupFactory
 
 
+class TrialExecutorInsufficientResourcesTest(unittest.TestCase):
+    def setUp(self):
+        os.environ["TUNE_INSUFFICENT_RESOURCE_WARN_THRESHOLD_S"] = "1"
+        self.cluster = Cluster(
+            initialize_head=True,
+            connect=True,
+            head_node_args={
+                "num_cpus": 4,
+                "num_gpus": 2,
+                "_system_config": {
+                    "num_heartbeats_timeout": 10
+                }
+            })
+
+    def tearDown(self):
+        ray.shutdown()
+        self.cluster.shutdown()
+
+    @freeze_time("2021-08-03", auto_tick_seconds=15)
+    @patch.object(ray.tune.trial_executor.logger, "warning")
+    def testOutputWarningMessage(self, mocked_warn):
+        def train(config):
+            pass
+
+        tune.run(
+            train, resources_per_trial={
+                "cpu": 1,
+                "gpu": 1,
+            })
+        msg = ("Autoscaler is disabled. No trial is running and no new trial"
+               " has been started within at least the last 1.0 seconds. This "
+               "could be due to the cluster not having enough resources "
+               "available to start the next trial. Please check if the "
+               "requested resources can be fulfilled by your cluster, or will "
+               "be fulfilled eventually (when using the Ray autoscaler).")
+        mocked_warn.assert_called_with(msg)
+
+    @freeze_time("2021-08-03")
+    @patch.object(ray.tune.trial_executor.logger, "warning")
+    def testNotOutputWarningMessage(self, mocked_warn):
+        def train(config):
+            pass
+
+        tune.run(
+            train, resources_per_trial={
+                "cpu": 1,
+                "gpu": 1,
+            })
+        mocked_warn.assert_not_called()
+
+
 class RayTrialExecutorTest(unittest.TestCase):
     def setUp(self):
         # Wait up to five seconds for placement groups when starting a trial

@@ -1,6 +1,9 @@
 # coding: utf-8
 from abc import ABCMeta, abstractmethod
+from functools import lru_cache
 import logging
+import os
+import time
 from typing import Dict, List, Optional
 
 from ray.tune.resources import Resources
@@ -12,6 +15,18 @@
 logger = logging.getLogger(__name__)
 
 
+# Accessing environment variable could be slow.
+@lru_cache()
+def _get_warn_threshold(autoscaler_enabled: bool) -> float:
+    if autoscaler_enabled:
+        return float(
+            os.environ.get(
+                "TUNE_WARN_INSUFFICENT_RESOURCE_THRESHOLD_S_AUTOSCALER", "60"))
+    else:
+        return float(
+            os.environ.get("TUNE_WARN_INSUFFICENT_RESOURCE_THRESHOLD_S", "1"))
+
+
 @DeveloperAPI
 class TrialExecutor(metaclass=ABCMeta):
     """Module for interacting with remote trainables.
@@ -32,6 +47,12 @@ def __init__(self, queue_trials: bool = False):
         self._queue_trials = queue_trials
         self._cached_trial_state = {}
         self._trials_to_cache = set()
+        # The start time since when all active trials have been in PENDING
+        # state, or since last time we output a resource insufficent
+        # warning message, whichever comes later.
+        # -1 means either the TrialExecutor is just initialized without any
+        # trials yet, or there are some trials in RUNNING state.
+        self._no_running_trials_since = -1
 
     def set_status(self, trial: Trial, status: str) -> None:
         """Sets status and checkpoints metadata if needed.
@@ -194,6 +215,31 @@ def on_step_end(self, trials: List[Trial]) -> None:
     def force_reconcilation_on_next_step_end(self) -> None:
         pass
 
+    def may_warn_insufficient_resources(self, all_trials):
+        autoscaler_enabled = is_ray_cluster()
+        if not any(trial.status == Trial.RUNNING for trial in all_trials):
+            if self._no_running_trials_since == -1:
+                self._no_running_trials_since = time.monotonic()
+            elif time.monotonic(
+            ) - self._no_running_trials_since > _get_warn_threshold(
+                    autoscaler_enabled):
+                warn_prefix = ("If autoscaler is still scaling up, ignore "
+                               "this message." if autoscaler_enabled else
+                               "Autoscaler is disabled.")
+                logger.warning(
+                    f"{warn_prefix} "
+                    f"No trial is running and no new trial has been started "
+                    f"within at least the last "
+                    f"{_get_warn_threshold(autoscaler_enabled)} seconds. "
+                    f"This could be due to the cluster not having enough "
+                    f"resources available to start the next trial. Please "
+                    f"check if the requested resources can be fulfilled by "
+                    f"your cluster, or will be fulfilled eventually (when "
+                    f"using the Ray autoscaler).")
+                self._no_running_trials_since = time.monotonic()
+        else:
+            self._no_running_trials_since = -1
+
     def on_no_available_trials(self, trials: List[Trial]) -> None:
         """
         Args:
@@ -203,6 +249,7 @@ def on_no_available_trials(self, trials: List[Trial]) -> None:
 
         if self._queue_trials:
             return
+        self.may_warn_insufficient_resources(trials)
         for trial in trials:
             if trial.uses_placement_groups:
                 return

diff --git a/python/requirements/tune/requirements_tune.txt b/python/requirements/tune/requirements_tune.txt
@@ -7,6 +7,7 @@ dask[complete]==2021.03.0; python_version < '3.7'
 dask[complete]==2021.06.1; python_version >= '3.7'
 dragonfly-opt==0.1.6
 flaml==0.5.2
+freezegun==1.1.0
 gluoncv==0.10.1.post0
 gpy==1.10.0
 gym[atari]==0.18.3
@@ -22,6 +23,7 @@ lightgbm==3.2.1
 matplotlib==3.3.4; python_version < '3.7'
 matplotlib==3.4.2; python_version >= '3.7'
 mlflow==1.19.0
+mock==4.0.3
 mxnet==1.8.0.post0
 nevergrad==0.4.3.post3
 optuna==2.8.0