diff --git a/python/ray/autoscaler/v2/tests/test_sdk.py b/python/ray/autoscaler/v2/tests/test_sdk.py index 81e9422583e8..ca26453d9610 100644 --- a/python/ray/autoscaler/v2/tests/test_sdk.py +++ b/python/ray/autoscaler/v2/tests/test_sdk.py @@ -1,5 +1,6 @@ import os import sys +import time # coding: utf-8 from dataclasses import dataclass @@ -102,7 +103,7 @@ def verify(): def test_node_state_lifecycle_basic(ray_start_cluster): - + start_s = time.perf_counter() cluster = ray_start_cluster cluster.add_node(num_cpus=0) ray.init(address=cluster.address) @@ -174,13 +175,22 @@ def verify_cluster_busy(): # Kill the node. cluster.remove_node(node) + # Sleep for a bit so head node should be idle longer than this. + time.sleep(3) + def verify_cluster_no_node(): state = get_cluster_resource_state(stub) + now_s = time.perf_counter() + test_dur_ms = (now_s - start_s) * 1000 assert_node_states( state, [ NodeState(node_id, NodeStatus.DEAD), - NodeState(head_node_id, NodeStatus.IDLE, lambda idle_ms: idle_ms > 0), + NodeState( + head_node_id, + NodeStatus.IDLE, + lambda idle_ms: idle_ms > 3 * 1000 and idle_ms < test_dur_ms, + ), ], ) return True diff --git a/src/ray/gcs/gcs_server/gcs_autoscaler_state_manager.cc b/src/ray/gcs/gcs_server/gcs_autoscaler_state_manager.cc index 21b7c73fadf7..0d93792212a9 100644 --- a/src/ray/gcs/gcs_server/gcs_autoscaler_state_manager.cc +++ b/src/ray/gcs/gcs_server/gcs_autoscaler_state_manager.cc @@ -200,10 +200,21 @@ void GcsAutoscalerStateManager::GetNodeStates( auto const &node_resource_data = cluster_resource_manager_.GetNodeResources( scheduling::NodeID(node_state_proto->node_id())); if (node_resource_data.idle_resource_duration_ms > 0) { - // The node is idle. + // The node was reported idle. node_state_proto->set_status(rpc::autoscaler::NodeStatus::IDLE); + + // We approximate the idle duration by the time since the last idle report + // plus the idle duration reported by the node: + // idle_dur = + + // + // This is because with lightweight resource update, we don't keep reporting + // the idle time duration when there's no resource change. We also don't want to + // use raylet reported idle timestamp since there might be clock skew. + RAY_CHECK(node_resource_data.last_resource_update_time != absl::nullopt); node_state_proto->set_idle_duration_ms( - node_resource_data.idle_resource_duration_ms); + node_resource_data.idle_resource_duration_ms + + absl::ToInt64Milliseconds( + absl::Now() - node_resource_data.last_resource_update_time.value())); } else { node_state_proto->set_status(rpc::autoscaler::NodeStatus::RUNNING); } diff --git a/src/ray/raylet/scheduling/cluster_resource_data.h b/src/ray/raylet/scheduling/cluster_resource_data.h index fb8c9e5fdc31..fbb5e0aba8c2 100644 --- a/src/ray/raylet/scheduling/cluster_resource_data.h +++ b/src/ray/raylet/scheduling/cluster_resource_data.h @@ -436,9 +436,12 @@ class NodeResources { // The key-value labels of this node. absl::flat_hash_map labels; - // The idle duration of the node from resources. + // The idle duration of the node from resources reported by raylet. int64_t idle_resource_duration_ms = 0; + // The timestamp of the last resource update if there was a resource report. + absl::optional last_resource_update_time = absl::nullopt; + /// Normal task resources could be uploaded by 1) Raylets' periodical reporters; 2) /// Rejected RequestWorkerLeaseReply. So we need the timestamps to decide whether an /// upload is latest. diff --git a/src/ray/raylet/scheduling/cluster_resource_manager.cc b/src/ray/raylet/scheduling/cluster_resource_manager.cc index 78bdfa18c2d6..295dde4c2c9b 100644 --- a/src/ray/raylet/scheduling/cluster_resource_manager.cc +++ b/src/ray/raylet/scheduling/cluster_resource_manager.cc @@ -256,6 +256,9 @@ bool ClusterResourceManager::UpdateNodeAvailableResourcesIfExist( // Update the idle duration for the node in terms of resources usage. node_resources->idle_resource_duration_ms = resource_data.idle_duration_ms(); + + // Last update time to the local node resources view. + node_resources->last_resource_update_time = absl::Now(); return true; }