Skip to content

Commit

Permalink
[core][autoscaler] GCS Autoscaler V2: Add node type name to ray (ray-…
Browse files Browse the repository at this point in the history
…project#36714)

Why are these changes needed?
This PR adds way to pass instance type (ray node type name) to ray, and make it available to autoscaler.

I will be adding e2e tests in a separate PR (from installer -> GCS -> autoscaler). This PR only adds the unit testing.
---------

Signed-off-by: rickyyx <rickyx@anyscale.com>
Signed-off-by: e428265 <arvind.chandramouli@lmco.com>
  • Loading branch information
rickyyx authored and arvind-chandra committed Aug 31, 2023
1 parent 6c841fc commit 6f25089
Show file tree
Hide file tree
Showing 7 changed files with 28 additions and 4 deletions.
2 changes: 2 additions & 0 deletions python/ray/autoscaler/v2/instance_manager/ray_installer.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,13 @@ def install_ray(self, instance: Instance, head_node_ip: str) -> bool:
# `RAY_HEAD_IP=<head_node_ip> \
# RAY_CLOUD_INSTANCE_ID=<instance_id> \
# ray start --head ...`
# See src/ray/common/constants.h for ENV name definitions.
ray_start_commands=with_envs(
ray_start_commands,
{
"RAY_HEAD_IP": head_node_ip,
"RAY_CLOUD_INSTANCE_ID": instance.instance_id,
"RAY_NODE_TYPE_NAME": instance.instance_type,
},
),
runtime_hash=self._config.runtime_hash,
Expand Down
5 changes: 4 additions & 1 deletion python/ray/tests/test_state_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -2135,7 +2135,7 @@ def verify():


@pytest.mark.asyncio
async def test_node_instance_id(ray_start_cluster, monkeypatch):
async def test_cloud_envs(ray_start_cluster, monkeypatch):
cluster = ray_start_cluster
cluster.add_node(num_cpus=1, node_name="head_node")
ray.init(address=cluster.address)
Expand All @@ -2144,6 +2144,7 @@ async def test_node_instance_id(ray_start_cluster, monkeypatch):
"RAY_CLOUD_INSTANCE_ID",
"test_cloud_id",
)
m.setenv("RAY_NODE_TYPE_NAME", "test-node-type")
cluster.add_node(num_cpus=1, node_name="worker_node")
client = state_source_client(cluster.address)

Expand All @@ -2154,8 +2155,10 @@ async def verify():
for node_info in reply.node_info_list:
if node_info.node_name == "worker_node":
assert node_info.instance_id == "test_cloud_id"
assert node_info.node_type_name == "test-node-type"
else:
assert node_info.instance_id == ""
assert node_info.node_type_name == ""

return True

Expand Down
9 changes: 9 additions & 0 deletions src/ray/common/constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,18 @@ constexpr char kSetupWorkerFilename[] = "setup_worker.py";
/// The version of Ray
constexpr char kRayVersion[] = "3.0.0.dev0";

/*****************************/
/* ENV labels for autoscaler */
/*****************************/
/// Name for cloud instance id env
constexpr char kNodeCloudInstanceIdEnv[] = "RAY_CLOUD_INSTANCE_ID";

constexpr char kNodeTypeNameEnv[] = "RAY_NODE_TYPE_NAME";

/**********************************/
/* ENV labels for autoscaler ends */
/**********************************/

/// Key for the placement group's bundle placement constraint.
/// Used by FormatPlacementGroupLabelName()
constexpr char kPlacementGroupConstraintKeyPrefix[] = "_PG_";
Expand Down
1 change: 1 addition & 0 deletions src/ray/gcs/gcs_server/gcs_autoscaler_state_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ void GcsAutoscalerStateManager::GetNodeStates(
auto node_state_proto = state->add_node_states();
node_state_proto->set_node_id(gcs_node_info.node_id());
node_state_proto->set_instance_id(gcs_node_info.instance_id());
node_state_proto->set_ray_node_type_name(gcs_node_info.node_type_name());
node_state_proto->set_node_state_version(last_cluster_resource_state_version_);
node_state_proto->set_status(status);

Expand Down
2 changes: 2 additions & 0 deletions src/ray/protobuf/experimental/autoscaler.proto
Original file line number Diff line number Diff line change
Expand Up @@ -96,12 +96,14 @@ message NodeState {

// The instance id that the node is running on.
// This is passed in when the node is registered.
// Set by ray from ENV at src/ray/common/constants.h::kNodeCloudInstanceIdEnv
string instance_id = 2;

// The node type name, e.g. ray-head-node, matching `available_node_types`
// in the autoscaler config. See `ray/autoscaler/ray-schema.json`
// Should be set when a ray node is starting - and this will be empty
// if it's not set when starting the node.
// Set by ray from ENV at src/ray/common/constants.h::kNodeTypeNameEnv
string ray_node_type_name = 3;

// The available resources on the node.
Expand Down
4 changes: 4 additions & 0 deletions src/ray/protobuf/gcs.proto
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,10 @@ message GcsNodeInfo {
// The instance id of the node if it's running on a cloud provider.
string instance_id = 13;

// The instance node type of the node if it's running on a cloud provider.
// Set through ENV of src/ray/common/constants.h::kNodeTypeNameEnv
string node_type_name = 14;

// The unix ms timestamp the node was started at.
uint64 start_time_ms = 23;
// The unix ms timestamp the node was ended at.
Expand Down
9 changes: 6 additions & 3 deletions src/ray/raylet/raylet.cc
Original file line number Diff line number Diff line change
Expand Up @@ -94,11 +94,14 @@ Raylet::Raylet(instrumented_io_context &main_service,
resource_map.end());
self_node_info_.set_start_time_ms(current_sys_time_ms());
self_node_info_.set_is_head_node(is_head_node);
// Read from env
auto instance_id = std::getenv(kNodeCloudInstanceIdEnv);
self_node_info_.set_instance_id(instance_id ? instance_id : "");
self_node_info_.mutable_labels()->insert(node_manager_config.labels.begin(),
node_manager_config.labels.end());

// Setting up autoscaler related fields from ENV
auto instance_id = std::getenv(kNodeCloudInstanceIdEnv);
self_node_info_.set_instance_id(instance_id ? instance_id : "");
auto cloud_node_type_name = std::getenv(kNodeTypeNameEnv);
self_node_info_.set_node_type_name(cloud_node_type_name ? cloud_node_type_name : "");
}

Raylet::~Raylet() {}
Expand Down

0 comments on commit 6f25089

Please sign in to comment.