Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[serve] Fix issue http proxy downscaling issues #36652

Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
385d0b6
[serve] address issue when downscale nodes with ongoing requests and …
GeneDer Jun 18, 2023
6944a4e
manual test and fixes issues
GeneDer Jun 18, 2023
4631a7c
Merge branch 'master' into fix-issue-http-proxy-downscaling-on-large-…
GeneDer Jun 18, 2023
59b3b5c
add unit test for get_running_replica_node_ids() and get_node_ids_wit…
GeneDer Jun 20, 2023
f218394
add test to test http state and proxies
GeneDer Jun 20, 2023
fc593f0
add test for expected responses on head and worker node routes
GeneDer Jun 21, 2023
098b5ef
Merge branch 'master' into fix-issue-http-proxy-downscaling-on-large-…
GeneDer Jun 21, 2023
5cf7163
optimize set active call
GeneDer Jun 21, 2023
a86ed19
add new INACTIVE to sort order and color map
GeneDer Jun 21, 2023
c04edc4
use blueGrey for inactive
GeneDer Jun 21, 2023
f37d17d
address comments
GeneDer Jun 21, 2023
d60036b
use set comprehension
GeneDer Jun 21, 2023
d8887fa
use {} syntax
GeneDer Jun 21, 2023
da718af
address comments and use long poll to pass active nodes
GeneDer Jun 23, 2023
d102ac2
Merge branch 'master' into fix-issue-http-proxy-downscaling-on-large-…
GeneDer Jun 23, 2023
6965e39
fix tests
GeneDer Jun 23, 2023
b443d22
drop the long poll client on http state and use update to set the dra…
GeneDer Jun 23, 2023
fdc68e5
linting
GeneDer Jun 23, 2023
f659da5
Merge branch 'master' into fix-issue-http-proxy-downscaling-on-large-…
GeneDer Jun 23, 2023
f1a12eb
Merge branch 'master' into fix-issue-http-proxy-downscaling-on-large-…
GeneDer Jun 23, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions dashboard/client/src/components/StatusChip.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ const colorMap = {
[ServeSystemActorStatus.HEALTHY]: green,
[ServeSystemActorStatus.UNHEALTHY]: red,
[ServeSystemActorStatus.STARTING]: orange,
[ServeSystemActorStatus.DRAINING]: blueGrey,
},
serveController: {
[ServeSystemActorStatus.HEALTHY]: green,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ const SERVE_HTTP_PROXY_STATUS_SORT_ORDER: Record<
[ServeSystemActorStatus.UNHEALTHY]: 0,
[ServeSystemActorStatus.STARTING]: 1,
[ServeSystemActorStatus.HEALTHY]: 2,
[ServeSystemActorStatus.DRAINING]: 3,
};

export const useServeApplications = () => {
Expand Down
1 change: 1 addition & 0 deletions dashboard/client/src/type/serve.ts
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ export enum ServeSystemActorStatus {
STARTING = "STARTING",
HEALTHY = "HEALTHY",
UNHEALTHY = "UNHEALTHY",
DRAINING = "DRAINING",
}

export type ServeSystemActor = {
Expand Down
1 change: 1 addition & 0 deletions python/ray/serve/_private/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,7 @@ class HTTPProxyStatus(str, Enum):
STARTING = "STARTING"
HEALTHY = "HEALTHY"
UNHEALTHY = "UNHEALTHY"
DRAINING = "DRAINING"


class ServeComponentType(str, Enum):
Expand Down
27 changes: 26 additions & 1 deletion python/ray/serve/_private/deployment_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from collections import defaultdict, OrderedDict
from copy import copy
from enum import Enum
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union

import ray
from ray import ObjectRef, cloudpickle
Expand Down Expand Up @@ -1227,6 +1227,20 @@ def get_running_replica_infos(self) -> List[RunningReplicaInfo]:
for replica in self._replicas.get([ReplicaState.RUNNING])
]

def get_active_node_ids(self) -> Set[str]:
"""Get the node ids of all running replicas in this deployment.

This is used to determine which node has replicas. Only nodes with replicas and
head node should have active proxies.
"""
active_states = [
ReplicaState.STARTING,
ReplicaState.UPDATING,
ReplicaState.RECOVERING,
ReplicaState.RUNNING,
]
return {replica.actor_node_id for replica in self._replicas.get(active_states)}

def list_replica_details(self) -> List[ReplicaDetails]:
return [replica.actor_details for replica in self._replicas.get()]

Expand Down Expand Up @@ -2496,3 +2510,14 @@ def record_multiplexed_replica_info(self, info: MultiplexedReplicaInfo):
self._deployment_states[info.deployment_name].record_multiplexed_model_ids(
info.replica_tag, info.model_ids
)

def get_active_node_ids(self) -> Set[str]:
"""Return set of node ids with running replicas of any deployment.

This is used to determine which node has replicas. Only nodes with replicas and
head node should have active proxies.
"""
node_ids = set()
for deployment_state in self._deployment_states.values():
node_ids.update(deployment_state.get_active_node_ids())
return node_ids
Loading