From e0584276e2ea1b212563aa160ecc1cbda12af289 Mon Sep 17 00:00:00 2001 From: ltbringer Date: Fri, 8 Nov 2024 05:27:48 +0000 Subject: [PATCH] [Core]: Fix ConnectionError on Autoscaler CR lookups in K8s clusters with custom DNS for Kubernetes API. (#48541) Signed-off-by: ltbringer --- .../_private/kuberay/node_provider.py | 38 ++++++++++++++----- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/python/ray/autoscaler/_private/kuberay/node_provider.py b/python/ray/autoscaler/_private/kuberay/node_provider.py index d5057fc1db63..ac0ade30b08b 100644 --- a/python/ray/autoscaler/_private/kuberay/node_provider.py +++ b/python/ray/autoscaler/_private/kuberay/node_provider.py @@ -47,6 +47,15 @@ RAY_HEAD_POD_NAME = os.getenv("RAY_HEAD_POD_NAME") +# https://kubernetes.io/docs/tasks/run-application/access-api-from-pod +# While running in a Pod, your container can create an HTTPS URL for the +# Kubernetes API server by fetching the KUBERNETES_SERVICE_HOST and +# KUBERNETES_SERVICE_PORT_HTTPS environment variables. +KUBERNETES_SERVICE_HOST = os.getenv( + "KUBERNETES_SERVICE_HOST", "https://kubernetes.default" +) +KUBERNETES_SERVICE_PORT = os.getenv("KUBERNETES_SERVICE_PORT_HTTPS", "443") +KUBERNETES_HOST = f"{KUBERNETES_SERVICE_HOST}:{KUBERNETES_SERVICE_PORT}" # Key for GKE label that identifies which multi-host replica a pod belongs to REPLICA_INDEX_KEY = "replicaIndex" @@ -179,7 +188,10 @@ def load_k8s_secrets() -> Tuple[Dict[str, str], str]: def url_from_resource( - namespace: str, path: str, kuberay_crd_version: str = KUBERAY_CRD_VER + namespace: str, + path: str, + kuberay_crd_version: str = KUBERAY_CRD_VER, + kubernetes_host: str = KUBERNETES_HOST, ) -> str: """Convert resource path to REST URL for Kubernetes API server. @@ -187,21 +199,29 @@ def url_from_resource( namespace: The K8s namespace of the resource path: The part of the resource path that starts with the resource type. Supported resource types are "pods" and "rayclusters". + kuberay_crd_version: The API version of the KubeRay CRD. + Looks like "v1alpha1", "v1". + kubernetes_host: The host of the Kubernetes API server. + Uses $KUBERNETES_SERVICE_HOST and + $KUBERNETES_SERVICE_PORT to construct the kubernetes_host if not provided. + + When set by Kubernetes, + $KUBERNETES_SERVICE_HOST could be an IP address. That's why the https + scheme is added here. + + Defaults to "https://kubernetes.default:443". """ + if kubernetes_host.startswith("http://"): + raise ValueError("Kubernetes host must be accessed over HTTPS.") + if not kubernetes_host.startswith("https://"): + kubernetes_host = "https://" + kubernetes_host if path.startswith("pods"): api_group = "/api/v1" elif path.startswith("rayclusters"): api_group = "/apis/ray.io/" + kuberay_crd_version else: raise NotImplementedError("Tried to access unknown entity at {}".format(path)) - return ( - "https://kubernetes.default:443" - + api_group - + "/namespaces/" - + namespace - + "/" - + path - ) + return kubernetes_host + api_group + "/namespaces/" + namespace + "/" + path def _worker_group_index(raycluster: Dict[str, Any], group_name: str) -> int: