forked from apache/airflow
-
Notifications
You must be signed in to change notification settings - Fork 18
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[astro] [AIRFLOW-5448] Handle istio-proxy for Kubernetes Pods (#62)
Istio service mesh is not compatible by default with Kubernetes Jobs. The normal behavior is that a Job will be started, get an istio-proxy sidecar attached to it via the istio mutating webhook, run until completion, then the 'main' container in the pod stops, but istio-proxy hangs around indefinitely. This change handles cleanly exiting the Istio sidecar 'istio-proxy' when a Kubernetes Executor task completes. (cherry picked from commit 84fa48f) (cherry picked from commit 6ed59bf) (cherry picked from commit ba60ede) (cherry picked from commit 80ac218) Handle Istio containers with Kubernetes Executor Pod adoption (#1318) closes astronomer/issues#3030 >This edge case deals specifically with task instances that ended in the UP_FOR_RETRY state when a scheduler is adopting orphaned task. Generally, this issue does not affec OSS Airflow since the template kubernetes worker pods spawned doesn't have additional containers that would prevent the pod from going into the Succeeded pod state. Those pods in the Succeeded state are handled by the scheduler's adoption process in _adopt_completed_pods(). Since Astronomer's kubernetes worker pods have an additional container (istio-proxy), they are in the NotReady state when tasks are not killed and they are not eligible for adoption. This can also happen for "completed" pods that have sidecars. Same process though, just a slightly different scenario: If a worker finishes while not being watched by a scheduler, it never gets adopted by another scheduler in _adopt_completed_pods() as the pod is still 'Running', but the TI also isn't in a resettable state so scheduler_job never asks the executor to adopt it! It's in limbo - "complete" in Airflows view (based on TI state) but "Running" in k8s view (since the sidecar is still running). This commit re-uses current Istio code and handles those pods. (cherry picked from commit 3f309b0) (cherry picked from commit 58cfc68) (cherry picked from commit 92a8289) [astro] Fix istio sidecar shutdown on newer GKE Newer GKE verions have started to emit multiple running events for a given pod with the sidecar still being shown as running. We will put retries around shutting down the sidecar and also check the current status of the sidecar, not just the status at the time of the event. e.g: GKE > 1.18.20.901 (cherry picked from commit cbd50ef) (cherry picked from commit d1025e1) (cherry picked from commit d56ba74) (cherry picked from commit 11a80ae) (cherry picked from commit 1f0e8be) (cherry picked from commit 20b0bad) (cherry picked from commit 102efe2) (cherry picked from commit 765cc50)
- Loading branch information
1 parent
ba0e71f
commit 7f7e454
Showing
4 changed files
with
310 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,170 @@ | ||
# Licensed to the Apache Software Foundation (ASF) under one | ||
# or more contributor license agreements. See the NOTICE file | ||
# distributed with this work for additional information | ||
# regarding copyright ownership. The ASF licenses this file | ||
# to you under the Apache License, Version 2.0 (the | ||
# "License"); you may not use this file except in compliance | ||
# with the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, | ||
# software distributed under the License is distributed on an | ||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
# KIND, either express or implied. See the License for the | ||
# specific language governing permissions and limitations | ||
# under the License. | ||
|
||
import tenacity | ||
from kubernetes.client.rest import ApiException | ||
from kubernetes.stream import stream | ||
from packaging.version import parse as semantic_version | ||
|
||
from airflow import AirflowException | ||
from airflow.utils.log.logging_mixin import LoggingMixin | ||
|
||
|
||
class SidecarNames: | ||
"""Define strings that indicate container names""" | ||
|
||
ISTIO_PROXY = 'istio-proxy' | ||
|
||
|
||
class Istio(LoggingMixin): | ||
"""Handle all Istio-related logic""" | ||
|
||
def __init__(self, kube_client): | ||
super().__init__() | ||
self._client = kube_client | ||
|
||
def handle_istio_proxy(self, pod) -> bool: | ||
"""If an istio-proxy sidecar is detected, and all other containers | ||
are terminated, then attempt to cleanly shutdown the sidecar. | ||
If we detect a version of Istio before it's compatible with Kubernetes | ||
Jobs, then raise an informative error message. | ||
:param pod: The pod which we are checking for the sidecar | ||
:returns: True if we detect and exit istio-proxy, False if we do not detect istio-proxy | ||
:rtype: bool | ||
Raises: | ||
AirflowException: if we find an istio-proxy, and we can't shut it down. | ||
""" | ||
if self._should_shutdown_istio_proxy(pod): | ||
self.log.info( | ||
"Detected that a task finished and needs an istio-proxy sidecar to be cleaned up. " | ||
"pod name: %s", | ||
pod.metadata.name, | ||
) | ||
try: | ||
self._shutdown_istio_proxy(pod) | ||
except ApiException: | ||
self.log.debug("Error handling Istio container for pod: %s", pod.metadata.name) | ||
return True | ||
return False | ||
|
||
def _should_shutdown_istio_proxy(self, pod): | ||
"""Look for an istio-proxy, and decide if it should be shutdown. | ||
Args: | ||
pod (V1Pod): The pod which we are checking for the sidecar | ||
Returns: | ||
(bool): True if we detect istio-proxy, and all other containers | ||
are finished running, otherwise false | ||
""" | ||
if pod.status.phase != "Running": | ||
return False | ||
found_istio = False | ||
for container_status in pod.status.container_statuses: | ||
if container_status.name == SidecarNames.ISTIO_PROXY and container_status.state.running: | ||
found_istio = True | ||
continue | ||
if not container_status.state.terminated: | ||
# Any state besides 'terminated' should be | ||
# considered still busy | ||
return False | ||
# If we didn't find istio at all, then we should | ||
# not shut it down. Also we should only shut it down | ||
# if it has state "running". | ||
return found_istio | ||
|
||
def _shutdown_istio_proxy(self, pod): | ||
"""Shutdown the istio-proxy on the provided pod | ||
Args: | ||
pod (V1Pod): The pod which the container is in | ||
Returns: | ||
None | ||
Raises: | ||
AirflowException: if we find an istio-proxy, and we can't shut it down. | ||
""" | ||
for container in pod.spec.containers: | ||
|
||
# Skip unless it's a sidecar named as SidecarNames.ISTIO_PROXY. | ||
if container.name != SidecarNames.ISTIO_PROXY: | ||
continue | ||
|
||
# Check if supported version of istio-proxy. | ||
# If we can't tell the version, proceed anyways. | ||
if ":" in container.image: | ||
_, tag = container.image.split(":") | ||
if semantic_version(tag) < semantic_version("1.3.0-rc.0"): | ||
raise AirflowException( | ||
'Please use istio version 1.3.0+ for KubernetesExecutor compatibility.' | ||
+ f' Detected version {tag}' | ||
) | ||
|
||
# Determine the istio-proxy statusPort, | ||
# which is where /quitquitquit is implemented. | ||
# Default to 15020. | ||
status_port = "15020" | ||
for i in range(len(container.args)): | ||
arg = container.args[i] | ||
if arg.strip() == "--statusPort": | ||
status_port = container.args[i + 1].strip() | ||
break | ||
if arg.strip()[:13] == "--statusPort=": | ||
status_port = arg.strip()[13:] | ||
break | ||
|
||
self.log.info("Shutting down istio-proxy in pod %s", pod.metadata.name) | ||
self._post_quitquitquit(pod, container, status_port) | ||
|
||
@tenacity.retry( | ||
stop=tenacity.stop_after_attempt(3), | ||
wait=tenacity.wait_fixed(0.5), | ||
reraise=True, | ||
retry=tenacity.retry_if_exception_type(ApiException), | ||
) | ||
def _post_quitquitquit(self, pod, container, status_port): | ||
"""Send the curl to shutdown the isto-proxy container""" | ||
# Use exec to curl localhost inside of the sidecar. | ||
try: | ||
_ = stream( | ||
self._client.connect_get_namespaced_pod_exec, | ||
pod.metadata.name, | ||
pod.metadata.namespace, | ||
tty=False, | ||
stderr=True, | ||
stdin=False, | ||
stdout=True, | ||
container=container.name, | ||
command=['/bin/sh', '-c', f'curl -XPOST http://127.0.0.1:{status_port}/quitquitquit'], | ||
) | ||
return | ||
except ApiException: | ||
# Check if the istio sidecar has already been shut down | ||
current_pod = self._client.read_namespaced_pod( | ||
name=pod.metadata.name, | ||
namespace=pod.metadata.namespace, | ||
) | ||
if not self._should_shutdown_istio_proxy(current_pod): | ||
self.log.info( | ||
"Istio sidecar is already shut down in %s, so continuing on", | ||
pod.metadata.name, | ||
) | ||
return | ||
raise |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
# Licensed to the Apache Software Foundation (ASF) under one | ||
# or more contributor license agreements. See the NOTICE file | ||
# distributed with this work for additional information | ||
# regarding copyright ownership. The ASF licenses this file | ||
# to you under the Apache License, Version 2.0 (the | ||
# "License"); you may not use this file except in compliance | ||
# with the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, | ||
# software distributed under the License is distributed on an | ||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
# KIND, either express or implied. See the License for the | ||
# specific language governing permissions and limitations | ||
# under the License. | ||
|
||
import unittest | ||
from unittest.mock import MagicMock, patch | ||
|
||
from airflow import AirflowException | ||
from airflow.kubernetes.istio import Istio | ||
|
||
|
||
def mock_stream(func, *args, **kwargs): | ||
print('calling func') | ||
return func(*args, **kwargs) | ||
|
||
|
||
class TestIstio(unittest.TestCase): | ||
def setUp(self): | ||
mock_kube_client = MagicMock() | ||
self.istio = Istio(mock_kube_client) | ||
|
||
def _mock_pod(self, image="istio/proxyv2:1.3.0", args=None): | ||
sidecar = MagicMock() | ||
sidecar.name = "istio-proxy" | ||
sidecar.namespace = "fake-namespace" | ||
sidecar.image = image | ||
sidecar.args = args | ||
pod = MagicMock() | ||
pod.spec.containers = [sidecar] | ||
pod.status.phase = "Running" | ||
pod.metadata.name = "fake-pod-name" | ||
pod.metadata.namespace = "fake-namespace" | ||
container_status1 = MagicMock() | ||
container_status1.name = "istio-proxy" | ||
container_status1.state.running = True | ||
container_status1.state.terminated = False | ||
container_status2 = MagicMock() | ||
container_status2.name = "base" | ||
container_status2.state.running = False | ||
container_status2.state.terminated = True | ||
pod.status.container_statuses = [container_status1, container_status2] | ||
return pod | ||
|
||
def test_handle_istio_proxy_low_version(self): | ||
pod = self._mock_pod(image="istio/proxyv2:1.2.9") | ||
self.assertRaises(AirflowException, self.istio.handle_istio_proxy, pod) | ||
|
||
def _handle_istio_proxy_with_sidecar_args(self, args): | ||
pod = self._mock_pod(args=args) | ||
self.istio.handle_istio_proxy(pod) | ||
|
||
@patch("airflow.kubernetes.istio.stream", new=mock_stream) | ||
def test_handle_istio_proxy(self): | ||
args = ["proxy", "sidecar", "--statusPort", "12345"] | ||
self._handle_istio_proxy_with_sidecar_args(args) | ||
self.istio._client.connect_get_namespaced_pod_exec.assert_called_once_with( | ||
'fake-pod-name', | ||
'fake-namespace', | ||
tty=False, | ||
container='istio-proxy', | ||
stderr=True, | ||
stdin=False, | ||
stdout=True, | ||
command=['/bin/sh', '-c', 'curl -XPOST http://127.0.0.1:12345/quitquitquit'], | ||
) | ||
|
||
@patch("airflow.kubernetes.istio.stream", new=mock_stream) | ||
def test_handle_istio_proxy_other_cli_format(self): | ||
args = ["proxy", "sidecar", "--statusPort=12345"] | ||
self._handle_istio_proxy_with_sidecar_args(args) | ||
self.istio._client.connect_get_namespaced_pod_exec.assert_called_once_with( | ||
'fake-pod-name', | ||
'fake-namespace', | ||
tty=False, | ||
container='istio-proxy', | ||
stderr=True, | ||
stdin=False, | ||
stdout=True, | ||
command=['/bin/sh', '-c', 'curl -XPOST http://127.0.0.1:12345/quitquitquit'], | ||
) | ||
|
||
@patch("airflow.kubernetes.istio.stream", new=mock_stream) | ||
def test_handle_istio_proxy_no_cli_argument(self): | ||
args = ["proxy", "sidecar"] | ||
self._handle_istio_proxy_with_sidecar_args(args) | ||
self.istio._client.connect_get_namespaced_pod_exec.assert_called_once_with( | ||
'fake-pod-name', | ||
'fake-namespace', | ||
tty=False, | ||
container='istio-proxy', | ||
stderr=True, | ||
stdin=False, | ||
stdout=True, | ||
command=['/bin/sh', '-c', 'curl -XPOST http://127.0.0.1:15020/quitquitquit'], | ||
) | ||
|
||
@patch("airflow.kubernetes.istio.stream", new=mock_stream) | ||
def test_handle_istio_with_no_sidecar(self): | ||
pod = MagicMock() | ||
pod.spec.containers = [] | ||
self.istio.handle_istio_proxy(MagicMock()) | ||
self.istio._client.connect_get_namespaced_pod_exec.assert_not_called() | ||
|
||
|
||
if __name__ == "__main__": | ||
unittest.main() |