From 5fdf0a29b4279fdfc89c802762adb78e82dfa9d7 Mon Sep 17 00:00:00 2001 From: SdgJlbl Date: Wed, 29 May 2024 16:32:42 +0200 Subject: [PATCH] feat: retry on timeout connections to external registry Signed-off-by: SdgJlbl --- .../builder/image_builder/image_builder.py | 16 +++++++++++--- backend/builder/tasks/tasks_build_image.py | 2 -- .../builder/tests/test_task_build_image.py | 21 ++++++++++++++++++- backend/substrapp/docker_registry.py | 2 -- changes/917.changed | 1 + 5 files changed, 34 insertions(+), 8 deletions(-) create mode 100644 changes/917.changed diff --git a/backend/builder/image_builder/image_builder.py b/backend/builder/image_builder/image_builder.py index ce625f76b..c234ec686 100644 --- a/backend/builder/image_builder/image_builder.py +++ b/backend/builder/image_builder/image_builder.py @@ -4,6 +4,7 @@ from typing import Union import kubernetes +import requests.exceptions import structlog from django.conf import settings @@ -67,7 +68,12 @@ def build_image_if_missing(channel: str, function: orchestrator.Function) -> Non datastore = ds.Datastore(channel=channel) container_image_tag = utils.container_image_tag_from_function(function) with lock_resource("image-build", container_image_tag, ttl=IMAGE_BUILD_TIMEOUT, timeout=IMAGE_BUILD_TIMEOUT): - if docker.container_image_exists(container_image_tag): + try: + image_exists = docker.container_image_exists(container_image_tag) + except requests.exceptions.ReadTimeout as e: + raise BuildRetryError(f"Timeout while checking if image exists: {e}") from e + + if image_exists: logger.info("Reusing existing image", image=container_image_tag) else: asset_content = datastore.get_function(function) @@ -136,7 +142,7 @@ def _delete_kaniko_pod(create_pod: bool, k8s_client: kubernetes.client.CoreV1Api @timeit -def _build_container_image(path: str, tag: str) -> None: +def _build_container_image(path: str, tag: str) -> None: # noqa: C901 _assert_dockerfile_exist(path) kubernetes.config.load_incluster_config() @@ -160,7 +166,11 @@ def _build_container_image(path: str, tag: str) -> None: except Exception as e: # In case of concurrent builds, it may fail. Check if the image exists. - if docker.container_image_exists(tag): + try: + image_exists = docker.container_image_exists(tag) + except Exception: + image_exists = False + if image_exists: logger.warning( f"Build of container image {tag} failed, probably because it was done by a concurrent build", exc_info=True, diff --git a/backend/builder/tasks/tasks_build_image.py b/backend/builder/tasks/tasks_build_image.py index 8bbe2c73d..a4087e9e5 100644 --- a/backend/builder/tasks/tasks_build_image.py +++ b/backend/builder/tasks/tasks_build_image.py @@ -1,5 +1,4 @@ import structlog -from django.conf import settings import orchestrator from backend.celery import app @@ -10,7 +9,6 @@ from builder.tasks.task import BuildTask logger = structlog.get_logger(__name__) -max_retries = settings.CELERY_TASK_MAX_RETRIES @app.task( diff --git a/backend/builder/tests/test_task_build_image.py b/backend/builder/tests/test_task_build_image.py index c5b86b643..cc630bc5a 100644 --- a/backend/builder/tests/test_task_build_image.py +++ b/backend/builder/tests/test_task_build_image.py @@ -2,12 +2,13 @@ import celery import pytest +from requests.exceptions import ReadTimeout import orchestrator.mock as orc_mock from builder.exceptions import BuildError from builder.exceptions import BuildRetryError +from builder.exceptions import CeleryNoRetryError from builder.tasks.tasks_build_image import build_image -from substrapp.compute_tasks.errors import CeleryNoRetryError from substrapp.models import FailedAssetKind from substrapp.utils.errors import store_failure @@ -93,3 +94,21 @@ def side_effect(*args, **kwargs): result_retry.get() assert result_retry.state == celery.states.SUCCESS assert result_other.state == "WAITING" + + +def test_ssl_connection_timeout(celery_app, celery_worker, mocker): + """ + Test that in case of a SSL connection timeout, the task is retried max_retries times, + then raise a CeleryNoRetryError + """ + function = orc_mock.FunctionFactory() + mocker.patch("builder.tasks.task.get_orchestrator_client") + api_mock = mocker.patch( + "substrapp.docker_registry.get_request_docker_api", side_effect=ReadTimeout("Read timed out. ") + ) + + with pytest.raises(CeleryNoRetryError): + build_image.apply_async( + kwargs={"function_serialized": function.model_dump_json(), "channel_name": CHANNEL} + ).get() + assert api_mock.call_count == build_image.max_retries diff --git a/backend/substrapp/docker_registry.py b/backend/substrapp/docker_registry.py index a90e0d1be..cfa1a29e6 100644 --- a/backend/substrapp/docker_registry.py +++ b/backend/substrapp/docker_registry.py @@ -75,8 +75,6 @@ def get_request_docker_api( if response.status_code != requests.status_codes.codes.ok: raise ImageNotFoundError(f"Error when querying docker-registry, status code: {response.status_code}") - response.raise_for_status() - return response.json() diff --git a/changes/917.changed b/changes/917.changed new file mode 100644 index 000000000..f50421900 --- /dev/null +++ b/changes/917.changed @@ -0,0 +1 @@ +SSL connection errors to the registry are now retried.