Skip to content

Commit

Permalink
feat: retry on timeout connections to external registry
Browse files Browse the repository at this point in the history
Signed-off-by: SdgJlbl <sarah.diot-girard@owkin.com>
  • Loading branch information
SdgJlbl committed Jun 7, 2024
1 parent fbb9179 commit 5fdf0a2
Show file tree
Hide file tree
Showing 5 changed files with 34 additions and 8 deletions.
16 changes: 13 additions & 3 deletions backend/builder/image_builder/image_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import Union

import kubernetes
import requests.exceptions
import structlog
from django.conf import settings

Expand Down Expand Up @@ -67,7 +68,12 @@ def build_image_if_missing(channel: str, function: orchestrator.Function) -> Non
datastore = ds.Datastore(channel=channel)
container_image_tag = utils.container_image_tag_from_function(function)
with lock_resource("image-build", container_image_tag, ttl=IMAGE_BUILD_TIMEOUT, timeout=IMAGE_BUILD_TIMEOUT):
if docker.container_image_exists(container_image_tag):
try:
image_exists = docker.container_image_exists(container_image_tag)
except requests.exceptions.ReadTimeout as e:
raise BuildRetryError(f"Timeout while checking if image exists: {e}") from e

if image_exists:
logger.info("Reusing existing image", image=container_image_tag)
else:
asset_content = datastore.get_function(function)
Expand Down Expand Up @@ -136,7 +142,7 @@ def _delete_kaniko_pod(create_pod: bool, k8s_client: kubernetes.client.CoreV1Api


@timeit
def _build_container_image(path: str, tag: str) -> None:
def _build_container_image(path: str, tag: str) -> None: # noqa: C901
_assert_dockerfile_exist(path)

kubernetes.config.load_incluster_config()
Expand All @@ -160,7 +166,11 @@ def _build_container_image(path: str, tag: str) -> None:

except Exception as e:
# In case of concurrent builds, it may fail. Check if the image exists.
if docker.container_image_exists(tag):
try:
image_exists = docker.container_image_exists(tag)
except Exception:
image_exists = False
if image_exists:
logger.warning(
f"Build of container image {tag} failed, probably because it was done by a concurrent build",
exc_info=True,
Expand Down
2 changes: 0 additions & 2 deletions backend/builder/tasks/tasks_build_image.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import structlog
from django.conf import settings

import orchestrator
from backend.celery import app
Expand All @@ -10,7 +9,6 @@
from builder.tasks.task import BuildTask

logger = structlog.get_logger(__name__)
max_retries = settings.CELERY_TASK_MAX_RETRIES


@app.task(
Expand Down
21 changes: 20 additions & 1 deletion backend/builder/tests/test_task_build_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@

import celery
import pytest
from requests.exceptions import ReadTimeout

import orchestrator.mock as orc_mock
from builder.exceptions import BuildError
from builder.exceptions import BuildRetryError
from builder.exceptions import CeleryNoRetryError
from builder.tasks.tasks_build_image import build_image
from substrapp.compute_tasks.errors import CeleryNoRetryError
from substrapp.models import FailedAssetKind
from substrapp.utils.errors import store_failure

Expand Down Expand Up @@ -93,3 +94,21 @@ def side_effect(*args, **kwargs):
result_retry.get()
assert result_retry.state == celery.states.SUCCESS
assert result_other.state == "WAITING"


def test_ssl_connection_timeout(celery_app, celery_worker, mocker):
"""
Test that in case of a SSL connection timeout, the task is retried max_retries times,
then raise a CeleryNoRetryError
"""
function = orc_mock.FunctionFactory()
mocker.patch("builder.tasks.task.get_orchestrator_client")
api_mock = mocker.patch(
"substrapp.docker_registry.get_request_docker_api", side_effect=ReadTimeout("Read timed out. ")
)

with pytest.raises(CeleryNoRetryError):
build_image.apply_async(
kwargs={"function_serialized": function.model_dump_json(), "channel_name": CHANNEL}
).get()
assert api_mock.call_count == build_image.max_retries
2 changes: 0 additions & 2 deletions backend/substrapp/docker_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,6 @@ def get_request_docker_api(
if response.status_code != requests.status_codes.codes.ok:
raise ImageNotFoundError(f"Error when querying docker-registry, status code: {response.status_code}")

response.raise_for_status()

return response.json()


Expand Down
1 change: 1 addition & 0 deletions changes/917.changed
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SSL connection errors to the registry are now retried.

0 comments on commit 5fdf0a2

Please sign in to comment.