Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: retry on timeout connections to external registry #917

Merged
merged 1 commit into from
Jun 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions backend/builder/image_builder/image_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import Union

import kubernetes
import requests.exceptions
import structlog
from django.conf import settings

Expand Down Expand Up @@ -67,7 +68,12 @@ def build_image_if_missing(channel: str, function: orchestrator.Function) -> Non
datastore = ds.Datastore(channel=channel)
container_image_tag = utils.container_image_tag_from_function(function)
with lock_resource("image-build", container_image_tag, ttl=IMAGE_BUILD_TIMEOUT, timeout=IMAGE_BUILD_TIMEOUT):
if docker.container_image_exists(container_image_tag):
try:
image_exists = docker.container_image_exists(container_image_tag)
except requests.exceptions.ReadTimeout as e:
raise BuildRetryError(f"Timeout while checking if image exists: {e}") from e

if image_exists:
logger.info("Reusing existing image", image=container_image_tag)
else:
asset_content = datastore.get_function(function)
Expand Down Expand Up @@ -136,7 +142,7 @@ def _delete_kaniko_pod(create_pod: bool, k8s_client: kubernetes.client.CoreV1Api


@timeit
def _build_container_image(path: str, tag: str) -> None:
def _build_container_image(path: str, tag: str) -> None: # noqa: C901
_assert_dockerfile_exist(path)

kubernetes.config.load_incluster_config()
Expand All @@ -160,7 +166,11 @@ def _build_container_image(path: str, tag: str) -> None:

except Exception as e:
# In case of concurrent builds, it may fail. Check if the image exists.
if docker.container_image_exists(tag):
try:
image_exists = docker.container_image_exists(tag)
except Exception:
image_exists = False
if image_exists:
logger.warning(
f"Build of container image {tag} failed, probably because it was done by a concurrent build",
exc_info=True,
Comment on lines +169 to 176
Copy link
Member

@ThibaultFy ThibaultFy May 31, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
try:
image_exists = docker.container_image_exists(tag)
except Exception:
image_exists = False
if image_exists:
logger.warning(
f"Build of container image {tag} failed, probably because it was done by a concurrent build",
exc_info=True,
try:
if docker.container_image_exists(tag):
logger.warning(
f"Build of container image {tag} failed, probably because it was done by a concurrent build",
exc_info=True,
)
return
except Exception:
pass

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I find it more explicit that way. Feel free to ignore

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like to wrap the try except on the minimal number of lines, to mark explicitly what could potentially raise, and to avoid to mask unexpected Exceptions (even more when using a blanket except Exception).

I'm curious though, what do you find more explicit in your suggestion?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see your point. What I found more explicit is to not set image_exists = False cause it's never used afterwards, and to explicitely set the except Exception: pass to clearly indicate that we bypass all exceptions.

Expand Down
2 changes: 0 additions & 2 deletions backend/builder/tasks/tasks_build_image.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import structlog
from django.conf import settings

import orchestrator
from backend.celery import app
Expand All @@ -10,7 +9,6 @@
from builder.tasks.task import BuildTask

logger = structlog.get_logger(__name__)
max_retries = settings.CELERY_TASK_MAX_RETRIES


@app.task(
Expand Down
21 changes: 20 additions & 1 deletion backend/builder/tests/test_task_build_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@

import celery
import pytest
from requests.exceptions import ReadTimeout

import orchestrator.mock as orc_mock
from builder.exceptions import BuildError
from builder.exceptions import BuildRetryError
from builder.exceptions import CeleryNoRetryError
from builder.tasks.tasks_build_image import build_image
from substrapp.compute_tasks.errors import CeleryNoRetryError
from substrapp.models import FailedAssetKind
from substrapp.utils.errors import store_failure

Expand Down Expand Up @@ -93,3 +94,21 @@ def side_effect(*args, **kwargs):
result_retry.get()
assert result_retry.state == celery.states.SUCCESS
assert result_other.state == "WAITING"


def test_ssl_connection_timeout(celery_app, celery_worker, mocker):
"""
Test that in case of a SSL connection timeout, the task is retried max_retries times,
then raise a CeleryNoRetryError
"""
function = orc_mock.FunctionFactory()
mocker.patch("builder.tasks.task.get_orchestrator_client")
api_mock = mocker.patch(
"substrapp.docker_registry.get_request_docker_api", side_effect=ReadTimeout("Read timed out. ")
)

with pytest.raises(CeleryNoRetryError):
build_image.apply_async(
kwargs={"function_serialized": function.model_dump_json(), "channel_name": CHANNEL}
).get()
assert api_mock.call_count == build_image.max_retries
2 changes: 0 additions & 2 deletions backend/substrapp/docker_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,6 @@ def get_request_docker_api(
if response.status_code != requests.status_codes.codes.ok:
raise ImageNotFoundError(f"Error when querying docker-registry, status code: {response.status_code}")

response.raise_for_status()

return response.json()


Expand Down
1 change: 1 addition & 0 deletions changes/917.changed
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SSL connection errors to the registry are now retried.
Loading