diff --git a/backend/backend/settings/deps/image_build.py b/backend/backend/settings/deps/image_build.py index fb348c930..2f92195e9 100644 --- a/backend/backend/settings/deps/image_build.py +++ b/backend/backend/settings/deps/image_build.py @@ -1,7 +1,7 @@ import os # How long we wait before throwing errors, in seconds -IMAGE_BUILD_TIMEOUT = 3 * 60 * 60 # 3 hours +IMAGE_BUILD_TIMEOUT = int(os.getenv("IMAGE_BUILD_TIMEOUT", 60 * 60)) # 1 hour # Delay before two check IMAGE_BUILD_CHECK_DELAY = 5 BUILDER_KANIKO_STARTUP_MAX_ATTEMPTS = int(os.getenv("BUILDER_KANIKO_STARTUP_MAX_ATTEMPTS", 60)) diff --git a/backend/substrapp/compute_tasks/image_builder.py b/backend/substrapp/compute_tasks/image_builder.py index fc4d50429..318cb925e 100644 --- a/backend/substrapp/compute_tasks/image_builder.py +++ b/backend/substrapp/compute_tasks/image_builder.py @@ -12,7 +12,6 @@ logger = structlog.get_logger(__name__) -IMAGE_BUILD_TIMEOUT = settings.IMAGE_BUILD_TIMEOUT IMAGE_BUILD_CHECK_DELAY = settings.IMAGE_BUILD_CHECK_DELAY REGISTRY = settings.REGISTRY REGISTRY_SCHEME = settings.REGISTRY_SCHEME diff --git a/backend/substrapp/exceptions.py b/backend/substrapp/exceptions.py index b3b64c1c8..416849a41 100644 --- a/backend/substrapp/exceptions.py +++ b/backend/substrapp/exceptions.py @@ -32,3 +32,13 @@ class IntegrityError(Exception): class ServerMediasNoSubdirError(Exception): """A supplied servermedias path didn't contain the expected subdir""" + + +class LockError(Exception): + """Cannot get a lock file to write on""" + + lock_file: str + + def __init__(self, *args, lock_file: str, **kwargs) -> None: + self.lock_file = lock_file + super().__init__(args, kwargs) diff --git a/backend/substrapp/lock_local.py b/backend/substrapp/lock_local.py index c7683f524..ea35f41b7 100644 --- a/backend/substrapp/lock_local.py +++ b/backend/substrapp/lock_local.py @@ -7,6 +7,8 @@ import structlog +from substrapp.exceptions import LockError + logger = structlog.get_logger(__name__) LOCK_FILE_FOLDER = "/tmp" # nosec @@ -47,7 +49,7 @@ def lock_resource( did_wait = True logger.debug("Lock: Waiting for lock to be released", lock_file=lock_file) if time.time() - start > timeout: - raise Exception(f"Failed to acquire lock after {timeout} seconds", lock_file=lock_file) + raise LockError(f"Failed to acquire lock after {timeout} seconds", lock_file=lock_file) time.sleep(delay) logger.debug("Lock: Acquired", lock_file=lock_file) diff --git a/backend/substrapp/tests/test_lock_local.py b/backend/substrapp/tests/test_lock_local.py new file mode 100644 index 000000000..22d019de3 --- /dev/null +++ b/backend/substrapp/tests/test_lock_local.py @@ -0,0 +1,20 @@ +from pathlib import Path + +import pytest + +from substrapp import lock_local +from substrapp.exceptions import LockError + + +# Test that `lock_local.lock_resource() returns the correct excception +def test_lock_local_timeout(): + resource_type = "test" + unique_identifier = "1" + # Create fake lock + slug = f"{resource_type}_{unique_identifier}" + lock_path = Path(lock_local._get_lock_file_path(slug)) + lock_path.write_text("unique_id") + + with pytest.raises(LockError): + with lock_local.lock_resource(resource_type, unique_identifier, timeout=1): + pass diff --git a/changes/992.added b/changes/992.added new file mode 100644 index 000000000..f72589d6e --- /dev/null +++ b/changes/992.added @@ -0,0 +1 @@ +Add configurable builder timeout through the env var `IMAGE_BUILD_TIMEOUT` diff --git a/charts/substra-backend/CHANGELOG.md b/charts/substra-backend/CHANGELOG.md index e7e97888b..ac595b37a 100644 --- a/charts/substra-backend/CHANGELOG.md +++ b/charts/substra-backend/CHANGELOG.md @@ -1,6 +1,12 @@ # Changelog +## [26.11.0] - 2024-09-17 + +# Added + +New configurable value `.Values.builder.timeout` configuring the duration before a building task would fail. + ## [26.10.2] - 2024-09-17 # Fixed diff --git a/charts/substra-backend/Chart.yaml b/charts/substra-backend/Chart.yaml index 3b3b75a7b..e34178caf 100644 --- a/charts/substra-backend/Chart.yaml +++ b/charts/substra-backend/Chart.yaml @@ -1,7 +1,7 @@ apiVersion: v2 name: substra-backend home: https://github.com/Substra -version: 26.10.2 +version: 26.11.0 appVersion: 0.48.0 kubeVersion: '>= 1.19.0-0' description: Main package for Substra diff --git a/charts/substra-backend/README.md b/charts/substra-backend/README.md index 5ff5d83cb..6cbb86751 100644 --- a/charts/substra-backend/README.md +++ b/charts/substra-backend/README.md @@ -236,6 +236,7 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `builder.enabled` | Enable builder service | `true` | | `builder.replicaCount` | Replica count for the builder service | `1` | | `builder.concurrency` | Maximum amount of tasks to process in parallel | `1` | +| `builder.timeout` | Number of seconds to wait before failing a build | `3600` | | `builder.kanikoStartup.maxAttempts` | Number of checks done before considering a kaniko pod has failed to spawn | `60` | | `builder.kanikoStartup.checkDelay` | Time, in seconds, to wait when a container is in pending state before checking again its status | `2` | | `builder.image.registry` | Substra backend server image registry | `ghcr.io` | diff --git a/charts/substra-backend/templates/statefulset-builder.yaml b/charts/substra-backend/templates/statefulset-builder.yaml index f3d12cc17..3f81b6d3d 100644 --- a/charts/substra-backend/templates/statefulset-builder.yaml +++ b/charts/substra-backend/templates/statefulset-builder.yaml @@ -199,6 +199,8 @@ spec: value: {{ toYaml .Values.builder.kanikoStartup.maxAttempts | quote }} - name: BUILDER_KANIKO_STARTUP_PENDING_STATE_WAIT_SECONDS value: {{ toYaml .Values.builder.kanikoStartup.checkDelay | quote }} + - name: IMAGE_BUILD_TIMEOUT + value: {{ toYaml .Values.builder.timeout | quote }} ports: - name: http containerPort: 8000 diff --git a/charts/substra-backend/values.yaml b/charts/substra-backend/values.yaml index 5ea060618..32ba8b9db 100644 --- a/charts/substra-backend/values.yaml +++ b/charts/substra-backend/values.yaml @@ -583,6 +583,10 @@ builder: ## concurrency: 1 + ## @param builder.timeout Number of seconds to wait before failing a build + ## + timeout: 3600 + ## @param builder.kanikoStartup.maxAttempts Number of checks done before considering a kaniko pod has failed to spawn ## @param builder.kanikoStartup.checkDelay Time, in seconds, to wait when a container is in pending state before checking again its status ##