diff --git a/backend/backend/settings/common.py b/backend/backend/settings/common.py index 2686cc7a4..d10539a95 100644 --- a/backend/backend/settings/common.py +++ b/backend/backend/settings/common.py @@ -205,6 +205,7 @@ "COMPUTE_POD_STARTUP_TIMEOUT_SECONDS": int(os.environ.get("COMPUTE_POD_STARTUP_TIMEOUT_SECONDS", 300)), "PRIVATE_CA_ENABLED": to_bool(os.environ.get("PRIVATE_CA_ENABLED")), } +IMAGE_SAVING_TIMEOUT_SECONDS = int(os.getenv("IMAGE_SAVING_TIMEOUT_SECONDS", 60)) WORKER_PVC_IS_HOSTPATH = to_bool(os.environ.get("WORKER_PVC_IS_HOSTPATH")) WORKER_PVC_DOCKER_CACHE = os.environ.get("WORKER_PVC_DOCKER_CACHE") diff --git a/backend/substrapp/tasks/tasks_save_image.py b/backend/substrapp/tasks/tasks_save_image.py index fc5fd8adb..c5cda3ad6 100644 --- a/backend/substrapp/tasks/tasks_save_image.py +++ b/backend/substrapp/tasks/tasks_save_image.py @@ -6,6 +6,7 @@ from typing import Any import structlog +from celery.exceptions import SoftTimeLimitExceeded from django.conf import settings from django.core.files import File from django.urls import reverse @@ -18,6 +19,7 @@ from orchestrator import get_orchestrator_client from substrapp.compute_tasks import utils from substrapp.compute_tasks.errors import CeleryNoRetryError +from substrapp.compute_tasks.errors import CeleryRetryError from substrapp.docker_registry import USER_IMAGE_REPOSITORY from substrapp.docker_registry import RegistryPreconditionFailedException from substrapp.models import FailedAssetKind @@ -28,6 +30,7 @@ REGISTRY = settings.REGISTRY REGISTRY_SCHEME = settings.REGISTRY_SCHEME SUBTUPLE_TMP_DIR = settings.SUBTUPLE_TMP_DIR +IMAGE_SAVING_TIMEOUT_SECONDS = settings.IMAGE_SAVING_TIMEOUT_SECONDS logger = structlog.get_logger("worker") @@ -38,6 +41,7 @@ class SaveImageTask(FailableTask): retry_backoff = settings.CELERY_TASK_RETRY_BACKOFF retry_backoff_max = settings.CELERY_TASK_RETRY_BACKOFF_MAX retry_jitter = settings.CELERY_TASK_RETRY_JITTER + soft_time_limit = IMAGE_SAVING_TIMEOUT_SECONDS acks_late = True reject_on_worker_lost = True ignore_result = False @@ -78,59 +82,63 @@ def on_success(self, retval: tuple[dict, str], task_id: str, args: tuple, kwargs def save_image(function_serialized: str, channel_name: str) -> dict: logger.info("Starting save_image") logger.info(f"Parameters: function_serialized {function_serialized}, " f"channel_name {channel_name}") - - # create serialized image - function = orchestrator.Function.model_validate_json(function_serialized) - container_image_tag = utils.container_image_tag_from_function(function) - - os.makedirs(SUBTUPLE_TMP_DIR, exist_ok=True) - - logger.info("Serialising the image from the registry") - - with TemporaryDirectory(dir=SUBTUPLE_TMP_DIR) as tmp_dir: - storage_path = pathlib.Path(tmp_dir) / f"{container_image_tag}.zip" - try: - image_transfer.make_payload( - zip_file=storage_path, - docker_images_to_transfer=[f"{USER_IMAGE_REPOSITORY}:{container_image_tag}"], - registry=REGISTRY, - secure=REGISTRY_SCHEME == "https", + try: + # create serialized image + function = orchestrator.Function.model_validate_json(function_serialized) + container_image_tag = utils.container_image_tag_from_function(function) + + os.makedirs(SUBTUPLE_TMP_DIR, exist_ok=True) + + logger.info("Serialising the image from the registry") + + with TemporaryDirectory(dir=SUBTUPLE_TMP_DIR) as tmp_dir: + storage_path = pathlib.Path(tmp_dir) / f"{container_image_tag}.zip" + try: + image_transfer.make_payload( + zip_file=storage_path, + docker_images_to_transfer=[f"{USER_IMAGE_REPOSITORY}:{container_image_tag}"], + registry=REGISTRY, + secure=REGISTRY_SCHEME == "https", + ) + except RegistryPreconditionFailedException as e: + raise BuildRetryError( + f"The image associated with the function {function.key} was built successfully " + f"but did not pass the security checks; " + "please contact an Harbor administrator to ensure that the image was scanned, " + "and get more information about the CVE." + ) from e + + logger.info("Start saving the serialized image") + # save it + image = FunctionImage.objects.create( + function_id=function.key, file=File(file=storage_path.open(mode="rb"), name="image.zip") + ) + # update APIFunction image-related fields + api_function = ApiFunction.objects.get(key=function.key) + # TODO get full url cf https://github.com/Substra/substra-backend/backend/api/serializers/function.py#L66 + api_function.image_address = settings.DEFAULT_DOMAIN + reverse( + "api:function_permissions-image", args=[function.key] ) - except RegistryPreconditionFailedException as e: - raise BuildRetryError( - f"The image associated with the function {function.key} was built successfully " - f"but did not pass the security checks; " - "please contact an Harbor administrator to ensure that the image was scanned, " - "and get more information about the CVE." - ) from e - - logger.info("Start saving the serialized image") - # save it - image = FunctionImage.objects.create( - function_id=function.key, file=File(file=storage_path.open(mode="rb"), name="image.zip") - ) - # update APIFunction image-related fields - api_function = ApiFunction.objects.get(key=function.key) - # TODO get full url cf https://github.com/Substra/substra-backend/backend/api/serializers/function.py#L66 - api_function.image_address = settings.DEFAULT_DOMAIN + reverse( - "api:function_permissions-image", args=[function.key] - ) - api_function.image_checksum = image.checksum - api_function.save() - - logger.info("Serialized image saved") - - orc_update_function_param = { - "key": str(api_function.key), - # TODO find a way to propagate the name or make it optional at update - "name": api_function.name, - "image": { - "checksum": api_function.image_checksum, - "storage_address": api_function.image_address, - }, - } - - return orc_update_function_param + api_function.image_checksum = image.checksum + api_function.save() + + logger.info("Serialized image saved") + + orc_update_function_param = { + "key": str(api_function.key), + # TODO find a way to propagate the name or make it optional at update + "name": api_function.name, + "image": { + "checksum": api_function.image_checksum, + "storage_address": api_function.image_address, + }, + } + + return orc_update_function_param + except SoftTimeLimitExceeded as e: + raise CeleryRetryError from e + except Exception: + raise @app.task( diff --git a/changes/1003.added b/changes/1003.added new file mode 100644 index 000000000..452a795ca --- /dev/null +++ b/changes/1003.added @@ -0,0 +1 @@ +Celery soft timeout on `SaveImageTask` and env variable (`IMAGE_SAVING_TIMEOUT_SECONDS`) to customize the timeout \ No newline at end of file diff --git a/charts/substra-backend/CHANGELOG.md b/charts/substra-backend/CHANGELOG.md index f8d44e9b5..3f56da53a 100644 --- a/charts/substra-backend/CHANGELOG.md +++ b/charts/substra-backend/CHANGELOG.md @@ -1,6 +1,12 @@ # Changelog +## [26.13.0] - 2024-09-30 + +# Added + +`.Values.builder.saveImageTimeoutSeconds` configuring the timeout for the Celery task `SaveImageTask`. + ## [26.12.0] - 2024-09-30 # Changed diff --git a/charts/substra-backend/Chart.yaml b/charts/substra-backend/Chart.yaml index 46d06f1b7..7bd60b33b 100644 --- a/charts/substra-backend/Chart.yaml +++ b/charts/substra-backend/Chart.yaml @@ -1,7 +1,7 @@ apiVersion: v2 name: substra-backend home: https://github.com/Substra -version: 26.12.0 +version: 26.13.0 appVersion: 0.48.0 kubeVersion: '>= 1.19.0-0' description: Main package for Substra diff --git a/charts/substra-backend/README.md b/charts/substra-backend/README.md index 6cbb86751..0a4377b2c 100644 --- a/charts/substra-backend/README.md +++ b/charts/substra-backend/README.md @@ -255,6 +255,7 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `builder.nodeSelector` | Node labels for pod assignment | `{}` | | `builder.tolerations` | Toleration labels for pod assignment | `[]` | | `builder.affinity` | Affinity settings for pod assignment, ignored if `DataSampleStorageInServerMedia` is `true` | `{}` | +| `builder.saveImageTimeoutSeconds` | Amount of seconds before restarting the image save as a blob | `60` | | `builder.persistence.storageClass` | Specify the _StorageClass_ used to provision the volume. Or the default _StorageClass_ will be used. Set it to `-` to disable dynamic provisioning | `""` | | `builder.persistence.size` | The size of the volume. | `10Gi` | | `builder.rbac.create` | Create a role and service account for the builder | `true` | diff --git a/charts/substra-backend/templates/statefulset-worker.yaml b/charts/substra-backend/templates/statefulset-worker.yaml index 33b9ebc77..7fdcab1b6 100644 --- a/charts/substra-backend/templates/statefulset-worker.yaml +++ b/charts/substra-backend/templates/statefulset-worker.yaml @@ -145,6 +145,8 @@ spec: value: {{ toYaml .Values.worker.computePod.resources | quote }} - name: COMPUTE_POD_MAX_STARTUP_WAIT_SECONDS value: {{ .Values.worker.computePod.maxStartupWaitSeconds | quote }} + - name: IMAGE_SAVING_TIMEOUT_SECONDS + value: {{ .Values.builder.saveImageTimeoutSeconds | quote }} - name: OBJECTSTORE_URL value: {{ include "substra-backend.objectStore.url" . | quote }} - name: ENABLE_DATASAMPLE_STORAGE_IN_SERVERMEDIAS diff --git a/charts/substra-backend/values.yaml b/charts/substra-backend/values.yaml index 32ba8b9db..9a675e53f 100644 --- a/charts/substra-backend/values.yaml +++ b/charts/substra-backend/values.yaml @@ -644,6 +644,9 @@ builder: ## @param builder.affinity Affinity settings for pod assignment, ignored if `DataSampleStorageInServerMedia` is `true` ## affinity: { } + ## @param builder.saveImageTimeoutSeconds Amount of seconds before restarting the image save as a blob + # + saveImageTimeoutSeconds: 60 persistence: diff --git a/docs/settings.md b/docs/settings.md index 0beb8b5e9..d05f9cc90 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -39,6 +39,7 @@ Accepted true values for `bool` are: `1`, `ON`, `On`, `on`, `T`, `t`, `TRUE`, `T | string | `HOSTNAME` | nil | | | string | `HOST_IP` | nil | | | int | `HTTP_CLIENT_TIMEOUT_SECONDS` | `30` | | +| int | `IMAGE_SAVING_TIMEOUT_SECONDS` | `60` | | | bool | `ISOLATED` | nil | | | string | `K8S_SECRET_NAMESPACE` | `default` | | | string | `KANIKO_DOCKER_CONFIG_SECRET_NAME` | nil | |