Skip to content

Commit

Permalink
feat: add timeout on save_image (#1003)
Browse files Browse the repository at this point in the history
* feat: add timeout on `save_image`

Signed-off-by: Guilhem Barthés <guilhem.barthes@owkin.com>

* feat: add chart value in env var to configure `SaveImageTask` timeout

Signed-off-by: Guilhem Barthés <guilhem.barthes@owkin.com>

* fix: int parsing for `IMAGE_SAVING_TIMEOUT_SECONDS`

Signed-off-by: Guilhem Barthés <guilhem.barthes@owkin.com>

* chore: update doc

Signed-off-by: Guilhem Barthés <guilhem.barthes@owkin.com>

* doc: change fragment

Signed-off-by: Guilhem Barthés <guilhem.barthes@owkin.com>

---------

Signed-off-by: Guilhem Barthés <guilhem.barthes@owkin.com>
  • Loading branch information
guilhem-barthes authored Oct 1, 2024
1 parent 07d9311 commit 8b45c58
Show file tree
Hide file tree
Showing 9 changed files with 76 additions and 53 deletions.
1 change: 1 addition & 0 deletions backend/backend/settings/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,7 @@
"COMPUTE_POD_STARTUP_TIMEOUT_SECONDS": int(os.environ.get("COMPUTE_POD_STARTUP_TIMEOUT_SECONDS", 300)),
"PRIVATE_CA_ENABLED": to_bool(os.environ.get("PRIVATE_CA_ENABLED")),
}
IMAGE_SAVING_TIMEOUT_SECONDS = int(os.getenv("IMAGE_SAVING_TIMEOUT_SECONDS", 60))

WORKER_PVC_IS_HOSTPATH = to_bool(os.environ.get("WORKER_PVC_IS_HOSTPATH"))
WORKER_PVC_DOCKER_CACHE = os.environ.get("WORKER_PVC_DOCKER_CACHE")
Expand Down
112 changes: 60 additions & 52 deletions backend/substrapp/tasks/tasks_save_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from typing import Any

import structlog
from celery.exceptions import SoftTimeLimitExceeded
from django.conf import settings
from django.core.files import File
from django.urls import reverse
Expand All @@ -18,6 +19,7 @@
from orchestrator import get_orchestrator_client
from substrapp.compute_tasks import utils
from substrapp.compute_tasks.errors import CeleryNoRetryError
from substrapp.compute_tasks.errors import CeleryRetryError
from substrapp.docker_registry import USER_IMAGE_REPOSITORY
from substrapp.docker_registry import RegistryPreconditionFailedException
from substrapp.models import FailedAssetKind
Expand All @@ -28,6 +30,7 @@
REGISTRY = settings.REGISTRY
REGISTRY_SCHEME = settings.REGISTRY_SCHEME
SUBTUPLE_TMP_DIR = settings.SUBTUPLE_TMP_DIR
IMAGE_SAVING_TIMEOUT_SECONDS = settings.IMAGE_SAVING_TIMEOUT_SECONDS

logger = structlog.get_logger("worker")

Expand All @@ -38,6 +41,7 @@ class SaveImageTask(FailableTask):
retry_backoff = settings.CELERY_TASK_RETRY_BACKOFF
retry_backoff_max = settings.CELERY_TASK_RETRY_BACKOFF_MAX
retry_jitter = settings.CELERY_TASK_RETRY_JITTER
soft_time_limit = IMAGE_SAVING_TIMEOUT_SECONDS
acks_late = True
reject_on_worker_lost = True
ignore_result = False
Expand Down Expand Up @@ -78,59 +82,63 @@ def on_success(self, retval: tuple[dict, str], task_id: str, args: tuple, kwargs
def save_image(function_serialized: str, channel_name: str) -> dict:
logger.info("Starting save_image")
logger.info(f"Parameters: function_serialized {function_serialized}, " f"channel_name {channel_name}")

# create serialized image
function = orchestrator.Function.model_validate_json(function_serialized)
container_image_tag = utils.container_image_tag_from_function(function)

os.makedirs(SUBTUPLE_TMP_DIR, exist_ok=True)

logger.info("Serialising the image from the registry")

with TemporaryDirectory(dir=SUBTUPLE_TMP_DIR) as tmp_dir:
storage_path = pathlib.Path(tmp_dir) / f"{container_image_tag}.zip"
try:
image_transfer.make_payload(
zip_file=storage_path,
docker_images_to_transfer=[f"{USER_IMAGE_REPOSITORY}:{container_image_tag}"],
registry=REGISTRY,
secure=REGISTRY_SCHEME == "https",
try:
# create serialized image
function = orchestrator.Function.model_validate_json(function_serialized)
container_image_tag = utils.container_image_tag_from_function(function)

os.makedirs(SUBTUPLE_TMP_DIR, exist_ok=True)

logger.info("Serialising the image from the registry")

with TemporaryDirectory(dir=SUBTUPLE_TMP_DIR) as tmp_dir:
storage_path = pathlib.Path(tmp_dir) / f"{container_image_tag}.zip"
try:
image_transfer.make_payload(
zip_file=storage_path,
docker_images_to_transfer=[f"{USER_IMAGE_REPOSITORY}:{container_image_tag}"],
registry=REGISTRY,
secure=REGISTRY_SCHEME == "https",
)
except RegistryPreconditionFailedException as e:
raise BuildRetryError(
f"The image associated with the function {function.key} was built successfully "
f"but did not pass the security checks; "
"please contact an Harbor administrator to ensure that the image was scanned, "
"and get more information about the CVE."
) from e

logger.info("Start saving the serialized image")
# save it
image = FunctionImage.objects.create(
function_id=function.key, file=File(file=storage_path.open(mode="rb"), name="image.zip")
)
# update APIFunction image-related fields
api_function = ApiFunction.objects.get(key=function.key)
# TODO get full url cf https://github.com/Substra/substra-backend/backend/api/serializers/function.py#L66
api_function.image_address = settings.DEFAULT_DOMAIN + reverse(
"api:function_permissions-image", args=[function.key]
)
except RegistryPreconditionFailedException as e:
raise BuildRetryError(
f"The image associated with the function {function.key} was built successfully "
f"but did not pass the security checks; "
"please contact an Harbor administrator to ensure that the image was scanned, "
"and get more information about the CVE."
) from e

logger.info("Start saving the serialized image")
# save it
image = FunctionImage.objects.create(
function_id=function.key, file=File(file=storage_path.open(mode="rb"), name="image.zip")
)
# update APIFunction image-related fields
api_function = ApiFunction.objects.get(key=function.key)
# TODO get full url cf https://github.com/Substra/substra-backend/backend/api/serializers/function.py#L66
api_function.image_address = settings.DEFAULT_DOMAIN + reverse(
"api:function_permissions-image", args=[function.key]
)
api_function.image_checksum = image.checksum
api_function.save()

logger.info("Serialized image saved")

orc_update_function_param = {
"key": str(api_function.key),
# TODO find a way to propagate the name or make it optional at update
"name": api_function.name,
"image": {
"checksum": api_function.image_checksum,
"storage_address": api_function.image_address,
},
}

return orc_update_function_param
api_function.image_checksum = image.checksum
api_function.save()

logger.info("Serialized image saved")

orc_update_function_param = {
"key": str(api_function.key),
# TODO find a way to propagate the name or make it optional at update
"name": api_function.name,
"image": {
"checksum": api_function.image_checksum,
"storage_address": api_function.image_address,
},
}

return orc_update_function_param
except SoftTimeLimitExceeded as e:
raise CeleryRetryError from e
except Exception:
raise


@app.task(
Expand Down
1 change: 1 addition & 0 deletions changes/1003.added
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Celery soft timeout on `SaveImageTask` and env variable (`IMAGE_SAVING_TIMEOUT_SECONDS`) to customize the timeout
6 changes: 6 additions & 0 deletions charts/substra-backend/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
# Changelog

<!-- towncrier release notes start -->
## [26.13.0] - 2024-09-30

# Added

`.Values.builder.saveImageTimeoutSeconds` configuring the timeout for the Celery task `SaveImageTask`.

## [26.12.0] - 2024-09-30

# Changed
Expand Down
2 changes: 1 addition & 1 deletion charts/substra-backend/Chart.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
apiVersion: v2
name: substra-backend
home: https://github.com/Substra
version: 26.12.0
version: 26.13.0
appVersion: 0.48.0
kubeVersion: '>= 1.19.0-0'
description: Main package for Substra
Expand Down
1 change: 1 addition & 0 deletions charts/substra-backend/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,7 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub
| `builder.nodeSelector` | Node labels for pod assignment | `{}` |
| `builder.tolerations` | Toleration labels for pod assignment | `[]` |
| `builder.affinity` | Affinity settings for pod assignment, ignored if `DataSampleStorageInServerMedia` is `true` | `{}` |
| `builder.saveImageTimeoutSeconds` | Amount of seconds before restarting the image save as a blob | `60` |
| `builder.persistence.storageClass` | Specify the _StorageClass_ used to provision the volume. Or the default _StorageClass_ will be used. Set it to `-` to disable dynamic provisioning | `""` |
| `builder.persistence.size` | The size of the volume. | `10Gi` |
| `builder.rbac.create` | Create a role and service account for the builder | `true` |
Expand Down
2 changes: 2 additions & 0 deletions charts/substra-backend/templates/statefulset-worker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,8 @@ spec:
value: {{ toYaml .Values.worker.computePod.resources | quote }}
- name: COMPUTE_POD_MAX_STARTUP_WAIT_SECONDS
value: {{ .Values.worker.computePod.maxStartupWaitSeconds | quote }}
- name: IMAGE_SAVING_TIMEOUT_SECONDS
value: {{ .Values.builder.saveImageTimeoutSeconds | quote }}
- name: OBJECTSTORE_URL
value: {{ include "substra-backend.objectStore.url" . | quote }}
- name: ENABLE_DATASAMPLE_STORAGE_IN_SERVERMEDIAS
Expand Down
3 changes: 3 additions & 0 deletions charts/substra-backend/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -644,6 +644,9 @@ builder:
## @param builder.affinity Affinity settings for pod assignment, ignored if `DataSampleStorageInServerMedia` is `true`
##
affinity: { }
## @param builder.saveImageTimeoutSeconds Amount of seconds before restarting the image save as a blob
#
saveImageTimeoutSeconds: 60


persistence:
Expand Down
1 change: 1 addition & 0 deletions docs/settings.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ Accepted true values for `bool` are: `1`, `ON`, `On`, `on`, `T`, `t`, `TRUE`, `T
| string | `HOSTNAME` | nil | |
| string | `HOST_IP` | nil | |
| int | `HTTP_CLIENT_TIMEOUT_SECONDS` | `30` | |
| int | `IMAGE_SAVING_TIMEOUT_SECONDS` | `60` | |
| bool | `ISOLATED` | nil | |
| string | `K8S_SECRET_NAMESPACE` | `default` | |
| string | `KANIKO_DOCKER_CONFIG_SECRET_NAME` | nil | |
Expand Down

0 comments on commit 8b45c58

Please sign in to comment.