From 8b9342a21a566f8288313e904958795eb3ab82a7 Mon Sep 17 00:00:00 2001 From: ThibaultFy Date: Thu, 25 Apr 2024 15:04:29 +0200 Subject: [PATCH] chore: all resources in values Signed-off-by: ThibaultFy --- backend/backend/settings/common.py | 2 + .../builder/image_builder/image_builder.py | 5 ++- .../substrapp/compute_tasks/compute_pod.py | 5 ++- backend/substrapp/kubernetes_utils.py | 9 ++-- charts/substra-backend/README.md | 41 ++++++++++--------- .../deployment-registry-prepopulate.yaml | 2 +- .../templates/statefulset-builder.yaml | 2 + .../templates/statefulset-worker.yaml | 2 + charts/substra-backend/values.yaml | 32 +++++++++------ 9 files changed, 61 insertions(+), 39 deletions(-) diff --git a/backend/backend/settings/common.py b/backend/backend/settings/common.py index 23ece165a..5cae475e4 100644 --- a/backend/backend/settings/common.py +++ b/backend/backend/settings/common.py @@ -197,6 +197,7 @@ "KANIKO_MIRROR": to_bool(os.environ.get("KANIKO_MIRROR", False)), "KANIKO_IMAGE": os.environ.get("KANIKO_IMAGE"), "KANIKO_DOCKER_CONFIG_SECRET_NAME": os.environ.get("KANIKO_DOCKER_CONFIG_SECRET_NAME"), + "KANIKO_RESOURCES": os.environ.get("KANIKO_RESOURCES"), "COMPUTE_POD_STARTUP_TIMEOUT_SECONDS": int(os.environ.get("COMPUTE_POD_STARTUP_TIMEOUT_SECONDS", 300)), "PRIVATE_CA_ENABLED": to_bool(os.environ.get("PRIVATE_CA_ENABLED")), } @@ -223,6 +224,7 @@ COMPUTE_POD_RUN_AS_GROUP = os.environ.get("COMPUTE_POD_RUN_AS_GROUP") COMPUTE_POD_FS_GROUP = os.environ.get("COMPUTE_POD_FS_GROUP") COMPUTE_POD_GKE_GPUS_LIMITS = int(os.environ.get("COMPUTE_POD_GKE_GPUS_LIMITS", 0)) +COMPUTE_POD_RESOURCES = os.environ.get("COMPUTE_POD_RESOURCES") # Prometheus configuration ENABLE_METRICS = to_bool(os.environ.get("ENABLE_METRICS", False)) diff --git a/backend/builder/image_builder/image_builder.py b/backend/builder/image_builder/image_builder.py index ed25b48ed..020ff0799 100644 --- a/backend/builder/image_builder/image_builder.py +++ b/backend/builder/image_builder/image_builder.py @@ -23,7 +23,7 @@ from substrapp.compute_tasks.volumes import get_worker_subtuple_pvc_name from substrapp.docker_registry import USER_IMAGE_REPOSITORY from substrapp.kubernetes_utils import delete_pod -from substrapp.kubernetes_utils import get_resources_requirements +from substrapp.kubernetes_utils import get_resources_requirements_from_yaml from substrapp.kubernetes_utils import get_security_context from substrapp.lock_local import lock_resource from substrapp.utils import timeit @@ -43,6 +43,7 @@ IMAGE_BUILD_TIMEOUT = settings.IMAGE_BUILD_TIMEOUT KANIKO_CONTAINER_NAME = "kaniko" HOSTNAME = settings.HOSTNAME +KANIKO_RESOURCES = settings.KANIKO_RESOURCES def container_image_tag_from_function(function: orchestrator.Function) -> str: @@ -307,7 +308,7 @@ def _build_container(dockerfile_mount_path: str, image_tag: str) -> kubernetes.c args=args, volume_mounts=volume_mounts, security_context=container_security_context, - resources=get_resources_requirements(cpu_request="1000m", memory_request="4Gi", memory_limit="32Gi"), + resources=get_resources_requirements_from_yaml(yaml_resources=KANIKO_RESOURCES), ) diff --git a/backend/substrapp/compute_tasks/compute_pod.py b/backend/substrapp/compute_tasks/compute_pod.py index 5e929104a..2f46df4b0 100644 --- a/backend/substrapp/compute_tasks/compute_pod.py +++ b/backend/substrapp/compute_tasks/compute_pod.py @@ -6,10 +6,11 @@ from substrapp.kubernetes_utils import delete_pod from substrapp.kubernetes_utils import get_pod_security_context -from substrapp.kubernetes_utils import get_resources_requirements +from substrapp.kubernetes_utils import get_resources_requirements_from_yaml from substrapp.kubernetes_utils import get_security_context NAMESPACE = settings.NAMESPACE +COMPUTE_POD_RESOURCES = settings.COMPUTE_POD_RESOURCES logger = structlog.get_logger(__name__) @@ -113,7 +114,7 @@ def create_pod( args=None, volume_mounts=volume_mounts + gpu_volume_mounts, security_context=get_security_context(), - resources=get_resources_requirements(cpu_request="1000m", memory_request="1Gi", memory_limit="64Gi"), + resources=get_resources_requirements_from_yaml(yaml_resources=COMPUTE_POD_RESOURCES), env=[kubernetes.client.V1EnvVar(name=env_name, value=env_value) for env_name, env_value in environment.items()], **container_optional_kwargs, ) diff --git a/backend/substrapp/kubernetes_utils.py b/backend/substrapp/kubernetes_utils.py index 1486e198a..663542fcc 100644 --- a/backend/substrapp/kubernetes_utils.py +++ b/backend/substrapp/kubernetes_utils.py @@ -1,4 +1,5 @@ import kubernetes +import yaml import structlog from django.conf import settings @@ -47,11 +48,13 @@ def get_security_context(root: bool = False, capabilities: list[str] = None) -> return security_context -def get_resources_requirements( - *, cpu_request: str = "1000m", memory_request: str = "200M", memory_limit: str = "2G" +def get_resources_requirements_from_yaml( + *, + yaml_resources: str, ) -> kubernetes.client.V1ResourceRequirements: + resources_dict = yaml.load(yaml_resources, Loader=yaml.FullLoader) return kubernetes.client.V1ResourceRequirements( - requests={"cpu": cpu_request, "memory": memory_request}, limits={"memory": memory_limit} + requests=resources_dict["requests"], limits=resources_dict["limits"] ) diff --git a/charts/substra-backend/README.md b/charts/substra-backend/README.md index 79ae134fc..11d1acafe 100644 --- a/charts/substra-backend/README.md +++ b/charts/substra-backend/README.md @@ -72,8 +72,8 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `server.ingress.extraHosts` | The list of additional hostnames to be covered with this ingress record | `[]` | | `server.ingress.extraTls` | The tls configuration for hostnames to be coverred by the ingress | `[]` | | `server.ingress.ingressClassName` | _IngressClass_ that will be used to implement the Ingress | `nil` | -| `server.resources.requests.cpu` | Server container cpu request | `1000m` | -| `server.resources.requests.memory` | Server container memory request | `6Gi` | +| `server.resources.requests.cpu` | Server container cpu request | `500m` | +| `server.resources.requests.memory` | Server container memory request | `512Mi` | | `server.resources.limits.cpu` | Server container cpu limit | `2000m` | | `server.resources.limits.memory` | Server container memory limit | `12Gi` | | `server.persistence.storageClass` | Specify the _StorageClass_ used to provision the volume. Or the default _StorageClass_ will be used. Set it to `-` to disable dynamic provisioning | `""` | @@ -122,8 +122,8 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `worker.podSecurityContext.runAsUser` | User ID for the pod | `1001` | | `worker.podSecurityContext.runAsGroup` | Group ID for the pod | `1001` | | `worker.podSecurityContext.fsGroup` | FileSystem group ID for the pod | `1001` | -| `worker.resources.requests.cpu` | Worker container cpu request | `1000m` | -| `worker.resources.requests.memory` | Worker container memory request | `4Gi` | +| `worker.resources.requests.cpu` | Worker container cpu request | `500m` | +| `worker.resources.requests.memory` | Worker container memory request | `512Mi` | | `worker.resources.limits.cpu` | Worker container cpu limit | `2000m` | | `worker.resources.limits.memory` | Worker container memory limit | `8Gi` | | `worker.nodeSelector` | Node labels for pod assignment | `{}` | @@ -138,14 +138,17 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `worker.computePod.securityContext.fsGroup` | Set the filesystem group for the Compute pod | `1001` | | `worker.computePod.securityContext.runAsUser` | Set the user for the Compute pod | `1001` | | `worker.computePod.securityContext.runAsGroup` | Set the group for the Compute pod | `1001` | +| `worker.computePod.resources.requests.cpu` | Worker compute pod container cpu request | `500m` | +| `worker.computePod.resources.requests.memory` | Worker compute pod container memory request | `512Mi` | +| `worker.computePod.resources.limits.memory` | Worker compute pod container memory limit | `64Gi` | | `worker.events.enabled` | Enable event service | `true` | | `worker.events.image.registry` | Substra event app image registry | `ghcr.io` | | `worker.events.image.repository` | Substra event app image repository | `substra/substra-backend` | | `worker.events.image.tag` | Substra event app image tag (defaults to AppVersion) | `nil` | | `worker.events.image.pullPolicy` | Substra event app image pull policy | `IfNotPresent` | | `worker.events.image.pullSecrets` | Specify image pull secrets | `[]` | -| `worker.events.resources.requests.cpu` | Worker events container cpu request | `500m` | -| `worker.events.resources.requests.memory` | Worker events container memory request | `200Mi` | +| `worker.events.resources.requests.cpu` | Worker events container cpu request | `100m` | +| `worker.events.resources.requests.memory` | Worker events container memory request | `50Mi` | | `worker.events.resources.limits.cpu` | Worker events container cpu limit | `500m` | | `worker.events.resources.limits.memory` | Worker events container memory limit | `400Mi` | | `worker.events.podSecurityContext.enabled` | Enable security context | `true` | @@ -173,8 +176,8 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `schedulerWorker.nodeSelector` | Node labels for pod assignment | `{}` | | `schedulerWorker.tolerations` | Toleration labels for pod assignment | `[]` | | `schedulerWorker.affinity` | Affinity settings for pod assignment | `{}` | -| `schedulerWorker.resources.requests.cpu` | Scheduler container cpu request | `250m` | -| `schedulerWorker.resources.requests.memory` | Scheduler container memory request | `200Mi` | +| `schedulerWorker.resources.requests.cpu` | Scheduler container cpu request | `100m` | +| `schedulerWorker.resources.requests.memory` | Scheduler container memory request | `50Mi` | | `schedulerWorker.resources.limits.cpu` | Scheduler container cpu limit | `250m` | | `schedulerWorker.resources.limits.memory` | Scheduler container memory limit | `400Mi` | | `schedulerWorker.podSecurityContext.enabled` | Enable security context | `true` | @@ -193,8 +196,8 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `scheduler.image.tag` | Substra backend tasks scheduler image tag (defaults to AppVersion) | `nil` | | `scheduler.image.pullPolicy` | Substra backend task scheduler image pull policy | `IfNotPresent` | | `scheduler.image.pullSecrets` | Specify image pull secrets | `[]` | -| `scheduler.resources.requests.cpu` | Scheduler container cpu request | `250m` | -| `scheduler.resources.requests.memory` | Scheduler container memory request | `200Mi` | +| `scheduler.resources.requests.cpu` | Scheduler container cpu request | `100m` | +| `scheduler.resources.requests.memory` | Scheduler container memory request | `50Mi` | | `scheduler.resources.limits.cpu` | Scheduler container cpu limit | `250m` | | `scheduler.resources.limits.memory` | Scheduler container memory limit | `400Mi` | | `scheduler.nodeSelector` | Node labels for pod assignment | `{}` | @@ -222,8 +225,8 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `builder.podSecurityContext.runAsUser` | User ID for the pod | `1001` | | `builder.podSecurityContext.runAsGroup` | Group ID for the pod | `1001` | | `builder.podSecurityContext.fsGroup` | FileSystem group ID for the pod | `1001` | -| `builder.resources.requests.cpu` | Builder container cpu request | `2000m` | -| `builder.resources.requests.memory` | Builder container memory request | `4Gi` | +| `builder.resources.requests.cpu` | Builder container cpu request | `500m` | +| `builder.resources.requests.memory` | Builder container memory request | `512Mi` | | `builder.resources.limits.cpu` | Builder container cpu limit | `2000m` | | `builder.resources.limits.memory` | Builder container memory limit | `8Gi` | | `builder.nodeSelector` | Node labels for pod assignment | `{}` | @@ -257,8 +260,8 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `api.events.image.tag` | Substra event app image tag (defaults to AppVersion) | `nil` | | `api.events.image.pullPolicy` | Substra event app image pull policy | `IfNotPresent` | | `api.events.image.pullSecrets` | Specify image pull secrets | `[]` | -| `api.events.resources.requests.cpu` | Api events container cpu request | `500m` | -| `api.events.resources.requests.memory` | Api events container memory request | `200Mi` | +| `api.events.resources.requests.cpu` | Api events container cpu request | `100m` | +| `api.events.resources.requests.memory` | Api events container memory request | `50Mi` | | `api.events.resources.limits.cpu` | Api events container cpu limit | `500m` | | `api.events.resources.limits.memory` | Api events container memory limit | `400Mi` | | `api.events.podSecurityContext.enabled` | Enable security context | `true` | @@ -293,6 +296,9 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `kaniko.image.registry` | Kaniko image registry | `gcr.io` | | `kaniko.image.repository` | Kaniko image repository | `kaniko-project/executor` | | `kaniko.image.tag` | Kaniko image tag | `v1.8.1` | +| `kaniko.resources.requests.cpu` | Kaniko container cpu request | `500m` | +| `kaniko.resources.requests.memory` | Kaniko container memory request | `256Mi` | +| `kaniko.resources.limits.memory` | Kaniko container memory limit | `32Gi` | | `kaniko.mirror` | If set to `true` pull base images from the local registry. | `false` | | `kaniko.dockerConfigSecretName` | A Docker config to use for pulling base images | `nil` | | `kaniko.cache.warmer.image.registry` | Kaniko cache warmer registry | `gcr.io` | @@ -314,12 +320,9 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | Name | Description | Value | | ------------------------------------------------------------ | -------------------------------------- | ------- | -| `registryPrepopulate.waitRegistry.resources.requests.cpu` | Wait registry container cpu request | `500m` | -| `registryPrepopulate.waitRegistry.resources.requests.memory` | Wait registry container memory request | `200Mi` | +| `registryPrepopulate.waitRegistry.resources.requests.cpu` | Wait registry container cpu request | `100m` | +| `registryPrepopulate.waitRegistry.resources.requests.memory` | Wait registry container memory request | `50Mi` | | `registryPrepopulate.waitRegistry.resources.limits.memory` | Wait registry container memory limit | `400Mi` | -| `registryPrepopulate.kaniko.resources.requests.cpu` | Kaniko container cpu request | `1000m` | -| `registryPrepopulate.kaniko.resources.requests.memory` | Kaniko container memory request | `2Gi` | -| `registryPrepopulate.kaniko.resources.limits.memory` | Kaniko container memory limit | `8Gi` | | `registryPrepopulate.pause.resources.requests.cpu` | Pause container cpu request | `50m` | | `registryPrepopulate.pause.resources.requests.memory` | Pause container memory request | `64Mi` | | `registryPrepopulate.pause.resources.limits.memory` | Pause container memory limit | `128Mi` | diff --git a/charts/substra-backend/templates/deployment-registry-prepopulate.yaml b/charts/substra-backend/templates/deployment-registry-prepopulate.yaml index 9ca60e39f..d1f319b1c 100644 --- a/charts/substra-backend/templates/deployment-registry-prepopulate.yaml +++ b/charts/substra-backend/templates/deployment-registry-prepopulate.yaml @@ -29,7 +29,7 @@ spec: - name: kaniko image: {{ include "common.images.name" $.Values.kaniko.image }} resources: - {{- toYaml $.Values.registryPrepopulate.kaniko.resources | nindent 12 }} + {{- toYaml $.Values.kaniko.resources | nindent 12 }} args: - "--context=/docker-context" {{- if .dstImage }} diff --git a/charts/substra-backend/templates/statefulset-builder.yaml b/charts/substra-backend/templates/statefulset-builder.yaml index 50a100afd..e636876f4 100644 --- a/charts/substra-backend/templates/statefulset-builder.yaml +++ b/charts/substra-backend/templates/statefulset-builder.yaml @@ -160,6 +160,8 @@ spec: value: {{ .Values.kaniko.dockerConfigSecretName | quote }} - name: OBJECTSTORE_URL value: {{ include "substra-backend.objectStore.url" . | quote }} + - name: KANIKO_RESOURCES + value: {{ toYaml .Values.kaniko.resources | quote }} ports: - name: http containerPort: 8000 diff --git a/charts/substra-backend/templates/statefulset-worker.yaml b/charts/substra-backend/templates/statefulset-worker.yaml index e797e4661..48ce3c1b4 100644 --- a/charts/substra-backend/templates/statefulset-worker.yaml +++ b/charts/substra-backend/templates/statefulset-worker.yaml @@ -109,6 +109,8 @@ spec: valueFrom: fieldRef: fieldPath: spec.nodeName + - name: COMPUTE_POD_RESOURCES + value: {{ toYaml .Values.worker.computePod.resources | quote }} - name: COMPUTE_POD_MAX_STARTUP_WAIT_SECONDS value: {{ .Values.worker.computePod.maxStartupWaitSeconds | quote }} - name: OBJECTSTORE_URL diff --git a/charts/substra-backend/values.yaml b/charts/substra-backend/values.yaml index 1109118a8..cf53e89ac 100644 --- a/charts/substra-backend/values.yaml +++ b/charts/substra-backend/values.yaml @@ -355,6 +355,16 @@ worker: fsGroup: 1001 runAsUser: 1001 runAsGroup: 1001 + ## @param worker.computePod.resources.requests.cpu Worker compute pod container cpu request + ## @param worker.computePod.resources.requests.memory Worker compute pod container memory request + ## @param worker.computePod.resources.limits.memory Worker compute pod container memory limit + ## + resources: + requests: + cpu: "500m" + memory: "512Mi" + limits: + memory: "64Gi" events: ## @param worker.events.enabled Enable event service ## @@ -577,7 +587,6 @@ builder: limits: cpu: "2000m" memory: "8Gi" - ## @param builder.nodeSelector Node labels for pod assignment ## nodeSelector: { } @@ -749,6 +758,16 @@ kaniko: registry: gcr.io repository: kaniko-project/executor tag: v1.8.1 + ## @param kaniko.resources.requests.cpu Kaniko container cpu request + ## @param kaniko.resources.requests.memory Kaniko container memory request + ## @param kaniko.resources.limits.memory Kaniko container memory limit + ## + resources: + requests: + cpu: "500m" + memory: "256Mi" + limits: + memory: "32Gi" ## @param kaniko.mirror If set to `true` pull base images from the local registry. ## mirror: false @@ -814,17 +833,6 @@ registryPrepopulate: cpu: "100m" limits: memory: "400Mi" - ## @param registryPrepopulate.kaniko.resources.requests.cpu Kaniko container cpu request - ## @param registryPrepopulate.kaniko.resources.requests.memory Kaniko container memory request - ## @param registryPrepopulate.kaniko.resources.limits.memory Kaniko container memory limit - ## - kaniko: - resources: - requests: - memory: "256Mi" - cpu: "500m" - limits: - memory: "8Gi" ## @param registryPrepopulate.pause.resources.requests.cpu Pause container cpu request ## @param registryPrepopulate.pause.resources.requests.memory Pause container memory request ## @param registryPrepopulate.pause.resources.limits.memory Pause container memory limit