From e5370546f3413eb7d3a032c8d932a2121f1f7393 Mon Sep 17 00:00:00 2001 From: SdgJlbl Date: Thu, 18 Apr 2024 14:45:38 +0200 Subject: [PATCH 01/11] chore: add limits and requests to pods (#867) Signed-off-by: SdgJlbl --- .../builder/image_builder/image_builder.py | 2 + .../substrapp/compute_tasks/compute_pod.py | 2 + backend/substrapp/kubernetes_utils.py | 8 ++ charts/substra-backend/CHANGELOG.md | 11 -- charts/substra-backend/Chart.yaml | 2 +- charts/substra-backend/README.md | 29 +++- .../templates/deployment-api-events.yaml | 7 + .../deployment-registry-prepopulate.yaml | 18 +++ .../templates/deployment-worker-events.yaml | 7 + charts/substra-backend/values.yaml | 129 +++++++++++++----- 10 files changed, 164 insertions(+), 51 deletions(-) diff --git a/backend/builder/image_builder/image_builder.py b/backend/builder/image_builder/image_builder.py index 38810abac..ed25b48ed 100644 --- a/backend/builder/image_builder/image_builder.py +++ b/backend/builder/image_builder/image_builder.py @@ -23,6 +23,7 @@ from substrapp.compute_tasks.volumes import get_worker_subtuple_pvc_name from substrapp.docker_registry import USER_IMAGE_REPOSITORY from substrapp.kubernetes_utils import delete_pod +from substrapp.kubernetes_utils import get_resources_requirements from substrapp.kubernetes_utils import get_security_context from substrapp.lock_local import lock_resource from substrapp.utils import timeit @@ -306,6 +307,7 @@ def _build_container(dockerfile_mount_path: str, image_tag: str) -> kubernetes.c args=args, volume_mounts=volume_mounts, security_context=container_security_context, + resources=get_resources_requirements(cpu_request="1000m", memory_request="4Gi", memory_limit="32Gi"), ) diff --git a/backend/substrapp/compute_tasks/compute_pod.py b/backend/substrapp/compute_tasks/compute_pod.py index bc12bea02..5e929104a 100644 --- a/backend/substrapp/compute_tasks/compute_pod.py +++ b/backend/substrapp/compute_tasks/compute_pod.py @@ -6,6 +6,7 @@ from substrapp.kubernetes_utils import delete_pod from substrapp.kubernetes_utils import get_pod_security_context +from substrapp.kubernetes_utils import get_resources_requirements from substrapp.kubernetes_utils import get_security_context NAMESPACE = settings.NAMESPACE @@ -112,6 +113,7 @@ def create_pod( args=None, volume_mounts=volume_mounts + gpu_volume_mounts, security_context=get_security_context(), + resources=get_resources_requirements(cpu_request="1000m", memory_request="1Gi", memory_limit="64Gi"), env=[kubernetes.client.V1EnvVar(name=env_name, value=env_value) for env_name, env_value in environment.items()], **container_optional_kwargs, ) diff --git a/backend/substrapp/kubernetes_utils.py b/backend/substrapp/kubernetes_utils.py index 5bb6ac65f..1486e198a 100644 --- a/backend/substrapp/kubernetes_utils.py +++ b/backend/substrapp/kubernetes_utils.py @@ -47,6 +47,14 @@ def get_security_context(root: bool = False, capabilities: list[str] = None) -> return security_context +def get_resources_requirements( + *, cpu_request: str = "1000m", memory_request: str = "200M", memory_limit: str = "2G" +) -> kubernetes.client.V1ResourceRequirements: + return kubernetes.client.V1ResourceRequirements( + requests={"cpu": cpu_request, "memory": memory_request}, limits={"memory": memory_limit} + ) + + def pod_exists_by_label_selector(k8s_client: kubernetes.client.CoreV1Api, label_selector: str) -> bool: """Return True if the pod exists, else False. diff --git a/charts/substra-backend/CHANGELOG.md b/charts/substra-backend/CHANGELOG.md index f480d971c..e5d926cca 100644 --- a/charts/substra-backend/CHANGELOG.md +++ b/charts/substra-backend/CHANGELOG.md @@ -2,17 +2,6 @@ -## [26.3.0] - 2024-04-19 - -### Removed - -- Revert resources limits and requests (CPU and memory) for all containers. - -## [26.2.0] - 2024-04-18 - -### Added - -- User docker registry in configurable through value in `containerRegistry.userImageRepository` ## [26.1.0] - 2024-04-17 diff --git a/charts/substra-backend/Chart.yaml b/charts/substra-backend/Chart.yaml index 34114016d..a6f1911c3 100644 --- a/charts/substra-backend/Chart.yaml +++ b/charts/substra-backend/Chart.yaml @@ -1,7 +1,7 @@ apiVersion: v2 name: substra-backend home: https://github.com/Substra -version: 26.3.0 +version: 26.1.0 appVersion: 0.45.0 kubeVersion: ">= 1.19.0-0" description: Main package for Substra diff --git a/charts/substra-backend/README.md b/charts/substra-backend/README.md index 758fc4116..32dea59e3 100644 --- a/charts/substra-backend/README.md +++ b/charts/substra-backend/README.md @@ -72,7 +72,10 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `server.ingress.extraHosts` | The list of additional hostnames to be covered with this ingress record | `[]` | | `server.ingress.extraTls` | The tls configuration for hostnames to be coverred by the ingress | `[]` | | `server.ingress.ingressClassName` | _IngressClass_ that will be used to implement the Ingress | `nil` | -| `server.resources` | Server container resources requests and limits | `{}` | +| `server.resources.requests.cpu` | Server container cpu request | `1000m` | +| `server.resources.requests.memory` | Server container memory request | `6Gi` | +| `server.resources.limits.cpu` | Server container cpu limit | `2000m` | +| `server.resources.limits.memory` | Server container memory limit | `12Gi` | | `server.persistence.storageClass` | Specify the _StorageClass_ used to provision the volume. Or the default _StorageClass_ will be used. Set it to `-` to disable dynamic provisioning | `""` | | `server.persistence.servermedias.size` | Servermedias volume size | `10Gi` | | `server.persistence.servermedias.existingClaim` | use this PVC rather than creating a new one | `nil` | @@ -119,7 +122,10 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `worker.podSecurityContext.runAsUser` | User ID for the pod | `1001` | | `worker.podSecurityContext.runAsGroup` | Group ID for the pod | `1001` | | `worker.podSecurityContext.fsGroup` | FileSystem group ID for the pod | `1001` | -| `worker.resources` | Worker container resources requests and limits | `{}` | +| `worker.resources.requests.cpu` | Worker container cpu request | `1000m` | +| `worker.resources.requests.memory` | Worker container memory request | `4Gi` | +| `worker.resources.limits.cpu` | Worker container cpu limit | `2000m` | +| `worker.resources.limits.memory` | Worker container memory limit | `8Gi` | | `worker.nodeSelector` | Node labels for pod assignment | `{}` | | `worker.tolerations` | Toleration labels for pod assignment | `[]` | | `worker.affinity` | Affinity settings for pod assignment, ignored if `DataSampleStorageInServerMedia` is `true` | `{}` | @@ -163,7 +169,10 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `schedulerWorker.nodeSelector` | Node labels for pod assignment | `{}` | | `schedulerWorker.tolerations` | Toleration labels for pod assignment | `[]` | | `schedulerWorker.affinity` | Affinity settings for pod assignment | `{}` | -| `schedulerWorker.resources` | Scheduler container resources requests and limits | `{}` | +| `schedulerWorker.resources.requests.cpu` | Scheduler container cpu request | `250m` | +| `schedulerWorker.resources.requests.memory` | Scheduler container memory request | `200Mi` | +| `schedulerWorker.resources.limits.cpu` | Scheduler container cpu limit | `250m` | +| `schedulerWorker.resources.limits.memory` | Scheduler container memory limit | `400Mi` | | `schedulerWorker.podSecurityContext.enabled` | Enable security context | `true` | | `schedulerWorker.podSecurityContext.runAsUser` | User ID for the pod | `1001` | | `schedulerWorker.podSecurityContext.runAsGroup` | Group ID for the pod | `1001` | @@ -180,7 +189,10 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `scheduler.image.tag` | Substra backend tasks scheduler image tag (defaults to AppVersion) | `nil` | | `scheduler.image.pullPolicy` | Substra backend task scheduler image pull policy | `IfNotPresent` | | `scheduler.image.pullSecrets` | Specify image pull secrets | `[]` | -| `scheduler.resources` | Scheduler container resources requests and limits | `{}` | +| `scheduler.resources.requests.cpu` | Scheduler container cpu request | `250m` | +| `scheduler.resources.requests.memory` | Scheduler container memory request | `200Mi` | +| `scheduler.resources.limits.cpu` | Scheduler container cpu limit | `250m` | +| `scheduler.resources.limits.memory` | Scheduler container memory limit | `400Mi` | | `scheduler.nodeSelector` | Node labels for pod assignment | `{}` | | `scheduler.tolerations` | Toleration labels for pod assignment | `[]` | | `scheduler.affinity` | Affinity settings for pod assignment | `{}` | @@ -194,8 +206,8 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | Name | Description | Value | | --------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------- | | `builder.replicaCount` | Number of builder replicas | `1` | -| `builder.enabled` | Enable worker service | `true` | -| `builder.replicaCount` | Replica count for the worker service | `1` | +| `builder.enabled` | Enable builder service | `true` | +| `builder.replicaCount` | Replica count for the builder service | `1` | | `builder.concurrency` | Maximum amount of tasks to process in parallel | `1` | | `builder.image.registry` | Substra backend server image registry | `ghcr.io` | | `builder.image.repository` | Substra backend server image repository | `substra/substra-backend` | @@ -206,7 +218,10 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `builder.podSecurityContext.runAsUser` | User ID for the pod | `1001` | | `builder.podSecurityContext.runAsGroup` | Group ID for the pod | `1001` | | `builder.podSecurityContext.fsGroup` | FileSystem group ID for the pod | `1001` | -| `builder.resources` | Builder container resources requests and limits | `{}` | +| `builder.resources.requests.cpu` | Builder container cpu request | `2000m` | +| `builder.resources.requests.memory` | Builder container memory request | `4Gi` | +| `builder.resources.limits.cpu` | Builder container cpu limit | `2000m` | +| `builder.resources.limits.memory` | Builder container memory limit | `8Gi` | | `builder.nodeSelector` | Node labels for pod assignment | `{}` | | `builder.tolerations` | Toleration labels for pod assignment | `[]` | | `builder.affinity` | Affinity settings for pod assignment, ignored if `DataSampleStorageInServerMedia` is `true` | `{}` | diff --git a/charts/substra-backend/templates/deployment-api-events.yaml b/charts/substra-backend/templates/deployment-api-events.yaml index 8df3831d1..eb0b5df7e 100644 --- a/charts/substra-backend/templates/deployment-api-events.yaml +++ b/charts/substra-backend/templates/deployment-api-events.yaml @@ -41,6 +41,13 @@ spec: - name: api-event-app image: {{ include "substra-backend.images.name" (dict "img" .Values.api.events.image "defaultTag" $.Chart.AppVersion) }} imagePullPolicy: {{ .Values.api.events.image.pullPolicy }} + resources: + requests: + memory: "200Mi" + cpu: "500m" + limits: + memory: "400Mi" + cpu: "500m" command: ["/bin/bash"] {{- if eq .Values.settings "prod" }} args: ["-c", "python manage.py consume"] diff --git a/charts/substra-backend/templates/deployment-registry-prepopulate.yaml b/charts/substra-backend/templates/deployment-registry-prepopulate.yaml index 2f1da5668..b52c58f5f 100644 --- a/charts/substra-backend/templates/deployment-registry-prepopulate.yaml +++ b/charts/substra-backend/templates/deployment-registry-prepopulate.yaml @@ -23,9 +23,21 @@ spec: initContainers: - name: wait-registry image: jwilder/dockerize:0.6.1 + resources: + requests: + memory: "200Mi" + cpu: "500m" + limits: + memory: "400Mi" command: ['dockerize', '-wait', 'tcp://{{ $.Release.Name }}-docker-registry:5000'] - name: kaniko image: {{ include "common.images.name" $.Values.kaniko.image }} + resources: + requests: + memory: "2Gi" + cpu: "1000m" + limits: + memory: "8Gi" args: - "--context=/docker-context" {{- if .dstImage }} @@ -48,6 +60,12 @@ spec: containers: - image: gcr.io/google-containers/pause:3.2 name: pause + resources: + requests: + memory: "64Mi" + cpu: "50m" + limits: + memory: "128Mi" volumes: - name: kaniko-dir emptyDir: {} diff --git a/charts/substra-backend/templates/deployment-worker-events.yaml b/charts/substra-backend/templates/deployment-worker-events.yaml index 83aef2ead..aa93b8fb4 100644 --- a/charts/substra-backend/templates/deployment-worker-events.yaml +++ b/charts/substra-backend/templates/deployment-worker-events.yaml @@ -41,6 +41,13 @@ spec: - name: worker-event-app image: {{ include "substra-backend.images.name" (dict "img" .Values.worker.events.image "defaultTag" $.Chart.AppVersion) }} imagePullPolicy: {{ .Values.worker.events.image.pullPolicy }} + resources: + requests: + memory: "200Mi" + cpu: "500m" + limits: + memory: "400Mi" + cpu: "500m" command: ["/bin/bash"] {{- if eq .Values.settings "prod" }} args: ["-c", "python manage.py consume"] diff --git a/charts/substra-backend/values.yaml b/charts/substra-backend/values.yaml index 17fd787a1..6596ce55b 100644 --- a/charts/substra-backend/values.yaml +++ b/charts/substra-backend/values.yaml @@ -172,17 +172,19 @@ server: ## ingressClassName: - ## @param server.resources Server container resources requests and limits - ## e.g: - ## resources: - ## limits: - ## cpu: 100m - ## memory: 128Mi - ## requests: - ## cpu: 100m - ## memory: 128Mi + ## @param server.resources.requests.cpu Server container cpu request + ## @param server.resources.requests.memory Server container memory request + ## @param server.resources.limits.cpu Server container cpu limit + ## @param server.resources.limits.memory Server container memory limit ## - resources: {} + resources: + requests: + cpu: "1000m" + memory: "6Gi" + limits: + cpu: "2000m" + memory: "12Gi" + persistence: ## @param server.persistence.storageClass Specify the _StorageClass_ used to provision the volume. Or the default _StorageClass_ will be used. Set it to `-` to disable dynamic provisioning @@ -304,9 +306,18 @@ worker: runAsUser: 1001 runAsGroup: 1001 fsGroup: 1001 - ## @param worker.resources Worker container resources requests and limits - ## - resources: {} + ## @param worker.resources.requests.cpu Worker container cpu request + ## @param worker.resources.requests.memory Worker container memory request + ## @param worker.resources.limits.cpu Worker container cpu limit + ## @param worker.resources.limits.memory Worker container memory limit + ## + resources: + requests: + cpu: "1000m" + memory: "4Gi" + limits: + cpu: "2000m" + memory: "8Gi" ## @param worker.nodeSelector Node labels for pod assignment ## nodeSelector: {} @@ -422,9 +433,18 @@ schedulerWorker: ## @param schedulerWorker.affinity Affinity settings for pod assignment ## affinity: {} - ## @param schedulerWorker.resources Scheduler container resources requests and limits - ## - resources: {} + ## @param schedulerWorker.resources.requests.cpu Scheduler container cpu request + ## @param schedulerWorker.resources.requests.memory Scheduler container memory request + ## @param schedulerWorker.resources.limits.cpu Scheduler container cpu limit + ## @param schedulerWorker.resources.limits.memory Scheduler container memory limit + ## + resources: + requests: + cpu: "250m" + memory: "200Mi" + limits: + cpu: "250m" + memory: "400Mi" ## @param schedulerWorker.podSecurityContext.enabled Enable security context ## @param schedulerWorker.podSecurityContext.runAsUser User ID for the pod ## @param schedulerWorker.podSecurityContext.runAsGroup Group ID for the pod @@ -456,9 +476,18 @@ scheduler: tag: null pullPolicy: IfNotPresent pullSecrets: [] - ## @param scheduler.resources Scheduler container resources requests and limits - ## - resources: {} + ## @param scheduler.resources.requests.cpu Scheduler container cpu request + ## @param scheduler.resources.requests.memory Scheduler container memory request + ## @param scheduler.resources.limits.cpu Scheduler container cpu limit + ## @param scheduler.resources.limits.memory Scheduler container memory limit + ## + resources: + requests: + cpu: "250m" + memory: "200Mi" + limits: + cpu: "250m" + memory: "400Mi" ## @param scheduler.nodeSelector Node labels for pod assignment ## nodeSelector: {} @@ -484,10 +513,10 @@ scheduler: ## @param builder.replicaCount Number of builder replicas ## builder: - ## @param builder.enabled Enable worker service + ## @param builder.enabled Enable builder service ## enabled: true - ## @param builder.replicaCount Replica count for the worker service + ## @param builder.replicaCount Replica count for the builder service ## replicaCount: 1 @@ -524,17 +553,18 @@ builder: fsGroup: 1001 - ## @param builder.resources Builder container resources requests and limits - ## e.g: - ## resources: - ## limits: - ## cpu: 100m - ## memory: 128Mi - ## requests: - ## cpu: 100m - ## memory: 128Mi + ## @param builder.resources.requests.cpu Builder container cpu request + ## @param builder.resources.requests.memory Builder container memory request + ## @param builder.resources.limits.cpu Builder container cpu limit + ## @param builder.resources.limits.memory Builder container memory limit ## - resources: {} + resources: + requests: + cpu: "2000m" + memory: "4Gi" + limits: + cpu: "2000m" + memory: "8Gi" ## @param builder.nodeSelector Node labels for pod assignment ## @@ -846,6 +876,13 @@ postgresql: capabilities: drop: - ALL + resources: + requests: + cpu: "1000m" + memory: "2Gi" + limits: + cpu: "1000m" + memory: "4Gi" ## @skip redis ## @@ -860,6 +897,13 @@ redis: service: ports: redis: 6379 + resources: + requests: + cpu: "500m" + memory: "512Mi" + limits: + cpu: "500m" + memory: "1024Mi" replica: replicaCount: 0 commonConfiguration: |- @@ -875,10 +919,17 @@ docker-registry: storage: filesystem persistence: enabled: true - size: 10Gi + size: 50Gi deleteEnabled: true service: type: NodePort + resources: + requests: + cpu: "500m" + memory: "16Gi" + limits: + cpu: "500m" + memory: "64Gi" ## @skip minio ## @@ -899,6 +950,13 @@ minio: capabilities: drop: - ALL + resources: + requests: + cpu: "500m" + memory: "16Gi" + limits: + cpu: "1000m" + memory: "64Gi" ## @skip localstack ## @@ -907,6 +965,13 @@ localstack: service: edgeService: nodePort: "" + resources: + requests: + cpu: "500m" + memory: "16Gi" + limits: + cpu: "500m" + memory: "64Gi" environment: - name: SERVICES value: s3 From 0708d413eed9b26ca7c7016001111ce1a64066b8 Mon Sep 17 00:00:00 2001 From: ThibaultFy Date: Thu, 25 Apr 2024 09:52:54 +0200 Subject: [PATCH 02/11] chore: all resources in values Signed-off-by: ThibaultFy --- charts/substra-backend/CHANGELOG.md | 11 ++++ charts/substra-backend/Chart.yaml | 2 +- charts/substra-backend/README.md | 14 +++++ .../templates/deployment-api-events.yaml | 7 +-- .../deployment-registry-prepopulate.yaml | 18 +----- .../templates/deployment-worker-events.yaml | 7 +-- charts/substra-backend/values.yaml | 61 +++++++++++++++++++ 7 files changed, 92 insertions(+), 28 deletions(-) diff --git a/charts/substra-backend/CHANGELOG.md b/charts/substra-backend/CHANGELOG.md index e5d926cca..f480d971c 100644 --- a/charts/substra-backend/CHANGELOG.md +++ b/charts/substra-backend/CHANGELOG.md @@ -2,6 +2,17 @@ +## [26.3.0] - 2024-04-19 + +### Removed + +- Revert resources limits and requests (CPU and memory) for all containers. + +## [26.2.0] - 2024-04-18 + +### Added + +- User docker registry in configurable through value in `containerRegistry.userImageRepository` ## [26.1.0] - 2024-04-17 diff --git a/charts/substra-backend/Chart.yaml b/charts/substra-backend/Chart.yaml index a6f1911c3..34114016d 100644 --- a/charts/substra-backend/Chart.yaml +++ b/charts/substra-backend/Chart.yaml @@ -1,7 +1,7 @@ apiVersion: v2 name: substra-backend home: https://github.com/Substra -version: 26.1.0 +version: 26.3.0 appVersion: 0.45.0 kubeVersion: ">= 1.19.0-0" description: Main package for Substra diff --git a/charts/substra-backend/README.md b/charts/substra-backend/README.md index 32dea59e3..3989f4c9f 100644 --- a/charts/substra-backend/README.md +++ b/charts/substra-backend/README.md @@ -302,6 +302,20 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `addAccountOperator.incomingOrganizations` | Incoming organizations credentials for substra backend organization-to-organization communications | `[]` | | `addAccountOperator.users` | A list of administrators users who can log into the substra backend server with admin privileges | `[]` | +### Registry prepopulate + +| Name | Description | Value | +| ------------------------------------------------------------ | -------------------------------------- | ------- | +| `registryPrepopulate.waitRegistry.resources.requests.cpu` | Wait registry container cpu request | `500m` | +| `registryPrepopulate.waitRegistry.resources.requests.memory` | Wait registry container memory request | `200Mi` | +| `registryPrepopulate.waitRegistry.resources.limits.memory` | Wait registry container memory limit | `400Mi` | +| `registryPrepopulate.kaniko.resources.requests.cpu` | Kaniko container cpu request | `1000m` | +| `registryPrepopulate.kaniko.resources.requests.memory` | Kaniko container memory request | `2Gi` | +| `registryPrepopulate.kaniko.resources.limits.memory` | Kaniko container memory limit | `8Gi` | +| `registryPrepopulate.pause.resources.requests.cpu` | Pause container cpu request | `50m` | +| `registryPrepopulate.pause.resources.requests.memory` | Pause container memory request | `64Mi` | +| `registryPrepopulate.pause.resources.limits.memory` | Pause container memory limit | `128Mi` | + ### Single Sign-On through OpenID Connect Uses the authorization code flow. diff --git a/charts/substra-backend/templates/deployment-api-events.yaml b/charts/substra-backend/templates/deployment-api-events.yaml index eb0b5df7e..b2e8bbd00 100644 --- a/charts/substra-backend/templates/deployment-api-events.yaml +++ b/charts/substra-backend/templates/deployment-api-events.yaml @@ -42,12 +42,7 @@ spec: image: {{ include "substra-backend.images.name" (dict "img" .Values.api.events.image "defaultTag" $.Chart.AppVersion) }} imagePullPolicy: {{ .Values.api.events.image.pullPolicy }} resources: - requests: - memory: "200Mi" - cpu: "500m" - limits: - memory: "400Mi" - cpu: "500m" + {{- toYaml .Values.api.events.resources | nindent 12 }} command: ["/bin/bash"] {{- if eq .Values.settings "prod" }} args: ["-c", "python manage.py consume"] diff --git a/charts/substra-backend/templates/deployment-registry-prepopulate.yaml b/charts/substra-backend/templates/deployment-registry-prepopulate.yaml index b52c58f5f..b60d7e47b 100644 --- a/charts/substra-backend/templates/deployment-registry-prepopulate.yaml +++ b/charts/substra-backend/templates/deployment-registry-prepopulate.yaml @@ -24,20 +24,12 @@ spec: - name: wait-registry image: jwilder/dockerize:0.6.1 resources: - requests: - memory: "200Mi" - cpu: "500m" - limits: - memory: "400Mi" + {{- toYaml .Values.registryPrepopulate.waitRegistry | nindent 12 }} command: ['dockerize', '-wait', 'tcp://{{ $.Release.Name }}-docker-registry:5000'] - name: kaniko image: {{ include "common.images.name" $.Values.kaniko.image }} resources: - requests: - memory: "2Gi" - cpu: "1000m" - limits: - memory: "8Gi" + {{- toYaml .Values.registryPrepopulate.kaniko.resources | nindent 12 }} args: - "--context=/docker-context" {{- if .dstImage }} @@ -61,11 +53,7 @@ spec: - image: gcr.io/google-containers/pause:3.2 name: pause resources: - requests: - memory: "64Mi" - cpu: "50m" - limits: - memory: "128Mi" + {{- toYaml .Values.registryPrepopulate.pause.resources | nindent 12 }} volumes: - name: kaniko-dir emptyDir: {} diff --git a/charts/substra-backend/templates/deployment-worker-events.yaml b/charts/substra-backend/templates/deployment-worker-events.yaml index aa93b8fb4..60767f333 100644 --- a/charts/substra-backend/templates/deployment-worker-events.yaml +++ b/charts/substra-backend/templates/deployment-worker-events.yaml @@ -42,12 +42,7 @@ spec: image: {{ include "substra-backend.images.name" (dict "img" .Values.worker.events.image "defaultTag" $.Chart.AppVersion) }} imagePullPolicy: {{ .Values.worker.events.image.pullPolicy }} resources: - requests: - memory: "200Mi" - cpu: "500m" - limits: - memory: "400Mi" - cpu: "500m" + {{- toYaml .Values.worker.events.resources | nindent 12 }} command: ["/bin/bash"] {{- if eq .Values.settings "prod" }} args: ["-c", "python manage.py consume"] diff --git a/charts/substra-backend/values.yaml b/charts/substra-backend/values.yaml index 6596ce55b..9d826ee7e 100644 --- a/charts/substra-backend/values.yaml +++ b/charts/substra-backend/values.yaml @@ -371,6 +371,18 @@ worker: tag: null pullPolicy: IfNotPresent pullSecrets: [] + ## @param worker.events.resources.requests.cpu Worker events container cpu request + ## @param worker.events.resources.requests.memory Worker events container memory request + ## @param worker.events.resources.limits.cpu Worker events container cpu limit + ## @param worker.events.resources.limits.memory Worker events container memory limit + ## + resources: + requests: + memory: "200Mi" + cpu: "500m" + limits: + memory: "400Mi" + cpu: "500m" ## @param worker.events.podSecurityContext.enabled Enable security context ## @param worker.events.podSecurityContext.runAsUser User ID for the pod ## @param worker.events.podSecurityContext.runAsGroup Group ID for the pod @@ -678,6 +690,18 @@ api: ## If not set and create is true, a name is generated using the substra.fullname template ## name: "" + ## @param api.events.resources.requests.cpu Api events container cpu request + ## @param api.events.resources.requests.memory Api events container memory request + ## @param api.events.resources.limits.cpu Api events container cpu limit + ## @param api.events.resources.limits.memory Api events container memory limit + ## + resources: + requests: + memory: "200Mi" + cpu: "500m" + limits: + memory: "400Mi" + cpu: "500m" ## @section Orchestrator settings ## @@ -776,6 +800,43 @@ addAccountOperator: ## users: [] +## @section Registry prepopulate +## +registryPrepopulate: + ## @param registryPrepopulate.waitRegistry.resources.requests.cpu Wait registry container cpu request + ## @param registryPrepopulate.waitRegistry.resources.requests.memory Wait registry container memory request + ## @param registryPrepopulate.waitRegistry.resources.limits.memory Wait registry container memory limit + ## + waitRegistry: + resources: + requests: + memory: "200Mi" + cpu: "500m" + limits: + memory: "400Mi" + ## @param registryPrepopulate.kaniko.resources.requests.cpu Kaniko container cpu request + ## @param registryPrepopulate.kaniko.resources.requests.memory Kaniko container memory request + ## @param registryPrepopulate.kaniko.resources.limits.memory Kaniko container memory limit + ## + kaniko: + resources: + requests: + memory: "2Gi" + cpu: "1000m" + limits: + memory: "8Gi" + ## @param registryPrepopulate.pause.resources.requests.cpu Pause container cpu request + ## @param registryPrepopulate.pause.resources.requests.memory Pause container memory request + ## @param registryPrepopulate.pause.resources.limits.memory Pause container memory limit + ## + pause: + resources: + requests: + memory: "64Mi" + cpu: "50m" + limits: + memory: "128Mi" + ## @section Single Sign-On through OpenID Connect ## @descriptionStart Uses the authorization code flow. From 955b15cba979764dda38a708d1f5b552ab50178b Mon Sep 17 00:00:00 2001 From: ThibaultFy Date: Thu, 25 Apr 2024 10:04:04 +0200 Subject: [PATCH 03/11] chore: all resources in values Signed-off-by: ThibaultFy --- backend/substrapp/tasks/task.py | 2 +- charts/substra-backend/README.md | 10 +++++++++- charts/substra-backend/values.yaml | 26 +++++++++++++------------- 3 files changed, 23 insertions(+), 15 deletions(-) diff --git a/backend/substrapp/tasks/task.py b/backend/substrapp/tasks/task.py index c5adda2be..a87083b57 100644 --- a/backend/substrapp/tasks/task.py +++ b/backend/substrapp/tasks/task.py @@ -73,7 +73,7 @@ def on_success(self, retval: dict[str, Any], task_id: str, args: tuple, kwargs: # Celery does not provide unpacked arguments, we are doing it in `split_args` def on_retry(self, exc: Exception, task_id: str, args: tuple, kwargs: dict[str, Any], einfo: ExceptionInfo) -> None: _, task = self.split_args(args) - # delete compute pod to reset hardware ressources + # delete compute pod to reset hardware resources delete_compute_plan_pods(task.compute_plan_key) logger.info( "Retrying task", diff --git a/charts/substra-backend/README.md b/charts/substra-backend/README.md index 3989f4c9f..79ae134fc 100644 --- a/charts/substra-backend/README.md +++ b/charts/substra-backend/README.md @@ -64,7 +64,7 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `server.service.externalIPs` | A list of IP addresses for which nodes in the cluster will also accept traffic for this service | `[]` | | `server.service.annotations` | Additional annotations for the _Service_ resource. | `{}` | | `server.ingress.enabled` | Deploy an ingress for the substra backend server | `false` | -| `server.ingress.hostname` | Default host for the ingress ressource | `substra.backend.local` | +| `server.ingress.hostname` | Default host for the ingress resource | `substra.backend.local` | | `server.ingress.pathType` | Ingress path type | `ImplementationSpecific` | | `server.ingress.path` | Path for the default host | `/` | | `server.ingress.extraPaths` | The list of extra paths to be created for the default host | `[]` | @@ -144,6 +144,10 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `worker.events.image.tag` | Substra event app image tag (defaults to AppVersion) | `nil` | | `worker.events.image.pullPolicy` | Substra event app image pull policy | `IfNotPresent` | | `worker.events.image.pullSecrets` | Specify image pull secrets | `[]` | +| `worker.events.resources.requests.cpu` | Worker events container cpu request | `500m` | +| `worker.events.resources.requests.memory` | Worker events container memory request | `200Mi` | +| `worker.events.resources.limits.cpu` | Worker events container cpu limit | `500m` | +| `worker.events.resources.limits.memory` | Worker events container memory limit | `400Mi` | | `worker.events.podSecurityContext.enabled` | Enable security context | `true` | | `worker.events.podSecurityContext.runAsUser` | User ID for the pod | `1001` | | `worker.events.podSecurityContext.runAsGroup` | Group ID for the pod | `1001` | @@ -253,6 +257,10 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `api.events.image.tag` | Substra event app image tag (defaults to AppVersion) | `nil` | | `api.events.image.pullPolicy` | Substra event app image pull policy | `IfNotPresent` | | `api.events.image.pullSecrets` | Specify image pull secrets | `[]` | +| `api.events.resources.requests.cpu` | Api events container cpu request | `500m` | +| `api.events.resources.requests.memory` | Api events container memory request | `200Mi` | +| `api.events.resources.limits.cpu` | Api events container cpu limit | `500m` | +| `api.events.resources.limits.memory` | Api events container memory limit | `400Mi` | | `api.events.podSecurityContext.enabled` | Enable security context | `true` | | `api.events.podSecurityContext.runAsUser` | User ID for the pod | `1001` | | `api.events.podSecurityContext.runAsGroup` | Group ID for the pod | `1001` | diff --git a/charts/substra-backend/values.yaml b/charts/substra-backend/values.yaml index 9d826ee7e..5aef65bda 100644 --- a/charts/substra-backend/values.yaml +++ b/charts/substra-backend/values.yaml @@ -121,7 +121,7 @@ server: ## enabled: false - ## @param server.ingress.hostname Default host for the ingress ressource + ## @param server.ingress.hostname Default host for the ingress resource ## hostname: substra.backend.local @@ -659,6 +659,18 @@ api: tag: null pullPolicy: IfNotPresent pullSecrets: [] + ## @param api.events.resources.requests.cpu Api events container cpu request + ## @param api.events.resources.requests.memory Api events container memory request + ## @param api.events.resources.limits.cpu Api events container cpu limit + ## @param api.events.resources.limits.memory Api events container memory limit + ## + resources: + requests: + memory: "200Mi" + cpu: "500m" + limits: + memory: "400Mi" + cpu: "500m" ## @param api.events.podSecurityContext.enabled Enable security context ## @param api.events.podSecurityContext.runAsUser User ID for the pod ## @param api.events.podSecurityContext.runAsGroup Group ID for the pod @@ -690,18 +702,6 @@ api: ## If not set and create is true, a name is generated using the substra.fullname template ## name: "" - ## @param api.events.resources.requests.cpu Api events container cpu request - ## @param api.events.resources.requests.memory Api events container memory request - ## @param api.events.resources.limits.cpu Api events container cpu limit - ## @param api.events.resources.limits.memory Api events container memory limit - ## - resources: - requests: - memory: "200Mi" - cpu: "500m" - limits: - memory: "400Mi" - cpu: "500m" ## @section Orchestrator settings ## From e95b1554d8533d82734c559f75d2e8ec1c29ca19 Mon Sep 17 00:00:00 2001 From: ThibaultFy Date: Thu, 25 Apr 2024 11:27:36 +0200 Subject: [PATCH 04/11] chore: lower requests Signed-off-by: ThibaultFy --- .../deployment-registry-prepopulate.yaml | 6 +- charts/substra-backend/values.yaml | 56 +++++++++---------- 2 files changed, 31 insertions(+), 31 deletions(-) diff --git a/charts/substra-backend/templates/deployment-registry-prepopulate.yaml b/charts/substra-backend/templates/deployment-registry-prepopulate.yaml index b60d7e47b..9ca60e39f 100644 --- a/charts/substra-backend/templates/deployment-registry-prepopulate.yaml +++ b/charts/substra-backend/templates/deployment-registry-prepopulate.yaml @@ -24,12 +24,12 @@ spec: - name: wait-registry image: jwilder/dockerize:0.6.1 resources: - {{- toYaml .Values.registryPrepopulate.waitRegistry | nindent 12 }} + {{- toYaml $.Values.registryPrepopulate.waitRegistry | nindent 12 }} command: ['dockerize', '-wait', 'tcp://{{ $.Release.Name }}-docker-registry:5000'] - name: kaniko image: {{ include "common.images.name" $.Values.kaniko.image }} resources: - {{- toYaml .Values.registryPrepopulate.kaniko.resources | nindent 12 }} + {{- toYaml $.Values.registryPrepopulate.kaniko.resources | nindent 12 }} args: - "--context=/docker-context" {{- if .dstImage }} @@ -53,7 +53,7 @@ spec: - image: gcr.io/google-containers/pause:3.2 name: pause resources: - {{- toYaml .Values.registryPrepopulate.pause.resources | nindent 12 }} + {{- toYaml $.Values.registryPrepopulate.pause.resources | nindent 12 }} volumes: - name: kaniko-dir emptyDir: {} diff --git a/charts/substra-backend/values.yaml b/charts/substra-backend/values.yaml index 5aef65bda..1109118a8 100644 --- a/charts/substra-backend/values.yaml +++ b/charts/substra-backend/values.yaml @@ -179,8 +179,8 @@ server: ## resources: requests: - cpu: "1000m" - memory: "6Gi" + cpu: "500m" + memory: "512Mi" limits: cpu: "2000m" memory: "12Gi" @@ -313,8 +313,8 @@ worker: ## resources: requests: - cpu: "1000m" - memory: "4Gi" + cpu: "500m" + memory: "512Mi" limits: cpu: "2000m" memory: "8Gi" @@ -378,8 +378,8 @@ worker: ## resources: requests: - memory: "200Mi" - cpu: "500m" + memory: "50Mi" + cpu: "100m" limits: memory: "400Mi" cpu: "500m" @@ -452,8 +452,8 @@ schedulerWorker: ## resources: requests: - cpu: "250m" - memory: "200Mi" + cpu: "100m" + memory: "50Mi" limits: cpu: "250m" memory: "400Mi" @@ -495,8 +495,8 @@ scheduler: ## resources: requests: - cpu: "250m" - memory: "200Mi" + cpu: "100m" + memory: "50Mi" limits: cpu: "250m" memory: "400Mi" @@ -572,8 +572,8 @@ builder: ## resources: requests: - cpu: "2000m" - memory: "4Gi" + cpu: "500m" + memory: "512Mi" limits: cpu: "2000m" memory: "8Gi" @@ -666,8 +666,8 @@ api: ## resources: requests: - memory: "200Mi" - cpu: "500m" + memory: "50Mi" + cpu: "100m" limits: memory: "400Mi" cpu: "500m" @@ -810,8 +810,8 @@ registryPrepopulate: waitRegistry: resources: requests: - memory: "200Mi" - cpu: "500m" + memory: "50Mi" + cpu: "100m" limits: memory: "400Mi" ## @param registryPrepopulate.kaniko.resources.requests.cpu Kaniko container cpu request @@ -821,8 +821,8 @@ registryPrepopulate: kaniko: resources: requests: - memory: "2Gi" - cpu: "1000m" + memory: "256Mi" + cpu: "500m" limits: memory: "8Gi" ## @param registryPrepopulate.pause.resources.requests.cpu Pause container cpu request @@ -939,8 +939,8 @@ postgresql: - ALL resources: requests: - cpu: "1000m" - memory: "2Gi" + cpu: "100m" + memory: "256Mi" limits: cpu: "1000m" memory: "4Gi" @@ -960,8 +960,8 @@ redis: redis: 6379 resources: requests: - cpu: "500m" - memory: "512Mi" + cpu: "100m" + memory: "256Mi" limits: cpu: "500m" memory: "1024Mi" @@ -986,8 +986,8 @@ docker-registry: type: NodePort resources: requests: - cpu: "500m" - memory: "16Gi" + cpu: "100m" + memory: "256Mi" limits: cpu: "500m" memory: "64Gi" @@ -1013,8 +1013,8 @@ minio: - ALL resources: requests: - cpu: "500m" - memory: "16Gi" + cpu: "100m" + memory: "1Gi" limits: cpu: "1000m" memory: "64Gi" @@ -1028,8 +1028,8 @@ localstack: nodePort: "" resources: requests: - cpu: "500m" - memory: "16Gi" + cpu: "100m" + memory: "1Gi" limits: cpu: "500m" memory: "64Gi" From babacf40442ff38089882ea4497fa11082cb3690 Mon Sep 17 00:00:00 2001 From: ThibaultFy Date: Thu, 25 Apr 2024 15:04:29 +0200 Subject: [PATCH 05/11] chore: all resources in values Signed-off-by: ThibaultFy --- backend/backend/settings/common.py | 2 + .../builder/image_builder/image_builder.py | 5 ++- .../substrapp/compute_tasks/compute_pod.py | 5 ++- backend/substrapp/kubernetes_utils.py | 9 ++-- charts/substra-backend/README.md | 41 ++++++++++--------- .../deployment-registry-prepopulate.yaml | 2 +- .../templates/statefulset-builder.yaml | 2 + .../templates/statefulset-worker.yaml | 2 + charts/substra-backend/values.yaml | 32 +++++++++------ 9 files changed, 61 insertions(+), 39 deletions(-) diff --git a/backend/backend/settings/common.py b/backend/backend/settings/common.py index 23ece165a..5cae475e4 100644 --- a/backend/backend/settings/common.py +++ b/backend/backend/settings/common.py @@ -197,6 +197,7 @@ "KANIKO_MIRROR": to_bool(os.environ.get("KANIKO_MIRROR", False)), "KANIKO_IMAGE": os.environ.get("KANIKO_IMAGE"), "KANIKO_DOCKER_CONFIG_SECRET_NAME": os.environ.get("KANIKO_DOCKER_CONFIG_SECRET_NAME"), + "KANIKO_RESOURCES": os.environ.get("KANIKO_RESOURCES"), "COMPUTE_POD_STARTUP_TIMEOUT_SECONDS": int(os.environ.get("COMPUTE_POD_STARTUP_TIMEOUT_SECONDS", 300)), "PRIVATE_CA_ENABLED": to_bool(os.environ.get("PRIVATE_CA_ENABLED")), } @@ -223,6 +224,7 @@ COMPUTE_POD_RUN_AS_GROUP = os.environ.get("COMPUTE_POD_RUN_AS_GROUP") COMPUTE_POD_FS_GROUP = os.environ.get("COMPUTE_POD_FS_GROUP") COMPUTE_POD_GKE_GPUS_LIMITS = int(os.environ.get("COMPUTE_POD_GKE_GPUS_LIMITS", 0)) +COMPUTE_POD_RESOURCES = os.environ.get("COMPUTE_POD_RESOURCES") # Prometheus configuration ENABLE_METRICS = to_bool(os.environ.get("ENABLE_METRICS", False)) diff --git a/backend/builder/image_builder/image_builder.py b/backend/builder/image_builder/image_builder.py index ed25b48ed..3270e0fc1 100644 --- a/backend/builder/image_builder/image_builder.py +++ b/backend/builder/image_builder/image_builder.py @@ -23,7 +23,7 @@ from substrapp.compute_tasks.volumes import get_worker_subtuple_pvc_name from substrapp.docker_registry import USER_IMAGE_REPOSITORY from substrapp.kubernetes_utils import delete_pod -from substrapp.kubernetes_utils import get_resources_requirements +from substrapp.kubernetes_utils import get_resources_requirements_from_yaml from substrapp.kubernetes_utils import get_security_context from substrapp.lock_local import lock_resource from substrapp.utils import timeit @@ -43,6 +43,7 @@ IMAGE_BUILD_TIMEOUT = settings.IMAGE_BUILD_TIMEOUT KANIKO_CONTAINER_NAME = "kaniko" HOSTNAME = settings.HOSTNAME +KANIKO_RESOURCES = settings.TASK["KANIKO_RESOURCES"] def container_image_tag_from_function(function: orchestrator.Function) -> str: @@ -307,7 +308,7 @@ def _build_container(dockerfile_mount_path: str, image_tag: str) -> kubernetes.c args=args, volume_mounts=volume_mounts, security_context=container_security_context, - resources=get_resources_requirements(cpu_request="1000m", memory_request="4Gi", memory_limit="32Gi"), + resources=get_resources_requirements_from_yaml(yaml_resources=KANIKO_RESOURCES), ) diff --git a/backend/substrapp/compute_tasks/compute_pod.py b/backend/substrapp/compute_tasks/compute_pod.py index 5e929104a..2f46df4b0 100644 --- a/backend/substrapp/compute_tasks/compute_pod.py +++ b/backend/substrapp/compute_tasks/compute_pod.py @@ -6,10 +6,11 @@ from substrapp.kubernetes_utils import delete_pod from substrapp.kubernetes_utils import get_pod_security_context -from substrapp.kubernetes_utils import get_resources_requirements +from substrapp.kubernetes_utils import get_resources_requirements_from_yaml from substrapp.kubernetes_utils import get_security_context NAMESPACE = settings.NAMESPACE +COMPUTE_POD_RESOURCES = settings.COMPUTE_POD_RESOURCES logger = structlog.get_logger(__name__) @@ -113,7 +114,7 @@ def create_pod( args=None, volume_mounts=volume_mounts + gpu_volume_mounts, security_context=get_security_context(), - resources=get_resources_requirements(cpu_request="1000m", memory_request="1Gi", memory_limit="64Gi"), + resources=get_resources_requirements_from_yaml(yaml_resources=COMPUTE_POD_RESOURCES), env=[kubernetes.client.V1EnvVar(name=env_name, value=env_value) for env_name, env_value in environment.items()], **container_optional_kwargs, ) diff --git a/backend/substrapp/kubernetes_utils.py b/backend/substrapp/kubernetes_utils.py index 1486e198a..663542fcc 100644 --- a/backend/substrapp/kubernetes_utils.py +++ b/backend/substrapp/kubernetes_utils.py @@ -1,4 +1,5 @@ import kubernetes +import yaml import structlog from django.conf import settings @@ -47,11 +48,13 @@ def get_security_context(root: bool = False, capabilities: list[str] = None) -> return security_context -def get_resources_requirements( - *, cpu_request: str = "1000m", memory_request: str = "200M", memory_limit: str = "2G" +def get_resources_requirements_from_yaml( + *, + yaml_resources: str, ) -> kubernetes.client.V1ResourceRequirements: + resources_dict = yaml.load(yaml_resources, Loader=yaml.FullLoader) return kubernetes.client.V1ResourceRequirements( - requests={"cpu": cpu_request, "memory": memory_request}, limits={"memory": memory_limit} + requests=resources_dict["requests"], limits=resources_dict["limits"] ) diff --git a/charts/substra-backend/README.md b/charts/substra-backend/README.md index 79ae134fc..11d1acafe 100644 --- a/charts/substra-backend/README.md +++ b/charts/substra-backend/README.md @@ -72,8 +72,8 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `server.ingress.extraHosts` | The list of additional hostnames to be covered with this ingress record | `[]` | | `server.ingress.extraTls` | The tls configuration for hostnames to be coverred by the ingress | `[]` | | `server.ingress.ingressClassName` | _IngressClass_ that will be used to implement the Ingress | `nil` | -| `server.resources.requests.cpu` | Server container cpu request | `1000m` | -| `server.resources.requests.memory` | Server container memory request | `6Gi` | +| `server.resources.requests.cpu` | Server container cpu request | `500m` | +| `server.resources.requests.memory` | Server container memory request | `512Mi` | | `server.resources.limits.cpu` | Server container cpu limit | `2000m` | | `server.resources.limits.memory` | Server container memory limit | `12Gi` | | `server.persistence.storageClass` | Specify the _StorageClass_ used to provision the volume. Or the default _StorageClass_ will be used. Set it to `-` to disable dynamic provisioning | `""` | @@ -122,8 +122,8 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `worker.podSecurityContext.runAsUser` | User ID for the pod | `1001` | | `worker.podSecurityContext.runAsGroup` | Group ID for the pod | `1001` | | `worker.podSecurityContext.fsGroup` | FileSystem group ID for the pod | `1001` | -| `worker.resources.requests.cpu` | Worker container cpu request | `1000m` | -| `worker.resources.requests.memory` | Worker container memory request | `4Gi` | +| `worker.resources.requests.cpu` | Worker container cpu request | `500m` | +| `worker.resources.requests.memory` | Worker container memory request | `512Mi` | | `worker.resources.limits.cpu` | Worker container cpu limit | `2000m` | | `worker.resources.limits.memory` | Worker container memory limit | `8Gi` | | `worker.nodeSelector` | Node labels for pod assignment | `{}` | @@ -138,14 +138,17 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `worker.computePod.securityContext.fsGroup` | Set the filesystem group for the Compute pod | `1001` | | `worker.computePod.securityContext.runAsUser` | Set the user for the Compute pod | `1001` | | `worker.computePod.securityContext.runAsGroup` | Set the group for the Compute pod | `1001` | +| `worker.computePod.resources.requests.cpu` | Worker compute pod container cpu request | `500m` | +| `worker.computePod.resources.requests.memory` | Worker compute pod container memory request | `512Mi` | +| `worker.computePod.resources.limits.memory` | Worker compute pod container memory limit | `64Gi` | | `worker.events.enabled` | Enable event service | `true` | | `worker.events.image.registry` | Substra event app image registry | `ghcr.io` | | `worker.events.image.repository` | Substra event app image repository | `substra/substra-backend` | | `worker.events.image.tag` | Substra event app image tag (defaults to AppVersion) | `nil` | | `worker.events.image.pullPolicy` | Substra event app image pull policy | `IfNotPresent` | | `worker.events.image.pullSecrets` | Specify image pull secrets | `[]` | -| `worker.events.resources.requests.cpu` | Worker events container cpu request | `500m` | -| `worker.events.resources.requests.memory` | Worker events container memory request | `200Mi` | +| `worker.events.resources.requests.cpu` | Worker events container cpu request | `100m` | +| `worker.events.resources.requests.memory` | Worker events container memory request | `50Mi` | | `worker.events.resources.limits.cpu` | Worker events container cpu limit | `500m` | | `worker.events.resources.limits.memory` | Worker events container memory limit | `400Mi` | | `worker.events.podSecurityContext.enabled` | Enable security context | `true` | @@ -173,8 +176,8 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `schedulerWorker.nodeSelector` | Node labels for pod assignment | `{}` | | `schedulerWorker.tolerations` | Toleration labels for pod assignment | `[]` | | `schedulerWorker.affinity` | Affinity settings for pod assignment | `{}` | -| `schedulerWorker.resources.requests.cpu` | Scheduler container cpu request | `250m` | -| `schedulerWorker.resources.requests.memory` | Scheduler container memory request | `200Mi` | +| `schedulerWorker.resources.requests.cpu` | Scheduler container cpu request | `100m` | +| `schedulerWorker.resources.requests.memory` | Scheduler container memory request | `50Mi` | | `schedulerWorker.resources.limits.cpu` | Scheduler container cpu limit | `250m` | | `schedulerWorker.resources.limits.memory` | Scheduler container memory limit | `400Mi` | | `schedulerWorker.podSecurityContext.enabled` | Enable security context | `true` | @@ -193,8 +196,8 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `scheduler.image.tag` | Substra backend tasks scheduler image tag (defaults to AppVersion) | `nil` | | `scheduler.image.pullPolicy` | Substra backend task scheduler image pull policy | `IfNotPresent` | | `scheduler.image.pullSecrets` | Specify image pull secrets | `[]` | -| `scheduler.resources.requests.cpu` | Scheduler container cpu request | `250m` | -| `scheduler.resources.requests.memory` | Scheduler container memory request | `200Mi` | +| `scheduler.resources.requests.cpu` | Scheduler container cpu request | `100m` | +| `scheduler.resources.requests.memory` | Scheduler container memory request | `50Mi` | | `scheduler.resources.limits.cpu` | Scheduler container cpu limit | `250m` | | `scheduler.resources.limits.memory` | Scheduler container memory limit | `400Mi` | | `scheduler.nodeSelector` | Node labels for pod assignment | `{}` | @@ -222,8 +225,8 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `builder.podSecurityContext.runAsUser` | User ID for the pod | `1001` | | `builder.podSecurityContext.runAsGroup` | Group ID for the pod | `1001` | | `builder.podSecurityContext.fsGroup` | FileSystem group ID for the pod | `1001` | -| `builder.resources.requests.cpu` | Builder container cpu request | `2000m` | -| `builder.resources.requests.memory` | Builder container memory request | `4Gi` | +| `builder.resources.requests.cpu` | Builder container cpu request | `500m` | +| `builder.resources.requests.memory` | Builder container memory request | `512Mi` | | `builder.resources.limits.cpu` | Builder container cpu limit | `2000m` | | `builder.resources.limits.memory` | Builder container memory limit | `8Gi` | | `builder.nodeSelector` | Node labels for pod assignment | `{}` | @@ -257,8 +260,8 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `api.events.image.tag` | Substra event app image tag (defaults to AppVersion) | `nil` | | `api.events.image.pullPolicy` | Substra event app image pull policy | `IfNotPresent` | | `api.events.image.pullSecrets` | Specify image pull secrets | `[]` | -| `api.events.resources.requests.cpu` | Api events container cpu request | `500m` | -| `api.events.resources.requests.memory` | Api events container memory request | `200Mi` | +| `api.events.resources.requests.cpu` | Api events container cpu request | `100m` | +| `api.events.resources.requests.memory` | Api events container memory request | `50Mi` | | `api.events.resources.limits.cpu` | Api events container cpu limit | `500m` | | `api.events.resources.limits.memory` | Api events container memory limit | `400Mi` | | `api.events.podSecurityContext.enabled` | Enable security context | `true` | @@ -293,6 +296,9 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `kaniko.image.registry` | Kaniko image registry | `gcr.io` | | `kaniko.image.repository` | Kaniko image repository | `kaniko-project/executor` | | `kaniko.image.tag` | Kaniko image tag | `v1.8.1` | +| `kaniko.resources.requests.cpu` | Kaniko container cpu request | `500m` | +| `kaniko.resources.requests.memory` | Kaniko container memory request | `256Mi` | +| `kaniko.resources.limits.memory` | Kaniko container memory limit | `32Gi` | | `kaniko.mirror` | If set to `true` pull base images from the local registry. | `false` | | `kaniko.dockerConfigSecretName` | A Docker config to use for pulling base images | `nil` | | `kaniko.cache.warmer.image.registry` | Kaniko cache warmer registry | `gcr.io` | @@ -314,12 +320,9 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | Name | Description | Value | | ------------------------------------------------------------ | -------------------------------------- | ------- | -| `registryPrepopulate.waitRegistry.resources.requests.cpu` | Wait registry container cpu request | `500m` | -| `registryPrepopulate.waitRegistry.resources.requests.memory` | Wait registry container memory request | `200Mi` | +| `registryPrepopulate.waitRegistry.resources.requests.cpu` | Wait registry container cpu request | `100m` | +| `registryPrepopulate.waitRegistry.resources.requests.memory` | Wait registry container memory request | `50Mi` | | `registryPrepopulate.waitRegistry.resources.limits.memory` | Wait registry container memory limit | `400Mi` | -| `registryPrepopulate.kaniko.resources.requests.cpu` | Kaniko container cpu request | `1000m` | -| `registryPrepopulate.kaniko.resources.requests.memory` | Kaniko container memory request | `2Gi` | -| `registryPrepopulate.kaniko.resources.limits.memory` | Kaniko container memory limit | `8Gi` | | `registryPrepopulate.pause.resources.requests.cpu` | Pause container cpu request | `50m` | | `registryPrepopulate.pause.resources.requests.memory` | Pause container memory request | `64Mi` | | `registryPrepopulate.pause.resources.limits.memory` | Pause container memory limit | `128Mi` | diff --git a/charts/substra-backend/templates/deployment-registry-prepopulate.yaml b/charts/substra-backend/templates/deployment-registry-prepopulate.yaml index 9ca60e39f..d1f319b1c 100644 --- a/charts/substra-backend/templates/deployment-registry-prepopulate.yaml +++ b/charts/substra-backend/templates/deployment-registry-prepopulate.yaml @@ -29,7 +29,7 @@ spec: - name: kaniko image: {{ include "common.images.name" $.Values.kaniko.image }} resources: - {{- toYaml $.Values.registryPrepopulate.kaniko.resources | nindent 12 }} + {{- toYaml $.Values.kaniko.resources | nindent 12 }} args: - "--context=/docker-context" {{- if .dstImage }} diff --git a/charts/substra-backend/templates/statefulset-builder.yaml b/charts/substra-backend/templates/statefulset-builder.yaml index 50a100afd..e636876f4 100644 --- a/charts/substra-backend/templates/statefulset-builder.yaml +++ b/charts/substra-backend/templates/statefulset-builder.yaml @@ -160,6 +160,8 @@ spec: value: {{ .Values.kaniko.dockerConfigSecretName | quote }} - name: OBJECTSTORE_URL value: {{ include "substra-backend.objectStore.url" . | quote }} + - name: KANIKO_RESOURCES + value: {{ toYaml .Values.kaniko.resources | quote }} ports: - name: http containerPort: 8000 diff --git a/charts/substra-backend/templates/statefulset-worker.yaml b/charts/substra-backend/templates/statefulset-worker.yaml index e797e4661..48ce3c1b4 100644 --- a/charts/substra-backend/templates/statefulset-worker.yaml +++ b/charts/substra-backend/templates/statefulset-worker.yaml @@ -109,6 +109,8 @@ spec: valueFrom: fieldRef: fieldPath: spec.nodeName + - name: COMPUTE_POD_RESOURCES + value: {{ toYaml .Values.worker.computePod.resources | quote }} - name: COMPUTE_POD_MAX_STARTUP_WAIT_SECONDS value: {{ .Values.worker.computePod.maxStartupWaitSeconds | quote }} - name: OBJECTSTORE_URL diff --git a/charts/substra-backend/values.yaml b/charts/substra-backend/values.yaml index 1109118a8..cf53e89ac 100644 --- a/charts/substra-backend/values.yaml +++ b/charts/substra-backend/values.yaml @@ -355,6 +355,16 @@ worker: fsGroup: 1001 runAsUser: 1001 runAsGroup: 1001 + ## @param worker.computePod.resources.requests.cpu Worker compute pod container cpu request + ## @param worker.computePod.resources.requests.memory Worker compute pod container memory request + ## @param worker.computePod.resources.limits.memory Worker compute pod container memory limit + ## + resources: + requests: + cpu: "500m" + memory: "512Mi" + limits: + memory: "64Gi" events: ## @param worker.events.enabled Enable event service ## @@ -577,7 +587,6 @@ builder: limits: cpu: "2000m" memory: "8Gi" - ## @param builder.nodeSelector Node labels for pod assignment ## nodeSelector: { } @@ -749,6 +758,16 @@ kaniko: registry: gcr.io repository: kaniko-project/executor tag: v1.8.1 + ## @param kaniko.resources.requests.cpu Kaniko container cpu request + ## @param kaniko.resources.requests.memory Kaniko container memory request + ## @param kaniko.resources.limits.memory Kaniko container memory limit + ## + resources: + requests: + cpu: "500m" + memory: "256Mi" + limits: + memory: "32Gi" ## @param kaniko.mirror If set to `true` pull base images from the local registry. ## mirror: false @@ -814,17 +833,6 @@ registryPrepopulate: cpu: "100m" limits: memory: "400Mi" - ## @param registryPrepopulate.kaniko.resources.requests.cpu Kaniko container cpu request - ## @param registryPrepopulate.kaniko.resources.requests.memory Kaniko container memory request - ## @param registryPrepopulate.kaniko.resources.limits.memory Kaniko container memory limit - ## - kaniko: - resources: - requests: - memory: "256Mi" - cpu: "500m" - limits: - memory: "8Gi" ## @param registryPrepopulate.pause.resources.requests.cpu Pause container cpu request ## @param registryPrepopulate.pause.resources.requests.memory Pause container memory request ## @param registryPrepopulate.pause.resources.limits.memory Pause container memory limit From 9e56fdd65fcae62d0b3db011f9cdfd49245b3417 Mon Sep 17 00:00:00 2001 From: ThibaultFy Date: Thu, 25 Apr 2024 15:17:50 +0200 Subject: [PATCH 06/11] chore: all resources in values Signed-off-by: ThibaultFy --- backend/substrapp/kubernetes_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/substrapp/kubernetes_utils.py b/backend/substrapp/kubernetes_utils.py index 663542fcc..dfc1ed737 100644 --- a/backend/substrapp/kubernetes_utils.py +++ b/backend/substrapp/kubernetes_utils.py @@ -1,6 +1,6 @@ import kubernetes -import yaml import structlog +import yaml from django.conf import settings from substrapp.exceptions import KubernetesError From d193ffcd816f49c7ec1d48acc0735e7e700f7ccc Mon Sep 17 00:00:00 2001 From: ThibaultFy Date: Thu, 25 Apr 2024 15:28:54 +0200 Subject: [PATCH 07/11] chore: chart version and changelog Signed-off-by: ThibaultFy --- changes/892.added | 1 + charts/substra-backend/CHANGELOG.md | 6 ++++++ charts/substra-backend/Chart.yaml | 2 +- 3 files changed, 8 insertions(+), 1 deletion(-) create mode 100644 changes/892.added diff --git a/changes/892.added b/changes/892.added new file mode 100644 index 000000000..ca73aaa08 --- /dev/null +++ b/changes/892.added @@ -0,0 +1 @@ +Resources to Kaniko pods and computeTask pods \ No newline at end of file diff --git a/charts/substra-backend/CHANGELOG.md b/charts/substra-backend/CHANGELOG.md index f480d971c..4591628f0 100644 --- a/charts/substra-backend/CHANGELOG.md +++ b/charts/substra-backend/CHANGELOG.md @@ -2,6 +2,12 @@ +## [26.4.0] - 2024-04-17 + +### Added + +- Resources limits and requests (CPU and memory) for all containers are set and configurable in values. + ## [26.3.0] - 2024-04-19 ### Removed diff --git a/charts/substra-backend/Chart.yaml b/charts/substra-backend/Chart.yaml index 34114016d..48d346719 100644 --- a/charts/substra-backend/Chart.yaml +++ b/charts/substra-backend/Chart.yaml @@ -1,7 +1,7 @@ apiVersion: v2 name: substra-backend home: https://github.com/Substra -version: 26.3.0 +version: 26.4.0 appVersion: 0.45.0 kubeVersion: ">= 1.19.0-0" description: Main package for Substra From bd89e78c4ddc68ebc70d3eb28c72c60bd1085ebc Mon Sep 17 00:00:00 2001 From: ThibaultFy Date: Thu, 25 Apr 2024 15:45:00 +0200 Subject: [PATCH 08/11] chore: lower resources Signed-off-by: ThibaultFy --- charts/substra-backend/values.yaml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/charts/substra-backend/values.yaml b/charts/substra-backend/values.yaml index cf53e89ac..c628eb247 100644 --- a/charts/substra-backend/values.yaml +++ b/charts/substra-backend/values.yaml @@ -179,7 +179,7 @@ server: ## resources: requests: - cpu: "500m" + cpu: "200m" memory: "512Mi" limits: cpu: "2000m" @@ -313,7 +313,7 @@ worker: ## resources: requests: - cpu: "500m" + cpu: "200m" memory: "512Mi" limits: cpu: "2000m" @@ -462,7 +462,7 @@ schedulerWorker: ## resources: requests: - cpu: "100m" + cpu: "50m" memory: "50Mi" limits: cpu: "250m" @@ -505,7 +505,7 @@ scheduler: ## resources: requests: - cpu: "100m" + cpu: "50m" memory: "50Mi" limits: cpu: "250m" @@ -582,7 +582,7 @@ builder: ## resources: requests: - cpu: "500m" + cpu: "200m" memory: "512Mi" limits: cpu: "2000m" @@ -947,7 +947,7 @@ postgresql: - ALL resources: requests: - cpu: "100m" + cpu: "50m" memory: "256Mi" limits: cpu: "1000m" @@ -968,7 +968,7 @@ redis: redis: 6379 resources: requests: - cpu: "100m" + cpu: "50m" memory: "256Mi" limits: cpu: "500m" @@ -1036,7 +1036,7 @@ localstack: nodePort: "" resources: requests: - cpu: "100m" + cpu: "50m" memory: "1Gi" limits: cpu: "500m" From 0df805e0d73325fa53339a8a7c96c79ece947ce3 Mon Sep 17 00:00:00 2001 From: ThibaultFy Date: Thu, 25 Apr 2024 15:54:31 +0200 Subject: [PATCH 09/11] chore: yaml safe load Signed-off-by: ThibaultFy --- backend/substrapp/kubernetes_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/backend/substrapp/kubernetes_utils.py b/backend/substrapp/kubernetes_utils.py index dfc1ed737..110474428 100644 --- a/backend/substrapp/kubernetes_utils.py +++ b/backend/substrapp/kubernetes_utils.py @@ -52,7 +52,9 @@ def get_resources_requirements_from_yaml( *, yaml_resources: str, ) -> kubernetes.client.V1ResourceRequirements: - resources_dict = yaml.load(yaml_resources, Loader=yaml.FullLoader) + """Return a kubernetes.client.V1ResourceRequirements object from a yaml string.""" + resources_dict = yaml.safe_load(yaml_resources) + return kubernetes.client.V1ResourceRequirements( requests=resources_dict["requests"], limits=resources_dict["limits"] ) From 052bd5070fbcf7d2173f88642170fed0d9318481 Mon Sep 17 00:00:00 2001 From: ThibaultFy Date: Thu, 25 Apr 2024 15:56:00 +0200 Subject: [PATCH 10/11] chore: update chart changelog and readme Signed-off-by: ThibaultFy --- charts/substra-backend/CHANGELOG.md | 2 +- charts/substra-backend/README.md | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/charts/substra-backend/CHANGELOG.md b/charts/substra-backend/CHANGELOG.md index 4591628f0..dee495c90 100644 --- a/charts/substra-backend/CHANGELOG.md +++ b/charts/substra-backend/CHANGELOG.md @@ -2,7 +2,7 @@ -## [26.4.0] - 2024-04-17 +## [26.4.0] - 2024-04-25 ### Added diff --git a/charts/substra-backend/README.md b/charts/substra-backend/README.md index 11d1acafe..82229aaf9 100644 --- a/charts/substra-backend/README.md +++ b/charts/substra-backend/README.md @@ -72,7 +72,7 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `server.ingress.extraHosts` | The list of additional hostnames to be covered with this ingress record | `[]` | | `server.ingress.extraTls` | The tls configuration for hostnames to be coverred by the ingress | `[]` | | `server.ingress.ingressClassName` | _IngressClass_ that will be used to implement the Ingress | `nil` | -| `server.resources.requests.cpu` | Server container cpu request | `500m` | +| `server.resources.requests.cpu` | Server container cpu request | `200m` | | `server.resources.requests.memory` | Server container memory request | `512Mi` | | `server.resources.limits.cpu` | Server container cpu limit | `2000m` | | `server.resources.limits.memory` | Server container memory limit | `12Gi` | @@ -122,7 +122,7 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `worker.podSecurityContext.runAsUser` | User ID for the pod | `1001` | | `worker.podSecurityContext.runAsGroup` | Group ID for the pod | `1001` | | `worker.podSecurityContext.fsGroup` | FileSystem group ID for the pod | `1001` | -| `worker.resources.requests.cpu` | Worker container cpu request | `500m` | +| `worker.resources.requests.cpu` | Worker container cpu request | `200m` | | `worker.resources.requests.memory` | Worker container memory request | `512Mi` | | `worker.resources.limits.cpu` | Worker container cpu limit | `2000m` | | `worker.resources.limits.memory` | Worker container memory limit | `8Gi` | @@ -176,7 +176,7 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `schedulerWorker.nodeSelector` | Node labels for pod assignment | `{}` | | `schedulerWorker.tolerations` | Toleration labels for pod assignment | `[]` | | `schedulerWorker.affinity` | Affinity settings for pod assignment | `{}` | -| `schedulerWorker.resources.requests.cpu` | Scheduler container cpu request | `100m` | +| `schedulerWorker.resources.requests.cpu` | Scheduler container cpu request | `50m` | | `schedulerWorker.resources.requests.memory` | Scheduler container memory request | `50Mi` | | `schedulerWorker.resources.limits.cpu` | Scheduler container cpu limit | `250m` | | `schedulerWorker.resources.limits.memory` | Scheduler container memory limit | `400Mi` | @@ -196,7 +196,7 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `scheduler.image.tag` | Substra backend tasks scheduler image tag (defaults to AppVersion) | `nil` | | `scheduler.image.pullPolicy` | Substra backend task scheduler image pull policy | `IfNotPresent` | | `scheduler.image.pullSecrets` | Specify image pull secrets | `[]` | -| `scheduler.resources.requests.cpu` | Scheduler container cpu request | `100m` | +| `scheduler.resources.requests.cpu` | Scheduler container cpu request | `50m` | | `scheduler.resources.requests.memory` | Scheduler container memory request | `50Mi` | | `scheduler.resources.limits.cpu` | Scheduler container cpu limit | `250m` | | `scheduler.resources.limits.memory` | Scheduler container memory limit | `400Mi` | @@ -225,7 +225,7 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `builder.podSecurityContext.runAsUser` | User ID for the pod | `1001` | | `builder.podSecurityContext.runAsGroup` | Group ID for the pod | `1001` | | `builder.podSecurityContext.fsGroup` | FileSystem group ID for the pod | `1001` | -| `builder.resources.requests.cpu` | Builder container cpu request | `500m` | +| `builder.resources.requests.cpu` | Builder container cpu request | `200m` | | `builder.resources.requests.memory` | Builder container memory request | `512Mi` | | `builder.resources.limits.cpu` | Builder container cpu limit | `2000m` | | `builder.resources.limits.memory` | Builder container memory limit | `8Gi` | From 70fb2c77cda0dddce105f2c420d5b5018ba4cb2e Mon Sep 17 00:00:00 2001 From: ThibaultFy Date: Thu, 25 Apr 2024 15:56:32 +0200 Subject: [PATCH 11/11] chore: update backend documentation Signed-off-by: ThibaultFy --- docs/settings.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/settings.md b/docs/settings.md index 3fd315b99..0beb8b5e9 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -17,6 +17,7 @@ Accepted true values for `bool` are: `1`, `ON`, `On`, `on`, `T`, `t`, `TRUE`, `T | string | `COMMON_HOST_DOMAIN` | nil | | | string | `COMPUTE_POD_FS_GROUP` | nil | | | int | `COMPUTE_POD_GKE_GPUS_LIMITS` | `0` | | +| string | `COMPUTE_POD_RESOURCES` | nil | | | string | `COMPUTE_POD_RUN_AS_GROUP` | nil | | | string | `COMPUTE_POD_RUN_AS_USER` | nil | | | int | `COMPUTE_POD_STARTUP_TIMEOUT_SECONDS` | `300` | | @@ -43,6 +44,7 @@ Accepted true values for `bool` are: `1`, `ON`, `On`, `on`, `T`, `t`, `TRUE`, `T | string | `KANIKO_DOCKER_CONFIG_SECRET_NAME` | nil | | | string | `KANIKO_IMAGE` | nil | | | bool | `KANIKO_MIRROR` | `False` | | +| string | `KANIKO_RESOURCES` | nil | | | bool | `LOGGING_USE_COLORS` | `True` | | | string | `LOG_LEVEL` | `INFO` | | | string | `NAMESPACE` | nil | |