From c7ca4a1b0df6ee3807b79a472728a9af54930cc2 Mon Sep 17 00:00:00 2001 From: ThibaultFy <50656860+ThibaultFy@users.noreply.github.com> Date: Fri, 26 Apr 2024 09:43:20 +0200 Subject: [PATCH] chore: add limits and requests configuration in values (#867) (#892) Signed-off-by: SdgJlbl Signed-off-by: ThibaultFy Co-authored-by: SdgJlbl --- backend/backend/settings/common.py | 2 + .../builder/image_builder/image_builder.py | 3 + .../substrapp/compute_tasks/compute_pod.py | 3 + backend/substrapp/kubernetes_utils.py | 13 ++ backend/substrapp/tasks/task.py | 2 +- changes/892.added | 1 + charts/substra-backend/CHANGELOG.md | 6 + charts/substra-backend/Chart.yaml | 2 +- charts/substra-backend/README.md | 56 ++++- .../templates/deployment-api-events.yaml | 2 + .../deployment-registry-prepopulate.yaml | 6 + .../templates/deployment-worker-events.yaml | 2 + .../templates/statefulset-builder.yaml | 2 + .../templates/statefulset-worker.yaml | 2 + charts/substra-backend/values.yaml | 206 +++++++++++++++--- docs/settings.md | 2 + 16 files changed, 264 insertions(+), 46 deletions(-) create mode 100644 changes/892.added diff --git a/backend/backend/settings/common.py b/backend/backend/settings/common.py index 23ece165a..5cae475e4 100644 --- a/backend/backend/settings/common.py +++ b/backend/backend/settings/common.py @@ -197,6 +197,7 @@ "KANIKO_MIRROR": to_bool(os.environ.get("KANIKO_MIRROR", False)), "KANIKO_IMAGE": os.environ.get("KANIKO_IMAGE"), "KANIKO_DOCKER_CONFIG_SECRET_NAME": os.environ.get("KANIKO_DOCKER_CONFIG_SECRET_NAME"), + "KANIKO_RESOURCES": os.environ.get("KANIKO_RESOURCES"), "COMPUTE_POD_STARTUP_TIMEOUT_SECONDS": int(os.environ.get("COMPUTE_POD_STARTUP_TIMEOUT_SECONDS", 300)), "PRIVATE_CA_ENABLED": to_bool(os.environ.get("PRIVATE_CA_ENABLED")), } @@ -223,6 +224,7 @@ COMPUTE_POD_RUN_AS_GROUP = os.environ.get("COMPUTE_POD_RUN_AS_GROUP") COMPUTE_POD_FS_GROUP = os.environ.get("COMPUTE_POD_FS_GROUP") COMPUTE_POD_GKE_GPUS_LIMITS = int(os.environ.get("COMPUTE_POD_GKE_GPUS_LIMITS", 0)) +COMPUTE_POD_RESOURCES = os.environ.get("COMPUTE_POD_RESOURCES") # Prometheus configuration ENABLE_METRICS = to_bool(os.environ.get("ENABLE_METRICS", False)) diff --git a/backend/builder/image_builder/image_builder.py b/backend/builder/image_builder/image_builder.py index 38810abac..3270e0fc1 100644 --- a/backend/builder/image_builder/image_builder.py +++ b/backend/builder/image_builder/image_builder.py @@ -23,6 +23,7 @@ from substrapp.compute_tasks.volumes import get_worker_subtuple_pvc_name from substrapp.docker_registry import USER_IMAGE_REPOSITORY from substrapp.kubernetes_utils import delete_pod +from substrapp.kubernetes_utils import get_resources_requirements_from_yaml from substrapp.kubernetes_utils import get_security_context from substrapp.lock_local import lock_resource from substrapp.utils import timeit @@ -42,6 +43,7 @@ IMAGE_BUILD_TIMEOUT = settings.IMAGE_BUILD_TIMEOUT KANIKO_CONTAINER_NAME = "kaniko" HOSTNAME = settings.HOSTNAME +KANIKO_RESOURCES = settings.TASK["KANIKO_RESOURCES"] def container_image_tag_from_function(function: orchestrator.Function) -> str: @@ -306,6 +308,7 @@ def _build_container(dockerfile_mount_path: str, image_tag: str) -> kubernetes.c args=args, volume_mounts=volume_mounts, security_context=container_security_context, + resources=get_resources_requirements_from_yaml(yaml_resources=KANIKO_RESOURCES), ) diff --git a/backend/substrapp/compute_tasks/compute_pod.py b/backend/substrapp/compute_tasks/compute_pod.py index bc12bea02..2f46df4b0 100644 --- a/backend/substrapp/compute_tasks/compute_pod.py +++ b/backend/substrapp/compute_tasks/compute_pod.py @@ -6,9 +6,11 @@ from substrapp.kubernetes_utils import delete_pod from substrapp.kubernetes_utils import get_pod_security_context +from substrapp.kubernetes_utils import get_resources_requirements_from_yaml from substrapp.kubernetes_utils import get_security_context NAMESPACE = settings.NAMESPACE +COMPUTE_POD_RESOURCES = settings.COMPUTE_POD_RESOURCES logger = structlog.get_logger(__name__) @@ -112,6 +114,7 @@ def create_pod( args=None, volume_mounts=volume_mounts + gpu_volume_mounts, security_context=get_security_context(), + resources=get_resources_requirements_from_yaml(yaml_resources=COMPUTE_POD_RESOURCES), env=[kubernetes.client.V1EnvVar(name=env_name, value=env_value) for env_name, env_value in environment.items()], **container_optional_kwargs, ) diff --git a/backend/substrapp/kubernetes_utils.py b/backend/substrapp/kubernetes_utils.py index 5bb6ac65f..110474428 100644 --- a/backend/substrapp/kubernetes_utils.py +++ b/backend/substrapp/kubernetes_utils.py @@ -1,5 +1,6 @@ import kubernetes import structlog +import yaml from django.conf import settings from substrapp.exceptions import KubernetesError @@ -47,6 +48,18 @@ def get_security_context(root: bool = False, capabilities: list[str] = None) -> return security_context +def get_resources_requirements_from_yaml( + *, + yaml_resources: str, +) -> kubernetes.client.V1ResourceRequirements: + """Return a kubernetes.client.V1ResourceRequirements object from a yaml string.""" + resources_dict = yaml.safe_load(yaml_resources) + + return kubernetes.client.V1ResourceRequirements( + requests=resources_dict["requests"], limits=resources_dict["limits"] + ) + + def pod_exists_by_label_selector(k8s_client: kubernetes.client.CoreV1Api, label_selector: str) -> bool: """Return True if the pod exists, else False. diff --git a/backend/substrapp/tasks/task.py b/backend/substrapp/tasks/task.py index c5adda2be..a87083b57 100644 --- a/backend/substrapp/tasks/task.py +++ b/backend/substrapp/tasks/task.py @@ -73,7 +73,7 @@ def on_success(self, retval: dict[str, Any], task_id: str, args: tuple, kwargs: # Celery does not provide unpacked arguments, we are doing it in `split_args` def on_retry(self, exc: Exception, task_id: str, args: tuple, kwargs: dict[str, Any], einfo: ExceptionInfo) -> None: _, task = self.split_args(args) - # delete compute pod to reset hardware ressources + # delete compute pod to reset hardware resources delete_compute_plan_pods(task.compute_plan_key) logger.info( "Retrying task", diff --git a/changes/892.added b/changes/892.added new file mode 100644 index 000000000..ca73aaa08 --- /dev/null +++ b/changes/892.added @@ -0,0 +1 @@ +Resources to Kaniko pods and computeTask pods \ No newline at end of file diff --git a/charts/substra-backend/CHANGELOG.md b/charts/substra-backend/CHANGELOG.md index f480d971c..dee495c90 100644 --- a/charts/substra-backend/CHANGELOG.md +++ b/charts/substra-backend/CHANGELOG.md @@ -2,6 +2,12 @@ +## [26.4.0] - 2024-04-25 + +### Added + +- Resources limits and requests (CPU and memory) for all containers are set and configurable in values. + ## [26.3.0] - 2024-04-19 ### Removed diff --git a/charts/substra-backend/Chart.yaml b/charts/substra-backend/Chart.yaml index 34114016d..48d346719 100644 --- a/charts/substra-backend/Chart.yaml +++ b/charts/substra-backend/Chart.yaml @@ -1,7 +1,7 @@ apiVersion: v2 name: substra-backend home: https://github.com/Substra -version: 26.3.0 +version: 26.4.0 appVersion: 0.45.0 kubeVersion: ">= 1.19.0-0" description: Main package for Substra diff --git a/charts/substra-backend/README.md b/charts/substra-backend/README.md index 758fc4116..82229aaf9 100644 --- a/charts/substra-backend/README.md +++ b/charts/substra-backend/README.md @@ -64,7 +64,7 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `server.service.externalIPs` | A list of IP addresses for which nodes in the cluster will also accept traffic for this service | `[]` | | `server.service.annotations` | Additional annotations for the _Service_ resource. | `{}` | | `server.ingress.enabled` | Deploy an ingress for the substra backend server | `false` | -| `server.ingress.hostname` | Default host for the ingress ressource | `substra.backend.local` | +| `server.ingress.hostname` | Default host for the ingress resource | `substra.backend.local` | | `server.ingress.pathType` | Ingress path type | `ImplementationSpecific` | | `server.ingress.path` | Path for the default host | `/` | | `server.ingress.extraPaths` | The list of extra paths to be created for the default host | `[]` | @@ -72,7 +72,10 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `server.ingress.extraHosts` | The list of additional hostnames to be covered with this ingress record | `[]` | | `server.ingress.extraTls` | The tls configuration for hostnames to be coverred by the ingress | `[]` | | `server.ingress.ingressClassName` | _IngressClass_ that will be used to implement the Ingress | `nil` | -| `server.resources` | Server container resources requests and limits | `{}` | +| `server.resources.requests.cpu` | Server container cpu request | `200m` | +| `server.resources.requests.memory` | Server container memory request | `512Mi` | +| `server.resources.limits.cpu` | Server container cpu limit | `2000m` | +| `server.resources.limits.memory` | Server container memory limit | `12Gi` | | `server.persistence.storageClass` | Specify the _StorageClass_ used to provision the volume. Or the default _StorageClass_ will be used. Set it to `-` to disable dynamic provisioning | `""` | | `server.persistence.servermedias.size` | Servermedias volume size | `10Gi` | | `server.persistence.servermedias.existingClaim` | use this PVC rather than creating a new one | `nil` | @@ -119,7 +122,10 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `worker.podSecurityContext.runAsUser` | User ID for the pod | `1001` | | `worker.podSecurityContext.runAsGroup` | Group ID for the pod | `1001` | | `worker.podSecurityContext.fsGroup` | FileSystem group ID for the pod | `1001` | -| `worker.resources` | Worker container resources requests and limits | `{}` | +| `worker.resources.requests.cpu` | Worker container cpu request | `200m` | +| `worker.resources.requests.memory` | Worker container memory request | `512Mi` | +| `worker.resources.limits.cpu` | Worker container cpu limit | `2000m` | +| `worker.resources.limits.memory` | Worker container memory limit | `8Gi` | | `worker.nodeSelector` | Node labels for pod assignment | `{}` | | `worker.tolerations` | Toleration labels for pod assignment | `[]` | | `worker.affinity` | Affinity settings for pod assignment, ignored if `DataSampleStorageInServerMedia` is `true` | `{}` | @@ -132,12 +138,19 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `worker.computePod.securityContext.fsGroup` | Set the filesystem group for the Compute pod | `1001` | | `worker.computePod.securityContext.runAsUser` | Set the user for the Compute pod | `1001` | | `worker.computePod.securityContext.runAsGroup` | Set the group for the Compute pod | `1001` | +| `worker.computePod.resources.requests.cpu` | Worker compute pod container cpu request | `500m` | +| `worker.computePod.resources.requests.memory` | Worker compute pod container memory request | `512Mi` | +| `worker.computePod.resources.limits.memory` | Worker compute pod container memory limit | `64Gi` | | `worker.events.enabled` | Enable event service | `true` | | `worker.events.image.registry` | Substra event app image registry | `ghcr.io` | | `worker.events.image.repository` | Substra event app image repository | `substra/substra-backend` | | `worker.events.image.tag` | Substra event app image tag (defaults to AppVersion) | `nil` | | `worker.events.image.pullPolicy` | Substra event app image pull policy | `IfNotPresent` | | `worker.events.image.pullSecrets` | Specify image pull secrets | `[]` | +| `worker.events.resources.requests.cpu` | Worker events container cpu request | `100m` | +| `worker.events.resources.requests.memory` | Worker events container memory request | `50Mi` | +| `worker.events.resources.limits.cpu` | Worker events container cpu limit | `500m` | +| `worker.events.resources.limits.memory` | Worker events container memory limit | `400Mi` | | `worker.events.podSecurityContext.enabled` | Enable security context | `true` | | `worker.events.podSecurityContext.runAsUser` | User ID for the pod | `1001` | | `worker.events.podSecurityContext.runAsGroup` | Group ID for the pod | `1001` | @@ -163,7 +176,10 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `schedulerWorker.nodeSelector` | Node labels for pod assignment | `{}` | | `schedulerWorker.tolerations` | Toleration labels for pod assignment | `[]` | | `schedulerWorker.affinity` | Affinity settings for pod assignment | `{}` | -| `schedulerWorker.resources` | Scheduler container resources requests and limits | `{}` | +| `schedulerWorker.resources.requests.cpu` | Scheduler container cpu request | `50m` | +| `schedulerWorker.resources.requests.memory` | Scheduler container memory request | `50Mi` | +| `schedulerWorker.resources.limits.cpu` | Scheduler container cpu limit | `250m` | +| `schedulerWorker.resources.limits.memory` | Scheduler container memory limit | `400Mi` | | `schedulerWorker.podSecurityContext.enabled` | Enable security context | `true` | | `schedulerWorker.podSecurityContext.runAsUser` | User ID for the pod | `1001` | | `schedulerWorker.podSecurityContext.runAsGroup` | Group ID for the pod | `1001` | @@ -180,7 +196,10 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `scheduler.image.tag` | Substra backend tasks scheduler image tag (defaults to AppVersion) | `nil` | | `scheduler.image.pullPolicy` | Substra backend task scheduler image pull policy | `IfNotPresent` | | `scheduler.image.pullSecrets` | Specify image pull secrets | `[]` | -| `scheduler.resources` | Scheduler container resources requests and limits | `{}` | +| `scheduler.resources.requests.cpu` | Scheduler container cpu request | `50m` | +| `scheduler.resources.requests.memory` | Scheduler container memory request | `50Mi` | +| `scheduler.resources.limits.cpu` | Scheduler container cpu limit | `250m` | +| `scheduler.resources.limits.memory` | Scheduler container memory limit | `400Mi` | | `scheduler.nodeSelector` | Node labels for pod assignment | `{}` | | `scheduler.tolerations` | Toleration labels for pod assignment | `[]` | | `scheduler.affinity` | Affinity settings for pod assignment | `{}` | @@ -194,8 +213,8 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | Name | Description | Value | | --------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------- | | `builder.replicaCount` | Number of builder replicas | `1` | -| `builder.enabled` | Enable worker service | `true` | -| `builder.replicaCount` | Replica count for the worker service | `1` | +| `builder.enabled` | Enable builder service | `true` | +| `builder.replicaCount` | Replica count for the builder service | `1` | | `builder.concurrency` | Maximum amount of tasks to process in parallel | `1` | | `builder.image.registry` | Substra backend server image registry | `ghcr.io` | | `builder.image.repository` | Substra backend server image repository | `substra/substra-backend` | @@ -206,7 +225,10 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `builder.podSecurityContext.runAsUser` | User ID for the pod | `1001` | | `builder.podSecurityContext.runAsGroup` | Group ID for the pod | `1001` | | `builder.podSecurityContext.fsGroup` | FileSystem group ID for the pod | `1001` | -| `builder.resources` | Builder container resources requests and limits | `{}` | +| `builder.resources.requests.cpu` | Builder container cpu request | `200m` | +| `builder.resources.requests.memory` | Builder container memory request | `512Mi` | +| `builder.resources.limits.cpu` | Builder container cpu limit | `2000m` | +| `builder.resources.limits.memory` | Builder container memory limit | `8Gi` | | `builder.nodeSelector` | Node labels for pod assignment | `{}` | | `builder.tolerations` | Toleration labels for pod assignment | `[]` | | `builder.affinity` | Affinity settings for pod assignment, ignored if `DataSampleStorageInServerMedia` is `true` | `{}` | @@ -238,6 +260,10 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `api.events.image.tag` | Substra event app image tag (defaults to AppVersion) | `nil` | | `api.events.image.pullPolicy` | Substra event app image pull policy | `IfNotPresent` | | `api.events.image.pullSecrets` | Specify image pull secrets | `[]` | +| `api.events.resources.requests.cpu` | Api events container cpu request | `100m` | +| `api.events.resources.requests.memory` | Api events container memory request | `50Mi` | +| `api.events.resources.limits.cpu` | Api events container cpu limit | `500m` | +| `api.events.resources.limits.memory` | Api events container memory limit | `400Mi` | | `api.events.podSecurityContext.enabled` | Enable security context | `true` | | `api.events.podSecurityContext.runAsUser` | User ID for the pod | `1001` | | `api.events.podSecurityContext.runAsGroup` | Group ID for the pod | `1001` | @@ -270,6 +296,9 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `kaniko.image.registry` | Kaniko image registry | `gcr.io` | | `kaniko.image.repository` | Kaniko image repository | `kaniko-project/executor` | | `kaniko.image.tag` | Kaniko image tag | `v1.8.1` | +| `kaniko.resources.requests.cpu` | Kaniko container cpu request | `500m` | +| `kaniko.resources.requests.memory` | Kaniko container memory request | `256Mi` | +| `kaniko.resources.limits.memory` | Kaniko container memory limit | `32Gi` | | `kaniko.mirror` | If set to `true` pull base images from the local registry. | `false` | | `kaniko.dockerConfigSecretName` | A Docker config to use for pulling base images | `nil` | | `kaniko.cache.warmer.image.registry` | Kaniko cache warmer registry | `gcr.io` | @@ -287,6 +316,17 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub | `addAccountOperator.incomingOrganizations` | Incoming organizations credentials for substra backend organization-to-organization communications | `[]` | | `addAccountOperator.users` | A list of administrators users who can log into the substra backend server with admin privileges | `[]` | +### Registry prepopulate + +| Name | Description | Value | +| ------------------------------------------------------------ | -------------------------------------- | ------- | +| `registryPrepopulate.waitRegistry.resources.requests.cpu` | Wait registry container cpu request | `100m` | +| `registryPrepopulate.waitRegistry.resources.requests.memory` | Wait registry container memory request | `50Mi` | +| `registryPrepopulate.waitRegistry.resources.limits.memory` | Wait registry container memory limit | `400Mi` | +| `registryPrepopulate.pause.resources.requests.cpu` | Pause container cpu request | `50m` | +| `registryPrepopulate.pause.resources.requests.memory` | Pause container memory request | `64Mi` | +| `registryPrepopulate.pause.resources.limits.memory` | Pause container memory limit | `128Mi` | + ### Single Sign-On through OpenID Connect Uses the authorization code flow. diff --git a/charts/substra-backend/templates/deployment-api-events.yaml b/charts/substra-backend/templates/deployment-api-events.yaml index 8df3831d1..b2e8bbd00 100644 --- a/charts/substra-backend/templates/deployment-api-events.yaml +++ b/charts/substra-backend/templates/deployment-api-events.yaml @@ -41,6 +41,8 @@ spec: - name: api-event-app image: {{ include "substra-backend.images.name" (dict "img" .Values.api.events.image "defaultTag" $.Chart.AppVersion) }} imagePullPolicy: {{ .Values.api.events.image.pullPolicy }} + resources: + {{- toYaml .Values.api.events.resources | nindent 12 }} command: ["/bin/bash"] {{- if eq .Values.settings "prod" }} args: ["-c", "python manage.py consume"] diff --git a/charts/substra-backend/templates/deployment-registry-prepopulate.yaml b/charts/substra-backend/templates/deployment-registry-prepopulate.yaml index 2f1da5668..d1f319b1c 100644 --- a/charts/substra-backend/templates/deployment-registry-prepopulate.yaml +++ b/charts/substra-backend/templates/deployment-registry-prepopulate.yaml @@ -23,9 +23,13 @@ spec: initContainers: - name: wait-registry image: jwilder/dockerize:0.6.1 + resources: + {{- toYaml $.Values.registryPrepopulate.waitRegistry | nindent 12 }} command: ['dockerize', '-wait', 'tcp://{{ $.Release.Name }}-docker-registry:5000'] - name: kaniko image: {{ include "common.images.name" $.Values.kaniko.image }} + resources: + {{- toYaml $.Values.kaniko.resources | nindent 12 }} args: - "--context=/docker-context" {{- if .dstImage }} @@ -48,6 +52,8 @@ spec: containers: - image: gcr.io/google-containers/pause:3.2 name: pause + resources: + {{- toYaml $.Values.registryPrepopulate.pause.resources | nindent 12 }} volumes: - name: kaniko-dir emptyDir: {} diff --git a/charts/substra-backend/templates/deployment-worker-events.yaml b/charts/substra-backend/templates/deployment-worker-events.yaml index 83aef2ead..60767f333 100644 --- a/charts/substra-backend/templates/deployment-worker-events.yaml +++ b/charts/substra-backend/templates/deployment-worker-events.yaml @@ -41,6 +41,8 @@ spec: - name: worker-event-app image: {{ include "substra-backend.images.name" (dict "img" .Values.worker.events.image "defaultTag" $.Chart.AppVersion) }} imagePullPolicy: {{ .Values.worker.events.image.pullPolicy }} + resources: + {{- toYaml .Values.worker.events.resources | nindent 12 }} command: ["/bin/bash"] {{- if eq .Values.settings "prod" }} args: ["-c", "python manage.py consume"] diff --git a/charts/substra-backend/templates/statefulset-builder.yaml b/charts/substra-backend/templates/statefulset-builder.yaml index 50a100afd..e636876f4 100644 --- a/charts/substra-backend/templates/statefulset-builder.yaml +++ b/charts/substra-backend/templates/statefulset-builder.yaml @@ -160,6 +160,8 @@ spec: value: {{ .Values.kaniko.dockerConfigSecretName | quote }} - name: OBJECTSTORE_URL value: {{ include "substra-backend.objectStore.url" . | quote }} + - name: KANIKO_RESOURCES + value: {{ toYaml .Values.kaniko.resources | quote }} ports: - name: http containerPort: 8000 diff --git a/charts/substra-backend/templates/statefulset-worker.yaml b/charts/substra-backend/templates/statefulset-worker.yaml index e797e4661..48ce3c1b4 100644 --- a/charts/substra-backend/templates/statefulset-worker.yaml +++ b/charts/substra-backend/templates/statefulset-worker.yaml @@ -109,6 +109,8 @@ spec: valueFrom: fieldRef: fieldPath: spec.nodeName + - name: COMPUTE_POD_RESOURCES + value: {{ toYaml .Values.worker.computePod.resources | quote }} - name: COMPUTE_POD_MAX_STARTUP_WAIT_SECONDS value: {{ .Values.worker.computePod.maxStartupWaitSeconds | quote }} - name: OBJECTSTORE_URL diff --git a/charts/substra-backend/values.yaml b/charts/substra-backend/values.yaml index 17fd787a1..c628eb247 100644 --- a/charts/substra-backend/values.yaml +++ b/charts/substra-backend/values.yaml @@ -121,7 +121,7 @@ server: ## enabled: false - ## @param server.ingress.hostname Default host for the ingress ressource + ## @param server.ingress.hostname Default host for the ingress resource ## hostname: substra.backend.local @@ -172,17 +172,19 @@ server: ## ingressClassName: - ## @param server.resources Server container resources requests and limits - ## e.g: - ## resources: - ## limits: - ## cpu: 100m - ## memory: 128Mi - ## requests: - ## cpu: 100m - ## memory: 128Mi - ## - resources: {} + ## @param server.resources.requests.cpu Server container cpu request + ## @param server.resources.requests.memory Server container memory request + ## @param server.resources.limits.cpu Server container cpu limit + ## @param server.resources.limits.memory Server container memory limit + ## + resources: + requests: + cpu: "200m" + memory: "512Mi" + limits: + cpu: "2000m" + memory: "12Gi" + persistence: ## @param server.persistence.storageClass Specify the _StorageClass_ used to provision the volume. Or the default _StorageClass_ will be used. Set it to `-` to disable dynamic provisioning @@ -304,9 +306,18 @@ worker: runAsUser: 1001 runAsGroup: 1001 fsGroup: 1001 - ## @param worker.resources Worker container resources requests and limits - ## - resources: {} + ## @param worker.resources.requests.cpu Worker container cpu request + ## @param worker.resources.requests.memory Worker container memory request + ## @param worker.resources.limits.cpu Worker container cpu limit + ## @param worker.resources.limits.memory Worker container memory limit + ## + resources: + requests: + cpu: "200m" + memory: "512Mi" + limits: + cpu: "2000m" + memory: "8Gi" ## @param worker.nodeSelector Node labels for pod assignment ## nodeSelector: {} @@ -344,6 +355,16 @@ worker: fsGroup: 1001 runAsUser: 1001 runAsGroup: 1001 + ## @param worker.computePod.resources.requests.cpu Worker compute pod container cpu request + ## @param worker.computePod.resources.requests.memory Worker compute pod container memory request + ## @param worker.computePod.resources.limits.memory Worker compute pod container memory limit + ## + resources: + requests: + cpu: "500m" + memory: "512Mi" + limits: + memory: "64Gi" events: ## @param worker.events.enabled Enable event service ## @@ -360,6 +381,18 @@ worker: tag: null pullPolicy: IfNotPresent pullSecrets: [] + ## @param worker.events.resources.requests.cpu Worker events container cpu request + ## @param worker.events.resources.requests.memory Worker events container memory request + ## @param worker.events.resources.limits.cpu Worker events container cpu limit + ## @param worker.events.resources.limits.memory Worker events container memory limit + ## + resources: + requests: + memory: "50Mi" + cpu: "100m" + limits: + memory: "400Mi" + cpu: "500m" ## @param worker.events.podSecurityContext.enabled Enable security context ## @param worker.events.podSecurityContext.runAsUser User ID for the pod ## @param worker.events.podSecurityContext.runAsGroup Group ID for the pod @@ -422,9 +455,18 @@ schedulerWorker: ## @param schedulerWorker.affinity Affinity settings for pod assignment ## affinity: {} - ## @param schedulerWorker.resources Scheduler container resources requests and limits - ## - resources: {} + ## @param schedulerWorker.resources.requests.cpu Scheduler container cpu request + ## @param schedulerWorker.resources.requests.memory Scheduler container memory request + ## @param schedulerWorker.resources.limits.cpu Scheduler container cpu limit + ## @param schedulerWorker.resources.limits.memory Scheduler container memory limit + ## + resources: + requests: + cpu: "50m" + memory: "50Mi" + limits: + cpu: "250m" + memory: "400Mi" ## @param schedulerWorker.podSecurityContext.enabled Enable security context ## @param schedulerWorker.podSecurityContext.runAsUser User ID for the pod ## @param schedulerWorker.podSecurityContext.runAsGroup Group ID for the pod @@ -456,9 +498,18 @@ scheduler: tag: null pullPolicy: IfNotPresent pullSecrets: [] - ## @param scheduler.resources Scheduler container resources requests and limits - ## - resources: {} + ## @param scheduler.resources.requests.cpu Scheduler container cpu request + ## @param scheduler.resources.requests.memory Scheduler container memory request + ## @param scheduler.resources.limits.cpu Scheduler container cpu limit + ## @param scheduler.resources.limits.memory Scheduler container memory limit + ## + resources: + requests: + cpu: "50m" + memory: "50Mi" + limits: + cpu: "250m" + memory: "400Mi" ## @param scheduler.nodeSelector Node labels for pod assignment ## nodeSelector: {} @@ -484,10 +535,10 @@ scheduler: ## @param builder.replicaCount Number of builder replicas ## builder: - ## @param builder.enabled Enable worker service + ## @param builder.enabled Enable builder service ## enabled: true - ## @param builder.replicaCount Replica count for the worker service + ## @param builder.replicaCount Replica count for the builder service ## replicaCount: 1 @@ -524,18 +575,18 @@ builder: fsGroup: 1001 - ## @param builder.resources Builder container resources requests and limits - ## e.g: - ## resources: - ## limits: - ## cpu: 100m - ## memory: 128Mi - ## requests: - ## cpu: 100m - ## memory: 128Mi - ## - resources: {} - + ## @param builder.resources.requests.cpu Builder container cpu request + ## @param builder.resources.requests.memory Builder container memory request + ## @param builder.resources.limits.cpu Builder container cpu limit + ## @param builder.resources.limits.memory Builder container memory limit + ## + resources: + requests: + cpu: "200m" + memory: "512Mi" + limits: + cpu: "2000m" + memory: "8Gi" ## @param builder.nodeSelector Node labels for pod assignment ## nodeSelector: { } @@ -617,6 +668,18 @@ api: tag: null pullPolicy: IfNotPresent pullSecrets: [] + ## @param api.events.resources.requests.cpu Api events container cpu request + ## @param api.events.resources.requests.memory Api events container memory request + ## @param api.events.resources.limits.cpu Api events container cpu limit + ## @param api.events.resources.limits.memory Api events container memory limit + ## + resources: + requests: + memory: "50Mi" + cpu: "100m" + limits: + memory: "400Mi" + cpu: "500m" ## @param api.events.podSecurityContext.enabled Enable security context ## @param api.events.podSecurityContext.runAsUser User ID for the pod ## @param api.events.podSecurityContext.runAsGroup Group ID for the pod @@ -695,6 +758,16 @@ kaniko: registry: gcr.io repository: kaniko-project/executor tag: v1.8.1 + ## @param kaniko.resources.requests.cpu Kaniko container cpu request + ## @param kaniko.resources.requests.memory Kaniko container memory request + ## @param kaniko.resources.limits.memory Kaniko container memory limit + ## + resources: + requests: + cpu: "500m" + memory: "256Mi" + limits: + memory: "32Gi" ## @param kaniko.mirror If set to `true` pull base images from the local registry. ## mirror: false @@ -746,6 +819,32 @@ addAccountOperator: ## users: [] +## @section Registry prepopulate +## +registryPrepopulate: + ## @param registryPrepopulate.waitRegistry.resources.requests.cpu Wait registry container cpu request + ## @param registryPrepopulate.waitRegistry.resources.requests.memory Wait registry container memory request + ## @param registryPrepopulate.waitRegistry.resources.limits.memory Wait registry container memory limit + ## + waitRegistry: + resources: + requests: + memory: "50Mi" + cpu: "100m" + limits: + memory: "400Mi" + ## @param registryPrepopulate.pause.resources.requests.cpu Pause container cpu request + ## @param registryPrepopulate.pause.resources.requests.memory Pause container memory request + ## @param registryPrepopulate.pause.resources.limits.memory Pause container memory limit + ## + pause: + resources: + requests: + memory: "64Mi" + cpu: "50m" + limits: + memory: "128Mi" + ## @section Single Sign-On through OpenID Connect ## @descriptionStart Uses the authorization code flow. @@ -846,6 +945,13 @@ postgresql: capabilities: drop: - ALL + resources: + requests: + cpu: "50m" + memory: "256Mi" + limits: + cpu: "1000m" + memory: "4Gi" ## @skip redis ## @@ -860,6 +966,13 @@ redis: service: ports: redis: 6379 + resources: + requests: + cpu: "50m" + memory: "256Mi" + limits: + cpu: "500m" + memory: "1024Mi" replica: replicaCount: 0 commonConfiguration: |- @@ -875,10 +988,17 @@ docker-registry: storage: filesystem persistence: enabled: true - size: 10Gi + size: 50Gi deleteEnabled: true service: type: NodePort + resources: + requests: + cpu: "100m" + memory: "256Mi" + limits: + cpu: "500m" + memory: "64Gi" ## @skip minio ## @@ -899,6 +1019,13 @@ minio: capabilities: drop: - ALL + resources: + requests: + cpu: "100m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "64Gi" ## @skip localstack ## @@ -907,6 +1034,13 @@ localstack: service: edgeService: nodePort: "" + resources: + requests: + cpu: "50m" + memory: "1Gi" + limits: + cpu: "500m" + memory: "64Gi" environment: - name: SERVICES value: s3 diff --git a/docs/settings.md b/docs/settings.md index 3fd315b99..0beb8b5e9 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -17,6 +17,7 @@ Accepted true values for `bool` are: `1`, `ON`, `On`, `on`, `T`, `t`, `TRUE`, `T | string | `COMMON_HOST_DOMAIN` | nil | | | string | `COMPUTE_POD_FS_GROUP` | nil | | | int | `COMPUTE_POD_GKE_GPUS_LIMITS` | `0` | | +| string | `COMPUTE_POD_RESOURCES` | nil | | | string | `COMPUTE_POD_RUN_AS_GROUP` | nil | | | string | `COMPUTE_POD_RUN_AS_USER` | nil | | | int | `COMPUTE_POD_STARTUP_TIMEOUT_SECONDS` | `300` | | @@ -43,6 +44,7 @@ Accepted true values for `bool` are: `1`, `ON`, `On`, `on`, `T`, `t`, `TRUE`, `T | string | `KANIKO_DOCKER_CONFIG_SECRET_NAME` | nil | | | string | `KANIKO_IMAGE` | nil | | | bool | `KANIKO_MIRROR` | `False` | | +| string | `KANIKO_RESOURCES` | nil | | | bool | `LOGGING_USE_COLORS` | `True` | | | string | `LOG_LEVEL` | `INFO` | | | string | `NAMESPACE` | nil | |