Skip to content

Commit

Permalink
chore: add limits and requests to pods (#867)
Browse files Browse the repository at this point in the history
Signed-off-by: SdgJlbl <sarah.diot-girard@owkin.com>
  • Loading branch information
SdgJlbl authored and ThibaultFy committed Apr 23, 2024
1 parent 671e000 commit e537054
Show file tree
Hide file tree
Showing 10 changed files with 164 additions and 51 deletions.
2 changes: 2 additions & 0 deletions backend/builder/image_builder/image_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from substrapp.compute_tasks.volumes import get_worker_subtuple_pvc_name
from substrapp.docker_registry import USER_IMAGE_REPOSITORY
from substrapp.kubernetes_utils import delete_pod
from substrapp.kubernetes_utils import get_resources_requirements
from substrapp.kubernetes_utils import get_security_context
from substrapp.lock_local import lock_resource
from substrapp.utils import timeit
Expand Down Expand Up @@ -306,6 +307,7 @@ def _build_container(dockerfile_mount_path: str, image_tag: str) -> kubernetes.c
args=args,
volume_mounts=volume_mounts,
security_context=container_security_context,
resources=get_resources_requirements(cpu_request="1000m", memory_request="4Gi", memory_limit="32Gi"),
)


Expand Down
2 changes: 2 additions & 0 deletions backend/substrapp/compute_tasks/compute_pod.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from substrapp.kubernetes_utils import delete_pod
from substrapp.kubernetes_utils import get_pod_security_context
from substrapp.kubernetes_utils import get_resources_requirements
from substrapp.kubernetes_utils import get_security_context

NAMESPACE = settings.NAMESPACE
Expand Down Expand Up @@ -112,6 +113,7 @@ def create_pod(
args=None,
volume_mounts=volume_mounts + gpu_volume_mounts,
security_context=get_security_context(),
resources=get_resources_requirements(cpu_request="1000m", memory_request="1Gi", memory_limit="64Gi"),
env=[kubernetes.client.V1EnvVar(name=env_name, value=env_value) for env_name, env_value in environment.items()],
**container_optional_kwargs,
)
Expand Down
8 changes: 8 additions & 0 deletions backend/substrapp/kubernetes_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,14 @@ def get_security_context(root: bool = False, capabilities: list[str] = None) ->
return security_context


def get_resources_requirements(
*, cpu_request: str = "1000m", memory_request: str = "200M", memory_limit: str = "2G"
) -> kubernetes.client.V1ResourceRequirements:
return kubernetes.client.V1ResourceRequirements(
requests={"cpu": cpu_request, "memory": memory_request}, limits={"memory": memory_limit}
)


def pod_exists_by_label_selector(k8s_client: kubernetes.client.CoreV1Api, label_selector: str) -> bool:
"""Return True if the pod exists, else False.
Expand Down
11 changes: 0 additions & 11 deletions charts/substra-backend/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,6 @@

<!-- towncrier release notes start -->

## [26.3.0] - 2024-04-19

### Removed

- Revert resources limits and requests (CPU and memory) for all containers.

## [26.2.0] - 2024-04-18

### Added

- User docker registry in configurable through value in `containerRegistry.userImageRepository`

## [26.1.0] - 2024-04-17

Expand Down
2 changes: 1 addition & 1 deletion charts/substra-backend/Chart.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
apiVersion: v2
name: substra-backend
home: https://github.com/Substra
version: 26.3.0
version: 26.1.0
appVersion: 0.45.0
kubeVersion: ">= 1.19.0-0"
description: Main package for Substra
Expand Down
29 changes: 22 additions & 7 deletions charts/substra-backend/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,10 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub
| `server.ingress.extraHosts` | The list of additional hostnames to be covered with this ingress record | `[]` |
| `server.ingress.extraTls` | The tls configuration for hostnames to be coverred by the ingress | `[]` |
| `server.ingress.ingressClassName` | _IngressClass_ that will be used to implement the Ingress | `nil` |
| `server.resources` | Server container resources requests and limits | `{}` |
| `server.resources.requests.cpu` | Server container cpu request | `1000m` |
| `server.resources.requests.memory` | Server container memory request | `6Gi` |
| `server.resources.limits.cpu` | Server container cpu limit | `2000m` |
| `server.resources.limits.memory` | Server container memory limit | `12Gi` |
| `server.persistence.storageClass` | Specify the _StorageClass_ used to provision the volume. Or the default _StorageClass_ will be used. Set it to `-` to disable dynamic provisioning | `""` |
| `server.persistence.servermedias.size` | Servermedias volume size | `10Gi` |
| `server.persistence.servermedias.existingClaim` | use this PVC rather than creating a new one | `nil` |
Expand Down Expand Up @@ -119,7 +122,10 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub
| `worker.podSecurityContext.runAsUser` | User ID for the pod | `1001` |
| `worker.podSecurityContext.runAsGroup` | Group ID for the pod | `1001` |
| `worker.podSecurityContext.fsGroup` | FileSystem group ID for the pod | `1001` |
| `worker.resources` | Worker container resources requests and limits | `{}` |
| `worker.resources.requests.cpu` | Worker container cpu request | `1000m` |
| `worker.resources.requests.memory` | Worker container memory request | `4Gi` |
| `worker.resources.limits.cpu` | Worker container cpu limit | `2000m` |
| `worker.resources.limits.memory` | Worker container memory limit | `8Gi` |
| `worker.nodeSelector` | Node labels for pod assignment | `{}` |
| `worker.tolerations` | Toleration labels for pod assignment | `[]` |
| `worker.affinity` | Affinity settings for pod assignment, ignored if `DataSampleStorageInServerMedia` is `true` | `{}` |
Expand Down Expand Up @@ -163,7 +169,10 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub
| `schedulerWorker.nodeSelector` | Node labels for pod assignment | `{}` |
| `schedulerWorker.tolerations` | Toleration labels for pod assignment | `[]` |
| `schedulerWorker.affinity` | Affinity settings for pod assignment | `{}` |
| `schedulerWorker.resources` | Scheduler container resources requests and limits | `{}` |
| `schedulerWorker.resources.requests.cpu` | Scheduler container cpu request | `250m` |
| `schedulerWorker.resources.requests.memory` | Scheduler container memory request | `200Mi` |
| `schedulerWorker.resources.limits.cpu` | Scheduler container cpu limit | `250m` |
| `schedulerWorker.resources.limits.memory` | Scheduler container memory limit | `400Mi` |
| `schedulerWorker.podSecurityContext.enabled` | Enable security context | `true` |
| `schedulerWorker.podSecurityContext.runAsUser` | User ID for the pod | `1001` |
| `schedulerWorker.podSecurityContext.runAsGroup` | Group ID for the pod | `1001` |
Expand All @@ -180,7 +189,10 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub
| `scheduler.image.tag` | Substra backend tasks scheduler image tag (defaults to AppVersion) | `nil` |
| `scheduler.image.pullPolicy` | Substra backend task scheduler image pull policy | `IfNotPresent` |
| `scheduler.image.pullSecrets` | Specify image pull secrets | `[]` |
| `scheduler.resources` | Scheduler container resources requests and limits | `{}` |
| `scheduler.resources.requests.cpu` | Scheduler container cpu request | `250m` |
| `scheduler.resources.requests.memory` | Scheduler container memory request | `200Mi` |
| `scheduler.resources.limits.cpu` | Scheduler container cpu limit | `250m` |
| `scheduler.resources.limits.memory` | Scheduler container memory limit | `400Mi` |
| `scheduler.nodeSelector` | Node labels for pod assignment | `{}` |
| `scheduler.tolerations` | Toleration labels for pod assignment | `[]` |
| `scheduler.affinity` | Affinity settings for pod assignment | `{}` |
Expand All @@ -194,8 +206,8 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub
| Name | Description | Value |
| --------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------- |
| `builder.replicaCount` | Number of builder replicas | `1` |
| `builder.enabled` | Enable worker service | `true` |
| `builder.replicaCount` | Replica count for the worker service | `1` |
| `builder.enabled` | Enable builder service | `true` |
| `builder.replicaCount` | Replica count for the builder service | `1` |
| `builder.concurrency` | Maximum amount of tasks to process in parallel | `1` |
| `builder.image.registry` | Substra backend server image registry | `ghcr.io` |
| `builder.image.repository` | Substra backend server image repository | `substra/substra-backend` |
Expand All @@ -206,7 +218,10 @@ See [UPGRADE.md](https://github.com/Substra/substra-backend/blob/main/charts/sub
| `builder.podSecurityContext.runAsUser` | User ID for the pod | `1001` |
| `builder.podSecurityContext.runAsGroup` | Group ID for the pod | `1001` |
| `builder.podSecurityContext.fsGroup` | FileSystem group ID for the pod | `1001` |
| `builder.resources` | Builder container resources requests and limits | `{}` |
| `builder.resources.requests.cpu` | Builder container cpu request | `2000m` |
| `builder.resources.requests.memory` | Builder container memory request | `4Gi` |
| `builder.resources.limits.cpu` | Builder container cpu limit | `2000m` |
| `builder.resources.limits.memory` | Builder container memory limit | `8Gi` |
| `builder.nodeSelector` | Node labels for pod assignment | `{}` |
| `builder.tolerations` | Toleration labels for pod assignment | `[]` |
| `builder.affinity` | Affinity settings for pod assignment, ignored if `DataSampleStorageInServerMedia` is `true` | `{}` |
Expand Down
7 changes: 7 additions & 0 deletions charts/substra-backend/templates/deployment-api-events.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,13 @@ spec:
- name: api-event-app
image: {{ include "substra-backend.images.name" (dict "img" .Values.api.events.image "defaultTag" $.Chart.AppVersion) }}
imagePullPolicy: {{ .Values.api.events.image.pullPolicy }}
resources:
requests:
memory: "200Mi"
cpu: "500m"
limits:
memory: "400Mi"
cpu: "500m"
command: ["/bin/bash"]
{{- if eq .Values.settings "prod" }}
args: ["-c", "python manage.py consume"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,21 @@ spec:
initContainers:
- name: wait-registry
image: jwilder/dockerize:0.6.1
resources:
requests:
memory: "200Mi"
cpu: "500m"
limits:
memory: "400Mi"
command: ['dockerize', '-wait', 'tcp://{{ $.Release.Name }}-docker-registry:5000']
- name: kaniko
image: {{ include "common.images.name" $.Values.kaniko.image }}
resources:
requests:
memory: "2Gi"
cpu: "1000m"
limits:
memory: "8Gi"
args:
- "--context=/docker-context"
{{- if .dstImage }}
Expand All @@ -48,6 +60,12 @@ spec:
containers:
- image: gcr.io/google-containers/pause:3.2
name: pause
resources:
requests:
memory: "64Mi"
cpu: "50m"
limits:
memory: "128Mi"
volumes:
- name: kaniko-dir
emptyDir: {}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,13 @@ spec:
- name: worker-event-app
image: {{ include "substra-backend.images.name" (dict "img" .Values.worker.events.image "defaultTag" $.Chart.AppVersion) }}
imagePullPolicy: {{ .Values.worker.events.image.pullPolicy }}
resources:
requests:
memory: "200Mi"
cpu: "500m"
limits:
memory: "400Mi"
cpu: "500m"
command: ["/bin/bash"]
{{- if eq .Values.settings "prod" }}
args: ["-c", "python manage.py consume"]
Expand Down
Loading

0 comments on commit e537054

Please sign in to comment.