Skip to content

Commit

Permalink
chore: add limits and requests configuration in values (#867) (#892)
Browse files Browse the repository at this point in the history
Signed-off-by: SdgJlbl <sarah.diot-girard@owkin.com>
Signed-off-by: ThibaultFy <thibault.fouqueray@gmail.com>
Co-authored-by: SdgJlbl <sarah.diot-girard@owkin.com>
  • Loading branch information
ThibaultFy and SdgJlbl committed Apr 26, 2024
1 parent 671e000 commit c7ca4a1
Show file tree
Hide file tree
Showing 16 changed files with 264 additions and 46 deletions.
2 changes: 2 additions & 0 deletions backend/backend/settings/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@
"KANIKO_MIRROR": to_bool(os.environ.get("KANIKO_MIRROR", False)),
"KANIKO_IMAGE": os.environ.get("KANIKO_IMAGE"),
"KANIKO_DOCKER_CONFIG_SECRET_NAME": os.environ.get("KANIKO_DOCKER_CONFIG_SECRET_NAME"),
"KANIKO_RESOURCES": os.environ.get("KANIKO_RESOURCES"),
"COMPUTE_POD_STARTUP_TIMEOUT_SECONDS": int(os.environ.get("COMPUTE_POD_STARTUP_TIMEOUT_SECONDS", 300)),
"PRIVATE_CA_ENABLED": to_bool(os.environ.get("PRIVATE_CA_ENABLED")),
}
Expand All @@ -223,6 +224,7 @@
COMPUTE_POD_RUN_AS_GROUP = os.environ.get("COMPUTE_POD_RUN_AS_GROUP")
COMPUTE_POD_FS_GROUP = os.environ.get("COMPUTE_POD_FS_GROUP")
COMPUTE_POD_GKE_GPUS_LIMITS = int(os.environ.get("COMPUTE_POD_GKE_GPUS_LIMITS", 0))
COMPUTE_POD_RESOURCES = os.environ.get("COMPUTE_POD_RESOURCES")

# Prometheus configuration
ENABLE_METRICS = to_bool(os.environ.get("ENABLE_METRICS", False))
Expand Down
3 changes: 3 additions & 0 deletions backend/builder/image_builder/image_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from substrapp.compute_tasks.volumes import get_worker_subtuple_pvc_name
from substrapp.docker_registry import USER_IMAGE_REPOSITORY
from substrapp.kubernetes_utils import delete_pod
from substrapp.kubernetes_utils import get_resources_requirements_from_yaml
from substrapp.kubernetes_utils import get_security_context
from substrapp.lock_local import lock_resource
from substrapp.utils import timeit
Expand All @@ -42,6 +43,7 @@
IMAGE_BUILD_TIMEOUT = settings.IMAGE_BUILD_TIMEOUT
KANIKO_CONTAINER_NAME = "kaniko"
HOSTNAME = settings.HOSTNAME
KANIKO_RESOURCES = settings.TASK["KANIKO_RESOURCES"]


def container_image_tag_from_function(function: orchestrator.Function) -> str:
Expand Down Expand Up @@ -306,6 +308,7 @@ def _build_container(dockerfile_mount_path: str, image_tag: str) -> kubernetes.c
args=args,
volume_mounts=volume_mounts,
security_context=container_security_context,
resources=get_resources_requirements_from_yaml(yaml_resources=KANIKO_RESOURCES),
)


Expand Down
3 changes: 3 additions & 0 deletions backend/substrapp/compute_tasks/compute_pod.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@

from substrapp.kubernetes_utils import delete_pod
from substrapp.kubernetes_utils import get_pod_security_context
from substrapp.kubernetes_utils import get_resources_requirements_from_yaml
from substrapp.kubernetes_utils import get_security_context

NAMESPACE = settings.NAMESPACE
COMPUTE_POD_RESOURCES = settings.COMPUTE_POD_RESOURCES
logger = structlog.get_logger(__name__)


Expand Down Expand Up @@ -112,6 +114,7 @@ def create_pod(
args=None,
volume_mounts=volume_mounts + gpu_volume_mounts,
security_context=get_security_context(),
resources=get_resources_requirements_from_yaml(yaml_resources=COMPUTE_POD_RESOURCES),
env=[kubernetes.client.V1EnvVar(name=env_name, value=env_value) for env_name, env_value in environment.items()],
**container_optional_kwargs,
)
Expand Down
13 changes: 13 additions & 0 deletions backend/substrapp/kubernetes_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import kubernetes
import structlog
import yaml
from django.conf import settings

from substrapp.exceptions import KubernetesError
Expand Down Expand Up @@ -47,6 +48,18 @@ def get_security_context(root: bool = False, capabilities: list[str] = None) ->
return security_context


def get_resources_requirements_from_yaml(
*,
yaml_resources: str,
) -> kubernetes.client.V1ResourceRequirements:
"""Return a kubernetes.client.V1ResourceRequirements object from a yaml string."""
resources_dict = yaml.safe_load(yaml_resources)

return kubernetes.client.V1ResourceRequirements(
requests=resources_dict["requests"], limits=resources_dict["limits"]
)


def pod_exists_by_label_selector(k8s_client: kubernetes.client.CoreV1Api, label_selector: str) -> bool:
"""Return True if the pod exists, else False.
Expand Down
2 changes: 1 addition & 1 deletion backend/substrapp/tasks/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def on_success(self, retval: dict[str, Any], task_id: str, args: tuple, kwargs:
# Celery does not provide unpacked arguments, we are doing it in `split_args`
def on_retry(self, exc: Exception, task_id: str, args: tuple, kwargs: dict[str, Any], einfo: ExceptionInfo) -> None:
_, task = self.split_args(args)
# delete compute pod to reset hardware ressources
# delete compute pod to reset hardware resources
delete_compute_plan_pods(task.compute_plan_key)
logger.info(
"Retrying task",
Expand Down
1 change: 1 addition & 0 deletions changes/892.added
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Resources to Kaniko pods and computeTask pods
6 changes: 6 additions & 0 deletions charts/substra-backend/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@

<!-- towncrier release notes start -->

## [26.4.0] - 2024-04-25

### Added

- Resources limits and requests (CPU and memory) for all containers are set and configurable in values.

## [26.3.0] - 2024-04-19

### Removed
Expand Down
2 changes: 1 addition & 1 deletion charts/substra-backend/Chart.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
apiVersion: v2
name: substra-backend
home: https://github.com/Substra
version: 26.3.0
version: 26.4.0
appVersion: 0.45.0
kubeVersion: ">= 1.19.0-0"
description: Main package for Substra
Expand Down
Loading

0 comments on commit c7ca4a1

Please sign in to comment.