Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: add limits and requests configuration in values (#867) #892

Merged
merged 11 commits into from
Apr 26, 2024
2 changes: 2 additions & 0 deletions backend/backend/settings/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@
"KANIKO_MIRROR": to_bool(os.environ.get("KANIKO_MIRROR", False)),
"KANIKO_IMAGE": os.environ.get("KANIKO_IMAGE"),
"KANIKO_DOCKER_CONFIG_SECRET_NAME": os.environ.get("KANIKO_DOCKER_CONFIG_SECRET_NAME"),
"KANIKO_RESOURCES": os.environ.get("KANIKO_RESOURCES"),
"COMPUTE_POD_STARTUP_TIMEOUT_SECONDS": int(os.environ.get("COMPUTE_POD_STARTUP_TIMEOUT_SECONDS", 300)),
"PRIVATE_CA_ENABLED": to_bool(os.environ.get("PRIVATE_CA_ENABLED")),
}
Expand All @@ -223,6 +224,7 @@
COMPUTE_POD_RUN_AS_GROUP = os.environ.get("COMPUTE_POD_RUN_AS_GROUP")
COMPUTE_POD_FS_GROUP = os.environ.get("COMPUTE_POD_FS_GROUP")
COMPUTE_POD_GKE_GPUS_LIMITS = int(os.environ.get("COMPUTE_POD_GKE_GPUS_LIMITS", 0))
COMPUTE_POD_RESOURCES = os.environ.get("COMPUTE_POD_RESOURCES")

# Prometheus configuration
ENABLE_METRICS = to_bool(os.environ.get("ENABLE_METRICS", False))
Expand Down
3 changes: 3 additions & 0 deletions backend/builder/image_builder/image_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from substrapp.compute_tasks.volumes import get_worker_subtuple_pvc_name
from substrapp.docker_registry import USER_IMAGE_REPOSITORY
from substrapp.kubernetes_utils import delete_pod
from substrapp.kubernetes_utils import get_resources_requirements_from_yaml
from substrapp.kubernetes_utils import get_security_context
from substrapp.lock_local import lock_resource
from substrapp.utils import timeit
Expand All @@ -42,6 +43,7 @@
IMAGE_BUILD_TIMEOUT = settings.IMAGE_BUILD_TIMEOUT
KANIKO_CONTAINER_NAME = "kaniko"
HOSTNAME = settings.HOSTNAME
KANIKO_RESOURCES = settings.TASK["KANIKO_RESOURCES"]


def container_image_tag_from_function(function: orchestrator.Function) -> str:
Expand Down Expand Up @@ -306,6 +308,7 @@ def _build_container(dockerfile_mount_path: str, image_tag: str) -> kubernetes.c
args=args,
volume_mounts=volume_mounts,
security_context=container_security_context,
resources=get_resources_requirements_from_yaml(yaml_resources=KANIKO_RESOURCES),
)


Expand Down
3 changes: 3 additions & 0 deletions backend/substrapp/compute_tasks/compute_pod.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@

from substrapp.kubernetes_utils import delete_pod
from substrapp.kubernetes_utils import get_pod_security_context
from substrapp.kubernetes_utils import get_resources_requirements_from_yaml
from substrapp.kubernetes_utils import get_security_context

NAMESPACE = settings.NAMESPACE
COMPUTE_POD_RESOURCES = settings.COMPUTE_POD_RESOURCES
logger = structlog.get_logger(__name__)


Expand Down Expand Up @@ -112,6 +114,7 @@ def create_pod(
args=None,
volume_mounts=volume_mounts + gpu_volume_mounts,
security_context=get_security_context(),
resources=get_resources_requirements_from_yaml(yaml_resources=COMPUTE_POD_RESOURCES),
env=[kubernetes.client.V1EnvVar(name=env_name, value=env_value) for env_name, env_value in environment.items()],
**container_optional_kwargs,
)
Expand Down
13 changes: 13 additions & 0 deletions backend/substrapp/kubernetes_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import kubernetes
import structlog
import yaml
from django.conf import settings

from substrapp.exceptions import KubernetesError
Expand Down Expand Up @@ -47,6 +48,18 @@ def get_security_context(root: bool = False, capabilities: list[str] = None) ->
return security_context


def get_resources_requirements_from_yaml(
*,
yaml_resources: str,
) -> kubernetes.client.V1ResourceRequirements:
"""Return a kubernetes.client.V1ResourceRequirements object from a yaml string."""
resources_dict = yaml.safe_load(yaml_resources)

return kubernetes.client.V1ResourceRequirements(
requests=resources_dict["requests"], limits=resources_dict["limits"]
)


def pod_exists_by_label_selector(k8s_client: kubernetes.client.CoreV1Api, label_selector: str) -> bool:
"""Return True if the pod exists, else False.

Expand Down
2 changes: 1 addition & 1 deletion backend/substrapp/tasks/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def on_success(self, retval: dict[str, Any], task_id: str, args: tuple, kwargs:
# Celery does not provide unpacked arguments, we are doing it in `split_args`
def on_retry(self, exc: Exception, task_id: str, args: tuple, kwargs: dict[str, Any], einfo: ExceptionInfo) -> None:
_, task = self.split_args(args)
# delete compute pod to reset hardware ressources
# delete compute pod to reset hardware resources
delete_compute_plan_pods(task.compute_plan_key)
logger.info(
"Retrying task",
Expand Down
1 change: 1 addition & 0 deletions changes/892.added
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Resources to Kaniko pods and computeTask pods
6 changes: 6 additions & 0 deletions charts/substra-backend/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@

<!-- towncrier release notes start -->

## [26.4.0] - 2024-04-25

### Added

- Resources limits and requests (CPU and memory) for all containers are set and configurable in values.

## [26.3.0] - 2024-04-19

### Removed
Expand Down
2 changes: 1 addition & 1 deletion charts/substra-backend/Chart.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
apiVersion: v2
name: substra-backend
home: https://github.com/Substra
version: 26.3.0
version: 26.4.0
appVersion: 0.45.0
kubeVersion: ">= 1.19.0-0"
description: Main package for Substra
Expand Down
Loading
Loading