Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support config GPU vendor #3029

Merged
merged 11 commits into from
Nov 30, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion docs/source/user_guide/pipelines.md
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,9 @@ The following alphabetically sorted list identifies the node properties that are

##### Resources: CPU, GPU, and RAM
- Resources that the notebook or script requires. RAM takes units of gigabytes (10<sup>9</sup> bytes).
- The values are ignored when the pipeline is executed locally.
- Specify a custom Kubernetes GPU vendor, if desired. The default vendor is `nvidia.com/gpu`. See [this topic in the Kubernetes documentation](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/) for more information.
- The values are ignored when the pipeline is executed locally.
- Example: `amd.com/gpu`

##### Runtime image

Expand Down
8 changes: 4 additions & 4 deletions elyra/pipeline/kfp/processor_kfp.py
Original file line number Diff line number Diff line change
Expand Up @@ -784,10 +784,10 @@ def _generate_workflow_tasks(
"size": operation.memory,
"units": "G",
}
workflow_task["task_modifiers"]["gpu_limit"] = {
"size": operation.gpu,
"vendor": workflow_task["task_modifiers"]["env_variables"].get("GPU_VENDOR", "nvidia"),
}
gpu_vendor = "nvidia.com/gpu"
if operation.gpu_vendor:
gpu_vendor = operation.gpu_vendor
workflow_task["task_modifiers"]["gpu_limit"] = {"size": operation.gpu, "vendor": gpu_vendor}

if is_crio_runtime:
# Attach empty dir volume
Expand Down
8 changes: 7 additions & 1 deletion elyra/pipeline/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,7 @@ def __init__(
cpu: number of cpus requested to run the operation
memory: amount of memory requested to run the operation (in Gi)
gpu: number of gpus requested to run the operation
gpu_vendor: gpu resource type, eg. nvidia.com/gpu, amd.com/gpu etc.
Entries for other (non-built-in) component types are a function of the respective component.

:param elyra_params: dictionary of parameter key:value pairs that are owned by Elyra
Expand All @@ -270,8 +271,9 @@ def __init__(
self._component_params["dependencies"] = Operation._scrub_list(component_params.get("dependencies", []))
self._component_params["include_subdirectories"] = component_params.get("include_subdirectories", False)
self._component_params["cpu"] = component_params.get("cpu")
self._component_params["gpu"] = component_params.get("gpu")
self._component_params["memory"] = component_params.get("memory")
self._component_params["gpu"] = component_params.get("gpu")
self._component_params["gpu_vendor"] = component_params.get("gpu_vendor")

if not elyra_params:
elyra_params = {}
Expand Down Expand Up @@ -319,6 +321,10 @@ def memory(self) -> Optional[str]:
def gpu(self) -> Optional[str]:
return self._component_params.get("gpu")

@property
def gpu_vendor(self) -> Optional[str]:
return self._component_params.get("gpu_vendor")

def __eq__(self, other: GenericOperation) -> bool:
if isinstance(self, other.__class__):
return super().__eq__(other)
Expand Down
15 changes: 15 additions & 0 deletions elyra/pipeline/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
from elyra.pipeline.pipeline_definition import PipelineDefinition
from elyra.pipeline.processor import PipelineProcessorManager
from elyra.pipeline.runtime_type import RuntimeProcessorType
from elyra.util.kubernetes import is_valid_kubernetes_device_plugin_name
from elyra.util.path import get_expanded_path


Expand Down Expand Up @@ -428,6 +429,20 @@ async def _validate_generic_node_properties(self, node: Node, response: Validati
resource_value=resource_value,
response=response,
)
for resource_vendor in ["gpu_vendor"]:
vendor = node.get_component_parameter(resource_vendor)
if vendor and not is_valid_kubernetes_device_plugin_name(vendor):
response.add_message(
severity=ValidationSeverity.Error,
message_type="invalidNodeProperty",
message="Property is not a valid resource vendor name.",
data={
"nodeID": node.id,
"nodeName": node_label,
"propertyName": resource_vendor,
"value": vendor,
},
)

for param in node.elyra_owned_properties:
required = self._is_required_property(component_props, param)
Expand Down
18 changes: 13 additions & 5 deletions elyra/templates/components/generic_properties_template.jinja2
Original file line number Diff line number Diff line change
Expand Up @@ -41,17 +41,25 @@
"description": "For CPU-intensive workloads, you can choose more than 1 CPU (e.g. 1.5).",
"minimum": 0
},
"memory": {
"type": "integer",
"title": "RAM(GB)",
"description": "The total amount of RAM specified.",
"minimum": 0
},
"gpu": {
"type": "integer",
"title": "GPU",
"description": "For GPU-intensive workloads, you can choose more than 1 GPU. Must be an integer.",
"minimum": 0
},
"memory": {
"type": "integer",
"title": "RAM(GB)",
"description": "The total amount of RAM specified.",
"minimum": 0
"gpu_vendor": {
"type": "string",
"title": "GPU Vendor",
"description": "GPU Vendor, or K8s GPU resource type, default 'nvidia.com/gpu'.",
"uihints": {
"ui:placeholder": "nvidia.com/gpu"
}
},
"dependencies": {
"title": "File Dependencies",
Expand Down
2 changes: 1 addition & 1 deletion elyra/templates/kubeflow/v1/python_dsl_template.jinja2
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def generated_pipeline(
{{ task_name }}.container.set_memory_request(memory="{{ workflow_task.task_modifiers.mem_request.size }}{{ workflow_task.task_modifiers.mem_request.units }}")
{% endif %}
{% if workflow_task.task_modifiers.gpu_limit and workflow_task.task_modifiers.gpu_limit.size %}
{{ task_name }}.container.set_gpu_limit(gpu="{{ workflow_task.task_modifiers.gpu_limit.size }}", vendor="{{ workflow_task.task_modifiers.gpu_limit.vendor }}")
{{ task_name }}.container.add_resource_limit(resource_name="{{ workflow_task.task_modifiers.gpu_limit.vendor }}", value="{{ workflow_task.task_modifiers.gpu_limit.size }}")
{% endif %}
{% if workflow_task.task_modifiers.env_variables %}
{% for env_var_name, env_var_value in workflow_task.task_modifiers.env_variables.items() %}
Expand Down
7 changes: 5 additions & 2 deletions elyra/tests/pipeline/test_pipeline_constructor.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,8 +353,9 @@ def test_validate_resource_values():
component_parameters = {
"filename": "elyra/pipeline/tests/resources/archive/test.ipynb",
"cpu": "4",
"gpu": "6",
"memory": "10",
"gpu": "6",
"gpu_vendor": "example.com/gpu",
"runtime_image": "tensorflow/tensorflow:latest",
}
test_operation = GenericOperation(
Expand All @@ -367,6 +368,7 @@ def test_validate_resource_values():

assert test_operation.cpu == "4"
assert test_operation.gpu == "6"
assert test_operation.gpu_vendor == "example.com/gpu"
assert test_operation.memory == "10"


Expand All @@ -385,8 +387,9 @@ def test_validate_resource_values_as_none():
)

assert test_operation.cpu is None
assert test_operation.gpu is None
assert test_operation.memory is None
assert test_operation.gpu is None
assert test_operation.gpu_vendor is None


def test_validate_gpu_accepts_zero_as_value():
Expand Down
10 changes: 10 additions & 0 deletions elyra/util/kubernetes.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,16 @@ def is_valid_kubernetes_key(name: str) -> bool:
return re.match(r"^[\w\-_.]+$", name) is not None


def is_valid_kubernetes_device_plugin_name(key: str) -> bool:
"""
Returns a truthy value indicating whether name meets the kubernetes
naming constraints for device plugin custom schedulable resource, as outlined in the link below.

https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/#using-device-plugins
"""
return is_valid_annotation_key(key)


def is_valid_annotation_key(key: str) -> bool:
"""
Returns a truthy value indicating whether name meets the kubernetes
Expand Down
16 changes: 15 additions & 1 deletion packages/pipeline-editor/style/index.css
Original file line number Diff line number Diff line change
Expand Up @@ -440,7 +440,7 @@ span.bx--list-box__label {
}

.elyra-PipelineEditor .form-group.field.field-integer {
width: 30%;
width: 100%;
}

.elyra-PipelineEditor .array-pipelineDefaults.form-control {
Expand Down Expand Up @@ -836,6 +836,20 @@ input.elyra-Dialog-checkbox.jp-mod-styled {
padding-bottom: 4px;
}

div#root_component_parameters_cpu,
div#root_component_parameters_memory,
div#root_component_parameters_gpu,
div#root_component_parameters_gpu_vendor {
width: 50%;
}

input#root_component_parameters_cpu,
input#root_component_parameters_memory,
input#root_component_parameters_gpu,
input#root_component_parameters_gpu_vendor {
width: 100%;
}

#cpu,
#gpu,
#memory {
Expand Down