support claiming device resources for tfjob/pytorchjob/custom-serving…

…, such as amd.com/gpu=1 Signed-off-by: lizhiboo <lizhiboo@yeah.net>
kubeflow · Aug 30, 2024 · 74923fd · 74923fd
1 parent b500f9e
commit 74923fd
Show file tree

Hide file tree

Showing 35 changed files with 403 additions and 175 deletions.
diff --git a/charts/custom-serving/templates/deployment.yaml b/charts/custom-serving/templates/deployment.yaml
@@ -192,6 +192,9 @@ spec:
               {{- if gt (int $gpuCount) 0}}
               nvidia.com/gpu: {{ .Values.gpuCount }}
               {{- end }}
+              {{- range $key, $value := .Values.devices }}
+              {{ $key }}: {{ $value }}
+              {{- end }}
               {{- if gt (int $gpuMemory) 0}}
               aliyun.com/gpu-mem: {{ .Values.gpuMemory }}
               {{- end }}

diff --git a/charts/custom-serving/values.yaml b/charts/custom-serving/values.yaml
@@ -14,6 +14,9 @@ restApiPort: 8501
 metricsPort: 0
 replicas: 1
 
+# device resources
+#devices: amd.com/gpu=1
+
 # repository: "cheyang/tf-model-server-gpu"
 image: "tensorflow/serving:latest"
 

diff --git a/charts/etjob/templates/etjob.yaml b/charts/etjob/templates/etjob.yaml
@@ -514,6 +514,9 @@ spec:
                     nvidia.com/gpu: {{ $gpuCount | quote }}
                     {{- end }}
                     {{- end }}
+                    {{- range $key, $value := .Values.devices }}
+                    {{ $key }}: {{ $value }}
+                    {{- end }}
                     {{- if .Values.cpu }}
                     cpu: {{ .Values.cpu | quote }}
                     {{- end }}
@@ -531,6 +534,9 @@ spec:
                     nvidia.com/gpu: {{ $gpuCount | quote }}
                     {{- end }}
                     {{- end }}
+                    {{- range $key, $value := .Values.devices }}
+                    {{ $key }}: {{ $value }}
+                    {{- end }}
                     {{- if .Values.cpu }}
                     cpu: {{ .Values.cpu | quote }}
                     {{- end }}

diff --git a/charts/mpijob/templates/mpijob.yaml b/charts/mpijob/templates/mpijob.yaml
@@ -238,6 +238,9 @@ spec:
             nvidia.com/gpu: {{ $gpuCount | quote }}
             {{- end }}
             {{- end }}
+            {{- range $key, $value := .Values.devices }}
+            {{ $key }}: {{ $value }}
+            {{- end }}
             {{- if .Values.cpu }}
             cpu: {{ .Values.cpu | quote }}
             {{- end }}

diff --git a/charts/pytorchjob/templates/pytorchjob.yaml b/charts/pytorchjob/templates/pytorchjob.yaml
@@ -239,10 +239,13 @@ spec:
                 {{- if gt (int $gpuCount) 0}}
                 {{- if .Values.nvidiaPath }}
                 alpha.kubernetes.io/nvidia-gpu: {{ $gpuCount | quote }}
-                {{- else}}
+                {{- else }}
                 nvidia.com/gpu: {{ $gpuCount | quote }}
                 {{- end }}
                 {{- end }}
+                {{- range $key, $value := .Values.devices }}
+                {{ $key }}: {{ $value }}
+                {{- end }}
                 {{- if .Values.cpu }}
                 cpu: {{ .Values.cpu | quote }}
                 {{- end }}
@@ -256,10 +259,13 @@ spec:
                 {{- if gt (int $gpuCount) 0}}
                 {{- if .Values.nvidiaPath }}
                 alpha.kubernetes.io/nvidia-gpu: {{ $gpuCount | quote }}
-                {{- else}}
+                {{- else }}
                 nvidia.com/gpu: {{ $gpuCount | quote }}
                 {{- end }}
                 {{- end }}
+                {{- range $key, $value := .Values.devices }}
+                {{ $key }}: {{ $value }}
+                {{- end }}
                 {{- if .Values.cpu }}
                 cpu: {{ .Values.cpu | quote }}
                 {{- end }}
@@ -531,10 +537,13 @@ spec:
                   {{- if gt (int $gpuCount) 0}}
                     {{- if .Values.nvidiaPath }}
                     alpha.kubernetes.io/nvidia-gpu: {{ $gpuCount | quote }}
-                    {{- else}}
+                    {{- else }}
                     nvidia.com/gpu: {{ $gpuCount | quote }}
                     {{- end }}
                     {{- end }}
+                    {{- range $key, $value := .Values.devices }}
+                    {{ $key }}: {{ $value }}
+                    {{- end }}
                     {{- if .Values.cpu }}
                     cpu: {{ .Values.cpu | quote }}
                     {{- end }}
@@ -548,10 +557,13 @@ spec:
                   {{- if gt (int $gpuCount) 0}}
                     {{- if .Values.nvidiaPath }}
                     alpha.kubernetes.io/nvidia-gpu: {{ $gpuCount | quote }}
-                    {{- else}}
+                    {{- else }}
                     nvidia.com/gpu: {{ $gpuCount | quote }}
                     {{- end }}
                     {{- end }}
+                    {{- range $key, $value := .Values.devices }}
+                    {{ $key }}: {{ $value }}
+                    {{- end }}
                     {{- if .Values.cpu }}
                     cpu: {{ .Values.cpu | quote }}
                     {{- end }}

diff --git a/charts/pytorchjob/values.yaml b/charts/pytorchjob/values.yaml
@@ -7,6 +7,9 @@ useHostPID: true
 useHostIPC: true
 gpuCount: 0 # user define
 
+# devices resources
+#devices: amd.com/gpu=1
+
 # rsync image
 rsyncImage: registry.cn-zhangjiakou.aliyuncs.com/acs/rsync:v3.1.0-aliyun
 # git sync image

diff --git a/charts/tfjob/templates/tfjob.yaml b/charts/tfjob/templates/tfjob.yaml
@@ -288,6 +288,9 @@ spec:
                   {{- if gt (int $psGpuCount) 0}}
                   nvidia.com/gpu: {{ .Values.psGPU | quote }}
                   {{- end }}
+                  {{- range $key, $value := .Values.devices }}
+                  {{ $key }}: {{ $value }}
+                  {{- end }}
                   {{- if .Values.psCPU }}
                   cpu: {{ .Values.psCPU | quote }}
                   {{- end}}
@@ -301,6 +304,9 @@ spec:
                   {{- if gt (int $psGpuCount) 0}}
                   nvidia.com/gpu: {{ .Values.psGPU | quote }}
                   {{- end }}
+                  {{- range $key, $value := .Values.devices }}
+                  {{ $key }}: {{ $value }}
+                  {{- end }}
                   {{- if .Values.psCPULimit }}
                   cpu: {{ .Values.psCPULimit | quote }}
                   {{- else if .Values.psCPU }}
@@ -519,7 +525,7 @@ spec:
             - name: code-sync
               emptyDir: {}
             {{- end}}
-            {{- if gt (int $gpuCount) 0}}  
+            {{- if gt (int $gpuCount) 0}}
             {{- if .Values.nvidiaPath }}
             - hostPath:
                 path: "{{ .Values.nvidiaPath }}"
@@ -628,10 +634,13 @@ spec:
                   {{- if gt (int $gpuCount) 0}}   
                   {{- if .Values.nvidiaPath }}
                   alpha.kubernetes.io/nvidia-gpu: {{ $gpuCount | quote }}
-                  {{- else}}
+                  {{- else }}
                   nvidia.com/gpu: {{ $gpuCount | quote }}
                   {{- end}}
                   {{- end}}
+                  {{- range $key, $value := .Values.devices }}
+                  {{ $key }}: {{ $value }}
+                  {{- end }}
                   {{- if .Values.workerCPU }}
                   cpu: {{ .Values.workerCPU | quote }}
                   {{- end}}
@@ -642,13 +651,16 @@ spec:
                   rdma/hca: "1"
                   {{- end}}
                 limits:
-                  {{- if gt (int $gpuCount) 0}}   
+                  {{- if gt (int $gpuCount) 0}}
                   {{- if .Values.nvidiaPath }}
                   alpha.kubernetes.io/nvidia-gpu: {{ $gpuCount | quote }}
-                  {{- else}}
+                  {{- else }}
                   nvidia.com/gpu: {{ $gpuCount | quote }}
                   {{- end}}
                   {{- end}}
+                  {{- range $key, $value := .Values.devices }}
+                  {{ $key }}: {{ $value }}
+                  {{- end }}
                   {{- if .Values.workerCPULimit }}
                   cpu: {{ .Values.workerCPULimit | quote }}                  
                   {{- else if .Values.workerCPU }}
@@ -977,13 +989,16 @@ spec:
               {{- end }}
               resources:             
                 requests:
-                  {{- if gt (int $gpuCount) 0}}   
+                  {{- if gt (int $gpuCount) 0}}
                   {{- if .Values.nvidiaPath }}
                   alpha.kubernetes.io/nvidia-gpu: {{ $gpuCount | quote }}
-                  {{- else}}
+                  {{- else }}
                   nvidia.com/gpu: {{ $gpuCount | quote }}
                   {{- end}}
                   {{- end}}
+                  {{- range $key, $value := .Values.devices }}
+                  {{ $key }}: {{ $value }}
+                  {{- end }}
                   {{- if .Values.chiefCPU }}
                   cpu: {{ .Values.chiefCPU | quote }}
                   {{- end}}
@@ -994,13 +1009,16 @@ spec:
                   rdma/hca: "1"
                   {{- end}}
                 limits:
-                  {{- if gt (int $gpuCount) 0}}   
+                  {{- if gt (int $gpuCount) 0}}
                   {{- if .Values.nvidiaPath }}
                   alpha.kubernetes.io/nvidia-gpu: {{ $gpuCount | quote }}
-                  {{- else}}
+                  {{- else }}
                   nvidia.com/gpu: {{ $gpuCount | quote }}
                   {{- end}}
                   {{- end}}
+                  {{- range $key, $value := .Values.devices }}
+                  {{ $key }}: {{ $value }}
+                  {{- end }}
                   {{- if .Values.chiefCPULimit }}
                   cpu: {{ .Values.chiefCPULimit | quote }}                  
                   {{- else if .Values.chiefCPU }}
@@ -1284,27 +1302,33 @@ spec:
               {{- end }}
               resources:             
                 requests:
-                  {{- if gt (int $gpuCount) 0}}   
+                  {{- if gt (int $gpuCount) 0}}
                   {{- if .Values.nvidiaPath }}
                   alpha.kubernetes.io/nvidia-gpu: {{ $gpuCount | quote }}
-                  {{- else}}
+                  {{- else }}
                   nvidia.com/gpu: {{ $gpuCount | quote }}
                   {{- end}}
                   {{- end}}
+                  {{- range $key, $value := .Values.devices }}
+                  {{ $key }}: {{ $value }}
+                  {{- end }}
                   {{- if .Values.evaluatorCPU }}
                   cpu: {{ .Values.evaluatorCPU | quote }}
                   {{- end}}
                   {{- if .Values.evaluatorMemory }}
                   memory: {{ .Values.evaluatorMemory | quote }}
                   {{- end}}
                 limits:
-                  {{- if gt (int $gpuCount) 0}}   
+                  {{- if gt (int $gpuCount) 0}}
                   {{- if .Values.nvidiaPath }}
                   alpha.kubernetes.io/nvidia-gpu: {{ $gpuCount | quote }}
-                  {{- else}}
+                  {{- else }}
                   nvidia.com/gpu: {{ $gpuCount | quote }}
                   {{- end}}
                   {{- end}}
+                  {{- range $key, $value := .Values.devices }}
+                  {{ $key }}: {{ $value }}
+                  {{- end }}
                   {{- if .Values.evaluatorCPULimit }}
                   cpu: {{ .Values.evaluatorCPULimit | quote }}                  
                   {{- else if .Values.evaluatorCPU }}

diff --git a/charts/tfjob/values.yaml b/charts/tfjob/values.yaml
@@ -8,6 +8,9 @@ useHostIPC: true
 gpuCount: 0 # user define
 privileged: false
 
+# device and resource
+#devices: amd.com/gpu=1
+
 chief: 0
 # Possible value: Chief/Master
 chiefName: Chief
@@ -25,6 +28,7 @@ ps: 0
 psImage: kubeflow/tf-dist-mnist-test:1.0
 # psCPU: 1
 # psMemory: 1 Gi
+# psGPU: 1
 # psPort: 2223
 annotations: {}
 

diff --git a/docs/cli/arena_serve_custom.md b/docs/cli/arena_serve_custom.md
@@ -18,6 +18,7 @@ arena serve custom [flags]
   -d, --data stringArray           specify the trained models datasource to mount for serving, like <name_of_datasource>:<mount_point_on_job>
       --data-subpath-expr stringArray  specify the datasource subpath to mount to the job by expression, like <name_of_datasource>:<mount_subpath_expr>
       --data-dir stringArray       specify the trained models datasource on host to mount for serving, like <host_path>:<mount_point_on_job>
+      --device stringArray         specify the chip vendors and count that used for resources, such as amd.com/gpu=1 gpu.intel.com/i915=1.
       --enable-istio               enable Istio for serving or not (disable Istio by default)
   -e, --env stringArray            the environment variables
       --expose-service             expose service using Istio gateway for external access or not (not expose by default)

diff --git a/docs/cli/arena_submit_etjob.md b/docs/cli/arena_submit_etjob.md
@@ -18,6 +18,7 @@ arena submit etjob [flags]
       --cpu string                  the cpu resource to use for the training, like 1 for 1 core.
   -d, --data strings                specify the datasource to mount to the job, like <name_of_datasource>:<mount_point_on_job>
       --data-dir strings            the data dir. If you specify /data, it means mounting hostpath /data into container path /data
+      --device stringArray          specify the chip vendors and count that used for resources, such as amd.com/gpu=1 gpu.intel.com/i915=1.
   -e, --env strings                 the environment variables
       --gang                        enable gang scheduling
       --gpus int                    the GPU count of each worker to run the training.

diff --git a/docs/cli/arena_submit_horovodjob.md b/docs/cli/arena_submit_horovodjob.md
@@ -18,6 +18,7 @@ arena submit horovodjob [flags]
       --cpu string                  the cpu resource to use for the training, like 1 for 1 core.
   -d, --data strings                specify the datasource to mount to the job, like <name_of_datasource>:<mount_point_on_job>
       --data-dir strings            the data dir. If you specify /data, it means mounting hostpath /data into container path /data
+      --device stringArray          specify the chip vendors and count that used for resources, such as amd.com/gpu=1 gpu.intel.com/i915=1.
   -e, --env strings                 the environment variables
       --gang                        enable gang scheduling
       --gpus int                    the GPU count of each worker to run the training.

diff --git a/docs/cli/arena_submit_mpijob.md b/docs/cli/arena_submit_mpijob.md
@@ -18,6 +18,7 @@ arena submit mpijob [flags]
       --cpu string                  the cpu resource to use for the training, like 1 for 1 core.
   -d, --data strings                specify the datasource to mount to the job, like <name_of_datasource>:<mount_point_on_job>
       --data-dir strings            the data dir. If you specify /data, it means mounting hostpath /data into container path /data
+      --device stringArray          specify the chip vendors and count that used for resources, such as amd.com/gpu=1 gpu.intel.com/i915=1.
   -e, --env strings                 the environment variables
       --gang                        enable gang scheduling
       --gpus int                    the GPU count of each worker to run the training.

diff --git a/docs/cli/arena_submit_pytorchjob.md b/docs/cli/arena_submit_pytorchjob.md
@@ -19,6 +19,7 @@ arena submit pytorchjob [flags]
       --cpu string                  the cpu resource to use for the training, like 1 for 1 core.
   -d, --data strings                specify the datasource to mount to the job, like <name_of_datasource>:<mount_point_on_job>
       --data-dir strings            the data dir. If you specify /data, it means mounting hostpath /data into container path /data
+      --device stringArray          specify the chip vendors and count that used for resources, such as amd.com/gpu=1 gpu.intel.com/i915=1.
   -e, --env strings                 the environment variables
       --gang                        enable gang scheduling
       --gpus int                    the GPU count of each worker to run the training.

diff --git a/docs/cli/arena_submit_tfjob.md b/docs/cli/arena_submit_tfjob.md
@@ -25,6 +25,7 @@ arena submit tfjob [flags]
       --config-file strings          giving configuration files when submiting jobs,usage:"--config-file <host_path_file>:<container_path_file>"
   -d, --data strings                 specify the datasource to mount to the job, like <name_of_datasource>:<mount_point_on_job>
       --data-dir strings             the data dir. If you specify /data, it means mounting hostpath /data into container path /data
+      --device stringArray           specify the chip vendors and count that used for resources, such as amd.com/gpu=1 gpu.intel.com/i915=1.
   -e, --env strings                  the environment variables
       --evaluator                    enable evaluator, which is optional for estimator.
       --evaluator-cpu string         the cpu resource to use for the evaluator, like 1 for 1 core.