Skip to content

Commit

Permalink
support claiming device resources for tfjob/pytorchjob/custom-serving…
Browse files Browse the repository at this point in the history
…, such as amd.com/gpu=1

Signed-off-by: lizhiboo <lizhiboo@yeah.net>
  • Loading branch information
lizhiboo committed Aug 30, 2024
1 parent b500f9e commit 74923fd
Show file tree
Hide file tree
Showing 35 changed files with 403 additions and 175 deletions.
3 changes: 3 additions & 0 deletions charts/custom-serving/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,9 @@ spec:
{{- if gt (int $gpuCount) 0}}
nvidia.com/gpu: {{ .Values.gpuCount }}
{{- end }}
{{- range $key, $value := .Values.devices }}
{{ $key }}: {{ $value }}
{{- end }}
{{- if gt (int $gpuMemory) 0}}
aliyun.com/gpu-mem: {{ .Values.gpuMemory }}
{{- end }}
Expand Down
3 changes: 3 additions & 0 deletions charts/custom-serving/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ restApiPort: 8501
metricsPort: 0
replicas: 1

# device resources
#devices: amd.com/gpu=1

# repository: "cheyang/tf-model-server-gpu"
image: "tensorflow/serving:latest"

Expand Down
6 changes: 6 additions & 0 deletions charts/etjob/templates/etjob.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -514,6 +514,9 @@ spec:
nvidia.com/gpu: {{ $gpuCount | quote }}
{{- end }}
{{- end }}
{{- range $key, $value := .Values.devices }}
{{ $key }}: {{ $value }}
{{- end }}
{{- if .Values.cpu }}
cpu: {{ .Values.cpu | quote }}
{{- end }}
Expand All @@ -531,6 +534,9 @@ spec:
nvidia.com/gpu: {{ $gpuCount | quote }}
{{- end }}
{{- end }}
{{- range $key, $value := .Values.devices }}
{{ $key }}: {{ $value }}
{{- end }}
{{- if .Values.cpu }}
cpu: {{ .Values.cpu | quote }}
{{- end }}
Expand Down
3 changes: 3 additions & 0 deletions charts/mpijob/templates/mpijob.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,9 @@ spec:
nvidia.com/gpu: {{ $gpuCount | quote }}
{{- end }}
{{- end }}
{{- range $key, $value := .Values.devices }}
{{ $key }}: {{ $value }}
{{- end }}
{{- if .Values.cpu }}
cpu: {{ .Values.cpu | quote }}
{{- end }}
Expand Down
20 changes: 16 additions & 4 deletions charts/pytorchjob/templates/pytorchjob.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -239,10 +239,13 @@ spec:
{{- if gt (int $gpuCount) 0}}
{{- if .Values.nvidiaPath }}
alpha.kubernetes.io/nvidia-gpu: {{ $gpuCount | quote }}
{{- else}}
{{- else }}
nvidia.com/gpu: {{ $gpuCount | quote }}
{{- end }}
{{- end }}
{{- range $key, $value := .Values.devices }}
{{ $key }}: {{ $value }}
{{- end }}
{{- if .Values.cpu }}
cpu: {{ .Values.cpu | quote }}
{{- end }}
Expand All @@ -256,10 +259,13 @@ spec:
{{- if gt (int $gpuCount) 0}}
{{- if .Values.nvidiaPath }}
alpha.kubernetes.io/nvidia-gpu: {{ $gpuCount | quote }}
{{- else}}
{{- else }}
nvidia.com/gpu: {{ $gpuCount | quote }}
{{- end }}
{{- end }}
{{- range $key, $value := .Values.devices }}
{{ $key }}: {{ $value }}
{{- end }}
{{- if .Values.cpu }}
cpu: {{ .Values.cpu | quote }}
{{- end }}
Expand Down Expand Up @@ -531,10 +537,13 @@ spec:
{{- if gt (int $gpuCount) 0}}
{{- if .Values.nvidiaPath }}
alpha.kubernetes.io/nvidia-gpu: {{ $gpuCount | quote }}
{{- else}}
{{- else }}
nvidia.com/gpu: {{ $gpuCount | quote }}
{{- end }}
{{- end }}
{{- range $key, $value := .Values.devices }}
{{ $key }}: {{ $value }}
{{- end }}
{{- if .Values.cpu }}
cpu: {{ .Values.cpu | quote }}
{{- end }}
Expand All @@ -548,10 +557,13 @@ spec:
{{- if gt (int $gpuCount) 0}}
{{- if .Values.nvidiaPath }}
alpha.kubernetes.io/nvidia-gpu: {{ $gpuCount | quote }}
{{- else}}
{{- else }}
nvidia.com/gpu: {{ $gpuCount | quote }}
{{- end }}
{{- end }}
{{- range $key, $value := .Values.devices }}
{{ $key }}: {{ $value }}
{{- end }}
{{- if .Values.cpu }}
cpu: {{ .Values.cpu | quote }}
{{- end }}
Expand Down
3 changes: 3 additions & 0 deletions charts/pytorchjob/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ useHostPID: true
useHostIPC: true
gpuCount: 0 # user define

# devices resources
#devices: amd.com/gpu=1

# rsync image
rsyncImage: registry.cn-zhangjiakou.aliyuncs.com/acs/rsync:v3.1.0-aliyun
# git sync image
Expand Down
48 changes: 36 additions & 12 deletions charts/tfjob/templates/tfjob.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,9 @@ spec:
{{- if gt (int $psGpuCount) 0}}
nvidia.com/gpu: {{ .Values.psGPU | quote }}
{{- end }}
{{- range $key, $value := .Values.devices }}
{{ $key }}: {{ $value }}
{{- end }}
{{- if .Values.psCPU }}
cpu: {{ .Values.psCPU | quote }}
{{- end}}
Expand All @@ -301,6 +304,9 @@ spec:
{{- if gt (int $psGpuCount) 0}}
nvidia.com/gpu: {{ .Values.psGPU | quote }}
{{- end }}
{{- range $key, $value := .Values.devices }}
{{ $key }}: {{ $value }}
{{- end }}
{{- if .Values.psCPULimit }}
cpu: {{ .Values.psCPULimit | quote }}
{{- else if .Values.psCPU }}
Expand Down Expand Up @@ -519,7 +525,7 @@ spec:
- name: code-sync
emptyDir: {}
{{- end}}
{{- if gt (int $gpuCount) 0}}
{{- if gt (int $gpuCount) 0}}
{{- if .Values.nvidiaPath }}
- hostPath:
path: "{{ .Values.nvidiaPath }}"
Expand Down Expand Up @@ -628,10 +634,13 @@ spec:
{{- if gt (int $gpuCount) 0}}
{{- if .Values.nvidiaPath }}
alpha.kubernetes.io/nvidia-gpu: {{ $gpuCount | quote }}
{{- else}}
{{- else }}
nvidia.com/gpu: {{ $gpuCount | quote }}
{{- end}}
{{- end}}
{{- range $key, $value := .Values.devices }}
{{ $key }}: {{ $value }}
{{- end }}
{{- if .Values.workerCPU }}
cpu: {{ .Values.workerCPU | quote }}
{{- end}}
Expand All @@ -642,13 +651,16 @@ spec:
rdma/hca: "1"
{{- end}}
limits:
{{- if gt (int $gpuCount) 0}}
{{- if gt (int $gpuCount) 0}}
{{- if .Values.nvidiaPath }}
alpha.kubernetes.io/nvidia-gpu: {{ $gpuCount | quote }}
{{- else}}
{{- else }}
nvidia.com/gpu: {{ $gpuCount | quote }}
{{- end}}
{{- end}}
{{- range $key, $value := .Values.devices }}
{{ $key }}: {{ $value }}
{{- end }}
{{- if .Values.workerCPULimit }}
cpu: {{ .Values.workerCPULimit | quote }}
{{- else if .Values.workerCPU }}
Expand Down Expand Up @@ -977,13 +989,16 @@ spec:
{{- end }}
resources:
requests:
{{- if gt (int $gpuCount) 0}}
{{- if gt (int $gpuCount) 0}}
{{- if .Values.nvidiaPath }}
alpha.kubernetes.io/nvidia-gpu: {{ $gpuCount | quote }}
{{- else}}
{{- else }}
nvidia.com/gpu: {{ $gpuCount | quote }}
{{- end}}
{{- end}}
{{- range $key, $value := .Values.devices }}
{{ $key }}: {{ $value }}
{{- end }}
{{- if .Values.chiefCPU }}
cpu: {{ .Values.chiefCPU | quote }}
{{- end}}
Expand All @@ -994,13 +1009,16 @@ spec:
rdma/hca: "1"
{{- end}}
limits:
{{- if gt (int $gpuCount) 0}}
{{- if gt (int $gpuCount) 0}}
{{- if .Values.nvidiaPath }}
alpha.kubernetes.io/nvidia-gpu: {{ $gpuCount | quote }}
{{- else}}
{{- else }}
nvidia.com/gpu: {{ $gpuCount | quote }}
{{- end}}
{{- end}}
{{- range $key, $value := .Values.devices }}
{{ $key }}: {{ $value }}
{{- end }}
{{- if .Values.chiefCPULimit }}
cpu: {{ .Values.chiefCPULimit | quote }}
{{- else if .Values.chiefCPU }}
Expand Down Expand Up @@ -1284,27 +1302,33 @@ spec:
{{- end }}
resources:
requests:
{{- if gt (int $gpuCount) 0}}
{{- if gt (int $gpuCount) 0}}
{{- if .Values.nvidiaPath }}
alpha.kubernetes.io/nvidia-gpu: {{ $gpuCount | quote }}
{{- else}}
{{- else }}
nvidia.com/gpu: {{ $gpuCount | quote }}
{{- end}}
{{- end}}
{{- range $key, $value := .Values.devices }}
{{ $key }}: {{ $value }}
{{- end }}
{{- if .Values.evaluatorCPU }}
cpu: {{ .Values.evaluatorCPU | quote }}
{{- end}}
{{- if .Values.evaluatorMemory }}
memory: {{ .Values.evaluatorMemory | quote }}
{{- end}}
limits:
{{- if gt (int $gpuCount) 0}}
{{- if gt (int $gpuCount) 0}}
{{- if .Values.nvidiaPath }}
alpha.kubernetes.io/nvidia-gpu: {{ $gpuCount | quote }}
{{- else}}
{{- else }}
nvidia.com/gpu: {{ $gpuCount | quote }}
{{- end}}
{{- end}}
{{- range $key, $value := .Values.devices }}
{{ $key }}: {{ $value }}
{{- end }}
{{- if .Values.evaluatorCPULimit }}
cpu: {{ .Values.evaluatorCPULimit | quote }}
{{- else if .Values.evaluatorCPU }}
Expand Down
4 changes: 4 additions & 0 deletions charts/tfjob/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ useHostIPC: true
gpuCount: 0 # user define
privileged: false

# device and resource
#devices: amd.com/gpu=1

chief: 0
# Possible value: Chief/Master
chiefName: Chief
Expand All @@ -25,6 +28,7 @@ ps: 0
psImage: kubeflow/tf-dist-mnist-test:1.0
# psCPU: 1
# psMemory: 1 Gi
# psGPU: 1
# psPort: 2223
annotations: {}

Expand Down
1 change: 1 addition & 0 deletions docs/cli/arena_serve_custom.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ arena serve custom [flags]
-d, --data stringArray specify the trained models datasource to mount for serving, like <name_of_datasource>:<mount_point_on_job>
--data-subpath-expr stringArray specify the datasource subpath to mount to the job by expression, like <name_of_datasource>:<mount_subpath_expr>
--data-dir stringArray specify the trained models datasource on host to mount for serving, like <host_path>:<mount_point_on_job>
--device stringArray specify the chip vendors and count that used for resources, such as amd.com/gpu=1 gpu.intel.com/i915=1.
--enable-istio enable Istio for serving or not (disable Istio by default)
-e, --env stringArray the environment variables
--expose-service expose service using Istio gateway for external access or not (not expose by default)
Expand Down
1 change: 1 addition & 0 deletions docs/cli/arena_submit_etjob.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ arena submit etjob [flags]
--cpu string the cpu resource to use for the training, like 1 for 1 core.
-d, --data strings specify the datasource to mount to the job, like <name_of_datasource>:<mount_point_on_job>
--data-dir strings the data dir. If you specify /data, it means mounting hostpath /data into container path /data
--device stringArray specify the chip vendors and count that used for resources, such as amd.com/gpu=1 gpu.intel.com/i915=1.
-e, --env strings the environment variables
--gang enable gang scheduling
--gpus int the GPU count of each worker to run the training.
Expand Down
1 change: 1 addition & 0 deletions docs/cli/arena_submit_horovodjob.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ arena submit horovodjob [flags]
--cpu string the cpu resource to use for the training, like 1 for 1 core.
-d, --data strings specify the datasource to mount to the job, like <name_of_datasource>:<mount_point_on_job>
--data-dir strings the data dir. If you specify /data, it means mounting hostpath /data into container path /data
--device stringArray specify the chip vendors and count that used for resources, such as amd.com/gpu=1 gpu.intel.com/i915=1.
-e, --env strings the environment variables
--gang enable gang scheduling
--gpus int the GPU count of each worker to run the training.
Expand Down
1 change: 1 addition & 0 deletions docs/cli/arena_submit_mpijob.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ arena submit mpijob [flags]
--cpu string the cpu resource to use for the training, like 1 for 1 core.
-d, --data strings specify the datasource to mount to the job, like <name_of_datasource>:<mount_point_on_job>
--data-dir strings the data dir. If you specify /data, it means mounting hostpath /data into container path /data
--device stringArray specify the chip vendors and count that used for resources, such as amd.com/gpu=1 gpu.intel.com/i915=1.
-e, --env strings the environment variables
--gang enable gang scheduling
--gpus int the GPU count of each worker to run the training.
Expand Down
1 change: 1 addition & 0 deletions docs/cli/arena_submit_pytorchjob.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ arena submit pytorchjob [flags]
--cpu string the cpu resource to use for the training, like 1 for 1 core.
-d, --data strings specify the datasource to mount to the job, like <name_of_datasource>:<mount_point_on_job>
--data-dir strings the data dir. If you specify /data, it means mounting hostpath /data into container path /data
--device stringArray specify the chip vendors and count that used for resources, such as amd.com/gpu=1 gpu.intel.com/i915=1.
-e, --env strings the environment variables
--gang enable gang scheduling
--gpus int the GPU count of each worker to run the training.
Expand Down
1 change: 1 addition & 0 deletions docs/cli/arena_submit_tfjob.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ arena submit tfjob [flags]
--config-file strings giving configuration files when submiting jobs,usage:"--config-file <host_path_file>:<container_path_file>"
-d, --data strings specify the datasource to mount to the job, like <name_of_datasource>:<mount_point_on_job>
--data-dir strings the data dir. If you specify /data, it means mounting hostpath /data into container path /data
--device stringArray specify the chip vendors and count that used for resources, such as amd.com/gpu=1 gpu.intel.com/i915=1.
-e, --env strings the environment variables
--evaluator enable evaluator, which is optional for estimator.
--evaluator-cpu string the cpu resource to use for the evaluator, like 1 for 1 core.
Expand Down
Loading

0 comments on commit 74923fd

Please sign in to comment.