Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#1121 Support multiple type devices #1122

Merged
merged 1 commit into from
Sep 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions charts/custom-serving/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,9 @@ spec:
{{- if gt (int $gpuCount) 0}}
nvidia.com/gpu: {{ .Values.gpuCount }}
{{- end }}
{{- range $key, $value := .Values.devices }}
{{ $key }}: {{ $value }}
{{- end }}
{{- if gt (int $gpuMemory) 0}}
aliyun.com/gpu-mem: {{ .Values.gpuMemory }}
{{- end }}
Expand Down
3 changes: 3 additions & 0 deletions charts/custom-serving/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ restApiPort: 8501
metricsPort: 0
replicas: 1

# device resources
#devices: amd.com/gpu=1

# repository: "cheyang/tf-model-server-gpu"
image: "tensorflow/serving:latest"

Expand Down
6 changes: 6 additions & 0 deletions charts/etjob/templates/etjob.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -514,6 +514,9 @@ spec:
nvidia.com/gpu: {{ $gpuCount | quote }}
{{- end }}
{{- end }}
{{- range $key, $value := .Values.devices }}
{{ $key }}: {{ $value }}
{{- end }}
{{- if .Values.cpu }}
cpu: {{ .Values.cpu | quote }}
{{- end }}
Expand All @@ -531,6 +534,9 @@ spec:
nvidia.com/gpu: {{ $gpuCount | quote }}
{{- end }}
{{- end }}
{{- range $key, $value := .Values.devices }}
{{ $key }}: {{ $value }}
{{- end }}
{{- if .Values.cpu }}
cpu: {{ .Values.cpu | quote }}
{{- end }}
Expand Down
3 changes: 3 additions & 0 deletions charts/mpijob/templates/mpijob.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,9 @@ spec:
nvidia.com/gpu: {{ $gpuCount | quote }}
{{- end }}
{{- end }}
{{- range $key, $value := .Values.devices }}
{{ $key }}: {{ $value }}
{{- end }}
{{- if .Values.cpu }}
cpu: {{ .Values.cpu | quote }}
{{- end }}
Expand Down
20 changes: 16 additions & 4 deletions charts/pytorchjob/templates/pytorchjob.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -239,10 +239,13 @@ spec:
{{- if gt (int $gpuCount) 0}}
{{- if .Values.nvidiaPath }}
alpha.kubernetes.io/nvidia-gpu: {{ $gpuCount | quote }}
{{- else}}
{{- else }}
nvidia.com/gpu: {{ $gpuCount | quote }}
{{- end }}
{{- end }}
{{- range $key, $value := .Values.devices }}
{{ $key }}: {{ $value }}
{{- end }}
{{- if .Values.cpu }}
cpu: {{ .Values.cpu | quote }}
{{- end }}
Expand All @@ -256,10 +259,13 @@ spec:
{{- if gt (int $gpuCount) 0}}
{{- if .Values.nvidiaPath }}
alpha.kubernetes.io/nvidia-gpu: {{ $gpuCount | quote }}
{{- else}}
{{- else }}
nvidia.com/gpu: {{ $gpuCount | quote }}
{{- end }}
{{- end }}
{{- range $key, $value := .Values.devices }}
{{ $key }}: {{ $value }}
{{- end }}
{{- if .Values.cpu }}
cpu: {{ .Values.cpu | quote }}
{{- end }}
Expand Down Expand Up @@ -531,10 +537,13 @@ spec:
{{- if gt (int $gpuCount) 0}}
{{- if .Values.nvidiaPath }}
alpha.kubernetes.io/nvidia-gpu: {{ $gpuCount | quote }}
{{- else}}
{{- else }}
nvidia.com/gpu: {{ $gpuCount | quote }}
{{- end }}
{{- end }}
{{- range $key, $value := .Values.devices }}
{{ $key }}: {{ $value }}
{{- end }}
{{- if .Values.cpu }}
cpu: {{ .Values.cpu | quote }}
{{- end }}
Expand All @@ -548,10 +557,13 @@ spec:
{{- if gt (int $gpuCount) 0}}
{{- if .Values.nvidiaPath }}
alpha.kubernetes.io/nvidia-gpu: {{ $gpuCount | quote }}
{{- else}}
{{- else }}
nvidia.com/gpu: {{ $gpuCount | quote }}
{{- end }}
{{- end }}
{{- range $key, $value := .Values.devices }}
{{ $key }}: {{ $value }}
{{- end }}
{{- if .Values.cpu }}
cpu: {{ .Values.cpu | quote }}
{{- end }}
Expand Down
3 changes: 3 additions & 0 deletions charts/pytorchjob/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ useHostPID: true
useHostIPC: true
gpuCount: 0 # user define

# devices resources
#devices: amd.com/gpu=1

# rsync image
rsyncImage: registry.cn-zhangjiakou.aliyuncs.com/acs/rsync:v3.1.0-aliyun
# git sync image
Expand Down
48 changes: 36 additions & 12 deletions charts/tfjob/templates/tfjob.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,9 @@ spec:
{{- if gt (int $psGpuCount) 0}}
nvidia.com/gpu: {{ .Values.psGPU | quote }}
{{- end }}
{{- range $key, $value := .Values.devices }}
{{ $key }}: {{ $value }}
{{- end }}
{{- if .Values.psCPU }}
cpu: {{ .Values.psCPU | quote }}
{{- end}}
Expand All @@ -301,6 +304,9 @@ spec:
{{- if gt (int $psGpuCount) 0}}
nvidia.com/gpu: {{ .Values.psGPU | quote }}
{{- end }}
{{- range $key, $value := .Values.devices }}
{{ $key }}: {{ $value }}
{{- end }}
{{- if .Values.psCPULimit }}
cpu: {{ .Values.psCPULimit | quote }}
{{- else if .Values.psCPU }}
Expand Down Expand Up @@ -519,7 +525,7 @@ spec:
- name: code-sync
emptyDir: {}
{{- end}}
{{- if gt (int $gpuCount) 0}}
{{- if gt (int $gpuCount) 0}}
{{- if .Values.nvidiaPath }}
- hostPath:
path: "{{ .Values.nvidiaPath }}"
Expand Down Expand Up @@ -628,10 +634,13 @@ spec:
{{- if gt (int $gpuCount) 0}}
{{- if .Values.nvidiaPath }}
alpha.kubernetes.io/nvidia-gpu: {{ $gpuCount | quote }}
{{- else}}
{{- else }}
nvidia.com/gpu: {{ $gpuCount | quote }}
{{- end}}
{{- end}}
{{- range $key, $value := .Values.devices }}
{{ $key }}: {{ $value }}
{{- end }}
{{- if .Values.workerCPU }}
cpu: {{ .Values.workerCPU | quote }}
{{- end}}
Expand All @@ -642,13 +651,16 @@ spec:
rdma/hca: "1"
{{- end}}
limits:
{{- if gt (int $gpuCount) 0}}
{{- if gt (int $gpuCount) 0}}
{{- if .Values.nvidiaPath }}
alpha.kubernetes.io/nvidia-gpu: {{ $gpuCount | quote }}
{{- else}}
{{- else }}
nvidia.com/gpu: {{ $gpuCount | quote }}
{{- end}}
{{- end}}
{{- range $key, $value := .Values.devices }}
{{ $key }}: {{ $value }}
{{- end }}
{{- if .Values.workerCPULimit }}
cpu: {{ .Values.workerCPULimit | quote }}
{{- else if .Values.workerCPU }}
Expand Down Expand Up @@ -977,13 +989,16 @@ spec:
{{- end }}
resources:
requests:
{{- if gt (int $gpuCount) 0}}
{{- if gt (int $gpuCount) 0}}
{{- if .Values.nvidiaPath }}
alpha.kubernetes.io/nvidia-gpu: {{ $gpuCount | quote }}
{{- else}}
{{- else }}
nvidia.com/gpu: {{ $gpuCount | quote }}
{{- end}}
{{- end}}
{{- range $key, $value := .Values.devices }}
{{ $key }}: {{ $value }}
{{- end }}
{{- if .Values.chiefCPU }}
cpu: {{ .Values.chiefCPU | quote }}
{{- end}}
Expand All @@ -994,13 +1009,16 @@ spec:
rdma/hca: "1"
{{- end}}
limits:
{{- if gt (int $gpuCount) 0}}
{{- if gt (int $gpuCount) 0}}
{{- if .Values.nvidiaPath }}
alpha.kubernetes.io/nvidia-gpu: {{ $gpuCount | quote }}
{{- else}}
{{- else }}
nvidia.com/gpu: {{ $gpuCount | quote }}
{{- end}}
{{- end}}
{{- range $key, $value := .Values.devices }}
{{ $key }}: {{ $value }}
{{- end }}
{{- if .Values.chiefCPULimit }}
cpu: {{ .Values.chiefCPULimit | quote }}
{{- else if .Values.chiefCPU }}
Expand Down Expand Up @@ -1284,27 +1302,33 @@ spec:
{{- end }}
resources:
requests:
{{- if gt (int $gpuCount) 0}}
{{- if gt (int $gpuCount) 0}}
{{- if .Values.nvidiaPath }}
alpha.kubernetes.io/nvidia-gpu: {{ $gpuCount | quote }}
{{- else}}
{{- else }}
nvidia.com/gpu: {{ $gpuCount | quote }}
{{- end}}
{{- end}}
{{- range $key, $value := .Values.devices }}
{{ $key }}: {{ $value }}
{{- end }}
{{- if .Values.evaluatorCPU }}
cpu: {{ .Values.evaluatorCPU | quote }}
{{- end}}
{{- if .Values.evaluatorMemory }}
memory: {{ .Values.evaluatorMemory | quote }}
{{- end}}
limits:
{{- if gt (int $gpuCount) 0}}
{{- if gt (int $gpuCount) 0}}
{{- if .Values.nvidiaPath }}
alpha.kubernetes.io/nvidia-gpu: {{ $gpuCount | quote }}
{{- else}}
{{- else }}
nvidia.com/gpu: {{ $gpuCount | quote }}
{{- end}}
{{- end}}
{{- range $key, $value := .Values.devices }}
{{ $key }}: {{ $value }}
{{- end }}
{{- if .Values.evaluatorCPULimit }}
cpu: {{ .Values.evaluatorCPULimit | quote }}
{{- else if .Values.evaluatorCPU }}
Expand Down
4 changes: 4 additions & 0 deletions charts/tfjob/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ useHostIPC: true
gpuCount: 0 # user define
privileged: false

# device and resource
#devices: amd.com/gpu=1

chief: 0
# Possible value: Chief/Master
chiefName: Chief
Expand All @@ -25,6 +28,7 @@ ps: 0
psImage: kubeflow/tf-dist-mnist-test:1.0
# psCPU: 1
# psMemory: 1 Gi
# psGPU: 1
# psPort: 2223
annotations: {}

Expand Down
1 change: 1 addition & 0 deletions docs/cli/arena_serve_custom.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ arena serve custom [flags]
-d, --data stringArray specify the trained models datasource to mount for serving, like <name_of_datasource>:<mount_point_on_job>
--data-subpath-expr stringArray specify the datasource subpath to mount to the job by expression, like <name_of_datasource>:<mount_subpath_expr>
--data-dir stringArray specify the trained models datasource on host to mount for serving, like <host_path>:<mount_point_on_job>
--device stringArray specify the chip vendors and count that used for resources, such as amd.com/gpu=1 gpu.intel.com/i915=1.
--enable-istio enable Istio for serving or not (disable Istio by default)
-e, --env stringArray the environment variables
--expose-service expose service using Istio gateway for external access or not (not expose by default)
Expand Down
1 change: 1 addition & 0 deletions docs/cli/arena_submit_etjob.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ arena submit etjob [flags]
--cpu string the cpu resource to use for the training, like 1 for 1 core.
-d, --data strings specify the datasource to mount to the job, like <name_of_datasource>:<mount_point_on_job>
--data-dir strings the data dir. If you specify /data, it means mounting hostpath /data into container path /data
--device stringArray specify the chip vendors and count that used for resources, such as amd.com/gpu=1 gpu.intel.com/i915=1.
-e, --env strings the environment variables
--gang enable gang scheduling
--gpus int the GPU count of each worker to run the training.
Expand Down
1 change: 1 addition & 0 deletions docs/cli/arena_submit_horovodjob.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ arena submit horovodjob [flags]
--cpu string the cpu resource to use for the training, like 1 for 1 core.
-d, --data strings specify the datasource to mount to the job, like <name_of_datasource>:<mount_point_on_job>
--data-dir strings the data dir. If you specify /data, it means mounting hostpath /data into container path /data
--device stringArray specify the chip vendors and count that used for resources, such as amd.com/gpu=1 gpu.intel.com/i915=1.
-e, --env strings the environment variables
--gang enable gang scheduling
--gpus int the GPU count of each worker to run the training.
Expand Down
1 change: 1 addition & 0 deletions docs/cli/arena_submit_mpijob.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ arena submit mpijob [flags]
--cpu string the cpu resource to use for the training, like 1 for 1 core.
-d, --data strings specify the datasource to mount to the job, like <name_of_datasource>:<mount_point_on_job>
--data-dir strings the data dir. If you specify /data, it means mounting hostpath /data into container path /data
--device stringArray specify the chip vendors and count that used for resources, such as amd.com/gpu=1 gpu.intel.com/i915=1.
-e, --env strings the environment variables
--gang enable gang scheduling
--gpus int the GPU count of each worker to run the training.
Expand Down
1 change: 1 addition & 0 deletions docs/cli/arena_submit_pytorchjob.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ arena submit pytorchjob [flags]
--cpu string the cpu resource to use for the training, like 1 for 1 core.
-d, --data strings specify the datasource to mount to the job, like <name_of_datasource>:<mount_point_on_job>
--data-dir strings the data dir. If you specify /data, it means mounting hostpath /data into container path /data
--device stringArray specify the chip vendors and count that used for resources, such as amd.com/gpu=1 gpu.intel.com/i915=1.
-e, --env strings the environment variables
--gang enable gang scheduling
--gpus int the GPU count of each worker to run the training.
Expand Down
1 change: 1 addition & 0 deletions docs/cli/arena_submit_tfjob.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ arena submit tfjob [flags]
--config-file strings giving configuration files when submiting jobs,usage:"--config-file <host_path_file>:<container_path_file>"
-d, --data strings specify the datasource to mount to the job, like <name_of_datasource>:<mount_point_on_job>
--data-dir strings the data dir. If you specify /data, it means mounting hostpath /data into container path /data
--device stringArray specify the chip vendors and count that used for resources, such as amd.com/gpu=1 gpu.intel.com/i915=1.
-e, --env strings the environment variables
--evaluator enable evaluator, which is optional for estimator.
--evaluator-cpu string the cpu resource to use for the evaluator, like 1 for 1 core.
Expand Down
Loading
Loading