-
Notifications
You must be signed in to change notification settings - Fork 443
/
mpijob-horovod.yaml
96 lines (96 loc) · 2.7 KB
/
mpijob-horovod.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
---
apiVersion: kubeflow.org/v1beta1
kind: Experiment
metadata:
namespace: kubeflow
name: mpijob-horovod
spec:
objective:
type: minimize
goal: 0.01
objectiveMetricName: loss
algorithm:
algorithmName: random
parallelTrialCount: 2
maxTrialCount: 6
maxFailedTrialCount: 3
parameters:
- name: lr
parameterType: double
feasibleSpace:
min: "0.001"
max: "0.003"
- name: num-steps
parameterType: int
feasibleSpace:
min: "50"
max: "150"
step: "10"
trialTemplate:
primaryPodLabels:
mpi-job-role: launcher
primaryContainerName: mpi-launcher
successCondition: status.conditions.#(type=="Succeeded")#|#(status=="True")#
failureCondition: status.conditions.#(type=="Failed")#|#(status=="True")#
trialParameters:
- name: learningRate
description: Learning rate for the training model
reference: lr
- name: numberSteps
description: Number of training steps
reference: num-steps
trialSpec:
apiVersion: kubeflow.org/v1
kind: MPIJob
spec:
slotsPerWorker: 1
cleanPodPolicy: Running
mpiReplicaSpecs:
Launcher:
replicas: 1
template:
spec:
containers:
- image: docker.io/kubeflow/mpi-horovod-mnist
name: mpi-launcher
command:
- mpirun
args:
- -np
- "2"
- --allow-run-as-root
- -bind-to
- none
- -map-by
- slot
- -x
- LD_LIBRARY_PATH
- -x
- PATH
- -mca
- pml
- ob1
- -mca
- btl
- ^openib
- python
- /examples/tensorflow_mnist.py
- --lr
- ${trialParameters.learningRate}
- --num-steps
- ${trialParameters.numberSteps}
resources:
limits:
cpu: 500m
memory: 2Gi
Worker:
replicas: 2
template:
spec:
containers:
- image: docker.io/kubeflow/mpi-horovod-mnist
name: mpi-worker
resources:
limits:
cpu: 500m
memory: 4Gi