diff --git a/examples/v1beta1/README.md b/examples/v1beta1/README.md index e5364b588d5..05cd07aabe6 100644 --- a/examples/v1beta1/README.md +++ b/examples/v1beta1/README.md @@ -384,8 +384,14 @@ gcr.io/kubeflow-ci/pytorch-dist-mnist-test gcr.io/kubeflow-ci/tf-mnist-with-summaries ``` -- FPGA XGBoost Parameter Tuning example, [source](https://github.com/inaccel/jupyter/blob/master/lab/dot/XGBoost/parameter-tuning.py) +- FPGA XGBoost Parameter Tuning example, [source](https://github.com/inaccel/jupyter/blob/master/lab/dot/XGBoost/parameter-tuning.py). ``` docker.io/inaccel/jupyter:lab ``` + +- MPI operator horovod mnist example, [source](https://github.com/kubeflow/mpi-operator/tree/master/examples/horovod). + +``` +docker.io/kubeflow/mpi-horovod-mnist +``` diff --git a/examples/v1beta1/mpijob-horovod.yaml b/examples/v1beta1/mpijob-horovod.yaml new file mode 100644 index 00000000000..7d89890782b --- /dev/null +++ b/examples/v1beta1/mpijob-horovod.yaml @@ -0,0 +1,95 @@ +apiVersion: "kubeflow.org/v1beta1" +kind: Experiment +metadata: + namespace: kubeflow + name: mpi-horovod-mnist +spec: + objective: + type: minimize + goal: 0.01 + objectiveMetricName: loss + algorithm: + algorithmName: random + parallelTrialCount: 2 + maxTrialCount: 6 + maxFailedTrialCount: 3 + parameters: + - name: lr + parameterType: double + feasibleSpace: + min: "0.001" + max: "0.003" + - name: num-steps + parameterType: int + feasibleSpace: + min: "50" + max: "150" + step: "10" + trialTemplate: + primaryPodLabels: + mpi-job-role: launcher + primaryContainerName: mpi-launcher + successCondition: status.conditions.#(type=="Succeeded")#|#(status=="True")# + failureCondition: status.conditions.#(type=="Failed")#|#(status=="True")# + trialParameters: + - name: learningRate + description: Learning rate for the training model + reference: lr + - name: numberSteps + description: Number of training steps + reference: num-steps + trialSpec: + apiVersion: kubeflow.org/v1 + kind: MPIJob + spec: + slotsPerWorker: 1 + cleanPodPolicy: Running + mpiReplicaSpecs: + Launcher: + replicas: 1 + template: + spec: + containers: + - image: docker.io/kubeflow/mpi-horovod-mnist + name: mpi-launcher + command: + - mpirun + args: + - -np + - "2" + - --allow-run-as-root + - -bind-to + - none + - -map-by + - slot + - -x + - LD_LIBRARY_PATH + - -x + - PATH + - -mca + - pml + - ob1 + - -mca + - btl + - ^openib + - python + - /examples/tensorflow_mnist.py + - --lr + - ${trialParameters.learningRate} + - --num-steps + - ${trialParameters.numberSteps} + resources: + limits: + cpu: 500m + memory: 2Gi + Worker: + replicas: 2 + template: + spec: + containers: + - image: docker.io/kubeflow/mpi-horovod-mnist + name: mpi-worker + resources: + limits: + cpu: 500m + memory: 4Gi diff --git a/manifests/v1beta1/katib-controller/katib-controller.yaml b/manifests/v1beta1/katib-controller/katib-controller.yaml index 779f255bad5..130d185fc7b 100644 --- a/manifests/v1beta1/katib-controller/katib-controller.yaml +++ b/manifests/v1beta1/katib-controller/katib-controller.yaml @@ -25,6 +25,7 @@ spec: command: ["./katib-controller"] args: - "--webhook-port=8443" + - "--trial-resources=MPIJob.v1.kubeflow.org" ports: - containerPort: 8443 name: webhook diff --git a/manifests/v1beta1/katib-controller/rbac.yaml b/manifests/v1beta1/katib-controller/rbac.yaml index dec9b5373f4..dc12f9f3db7 100644 --- a/manifests/v1beta1/katib-controller/rbac.yaml +++ b/manifests/v1beta1/katib-controller/rbac.yaml @@ -70,6 +70,7 @@ rules: resources: - tfjobs - pytorchjobs + - mpijobs verbs: - "*" ---