From 3a7f45e1d4400bdfd432b456011b3da6c1c70409 Mon Sep 17 00:00:00 2001 From: Andrey Velichkevich Date: Tue, 27 Oct 2020 06:43:58 +0000 Subject: [PATCH] Add Tekton Pipeline example (#1339) * Tekton example Add README for Tekton examples Add yaml with PipelineRun * Fix README * Remove istio annotation * Fix comment --- examples/v1beta1/tekton/README.md | 40 ++++++++ examples/v1beta1/tekton/pipeline-run.yaml | 96 +++++++++++++++++++ .../katib-controller/katib-controller.yaml | 1 + manifests/v1beta1/katib-controller/rbac.yaml | 6 ++ 4 files changed, 143 insertions(+) create mode 100644 examples/v1beta1/tekton/README.md create mode 100644 examples/v1beta1/tekton/pipeline-run.yaml diff --git a/examples/v1beta1/tekton/README.md b/examples/v1beta1/tekton/README.md new file mode 100644 index 00000000000..2caa44867cf --- /dev/null +++ b/examples/v1beta1/tekton/README.md @@ -0,0 +1,40 @@ +# Katib examples with Tekton integration + +Here you can find examples of using Katib with [Tekton](https://github.com/tektoncd/pipeline). + +Check [here](https://github.com/tektoncd/pipeline/blob/master/docs/install.md#installing-tekton-pipelines-on-kubernetes) +how to install Tekton on your cluster. + +**Note** that you must modify Tekton [`nop`](https://github.com/tektoncd/pipeline/tree/master/cmd/nop) +image to run Tekton pipelines. `Nop` image is used to stop sidecar containers after main container +is completed. Metrics collector should not be stopped after training container is finished. +To avoid this problem, set `nop` image to metrics collector sidecar image. + +For example, if you are using +[StdOut](https://www.kubeflow.org/docs/components/hyperparameter-tuning/experiment/#metrics-collector) metrics collector, +`nop` image must be equal to `gcr.io/kubeflow-images-public/katib/v1beta1/file-metrics-collector`. + +After deploying Tekton on your cluster, run bellow command to modify `nop` image: + +```bash +kubectl patch deploy tekton-pipelines-controller -n tekton-pipelines --type='json' \ + -p='[{"op": "replace", "path": "/spec/template/spec/containers/0/args/9", "value": "gcr.io/kubeflow-images-public/katib/v1beta1/file-metrics-collector"}]' +``` + +Check that Tekton controller's pod was restarted: + +```bash +$ kubectl get pods -n tekton-pipelines + +NAME READY STATUS RESTARTS AGE +tekton-pipelines-controller-7fcb6c6cd4-p8zf2 1/1 Running 0 2m2s +tekton-pipelines-webhook-7f9888f9b-7d6mr 1/1 Running 0 12h +``` + +Check that `nop` image was modified: + +```bash +$ kubectl get pod -n tekton-pipelines -o yaml | grep katib/v1beta1/file-metrics-collector + + - gcr.io/kubeflow-images-public/katib/v1beta1/file-metrics-collector +``` diff --git a/examples/v1beta1/tekton/pipeline-run.yaml b/examples/v1beta1/tekton/pipeline-run.yaml new file mode 100644 index 00000000000..4a9d12bc335 --- /dev/null +++ b/examples/v1beta1/tekton/pipeline-run.yaml @@ -0,0 +1,96 @@ +# This example shows how you can use Tekton Pipelines in Katib, transfer parameters from one Task to another and run HP job. +# It uses simple random algorithm and tunes only learning rate. +# Pipelines contains 2 Tasks, first is data-preprocessing second is model-training. +# First Task shows how you can prepare your training data (here: simply divide number of training examples) before running HP job. +# Number of training examples is transferred to the second Task. +# Second Task is the actual training which metrics collector sidecar is injected. +# Note that for this example Tekton controller's nop image must be equal to StdOut metrics collector image. +apiVersion: "kubeflow.org/v1beta1" +kind: Experiment +metadata: + namespace: kubeflow + name: tekton-pipeline-run +spec: + objective: + type: maximize + goal: 0.99 + objectiveMetricName: Validation-accuracy + additionalMetricNames: + - Train-accuracy + algorithm: + algorithmName: random + parallelTrialCount: 2 + maxTrialCount: 4 + maxFailedTrialCount: 3 + parameters: + - name: lr + parameterType: double + feasibleSpace: + min: "0.01" + max: "0.03" + trialTemplate: + retain: true + primaryPodLabels: + tekton.dev/pipelineTask: model-training + primaryContainerName: step-model-training + successCondition: status.conditions.#(type=="Succeeded")#|#(status=="True")# + failureCondition: status.conditions.#(type=="Succeeded")#|#(status=="False")# + trialParameters: + - name: learningRate + description: Learning rate for the training model + reference: lr + trialSpec: + apiVersion: tekton.dev/v1beta1 + kind: PipelineRun + spec: + params: + - name: lr + value: ${trialParameters.learningRate} + - name: num-examples-init + value: "60000" + pipelineSpec: + params: + - name: lr + description: Learning rate for the training model + - name: num-examples-init + description: Initial value for number of training examples + tasks: + - name: data-preprocessing + params: + - name: num-examples-pre + value: $(params.num-examples-init) + taskSpec: + params: + - name: num-examples-pre + description: Number of training examples before optimization + results: + - name: num-examples-post + description: Number of training examples after optimization + steps: + - name: num-examples-optimize + image: python:alpine3.6 + command: + - sh + - -c + args: + - python3 -c "import random; print($(params.num-examples-pre)//random.randint(10,100),end='')" | tee $(results.num-examples-post.path) + - name: model-training + params: + - name: lr + value: $(params.lr) + - name: num-examples + value: $(tasks.data-preprocessing.results.num-examples-post) + taskSpec: + params: + - name: lr + description: Learning rate for the training model + - name: num-examples + description: Number of training examples + steps: + - name: model-training + image: docker.io/kubeflowkatib/mxnet-mnist + command: + - "python3" + - "/opt/mxnet-mnist/mnist.py" + - "--num-examples=$(params.num-examples)" + - "--lr=$(params.lr)" diff --git a/manifests/v1beta1/katib-controller/katib-controller.yaml b/manifests/v1beta1/katib-controller/katib-controller.yaml index 4dfb01fd445..4d362383bcc 100644 --- a/manifests/v1beta1/katib-controller/katib-controller.yaml +++ b/manifests/v1beta1/katib-controller/katib-controller.yaml @@ -29,6 +29,7 @@ spec: - "--trial-resources=TFJob.v1.kubeflow.org" - "--trial-resources=PyTorchJob.v1.kubeflow.org" - "--trial-resources=MPIJob.v1.kubeflow.org" + - "--trial-resources=PipelineRun.v1beta1.tekton.dev" ports: - containerPort: 8443 name: webhook diff --git a/manifests/v1beta1/katib-controller/rbac.yaml b/manifests/v1beta1/katib-controller/rbac.yaml index dc12f9f3db7..a902f47faf8 100644 --- a/manifests/v1beta1/katib-controller/rbac.yaml +++ b/manifests/v1beta1/katib-controller/rbac.yaml @@ -73,6 +73,12 @@ rules: - mpijobs verbs: - "*" + - apiGroups: + - tekton.dev + resources: + - pipelineruns + verbs: + - "*" --- apiVersion: v1 kind: ServiceAccount