Tekton example

Add README for Tekton examples Add yaml with PipelineRun
kubeflow · Sep 16, 2020 · edc119c · edc119c
1 parent 721a382
commit edc119c
Show file tree

Hide file tree

Showing 4 changed files with 152 additions and 0 deletions.
diff --git a/examples/v1beta1/tekton/README.md b/examples/v1beta1/tekton/README.md
@@ -0,0 +1,41 @@
+# Katib examples with Tekton integration
+
+Here you can find examples of using Katib with [Tekton](https://github.com/tektoncd/pipeline).
+Check [here](https://github.com/tektoncd/pipeline/blob/master/docs/install.md#installing-tekton-pipelines-on-kubernetes) how to install Tekton on your cluster.
+
+**Note** that you must modify Tekton [`nop`](https://github.com/tektoncd/pipeline/tree/master/cmd/nop) image to run Tekton pipelines. `Nop` images is used to stop sidecar containers after main container is completed. Metrics collector must be not stopped after training container is finished. To avoid this problem, `nop` image should be equal to metrics collector sidecar image.
+
+For example, if you are using [StdOut](https://www.kubeflow.org/docs/components/hyperparameter-tuning/experiment/#metrics-collector) metrics collector, `nop` image must be equal to `gcr.io/kubeflow-images-public/katib/v1beta1/file-metrics-collector`.
+
+After deploying Tekton on your cluster, run bellow command to modify `nop` image.
+
+```bash
+kubectl patch deploy tekton-pipelines-controller -n tekton-pipelines --type='json' \
+-p='[{"op": "replace", "path": "/spec/template/spec/containers/0/args/9", "value": "gcr.io/kubeflow-images-public/katib/v1beta1/file-metrics-collector"}]'
+```
+
+Check that Tekton controller's pod was restarted:
+
+```
+kubectl get pods -n tekton-pipelines
+```
+
+Expected output:
+
+```
+NAME                                           READY   STATUS    RESTARTS   AGE
+tekton-pipelines-controller-7fcb6c6cd4-p8zf2   1/1     Running   0          2m2s
+tekton-pipelines-webhook-7f9888f9b-7d6mr       1/1     Running   0          12h
+```
+
+Check that `nop` image was modified:
+
+```
+kubectl get pod <tekton-controller-pod-name> -n tekton-pipelines- -o yaml | grep katib/v1beta1/file-metrics-collector
+```
+
+Expected output:
+
+```
+- gcr.io/kubeflow-images-public/katib/v1beta1/file-metrics-collector
+```
diff --git a/examples/v1beta1/tekton/pipeline-run.yaml b/examples/v1beta1/tekton/pipeline-run.yaml
@@ -0,0 +1,104 @@
+# This examples shows how you can use Tekton Pipelines in Katib.
+# PipelineRun shows how you can transfer parameters from one Task to another and run HP job.
+# It uses simple random algorithm and tunes only learning rate.
+# Pipelines contains 2 Tasks, first is data-preprocessing second is model-training.
+# First Task shows how you can prepare your training data (simply divide number of training examples) before running HP job.
+# Number of examples is transferred to the second Task.
+# Second Task is the actual training which metrics collector sidecar is injected.
+# Note that for this example Tekton controller's nop image must be equal to StdOut metrics collector image.
+apiVersion: "kubeflow.org/v1beta1"
+kind: Experiment
+metadata:
+  namespace: kubeflow
+  name: tekton-pipeline-run
+spec:
+  objective:
+    type: maximize
+    goal: 0.99
+    objectiveMetricName: Validation-accuracy
+    additionalMetricNames:
+      - Train-accuracy
+  algorithm:
+    algorithmName: random
+  parallelTrialCount: 2
+  maxTrialCount: 4
+  maxFailedTrialCount: 3
+  parameters:
+    - name: lr
+      parameterType: double
+      feasibleSpace:
+        min: "0.01"
+        max: "0.03"
+  trialTemplate:
+    retain: true
+    primaryPodLabels:
+      tekton.dev/pipelineTask: model-training
+    primaryContainerName: step-model-training
+    successCondition: status.conditions.#(type=="Succeeded")#|#(status=="True")#
+    failureCondition: status.conditions.#(type=="Succeeded")#|#(status=="False")#
+    trialParameters:
+      - name: learningRate
+        description: Learning rate for the training model
+        reference: lr
+    trialSpec:
+      apiVersion: tekton.dev/v1beta1
+      kind: PipelineRun
+      spec:
+        params:
+          - name: lr
+            value: ${trialParameters.learningRate}
+          - name: num-examples-init
+            value: "60000"
+        pipelineSpec:
+          params:
+            - name: lr
+              description: Learning rate for the training model
+            - name: num-examples-init
+              description: Initial value for number of training examples
+          tasks:
+            - name: data-preprocessing
+              params:
+                - name: num-examples-pre
+                  value: $(params.num-examples-init)
+              taskSpec:
+                metadata:
+                  annotations:
+                    sidecar.istio.io/inject: "false"
+                params:
+                  - name: num-examples-pre
+                    description: Number of training examples before optimization
+                results:
+                  - name: num-examples-post
+                    description: Number of training examples after optimization
+                steps:
+                  - name: num-examples-optimize
+                    image: python:alpine3.6
+                    command:
+                      - sh
+                      - -c
+                    args:
+                      - python3 -c "import random; print($(params.num-examples-pre)//random.randint(10,100),end='')" | tee $(results.num-examples-post.path)
+            - name: model-training
+              params:
+                - name: lr
+                  value: $(params.lr)
+                - name: num-examples
+                  value: $(tasks.data-preprocessing.results.num-examples-post)
+              taskSpec:
+                metadata:
+                  annotations:
+                    sidecar.istio.io/inject: "false"
+                params:
+                  - name: lr
+                    description: Learning rate for the training model
+                  - name: num-examples
+                    description: Number of training examples
+                steps:
+                  - name: model-training
+                    image: docker.io/kubeflowkatib/mxnet-mnist
+                    command:
+                      - "python3"
+                      - "/opt/mxnet-mnist/mnist.py"
+                      - "--batch-size=64"
+                      - "--num-examples=$(params.num-examples)"
+                      - "--lr=$(params.lr)"
diff --git a/manifests/v1beta1/katib-controller/katib-controller.yaml b/manifests/v1beta1/katib-controller/katib-controller.yaml
@@ -25,6 +25,7 @@ spec:
           command: ["./katib-controller"]
           args:
             - "--webhook-port=8443"
+            - "--trial-resources=PipelineRun.v1beta1.tekton.dev"
           ports:
             - containerPort: 8443
               name: webhook

diff --git a/manifests/v1beta1/katib-controller/rbac.yaml b/manifests/v1beta1/katib-controller/rbac.yaml
@@ -72,6 +72,12 @@ rules:
       - pytorchjobs
     verbs:
       - "*"
+  - apiGroups:
+      - tekton.dev
+    resources:
+      - pipelineruns
+    verbs:
+      - "*"
 ---
 apiVersion: v1
 kind: ServiceAccount