From 3a7f45e1d4400bdfd432b456011b3da6c1c70409 Mon Sep 17 00:00:00 2001
From: Andrey Velichkevich <andrey.velichkevich@gmail.com>
Date: Tue, 27 Oct 2020 06:43:58 +0000
Subject: [PATCH] Add Tekton Pipeline example (#1339)

* Tekton example

Add README for Tekton examples
Add yaml with PipelineRun

* Fix README

* Remove istio annotation

* Fix comment
---
 examples/v1beta1/tekton/README.md             | 40 ++++++++
 examples/v1beta1/tekton/pipeline-run.yaml     | 96 +++++++++++++++++++
 .../katib-controller/katib-controller.yaml    |  1 +
 manifests/v1beta1/katib-controller/rbac.yaml  |  6 ++
 4 files changed, 143 insertions(+)
 create mode 100644 examples/v1beta1/tekton/README.md
 create mode 100644 examples/v1beta1/tekton/pipeline-run.yaml

diff --git a/examples/v1beta1/tekton/README.md b/examples/v1beta1/tekton/README.md
new file mode 100644
index 00000000000..2caa44867cf
--- /dev/null
+++ b/examples/v1beta1/tekton/README.md
@@ -0,0 +1,40 @@
+# Katib examples with Tekton integration
+
+Here you can find examples of using Katib with [Tekton](https://github.com/tektoncd/pipeline).
+
+Check [here](https://github.com/tektoncd/pipeline/blob/master/docs/install.md#installing-tekton-pipelines-on-kubernetes)
+how to install Tekton on your cluster.
+
+**Note** that you must modify Tekton [`nop`](https://github.com/tektoncd/pipeline/tree/master/cmd/nop)
+image to run Tekton pipelines. `Nop` image is used to stop sidecar containers after main container
+is completed. Metrics collector should not be stopped after training container is finished.
+To avoid this problem, set `nop` image to metrics collector sidecar image.
+
+For example, if you are using
+[StdOut](https://www.kubeflow.org/docs/components/hyperparameter-tuning/experiment/#metrics-collector) metrics collector,
+`nop` image must be equal to `gcr.io/kubeflow-images-public/katib/v1beta1/file-metrics-collector`.
+
+After deploying Tekton on your cluster, run bellow command to modify `nop` image:
+
+```bash
+kubectl patch deploy tekton-pipelines-controller -n tekton-pipelines --type='json' \
+  -p='[{"op": "replace", "path": "/spec/template/spec/containers/0/args/9", "value": "gcr.io/kubeflow-images-public/katib/v1beta1/file-metrics-collector"}]'
+```
+
+Check that Tekton controller's pod was restarted:
+
+```bash
+$ kubectl get pods -n tekton-pipelines
+
+NAME                                           READY   STATUS    RESTARTS   AGE
+tekton-pipelines-controller-7fcb6c6cd4-p8zf2   1/1     Running   0          2m2s
+tekton-pipelines-webhook-7f9888f9b-7d6mr       1/1     Running   0          12h
+```
+
+Check that `nop` image was modified:
+
+```bash
+$ kubectl get pod <tekton-controller-pod-name> -n tekton-pipelines -o yaml | grep katib/v1beta1/file-metrics-collector
+
+   - gcr.io/kubeflow-images-public/katib/v1beta1/file-metrics-collector
+```
diff --git a/examples/v1beta1/tekton/pipeline-run.yaml b/examples/v1beta1/tekton/pipeline-run.yaml
new file mode 100644
index 00000000000..4a9d12bc335
--- /dev/null
+++ b/examples/v1beta1/tekton/pipeline-run.yaml
@@ -0,0 +1,96 @@
+# This example shows how you can use Tekton Pipelines in Katib, transfer parameters from one Task to another and run HP job.
+# It uses simple random algorithm and tunes only learning rate.
+# Pipelines contains 2 Tasks, first is data-preprocessing second is model-training.
+# First Task shows how you can prepare your training data (here: simply divide number of training examples) before running HP job.
+# Number of training examples is transferred to the second Task.
+# Second Task is the actual training which metrics collector sidecar is injected.
+# Note that for this example Tekton controller's nop image must be equal to StdOut metrics collector image.
+apiVersion: "kubeflow.org/v1beta1"
+kind: Experiment
+metadata:
+  namespace: kubeflow
+  name: tekton-pipeline-run
+spec:
+  objective:
+    type: maximize
+    goal: 0.99
+    objectiveMetricName: Validation-accuracy
+    additionalMetricNames:
+      - Train-accuracy
+  algorithm:
+    algorithmName: random
+  parallelTrialCount: 2
+  maxTrialCount: 4
+  maxFailedTrialCount: 3
+  parameters:
+    - name: lr
+      parameterType: double
+      feasibleSpace:
+        min: "0.01"
+        max: "0.03"
+  trialTemplate:
+    retain: true
+    primaryPodLabels:
+      tekton.dev/pipelineTask: model-training
+    primaryContainerName: step-model-training
+    successCondition: status.conditions.#(type=="Succeeded")#|#(status=="True")#
+    failureCondition: status.conditions.#(type=="Succeeded")#|#(status=="False")#
+    trialParameters:
+      - name: learningRate
+        description: Learning rate for the training model
+        reference: lr
+    trialSpec:
+      apiVersion: tekton.dev/v1beta1
+      kind: PipelineRun
+      spec:
+        params:
+          - name: lr
+            value: ${trialParameters.learningRate}
+          - name: num-examples-init
+            value: "60000"
+        pipelineSpec:
+          params:
+            - name: lr
+              description: Learning rate for the training model
+            - name: num-examples-init
+              description: Initial value for number of training examples
+          tasks:
+            - name: data-preprocessing
+              params:
+                - name: num-examples-pre
+                  value: $(params.num-examples-init)
+              taskSpec:
+                params:
+                  - name: num-examples-pre
+                    description: Number of training examples before optimization
+                results:
+                  - name: num-examples-post
+                    description: Number of training examples after optimization
+                steps:
+                  - name: num-examples-optimize
+                    image: python:alpine3.6
+                    command:
+                      - sh
+                      - -c
+                    args:
+                      - python3 -c "import random; print($(params.num-examples-pre)//random.randint(10,100),end='')" | tee $(results.num-examples-post.path)
+            - name: model-training
+              params:
+                - name: lr
+                  value: $(params.lr)
+                - name: num-examples
+                  value: $(tasks.data-preprocessing.results.num-examples-post)
+              taskSpec:
+                params:
+                  - name: lr
+                    description: Learning rate for the training model
+                  - name: num-examples
+                    description: Number of training examples
+                steps:
+                  - name: model-training
+                    image: docker.io/kubeflowkatib/mxnet-mnist
+                    command:
+                      - "python3"
+                      - "/opt/mxnet-mnist/mnist.py"
+                      - "--num-examples=$(params.num-examples)"
+                      - "--lr=$(params.lr)"
diff --git a/manifests/v1beta1/katib-controller/katib-controller.yaml b/manifests/v1beta1/katib-controller/katib-controller.yaml
index 4dfb01fd445..4d362383bcc 100644
--- a/manifests/v1beta1/katib-controller/katib-controller.yaml
+++ b/manifests/v1beta1/katib-controller/katib-controller.yaml
@@ -29,6 +29,7 @@ spec:
             - "--trial-resources=TFJob.v1.kubeflow.org"
             - "--trial-resources=PyTorchJob.v1.kubeflow.org"
             - "--trial-resources=MPIJob.v1.kubeflow.org"
+            - "--trial-resources=PipelineRun.v1beta1.tekton.dev"
           ports:
             - containerPort: 8443
               name: webhook
diff --git a/manifests/v1beta1/katib-controller/rbac.yaml b/manifests/v1beta1/katib-controller/rbac.yaml
index dc12f9f3db7..a902f47faf8 100644
--- a/manifests/v1beta1/katib-controller/rbac.yaml
+++ b/manifests/v1beta1/katib-controller/rbac.yaml
@@ -73,6 +73,12 @@ rules:
       - mpijobs
     verbs:
       - "*"
+  - apiGroups:
+      - tekton.dev
+    resources:
+      - pipelineruns
+    verbs:
+      - "*"
 ---
 apiVersion: v1
 kind: ServiceAccount