diff --git a/examples/v1beta1/tekton/README.md b/examples/v1beta1/tekton/README.md new file mode 100644 index 00000000000..d1eb9ca90cf --- /dev/null +++ b/examples/v1beta1/tekton/README.md @@ -0,0 +1,41 @@ +# Katib examples with Tekton integration + +Here you can find examples of using Katib with [Tekton](https://github.com/tektoncd/pipeline). +Check [here](https://github.com/tektoncd/pipeline/blob/master/docs/install.md#installing-tekton-pipelines-on-kubernetes) how to install Tekton on your cluster. + +**Note** that you must modify Tekton [`nop`](https://github.com/tektoncd/pipeline/tree/master/cmd/nop) image to run Tekton pipelines. `Nop` images is used to stop sidecar containers after main container is completed. Metrics collector must be not stopped after training container is finished. To avoid this problem, `nop` image should be equal to metrics collector sidecar image. + +For example, if you are using [StdOut](https://www.kubeflow.org/docs/components/hyperparameter-tuning/experiment/#metrics-collector) metrics collector, `nop` image must be equal to `gcr.io/kubeflow-images-public/katib/v1beta1/file-metrics-collector`. + +After deploying Tekton on your cluster, run bellow command to modify `nop` image. + +```bash +kubectl patch deploy tekton-pipelines-controller -n tekton-pipelines --type='json' \ +-p='[{"op": "replace", "path": "/spec/template/spec/containers/0/args/9", "value": "gcr.io/kubeflow-images-public/katib/v1beta1/file-metrics-collector"}]' +``` + +Check that Tekton controller's pod was restarted: + +``` +kubectl get pods -n tekton-pipelines +``` + +Expected output: + +``` +NAME READY STATUS RESTARTS AGE +tekton-pipelines-controller-7fcb6c6cd4-p8zf2 1/1 Running 0 2m2s +tekton-pipelines-webhook-7f9888f9b-7d6mr 1/1 Running 0 12h +``` + +Check that `nop` image was modified: + +``` +kubectl get pod -n tekton-pipelines- -o yaml | grep katib/v1beta1/file-metrics-collector +``` + +Expected output: + +``` +- gcr.io/kubeflow-images-public/katib/v1beta1/file-metrics-collector +``` diff --git a/examples/v1beta1/tekton/pipeline-run.yaml b/examples/v1beta1/tekton/pipeline-run.yaml new file mode 100644 index 00000000000..dfe0de8b350 --- /dev/null +++ b/examples/v1beta1/tekton/pipeline-run.yaml @@ -0,0 +1,104 @@ +# This examples shows how you can use Tekton Pipelines in Katib. +# PipelineRun shows how you can transfer parameters from one Task to another and run HP job. +# It uses simple random algorithm and tunes only learning rate. +# Pipelines contains 2 Tasks, first is data-preprocessing second is model-training. +# First Task shows how you can prepare your training data (simply divide number of training examples) before running HP job. +# Number of examples is transferred to the second Task. +# Second Task is the actual training which metrics collector sidecar is injected. +# Note that for this example Tekton controller's nop image must be equal to StdOut metrics collector image. +apiVersion: "kubeflow.org/v1beta1" +kind: Experiment +metadata: + namespace: kubeflow + name: tekton-pipeline-run +spec: + objective: + type: maximize + goal: 0.99 + objectiveMetricName: Validation-accuracy + additionalMetricNames: + - Train-accuracy + algorithm: + algorithmName: random + parallelTrialCount: 2 + maxTrialCount: 4 + maxFailedTrialCount: 3 + parameters: + - name: lr + parameterType: double + feasibleSpace: + min: "0.01" + max: "0.03" + trialTemplate: + retain: true + primaryPodLabels: + tekton.dev/pipelineTask: model-training + primaryContainerName: step-model-training + successCondition: status.conditions.#(type=="Succeeded")#|#(status=="True")# + failureCondition: status.conditions.#(type=="Succeeded")#|#(status=="False")# + trialParameters: + - name: learningRate + description: Learning rate for the training model + reference: lr + trialSpec: + apiVersion: tekton.dev/v1beta1 + kind: PipelineRun + spec: + params: + - name: lr + value: ${trialParameters.learningRate} + - name: num-examples-init + value: "60000" + pipelineSpec: + params: + - name: lr + description: Learning rate for the training model + - name: num-examples-init + description: Initial value for number of training examples + tasks: + - name: data-preprocessing + params: + - name: num-examples-pre + value: $(params.num-examples-init) + taskSpec: + metadata: + annotations: + sidecar.istio.io/inject: "false" + params: + - name: num-examples-pre + description: Number of training examples before optimization + results: + - name: num-examples-post + description: Number of training examples after optimization + steps: + - name: num-examples-optimize + image: python:alpine3.6 + command: + - sh + - -c + args: + - python3 -c "import random; print($(params.num-examples-pre)//random.randint(10,100),end='')" | tee $(results.num-examples-post.path) + - name: model-training + params: + - name: lr + value: $(params.lr) + - name: num-examples + value: $(tasks.data-preprocessing.results.num-examples-post) + taskSpec: + metadata: + annotations: + sidecar.istio.io/inject: "false" + params: + - name: lr + description: Learning rate for the training model + - name: num-examples + description: Number of training examples + steps: + - name: model-training + image: docker.io/kubeflowkatib/mxnet-mnist + command: + - "python3" + - "/opt/mxnet-mnist/mnist.py" + - "--batch-size=64" + - "--num-examples=$(params.num-examples)" + - "--lr=$(params.lr)" diff --git a/manifests/v1beta1/katib-controller/katib-controller.yaml b/manifests/v1beta1/katib-controller/katib-controller.yaml index 779f255bad5..c0e7cd4f944 100644 --- a/manifests/v1beta1/katib-controller/katib-controller.yaml +++ b/manifests/v1beta1/katib-controller/katib-controller.yaml @@ -25,6 +25,7 @@ spec: command: ["./katib-controller"] args: - "--webhook-port=8443" + - "--trial-resources=PipelineRun.v1beta1.tekton.dev" ports: - containerPort: 8443 name: webhook diff --git a/manifests/v1beta1/katib-controller/rbac.yaml b/manifests/v1beta1/katib-controller/rbac.yaml index dec9b5373f4..f9ee2ba1e36 100644 --- a/manifests/v1beta1/katib-controller/rbac.yaml +++ b/manifests/v1beta1/katib-controller/rbac.yaml @@ -72,6 +72,12 @@ rules: - pytorchjobs verbs: - "*" + - apiGroups: + - tekton.dev + resources: + - pipelineruns + verbs: + - "*" --- apiVersion: v1 kind: ServiceAccount