From 98e6141a81f3d216ada8e3cd8564c7067c3075ef Mon Sep 17 00:00:00 2001 From: avelichk Date: Tue, 3 Aug 2021 23:55:04 +0100 Subject: [PATCH 1/7] Add Support for Argo Workflows --- examples/v1beta1/argo/README.md | 45 ++++++++++ examples/v1beta1/argo/argo-workflow.yaml | 83 +++++++++++++++++++ .../components/controller/controller.yaml | 1 + .../v1beta1/components/controller/rbac.yaml | 6 ++ 4 files changed, 135 insertions(+) create mode 100644 examples/v1beta1/argo/README.md create mode 100644 examples/v1beta1/argo/argo-workflow.yaml diff --git a/examples/v1beta1/argo/README.md b/examples/v1beta1/argo/README.md new file mode 100644 index 00000000000..5971e5ae95c --- /dev/null +++ b/examples/v1beta1/argo/README.md @@ -0,0 +1,45 @@ +# Katib examples with Argo Workflows integration + +Here you can find examples of using Katib with [Argo Workflows](https://github.com/argoproj/argo-workflows). +**Note**:: You have to install Argo Workflows >= `v3.1` to use it in Katib Experiments. + +## Installation + +To deploy Argo Workflows `v3.1.3`, run the following commands: + +```bash +kubectl create namespace argo +kubectl apply -n argo -f https://github.com/argoproj/argo-workflows/releases/download/v3.1.3/install.yaml +``` + +Check that Argo Workflow components are running: + +```bash +$ kubectl get pods -n argo + +``` + +After that, run bellow command to enable +[Katib Metrics Collector sidecar injection](https://www.kubeflow.org/docs/components/katib/experiment/#metrics-collector): + +```bash +kubectl patch namespace argo -p '{"metadata":{"labels":{"katib-metricscollector-injection":"enabled"}}}' +``` + +**Note**: Argo Workflows is using `docker` as a +[default container runtime executor](https://argoproj.github.io/argo-workflows/workflow-executors/#workflow-executors). +Since Katib is using Metrics Collector sidecar container, you should modify this +executor to [`emissary`](https://argoproj.github.io/argo-workflows/workflow-executors/#emissary-emissary). + +Run the following command to change the `containerRuntimeExecutor` to `emissary` in the +Argo `workflow-controller-configmap`. + +```bash +kubectl patch ConfigMap -n argo workflow-controller-configmap --type='merge' -p='{"data":{"containerRuntimeExecutor":"emissary"}}' +``` + +Verify that `containerRuntimeExecutor` has been modified: + +```bash +kubectl get ConfigMap -n argo workflow-controller-configmap -o yaml | grep containerRuntimeExecutor +``` diff --git a/examples/v1beta1/argo/argo-workflow.yaml b/examples/v1beta1/argo/argo-workflow.yaml new file mode 100644 index 00000000000..a5a37f7f82e --- /dev/null +++ b/examples/v1beta1/argo/argo-workflow.yaml @@ -0,0 +1,83 @@ +# This example shows how you can use Argo Workflows in Katib, transfer parameters from one Step to another and run HP job. +# It uses simple random algorithm and tunes only learning rate. +# Workflow contains 2 Steps, first is data-preprocessing second is model-training. +# First Step shows how you can prepare your training data (here: simply divide number of training examples) before running HP job. +# Number of training examples is transferred to the second Step. +# Second Step is the actual training which metrics collector sidecar is injected. +# Note that for this example Argo Container Runtime Executor must be "emissary". +apiVersion: kubeflow.org/v1beta1 +kind: Experiment +metadata: + namespace: argo + name: katib-argo-workflow +spec: + objective: + type: maximize + goal: 0.99 + objectiveMetricName: Validation-accuracy + additionalMetricNames: + - Train-accuracy + algorithm: + algorithmName: random + parallelTrialCount: 2 + maxTrialCount: 5 + maxFailedTrialCount: 1 + parameters: + - name: lr + parameterType: double + feasibleSpace: + min: "0.01" + max: "0.03" + trialTemplate: + retain: true + primaryPodLabels: + katib.kubeflow.org/model-training: "true" + primaryContainerName: main + successCondition: status.[@this].#(phase=="Succeeded")# + failureCondition: status.[@this].#(phase=="Failed")# + trialParameters: + - name: learningRate + description: Learning rate for the training model + reference: lr + trialSpec: + apiVersion: argoproj.io/v1alpha1 + kind: Workflow + spec: + serviceAccountName: argo + entrypoint: hp-workflow + templates: + - name: hp-workflow + steps: + - - name: data-preprocessing + template: gen-num-examples + - - name: model-training + template: model-training + arguments: + parameters: + - name: num-examples + value: "{{steps.data-preprocessing.outputs.result}}" + + - name: gen-num-examples + script: + image: python:alpine3.6 + command: + - python + source: | + import random + print(60000//random.randint(10, 100)) + + - name: model-training + metadata: + labels: + katib.kubeflow.org/model-training: "true" + inputs: + parameters: + - name: num-examples + container: + name: model-training + image: docker.io/kubeflowkatib/mxnet-mnist:v1beta1-45c5727 + command: + - "python3" + - "/opt/mxnet-mnist/mnist.py" + - "--lr=${trialParameters.learningRate}" + - "--num-examples={{inputs.parameters.num-examples}}" diff --git a/manifests/v1beta1/components/controller/controller.yaml b/manifests/v1beta1/components/controller/controller.yaml index 23738828865..4362d6d89a0 100644 --- a/manifests/v1beta1/components/controller/controller.yaml +++ b/manifests/v1beta1/components/controller/controller.yaml @@ -33,6 +33,7 @@ spec: # TODO (andreyvelich): Change to v1.kubeflow.org once all-in-one operator is finished. - "--trial-resources=XGBoostJob.v1.xgboostjob.kubeflow.org" - "--trial-resources=PipelineRun.v1beta1.tekton.dev" + - "--trial-resources=Workflow.v1alpha1.argoproj.io" ports: - containerPort: 8443 name: webhook diff --git a/manifests/v1beta1/components/controller/rbac.yaml b/manifests/v1beta1/components/controller/rbac.yaml index 73d1ba3b05d..d125c0b10fa 100644 --- a/manifests/v1beta1/components/controller/rbac.yaml +++ b/manifests/v1beta1/components/controller/rbac.yaml @@ -69,6 +69,12 @@ rules: - taskruns verbs: - "*" + - apiGroups: + - argoproj.io + resources: + - workflows + verbs: + - "*" --- apiVersion: v1 kind: ServiceAccount From 6ad77d6ac39f5ff283610d1064c293536d1dc468 Mon Sep 17 00:00:00 2001 From: avelichk Date: Wed, 4 Aug 2021 00:18:19 +0100 Subject: [PATCH 2/7] Few changes in README --- docs/new-algorithm-service.md | 2 +- docs/presentations.md | 2 +- examples/v1beta1/argo/README.md | 21 ++++++++++++------- .../nas/darts-cnn-cifar10/architect.py | 2 +- examples/v1beta1/tekton/README.md | 2 +- 5 files changed, 18 insertions(+), 11 deletions(-) diff --git a/docs/new-algorithm-service.md b/docs/new-algorithm-service.md index 2ad10de7049..9ac6ce599d1 100644 --- a/docs/new-algorithm-service.md +++ b/docs/new-algorithm-service.md @@ -150,7 +150,7 @@ You can setup the GRPC server using `grpc_testing`, then define your own test ca #### E2E Test (Optional) E2e tests help Katib verify that the algorithm works well. -Follow bellow steps to add your algorithm (Suggestion) to the Katib CI +Follow below steps to add your algorithm (Suggestion) to the Katib CI (replace `` with your Suggestion name): 1. Submit a PR to add a new ECR private registry to the AWS diff --git a/docs/presentations.md b/docs/presentations.md index 759aff21508..4703fb60d93 100644 --- a/docs/presentations.md +++ b/docs/presentations.md @@ -1,6 +1,6 @@ # Katib Presentations and Demos -Bellow are the list of Katib presentations and demos. If you want to add your +Below are the list of Katib presentations and demos. If you want to add your presentation or demo in this list please send a pull request. Please keep the list in reverse chronological order. diff --git a/examples/v1beta1/argo/README.md b/examples/v1beta1/argo/README.md index 5971e5ae95c..190783818a2 100644 --- a/examples/v1beta1/argo/README.md +++ b/examples/v1beta1/argo/README.md @@ -1,7 +1,8 @@ -# Katib examples with Argo Workflows integration +# Katib Examples with Argo Workflows Integration Here you can find examples of using Katib with [Argo Workflows](https://github.com/argoproj/argo-workflows). -**Note**:: You have to install Argo Workflows >= `v3.1` to use it in Katib Experiments. + +**Note:** You have to install `Argo >= v3.1` to use it in Katib Experiments. ## Installation @@ -17,22 +18,26 @@ Check that Argo Workflow components are running: ```bash $ kubectl get pods -n argo +NAME READY STATUS RESTARTS AGE +argo-server-5bbd69cc6b-6nvb6 1/1 Running 0 20s +workflow-controller-5f48fb7c8-vw9bp 1/1 Running 0 20s ``` -After that, run bellow command to enable +After that, run below command to enable [Katib Metrics Collector sidecar injection](https://www.kubeflow.org/docs/components/katib/experiment/#metrics-collector): ```bash kubectl patch namespace argo -p '{"metadata":{"labels":{"katib-metricscollector-injection":"enabled"}}}' ``` -**Note**: Argo Workflows is using `docker` as a +**Note:** Argo Workflows is using `docker` as a [default container runtime executor](https://argoproj.github.io/argo-workflows/workflow-executors/#workflow-executors). -Since Katib is using Metrics Collector sidecar container, you should modify this +Since Katib is using Metrics Collector sidecar container and Argo Workflows controller +should not kill sidecar containers, you have to modify this executor to [`emissary`](https://argoproj.github.io/argo-workflows/workflow-executors/#emissary-emissary). Run the following command to change the `containerRuntimeExecutor` to `emissary` in the -Argo `workflow-controller-configmap`. +Argo `workflow-controller-configmap` ```bash kubectl patch ConfigMap -n argo workflow-controller-configmap --type='merge' -p='{"data":{"containerRuntimeExecutor":"emissary"}}' @@ -41,5 +46,7 @@ kubectl patch ConfigMap -n argo workflow-controller-configmap --type='merge' -p= Verify that `containerRuntimeExecutor` has been modified: ```bash -kubectl get ConfigMap -n argo workflow-controller-configmap -o yaml | grep containerRuntimeExecutor +$ kubectl get ConfigMap -n argo workflow-controller-configmap -o yaml | grep containerRuntimeExecutor + + containerRuntimeExecutor: emissary ``` diff --git a/examples/v1beta1/nas/darts-cnn-cifar10/architect.py b/examples/v1beta1/nas/darts-cnn-cifar10/architect.py index 65217706af3..e18c983ecdf 100644 --- a/examples/v1beta1/nas/darts-cnn-cifar10/architect.py +++ b/examples/v1beta1/nas/darts-cnn-cifar10/architect.py @@ -33,7 +33,7 @@ def virtual_step(self, train_x, train_y, xi, w_optim): gradients = torch.autograd.grad(loss, self.model.getWeights()) # Do virtual step (Update gradient) - # Bellow opeartions do not need gradient tracking + # Below opeartions do not need gradient tracking with torch.no_grad(): # dict key is not the value, but the pointer. So original network weight have to # be iterated also. diff --git a/examples/v1beta1/tekton/README.md b/examples/v1beta1/tekton/README.md index b5ecacd1e71..5394d511f25 100644 --- a/examples/v1beta1/tekton/README.md +++ b/examples/v1beta1/tekton/README.md @@ -14,7 +14,7 @@ For example, if you are using [StdOut](https://www.kubeflow.org/docs/components/katib/experiment/#metrics-collector) metrics collector, `nop` image must be equal to `docker.io/kubeflowkatib/file-metrics-collector`. -After deploying Tekton on your cluster, run bellow command to modify `nop` image: +After deploying Tekton on your cluster, run below command to modify `nop` image: ```bash kubectl patch deploy tekton-pipelines-controller -n tekton-pipelines --type='json' \ From 97ddd158359a45319a759587525749e3f8a7d459 Mon Sep 17 00:00:00 2001 From: avelichk Date: Wed, 4 Aug 2021 00:29:25 +0100 Subject: [PATCH 3/7] Add Argo to README --- README.md | 2 ++ examples/v1beta1/argo/README.md | 2 +- examples/v1beta1/argo/argo-workflow.yaml | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index a5291fcb6d9..b6fc2bc1372 100644 --- a/README.md +++ b/README.md @@ -129,6 +129,8 @@ Katib has these CRD examples in upstream: - [Tekton `Pipeline`](https://github.com/tektoncd/pipeline) +- [Argo `Workflows`](https://github.com/argoproj/argo-workflows) + Thus, Katib supports multiple frameworks with the help of different job kinds. ### Search Algorithms diff --git a/examples/v1beta1/argo/README.md b/examples/v1beta1/argo/README.md index 190783818a2..9625ac73689 100644 --- a/examples/v1beta1/argo/README.md +++ b/examples/v1beta1/argo/README.md @@ -30,7 +30,7 @@ After that, run below command to enable kubectl patch namespace argo -p '{"metadata":{"labels":{"katib-metricscollector-injection":"enabled"}}}' ``` -**Note:** Argo Workflows is using `docker` as a +**Note:** Argo Workflows are using `docker` as a [default container runtime executor](https://argoproj.github.io/argo-workflows/workflow-executors/#workflow-executors). Since Katib is using Metrics Collector sidecar container and Argo Workflows controller should not kill sidecar containers, you have to modify this diff --git a/examples/v1beta1/argo/argo-workflow.yaml b/examples/v1beta1/argo/argo-workflow.yaml index a5a37f7f82e..fcc105552c4 100644 --- a/examples/v1beta1/argo/argo-workflow.yaml +++ b/examples/v1beta1/argo/argo-workflow.yaml @@ -1,5 +1,5 @@ # This example shows how you can use Argo Workflows in Katib, transfer parameters from one Step to another and run HP job. -# It uses simple random algorithm and tunes only learning rate. +# It uses a simple random algorithm and tunes only learning rate. # Workflow contains 2 Steps, first is data-preprocessing second is model-training. # First Step shows how you can prepare your training data (here: simply divide number of training examples) before running HP job. # Number of training examples is transferred to the second Step. From f1fc0d051eecf231d6cb9b830e3c9caed372375e Mon Sep 17 00:00:00 2001 From: avelichk Date: Wed, 4 Aug 2021 16:28:32 +0100 Subject: [PATCH 4/7] Remove Argo access from Katib manifests --- examples/v1beta1/argo/README.md | 49 ++++++++++++++++++- .../components/controller/controller.yaml | 2 - .../v1beta1/components/controller/rbac.yaml | 13 ----- 3 files changed, 48 insertions(+), 16 deletions(-) diff --git a/examples/v1beta1/argo/README.md b/examples/v1beta1/argo/README.md index 9625ac73689..6c479b2a030 100644 --- a/examples/v1beta1/argo/README.md +++ b/examples/v1beta1/argo/README.md @@ -2,10 +2,12 @@ Here you can find examples of using Katib with [Argo Workflows](https://github.com/argoproj/argo-workflows). -**Note:** You have to install `Argo >= v3.1` to use it in Katib Experiments. +**Note:** You have to install `Argo >= v3.1.3` to use it in Katib Experiments. ## Installation +### Argo Workflow + To deploy Argo Workflows `v3.1.3`, run the following commands: ```bash @@ -50,3 +52,48 @@ $ kubectl get ConfigMap -n argo workflow-controller-configmap -o yaml | grep con containerRuntimeExecutor: emissary ``` + +### Katib Controller + +To run Argo Workflow within Katib Trials you have to update Katib +[ClusterRole's rules](https://github.com/kubeflow/katib/blob/master/manifests/v1beta1/components/controller/rbac.yaml#L5) +with appropriate permission: + +```yaml +- apiGroups: + - argoproj.io + resources: + - workflows + verbs: + - "*" +``` + +Run the following command to update Katib ClusterRole: + +```bash +kubectl patch ClusterRole katib-controller -n kubeflow --type=json \ + -p='[{"op": "add", "path": "/rules/-", "value": {"apiGroups":["argoproj.io"],"resources":["workflows"],"verbs":["*"]}}]' +``` + +In addition to that, you have to modify Katib +[Controller args](https://github.com/kubeflow/katib/blob/master/manifests/v1beta1/components/controller/controller.yaml#L27) +with the new flag `--trial-resources`. + +Run the following command to update Katib Controller args: + +```bash +kubectl patch Deployment katib-controller -n kubeflow --type=json \ + -p='[{"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "--trial-resources=Workflow.v1alpha1.argoproj.io"}]' +``` + +After these changes, check logs from Katib controller to verify Argo Workflow integration: + +```bash +kubectl logs $(kubectl get pods -n kubeflow -o name | grep katib-controller) -n kubeflow +``` + +Expected output: + +```shell +{"level":"info","ts":1628032648.6285546,"logger":"trial-controller","msg":"Job watch added successfully","CRD Group":"argoproj.io","CRD Version":"v1alpha1","CRD Kind":"Workflow"} +``` diff --git a/manifests/v1beta1/components/controller/controller.yaml b/manifests/v1beta1/components/controller/controller.yaml index 4362d6d89a0..7c2a5c876b7 100644 --- a/manifests/v1beta1/components/controller/controller.yaml +++ b/manifests/v1beta1/components/controller/controller.yaml @@ -32,8 +32,6 @@ spec: - "--trial-resources=MPIJob.v1.kubeflow.org" # TODO (andreyvelich): Change to v1.kubeflow.org once all-in-one operator is finished. - "--trial-resources=XGBoostJob.v1.xgboostjob.kubeflow.org" - - "--trial-resources=PipelineRun.v1beta1.tekton.dev" - - "--trial-resources=Workflow.v1alpha1.argoproj.io" ports: - containerPort: 8443 name: webhook diff --git a/manifests/v1beta1/components/controller/rbac.yaml b/manifests/v1beta1/components/controller/rbac.yaml index d125c0b10fa..9b4ba763b1f 100644 --- a/manifests/v1beta1/components/controller/rbac.yaml +++ b/manifests/v1beta1/components/controller/rbac.yaml @@ -62,19 +62,6 @@ rules: - xgboostjobs verbs: - "*" - - apiGroups: - - tekton.dev - resources: - - pipelineruns - - taskruns - verbs: - - "*" - - apiGroups: - - argoproj.io - resources: - - workflows - verbs: - - "*" --- apiVersion: v1 kind: ServiceAccount From 5d0779b55d5134e06696e26fccf0ae236ec582c9 Mon Sep 17 00:00:00 2001 From: avelichk Date: Wed, 4 Aug 2021 17:06:03 +0100 Subject: [PATCH 5/7] Remove Tekton access from Katib manifests --- README.md | 4 +- examples/v1beta1/argo/README.md | 24 ++++++-- examples/v1beta1/tekton/README.md | 95 ++++++++++++++++++++++++++++--- 3 files changed, 107 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index b6fc2bc1372..83a694450d8 100644 --- a/README.md +++ b/README.md @@ -127,9 +127,9 @@ Katib has these CRD examples in upstream: - [Kubeflow `XGBoostJob`](https://github.com/kubeflow/xgboost-operator) -- [Tekton `Pipeline`](https://github.com/tektoncd/pipeline) +- [Tekton `Pipeline`](./examples/v1beta1/tekton) -- [Argo `Workflows`](https://github.com/argoproj/argo-workflows) +- [Argo `Workflows`](./examples/v1beta1/argo) Thus, Katib supports multiple frameworks with the help of different job kinds. diff --git a/examples/v1beta1/argo/README.md b/examples/v1beta1/argo/README.md index 6c479b2a030..c375f8df67a 100644 --- a/examples/v1beta1/argo/README.md +++ b/examples/v1beta1/argo/README.md @@ -57,7 +57,7 @@ $ kubectl get ConfigMap -n argo workflow-controller-configmap -o yaml | grep con To run Argo Workflow within Katib Trials you have to update Katib [ClusterRole's rules](https://github.com/kubeflow/katib/blob/master/manifests/v1beta1/components/controller/rbac.yaml#L5) -with appropriate permission: +with the appropriate permission: ```yaml - apiGroups: @@ -86,14 +86,28 @@ kubectl patch Deployment katib-controller -n kubeflow --type=json \ -p='[{"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "--trial-resources=Workflow.v1alpha1.argoproj.io"}]' ``` -After these changes, check logs from Katib controller to verify Argo Workflow integration: +Check that Katib Controller's pod was restarted: ```bash -kubectl logs $(kubectl get pods -n kubeflow -o name | grep katib-controller) -n kubeflow +$ kubectl get pods -n kubeflow + +NAME READY STATUS RESTARTS AGE +katib-cert-generator-hnv6q 0/1 Completed 0 6m12s +katib-controller-784994d449-9bgj9 1/1 Running 0 28s +katib-db-manager-78697c7bd4-ck7l8 1/1 Running 0 6m13s +katib-mysql-854cdb87c4-krcm9 1/1 Running 0 6m13s +katib-ui-57b9d7f6dd-cv6gn 1/1 Running 0 6m13s ``` -Expected output: +Check logs from Katib Controller to verify Argo Workflow integration: + +```bash +$ kubectl logs $(kubectl get pods -n kubeflow -o name | grep katib-controller) -n kubeflow | grep '"CRD Kind":"Workflow"' -```shell {"level":"info","ts":1628032648.6285546,"logger":"trial-controller","msg":"Job watch added successfully","CRD Group":"argoproj.io","CRD Version":"v1alpha1","CRD Kind":"Workflow"} ``` + +If you ran the above steps successfully, you should be able to run Argo Workflow examples. + +Learn more about using custom Kubernetes resource as a Trial template in the +[official Kubeflow guides](https://www.kubeflow.org/docs/components/katib/trial-template/#use-custom-kubernetes-resource-as-a-trial-template) diff --git a/examples/v1beta1/tekton/README.md b/examples/v1beta1/tekton/README.md index 5394d511f25..89cec8de8e0 100644 --- a/examples/v1beta1/tekton/README.md +++ b/examples/v1beta1/tekton/README.md @@ -1,12 +1,29 @@ -# Katib examples with Tekton integration +# Katib Examples with Tekton Integration Here you can find examples of using Katib with [Tekton](https://github.com/tektoncd/pipeline). -Check [here](https://github.com/tektoncd/pipeline/blob/master/docs/install.md#installing-tekton-pipelines-on-kubernetes) -how to install Tekton on your cluster. +## Installation -**Note** that you must modify Tekton [`nop`](https://github.com/tektoncd/pipeline/tree/master/cmd/nop) -image to run Tekton pipelines. `Nop` image is used to stop sidecar containers after main container +### Tekton Pipelines + +To deploy Tekton Pipelines `v0.26.0`, run the following command: + +```bash +kubectl apply -f https://storage.googleapis.com/tekton-releases/pipeline/previous/v0.26.0/release.yaml +``` + +Check that Tekton Pipelines components are running: + +```bash +$ kubectl get pods -n tekton-pipelines + +NAME READY STATUS RESTARTS AGE +tekton-pipelines-controller-799cdc78fc-sm4vl 1/1 Running 0 50s +tekton-pipelines-webhook-79d8f4f9bc-qmk97 1/1 Running 0 50s +``` + +**Note:** You must modify Tekton [`nop`](https://github.com/tektoncd/pipeline/tree/master/cmd/nop) +image to run Tekton Pipelines. `Nop` image is used to stop sidecar containers after main container is completed. Metrics collector should not be stopped after training container is finished. To avoid this problem, set `nop` image to metrics collector sidecar image. @@ -14,27 +31,87 @@ For example, if you are using [StdOut](https://www.kubeflow.org/docs/components/katib/experiment/#metrics-collector) metrics collector, `nop` image must be equal to `docker.io/kubeflowkatib/file-metrics-collector`. -After deploying Tekton on your cluster, run below command to modify `nop` image: +Run the following command to modify the `nop` image: ```bash kubectl patch deploy tekton-pipelines-controller -n tekton-pipelines --type='json' \ -p='[{"op": "replace", "path": "/spec/template/spec/containers/0/args/9", "value": "docker.io/kubeflowkatib/file-metrics-collector"}]' ``` -Check that Tekton controller's pod was restarted: +Check that Tekton Controller's pod was restarted: ```bash $ kubectl get pods -n tekton-pipelines NAME READY STATUS RESTARTS AGE tekton-pipelines-controller-7fcb6c6cd4-p8zf2 1/1 Running 0 2m2s -tekton-pipelines-webhook-7f9888f9b-7d6mr 1/1 Running 0 12h +tekton-pipelines-webhook-7f9888f9b-7d6mr 1/1 Running 0 3m ``` -Check that `nop` image was modified: +Verify that `nop` image was modified: ```bash $ kubectl get $(kubectl get pods -o name -n tekton-pipelines | grep tekton-pipelines-controller) -n tekton-pipelines -o yaml | grep katib - docker.io/kubeflowkatib/file-metrics-collector ``` + +### Katib Controller + +To run Tekton Pipelines within Katib Trials you have to update Katib +[ClusterRole's rules](https://github.com/kubeflow/katib/blob/master/manifests/v1beta1/components/controller/rbac.yaml#L5) +with the appropriate permission: + +```yaml +- apiGroups: + - tekton.dev + resources: + - pipelineruns + - taskruns + verbs: + - "*" +``` + +Run the following command to update Katib ClusterRole: + +```bash +kubectl patch ClusterRole katib-controller -n kubeflow --type=json \ + -p='[{"op": "add", "path": "/rules/-", "value": {"apiGroups":["tekton.dev"],"resources":["pipelineruns", "taskruns"],"verbs":["*"]}}]' +``` + +In addition to that, you have to modify Katib +[Controller args](https://github.com/kubeflow/katib/blob/master/manifests/v1beta1/components/controller/controller.yaml#L27) +with the new flag `--trial-resources`. + +Run the following command to update Katib Controller args: + +```bash +kubectl patch Deployment katib-controller -n kubeflow --type=json \ + -p='[{"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "--trial-resources=PipelineRun.v1beta1.tekton.dev"}]' +``` + +Check that Katib Controller's pod was restarted: + +```bash +$ kubectl get pods -n kubeflow + +NAME READY STATUS RESTARTS AGE +katib-cert-generator-hnv6q 0/1 Completed 0 6m12s +katib-controller-784994d449-9bgj9 1/1 Running 0 28s +katib-db-manager-78697c7bd4-ck7l8 1/1 Running 0 6m13s +katib-mysql-854cdb87c4-krcm9 1/1 Running 0 6m13s +katib-ui-57b9d7f6dd-cv6gn 1/1 Running 0 6m13s +``` + +Check logs from Katib Controller to verify Tekton Pipelines integration: + +```bash +$ kubectl logs $(kubectl get pods -n kubeflow -o name | grep katib-controller) -n kubeflow | grep '"CRD Kind":"PipelineRun"' + +{"level":"info","ts":1628032648.6285546,"logger":"trial-controller","msg":"Job watch added successfully","CRD Group":"tekton.dev","CRD Version":"v1beta1","CRD Kind":"PipelineRun"} +``` + +If you ran the above steps successfully, you should be able to run Tekton Pipelines examples. + +Learn more about using custom Kubernetes resource as a Trial template in the +[official Kubeflow guides](https://www.kubeflow.org/docs/components/katib/trial-template/#use-custom-kubernetes-resource-as-a-trial-template) From 17455d7beaa2e3cab71381434393b9293d04831f Mon Sep 17 00:00:00 2001 From: avelichk Date: Wed, 4 Aug 2021 17:22:21 +0100 Subject: [PATCH 6/7] Few changes in README --- examples/v1beta1/tekton/README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/v1beta1/tekton/README.md b/examples/v1beta1/tekton/README.md index 89cec8de8e0..f191bfeea98 100644 --- a/examples/v1beta1/tekton/README.md +++ b/examples/v1beta1/tekton/README.md @@ -24,11 +24,12 @@ tekton-pipelines-webhook-79d8f4f9bc-qmk97 1/1 Running 0 50s **Note:** You must modify Tekton [`nop`](https://github.com/tektoncd/pipeline/tree/master/cmd/nop) image to run Tekton Pipelines. `Nop` image is used to stop sidecar containers after main container -is completed. Metrics collector should not be stopped after training container is finished. -To avoid this problem, set `nop` image to metrics collector sidecar image. +is completed. Since Katib is using Metrics Collector sidecar container +and Tekton Pipelines controller should not kill sidecar containers, you have to +set this `nop` image to Metrics Collector image. For example, if you are using -[StdOut](https://www.kubeflow.org/docs/components/katib/experiment/#metrics-collector) metrics collector, +[StdOut](https://www.kubeflow.org/docs/components/katib/experiment/#metrics-collector) Metrics Collector, `nop` image must be equal to `docker.io/kubeflowkatib/file-metrics-collector`. Run the following command to modify the `nop` image: From c2ece2ac02f1baefcb2ef93444e0254cd4c4eb84 Mon Sep 17 00:00:00 2001 From: avelichk Date: Wed, 4 Aug 2021 17:30:30 +0100 Subject: [PATCH 7/7] Change to Pipelines --- README.md | 2 +- examples/v1beta1/tekton/README.md | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 83a694450d8..c6c1ae7c771 100644 --- a/README.md +++ b/README.md @@ -127,7 +127,7 @@ Katib has these CRD examples in upstream: - [Kubeflow `XGBoostJob`](https://github.com/kubeflow/xgboost-operator) -- [Tekton `Pipeline`](./examples/v1beta1/tekton) +- [Tekton `Pipelines`](./examples/v1beta1/tekton) - [Argo `Workflows`](./examples/v1beta1/argo) diff --git a/examples/v1beta1/tekton/README.md b/examples/v1beta1/tekton/README.md index f191bfeea98..9efd0bd63dc 100644 --- a/examples/v1beta1/tekton/README.md +++ b/examples/v1beta1/tekton/README.md @@ -1,4 +1,4 @@ -# Katib Examples with Tekton Integration +# Katib Examples with Tekton Pipelines Integration Here you can find examples of using Katib with [Tekton](https://github.com/tektoncd/pipeline). @@ -25,7 +25,7 @@ tekton-pipelines-webhook-79d8f4f9bc-qmk97 1/1 Running 0 50s **Note:** You must modify Tekton [`nop`](https://github.com/tektoncd/pipeline/tree/master/cmd/nop) image to run Tekton Pipelines. `Nop` image is used to stop sidecar containers after main container is completed. Since Katib is using Metrics Collector sidecar container -and Tekton Pipelines controller should not kill sidecar containers, you have to +and Tekton Pipelines Controller should not kill sidecar containers, you have to set this `nop` image to Metrics Collector image. For example, if you are using @@ -39,7 +39,7 @@ kubectl patch deploy tekton-pipelines-controller -n tekton-pipelines --type='jso -p='[{"op": "replace", "path": "/spec/template/spec/containers/0/args/9", "value": "docker.io/kubeflowkatib/file-metrics-collector"}]' ``` -Check that Tekton Controller's pod was restarted: +Check that Tekton Pipelines Controller's pod was restarted: ```bash $ kubectl get pods -n tekton-pipelines