Add CI to build example images (#1731)

Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com> Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>
kubeflow · Jan 20, 2023 · e6b4300 · e6b4300
1 parent 1d88c0e
commit e6b4300
Show file tree

Hide file tree

Showing 22 changed files with 176 additions and 70 deletions.
diff --git a/.github/workflows/build-and-publish-images.yaml b/.github/workflows/build-and-publish-images.yaml
@@ -0,0 +1,60 @@
+# Reusable workflows for publishing Training Operator images.
+name: Build And Publish Images
+
+on:
+  workflow_call:
+    inputs:
+      component-name:
+        required: true
+        type: string
+      platforms:
+        required: true
+        type: string
+      dockerfile:
+        required: true
+        type: string
+    secrets:
+      DOCKERHUB_USERNAME:
+        required: false
+      DOCKERHUB_TOKEN:
+        required: false
+
+jobs:
+  build-and-publish:
+    name: Publish Image
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+
+      - name: Docker Login
+        # Trigger workflow only for kubeflow/training-operator repository with specific branch (master, v.*-branch) or tag (v.*).
+        if: >-
+          github.repository == 'kubeflow/training-operator' &&
+          (github.ref == 'refs/heads/master' || (startsWith(github.ref, 'refs/heads/v') && endsWith(github.ref, '-branch')) || startsWith(github.ref, 'refs/tags/v'))
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - name: Publish Component ${{ inputs.component-name }}
+        # Trigger workflow only for kubeflow/training-operator repository with specific branch (master, v.*-branch) or tag (v.*).
+        if: >-
+          github.repository == 'kubeflow/training-operator' &&
+          (github.ref == 'refs/heads/master' || (startsWith(github.ref, 'refs/heads/v') && endsWith(github.ref, '-branch')) || startsWith(github.ref, 'refs/tags/v'))
+        id: publish
+        uses: ./.github/workflows/template-publish-image
+        with:
+          image: docker.io/kubeflow/${{ inputs.component-name }}
+          dockerfile: ${{ inputs.dockerfile }}
+          platforms: ${{ inputs.platforms }}
+          push: true
+
+      - name: Test Build For Component ${{ inputs.component-name }}
+        if: steps.publish.outcome == 'skipped'
+        uses: ./.github/workflows/template-publish-image
+        with:
+          image: docker.io/kubeflow/${{ inputs.component-name }}
+          dockerfile: ${{ inputs.dockerfile }}
+          platforms: ${{ inputs.platforms }}
+          push: false
diff --git a/.github/workflows/publish-core-images.yaml b/.github/workflows/publish-core-images.yaml
@@ -0,0 +1,24 @@
+name: Publish Training Operator Core Images
+
+on:
+  - push
+  - pull_request
+
+jobs:
+  core:
+    name: Publish Image
+    uses: ./.github/workflows/build-and-publish-images.yaml
+    with:
+      component-name: ${{ matrix.component-name }}
+      platforms: linux/amd64,linux/arm64,linux/ppc64le
+      dockerfile: ${{ matrix.dockerfile }}
+    secrets:
+      DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
+      DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }}
+
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - component-name: training-operator
+            dockerfile: build/images/training-operator/Dockerfile
diff --git a/.github/workflows/publish-example-images.yaml b/.github/workflows/publish-example-images.yaml
@@ -0,0 +1,55 @@
+name: Publish Training Operator Example Images
+
+on:
+  - push
+  - pull_request
+
+jobs:
+  example:
+    name: Publish Image
+    uses: ./.github/workflows/build-and-publish-images.yaml
+    with:
+      component-name: ${{ matrix.component-name }}
+      # TODO (tenzen-y): Support linux/arm64 platform
+      platforms: linux/amd64
+      dockerfile: ${{ matrix.dockerfile }}
+    secrets:
+      DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
+      DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }}
+
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - component-name: tf-dist-mnist-test
+            dockerfile: examples/tensorflow/dist-mnist/Dockerfile
+          - component-name: tf-distributed-worker
+            dockerfile: examples/tensorflow/distribution_strategy/estimator-API/Dockerfile
+          - component-name: tf-multi-worker-strategy
+            dockerfile: examples/tensorflow/distribution_strategy/keras-API/Dockerfile
+          - component-name: tf-mnist-with-summaries
+            dockerfile: examples/tensorflow/mnist_with_summaries/Dockerfile
+          - component-name: tf-smoke
+            dockerfile: examples/tensorflow/tf_sample/Dockerfile
+          - component-name: pytorch-dist-sendrecv-test
+            dockerfile: examples/pytorch/smoke-dist/Dockerfile
+          - component-name: pytorch-elastic-example-imagenet
+            dockerfile: examples/pytorch/elastic/imagenet/Dockerfile
+          - component-name: pytorch-elastic-example-echo
+            dockerfile: examples/pytorch/elastic/echo/Dockerfile
+
+# TODO (tenzen-y): Fix the below broken Dockerfiles
+#          - component-name: lightgbm-dist-py-test
+#            dockerfile: examples/xgboost/lightgbm-dist/Dockerfile
+#          - component-name: xgboost-dist-rabit-test
+#            dockerfile: examples/xgboost/smoke-dist/Dockerfile
+#          - component-name: xgboost-dist-iris
+#            dockerfile: examples/xgboost/xgboost-dist
+#          - component-name: mxnet-gpu
+#            dockerfile: examples/mxnet/train/Dockerfile
+#          - component-name: mxnet-auto-tuning
+#            dockerfile: examples/mxnet/tune/Dockerfile
+#          - component-name: pytorch-dist-mnist-mpi
+#            dockerfile: examples/pytorch/mnist/Dockerfile-mpi
+#          - component-name: pytorch-dist-mnist
+#            dockerfile: examples/pytorch/mnist/Dockerfile
diff --git a/.github/workflows/publish-images.yaml b/.github/workflows/publish-images.yaml
diff --git a/.github/workflows/template-publish-image/action.yaml b/.github/workflows/template-publish-image/action.yaml
@@ -1,29 +1,31 @@
-# Template run for publishing images.
+# Composite action to publish Training Operator images.
+name: Build And Publish Container Images
+description: Build Multiplatform Supporting Container Images
 
 inputs:
   image:
     required: true
-    type: string
+    description: image tag
   dockerfile:
     required: true
-    type: string
+    description: path for Dockerfile
+  platforms:
+    required: true
+    description: e.g, linux/amd64
+  push:
+    required: true
+    description: whether to push container images or not
 
 runs:
   using: composite
   steps:
-    - name: Set Up Docker Buildx
-      uses: docker/setup-buildx-action@v2
-
     - name: Setup QEMU
       uses: docker/setup-qemu-action@v2
       with:
         platforms: amd64,ppc64le,arm64
 
-    - name: Docker Login
-      uses: docker/login-action@v2
-      with:
-        username: ${{ env.DOCKERHUB_USERNAME }}
-        password: ${{ env.DOCKERHUB_TOKEN }}
+    - name: Set Up Docker Buildx
+      uses: docker/setup-buildx-action@v2
 
     - name: Add Docker Tags
       id: meta
@@ -37,10 +39,10 @@ runs:
     - name: Build and Push
       uses: docker/build-push-action@v3
       with:
-        platforms: linux/amd64,linux/ppc64le,linux/arm64
+        platforms: ${{ inputs.platforms }}
         context: .
         file: ${{ inputs.dockerfile }}
-        push: true
+        push: ${{ inputs.push }}
         tags: ${{ steps.meta.outputs.tags }}
         cache-from: type=gha
-        cache-to: type=gha,mode=max
+        cache-to: type=gha,mode=max
diff --git a/examples/pytorch/elastic/echo/Dockerfile b/examples/pytorch/elastic/echo/Dockerfile
@@ -2,7 +2,7 @@ FROM python:3.8-buster
 WORKDIR /workspace
 RUN pip install torch==1.10.0 numpy
 # TODO Replace this with the PIP version when available
-ADD echo.py echo.py
+ADD examples/pytorch/elastic/echo/echo.py echo.py
 ENV PYTHONPATH /workspace
 ENV ALLOW_NONE_AUTHENTICATION yes
 ENTRYPOINT ["python", "-m", "torch.distributed.run"]
diff --git a/examples/pytorch/elastic/echo/echo.yaml b/examples/pytorch/elastic/echo/echo.yaml
@@ -15,7 +15,7 @@ spec:
         spec:
           containers:
             - name: pytorch
-              image: kubeflow/pytorch-elastic-example-echo:1.0.0
+              image: kubeflow/pytorch-elastic-example-echo:latest
               imagePullPolicy: IfNotPresent
               env:
               - name: LOGLEVEL

diff --git a/examples/pytorch/elastic/imagenet/Dockerfile b/examples/pytorch/elastic/imagenet/Dockerfile
@@ -7,7 +7,7 @@ WORKDIR /workspace
 RUN apt-get -q update && apt-get -q install -y wget unzip
 RUN wget -q http://cs231n.stanford.edu/tiny-imagenet-200.zip && unzip -q tiny-imagenet-200.zip -d data && rm tiny-imagenet-200.zip
 
-COPY . ./examples
+COPY examples/pytorch/elastic/imagenet/ ./examples
 
 USER root
 ENTRYPOINT ["python", "-m", "torch.distributed.run"]

diff --git a/examples/pytorch/elastic/imagenet/imagenet.yaml b/examples/pytorch/elastic/imagenet/imagenet.yaml
@@ -23,7 +23,7 @@ spec:
         spec:
           containers:
             - name: pytorch
-              image: quay.io/johnugeorge/pytorch-elastic-example-imagenet:0.1
+              image: kubeflow/pytorch-elastic-example-imagenet:latest
               imagePullPolicy: IfNotPresent
               resources:
                 requests:

diff --git a/examples/pytorch/smoke-dist/Dockerfile b/examples/pytorch/smoke-dist/Dockerfile
@@ -1,5 +1,5 @@
 FROM pytorch/pytorch:1.0-cuda10.0-cudnn7-runtime
 
 RUN mkdir -p /opt/mlkube
-COPY dist_sendrecv.py /opt/mlkube/
+COPY examples/pytorch/smoke-dist/dist_sendrecv.py /opt/mlkube/
 ENTRYPOINT ["python", "/opt/mlkube/dist_sendrecv.py"]
diff --git a/examples/pytorch/smoke-dist/pytorch_job_sendrecv.yaml b/examples/pytorch/smoke-dist/pytorch_job_sendrecv.yaml
@@ -11,12 +11,12 @@ spec:
         spec:
           containers:
             - name: pytorch
-              image: gcr.io/kubeflow-ci/pytorch-dist-sendrecv-test:1.0
+              image: kubeflow/pytorch-dist-sendrecv-test:latest
     Worker:
       replicas: 3
       restartPolicy: OnFailure
       template:
         spec:
           containers:
             - name: pytorch
-              image: gcr.io/kubeflow-ci/pytorch-dist-sendrecv-test:1.0
+              image: kubeflow/pytorch-dist-sendrecv-test:latest
diff --git a/examples/tensorflow/dist-mnist/Dockerfile b/examples/tensorflow/dist-mnist/Dockerfile
@@ -14,5 +14,5 @@
 
 FROM tensorflow/tensorflow:1.5.0
 
-ADD . /var/tf_dist_mnist
+ADD examples/tensorflow/dist-mnist/ /var/tf_dist_mnist
 ENTRYPOINT ["python", "/var/tf_dist_mnist/dist_mnist.py"]
diff --git a/examples/tensorflow/dist-mnist/tf_job_mnist.yaml b/examples/tensorflow/dist-mnist/tf_job_mnist.yaml
@@ -11,12 +11,12 @@ spec:
         spec:
           containers:
             - name: tensorflow
-              image: kubeflow/tf-dist-mnist-test:1.0
+              image: kubeflow/tf-dist-mnist-test:latest
     Worker:
       replicas: 4
       restartPolicy: Never
       template:
         spec:
           containers:
             - name: tensorflow
-              image: kubeflow/tf-dist-mnist-test:1.0
+              image: kubeflow/tf-dist-mnist-test:latest
diff --git a/examples/tensorflow/distribution_strategy/estimator-API/Dockerfile b/examples/tensorflow/distribution_strategy/estimator-API/Dockerfile
@@ -1,4 +1,4 @@
 FROM tensorflow/tensorflow:1.11.0
 
-COPY keras_model_to_estimator.py /
+COPY examples/tensorflow/distribution_strategy/estimator-API/keras_model_to_estimator.py /
 ENTRYPOINT ["python", "/keras_model_to_estimator.py", "/tmp/tfkeras_example/"]
diff --git a/examples/tensorflow/distribution_strategy/estimator-API/distributed_tfjob.yaml b/examples/tensorflow/distribution_strategy/estimator-API/distributed_tfjob.yaml
@@ -2,7 +2,6 @@ apiVersion: "kubeflow.org/v1"
 kind: "TFJob"
 metadata:
   name: "distributed-training"
-  namespace: "kf-latest"
 spec:
   runPolicy:
     cleanPodPolicy: None
@@ -12,9 +11,9 @@ spec:
       restartPolicy: Never
       template:
         metadata:
-              annotations:
-                scheduling.k8s.io/group-name: "distributed-training"
+          annotations:
+            scheduling.k8s.io/group-name: "distributed-training"
         spec:
           containers:
             - name: tensorflow
-              image: gcr.io/kubeflow-examples/distributed_worker:v20181031-513e107c
+              image: kubeflow/tf-distributed-worker:latest
diff --git a/examples/tensorflow/distribution_strategy/keras-API/Dockerfile b/examples/tensorflow/distribution_strategy/keras-API/Dockerfile
@@ -2,5 +2,5 @@ FROM python:3.9
 
 RUN pip install tensorflow==2.11.0 tensorflow_datasets==4.7.0
 
-COPY multi_worker_strategy-with-keras.py /
+COPY examples/tensorflow/distribution_strategy/keras-API/multi_worker_strategy-with-keras.py /
 ENTRYPOINT ["python", "/multi_worker_strategy-with-keras.py", "--saved_model_dir", "/train/saved_model/", "--checkpoint_dir", "/train/checkpoint"]
diff --git a/examples/tensorflow/distribution_strategy/keras-API/multi_worker_tfjob.yaml b/examples/tensorflow/distribution_strategy/keras-API/multi_worker_tfjob.yaml
@@ -13,7 +13,7 @@ spec:
         spec:
           containers:
             - name: tensorflow
-              image: kubeflowimages/multi_worker_strategy:v20200522-2a5b081c
+              image: kubeflow/tf-multi-worker-strategy:latest
               volumeMounts:
                 - mountPath: /train
                   name: training

diff --git a/examples/tensorflow/mnist_with_summaries/Dockerfile b/examples/tensorflow/mnist_with_summaries/Dockerfile
@@ -14,5 +14,5 @@
 
 FROM tensorflow/tensorflow:1.11.0
 
-ADD . /var/tf_mnist
+ADD examples/tensorflow/mnist_with_summaries/ /var/tf_mnist
 ENTRYPOINT ["python", "/var/tf_mnist/mnist_with_summaries.py"]
diff --git a/examples/tensorflow/mnist_with_summaries/Dockerfile.ppc64le b/examples/tensorflow/mnist_with_summaries/Dockerfile.ppc64le
@@ -14,5 +14,5 @@
 
 FROM ibmcom/tensorflow-ppc64le:1.13.1
 
-ADD . /var/tf_mnist
+ADD examples/tensorflow/tf_sample/ /var/tf_mnist
 ENTRYPOINT ["python", "/var/tf_mnist/mnist_with_summaries.py"]
diff --git a/examples/tensorflow/mnist_with_summaries/tf_job_mnist.yaml b/examples/tensorflow/mnist_with_summaries/tf_job_mnist.yaml
@@ -14,7 +14,7 @@ spec:
         spec:
           containers:
             - name: tensorflow
-              image: gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0
+              image: kubeflow/tf-mnist-with-summaries:latest
               command:
                 - "python"
                 - "/var/tf_mnist/mnist_with_summaries.py"

diff --git a/examples/tensorflow/simple.yaml b/examples/tensorflow/simple.yaml
@@ -12,7 +12,7 @@ spec:
         spec:
           containers:
             - name: tensorflow
-              image: gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0
+              image: kubeflow/tf-mnist-with-summaries:latest
               command:
                 - "python"
                 - "/var/tf_mnist/mnist_with_summaries.py"
diff --git a/examples/tensorflow/tf_sample/Dockerfile b/examples/tensorflow/tf_sample/Dockerfile
@@ -1,5 +1,5 @@
 FROM  tensorflow/tensorflow:1.8.0
 RUN pip install retrying
 RUN mkdir -p /opt/kubeflow
-COPY tf_smoke.py /opt/kubeflow/
+COPY examples/tensorflow/tf_sample/tf_smoke.py /opt/kubeflow/
 ENTRYPOINT ["python", "/opt/kubeflow/tf_smoke.py"]