Skip to content

Commit

Permalink
Add CI to build example images (#1731)
Browse files Browse the repository at this point in the history
Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>

Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>
  • Loading branch information
tenzen-y authored Jan 20, 2023
1 parent 1d88c0e commit e6b4300
Show file tree
Hide file tree
Showing 22 changed files with 176 additions and 70 deletions.
60 changes: 60 additions & 0 deletions .github/workflows/build-and-publish-images.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Reusable workflows for publishing Training Operator images.
name: Build And Publish Images

on:
workflow_call:
inputs:
component-name:
required: true
type: string
platforms:
required: true
type: string
dockerfile:
required: true
type: string
secrets:
DOCKERHUB_USERNAME:
required: false
DOCKERHUB_TOKEN:
required: false

jobs:
build-and-publish:
name: Publish Image
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v3

- name: Docker Login
# Trigger workflow only for kubeflow/training-operator repository with specific branch (master, v.*-branch) or tag (v.*).
if: >-
github.repository == 'kubeflow/training-operator' &&
(github.ref == 'refs/heads/master' || (startsWith(github.ref, 'refs/heads/v') && endsWith(github.ref, '-branch')) || startsWith(github.ref, 'refs/tags/v'))
uses: docker/login-action@v2
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}

- name: Publish Component ${{ inputs.component-name }}
# Trigger workflow only for kubeflow/training-operator repository with specific branch (master, v.*-branch) or tag (v.*).
if: >-
github.repository == 'kubeflow/training-operator' &&
(github.ref == 'refs/heads/master' || (startsWith(github.ref, 'refs/heads/v') && endsWith(github.ref, '-branch')) || startsWith(github.ref, 'refs/tags/v'))
id: publish
uses: ./.github/workflows/template-publish-image
with:
image: docker.io/kubeflow/${{ inputs.component-name }}
dockerfile: ${{ inputs.dockerfile }}
platforms: ${{ inputs.platforms }}
push: true

- name: Test Build For Component ${{ inputs.component-name }}
if: steps.publish.outcome == 'skipped'
uses: ./.github/workflows/template-publish-image
with:
image: docker.io/kubeflow/${{ inputs.component-name }}
dockerfile: ${{ inputs.dockerfile }}
platforms: ${{ inputs.platforms }}
push: false
24 changes: 24 additions & 0 deletions .github/workflows/publish-core-images.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
name: Publish Training Operator Core Images

on:
- push
- pull_request

jobs:
core:
name: Publish Image
uses: ./.github/workflows/build-and-publish-images.yaml
with:
component-name: ${{ matrix.component-name }}
platforms: linux/amd64,linux/arm64,linux/ppc64le
dockerfile: ${{ matrix.dockerfile }}
secrets:
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }}

strategy:
fail-fast: false
matrix:
include:
- component-name: training-operator
dockerfile: build/images/training-operator/Dockerfile
55 changes: 55 additions & 0 deletions .github/workflows/publish-example-images.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
name: Publish Training Operator Example Images

on:
- push
- pull_request

jobs:
example:
name: Publish Image
uses: ./.github/workflows/build-and-publish-images.yaml
with:
component-name: ${{ matrix.component-name }}
# TODO (tenzen-y): Support linux/arm64 platform
platforms: linux/amd64
dockerfile: ${{ matrix.dockerfile }}
secrets:
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }}

strategy:
fail-fast: false
matrix:
include:
- component-name: tf-dist-mnist-test
dockerfile: examples/tensorflow/dist-mnist/Dockerfile
- component-name: tf-distributed-worker
dockerfile: examples/tensorflow/distribution_strategy/estimator-API/Dockerfile
- component-name: tf-multi-worker-strategy
dockerfile: examples/tensorflow/distribution_strategy/keras-API/Dockerfile
- component-name: tf-mnist-with-summaries
dockerfile: examples/tensorflow/mnist_with_summaries/Dockerfile
- component-name: tf-smoke
dockerfile: examples/tensorflow/tf_sample/Dockerfile
- component-name: pytorch-dist-sendrecv-test
dockerfile: examples/pytorch/smoke-dist/Dockerfile
- component-name: pytorch-elastic-example-imagenet
dockerfile: examples/pytorch/elastic/imagenet/Dockerfile
- component-name: pytorch-elastic-example-echo
dockerfile: examples/pytorch/elastic/echo/Dockerfile

# TODO (tenzen-y): Fix the below broken Dockerfiles
# - component-name: lightgbm-dist-py-test
# dockerfile: examples/xgboost/lightgbm-dist/Dockerfile
# - component-name: xgboost-dist-rabit-test
# dockerfile: examples/xgboost/smoke-dist/Dockerfile
# - component-name: xgboost-dist-iris
# dockerfile: examples/xgboost/xgboost-dist
# - component-name: mxnet-gpu
# dockerfile: examples/mxnet/train/Dockerfile
# - component-name: mxnet-auto-tuning
# dockerfile: examples/mxnet/tune/Dockerfile
# - component-name: pytorch-dist-mnist-mpi
# dockerfile: examples/pytorch/mnist/Dockerfile-mpi
# - component-name: pytorch-dist-mnist
# dockerfile: examples/pytorch/mnist/Dockerfile
34 changes: 0 additions & 34 deletions .github/workflows/publish-images.yaml

This file was deleted.

30 changes: 16 additions & 14 deletions .github/workflows/template-publish-image/action.yaml
Original file line number Diff line number Diff line change
@@ -1,29 +1,31 @@
# Template run for publishing images.
# Composite action to publish Training Operator images.
name: Build And Publish Container Images
description: Build Multiplatform Supporting Container Images

inputs:
image:
required: true
type: string
description: image tag
dockerfile:
required: true
type: string
description: path for Dockerfile
platforms:
required: true
description: e.g, linux/amd64
push:
required: true
description: whether to push container images or not

runs:
using: composite
steps:
- name: Set Up Docker Buildx
uses: docker/setup-buildx-action@v2

- name: Setup QEMU
uses: docker/setup-qemu-action@v2
with:
platforms: amd64,ppc64le,arm64

- name: Docker Login
uses: docker/login-action@v2
with:
username: ${{ env.DOCKERHUB_USERNAME }}
password: ${{ env.DOCKERHUB_TOKEN }}
- name: Set Up Docker Buildx
uses: docker/setup-buildx-action@v2

- name: Add Docker Tags
id: meta
Expand All @@ -37,10 +39,10 @@ runs:
- name: Build and Push
uses: docker/build-push-action@v3
with:
platforms: linux/amd64,linux/ppc64le,linux/arm64
platforms: ${{ inputs.platforms }}
context: .
file: ${{ inputs.dockerfile }}
push: true
push: ${{ inputs.push }}
tags: ${{ steps.meta.outputs.tags }}
cache-from: type=gha
cache-to: type=gha,mode=max
cache-to: type=gha,mode=max
2 changes: 1 addition & 1 deletion examples/pytorch/elastic/echo/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ FROM python:3.8-buster
WORKDIR /workspace
RUN pip install torch==1.10.0 numpy
# TODO Replace this with the PIP version when available
ADD echo.py echo.py
ADD examples/pytorch/elastic/echo/echo.py echo.py
ENV PYTHONPATH /workspace
ENV ALLOW_NONE_AUTHENTICATION yes
ENTRYPOINT ["python", "-m", "torch.distributed.run"]
2 changes: 1 addition & 1 deletion examples/pytorch/elastic/echo/echo.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ spec:
spec:
containers:
- name: pytorch
image: kubeflow/pytorch-elastic-example-echo:1.0.0
image: kubeflow/pytorch-elastic-example-echo:latest
imagePullPolicy: IfNotPresent
env:
- name: LOGLEVEL
Expand Down
2 changes: 1 addition & 1 deletion examples/pytorch/elastic/imagenet/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ WORKDIR /workspace
RUN apt-get -q update && apt-get -q install -y wget unzip
RUN wget -q http://cs231n.stanford.edu/tiny-imagenet-200.zip && unzip -q tiny-imagenet-200.zip -d data && rm tiny-imagenet-200.zip

COPY . ./examples
COPY examples/pytorch/elastic/imagenet/ ./examples

USER root
ENTRYPOINT ["python", "-m", "torch.distributed.run"]
Expand Down
2 changes: 1 addition & 1 deletion examples/pytorch/elastic/imagenet/imagenet.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ spec:
spec:
containers:
- name: pytorch
image: quay.io/johnugeorge/pytorch-elastic-example-imagenet:0.1
image: kubeflow/pytorch-elastic-example-imagenet:latest
imagePullPolicy: IfNotPresent
resources:
requests:
Expand Down
2 changes: 1 addition & 1 deletion examples/pytorch/smoke-dist/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
FROM pytorch/pytorch:1.0-cuda10.0-cudnn7-runtime

RUN mkdir -p /opt/mlkube
COPY dist_sendrecv.py /opt/mlkube/
COPY examples/pytorch/smoke-dist/dist_sendrecv.py /opt/mlkube/
ENTRYPOINT ["python", "/opt/mlkube/dist_sendrecv.py"]
4 changes: 2 additions & 2 deletions examples/pytorch/smoke-dist/pytorch_job_sendrecv.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@ spec:
spec:
containers:
- name: pytorch
image: gcr.io/kubeflow-ci/pytorch-dist-sendrecv-test:1.0
image: kubeflow/pytorch-dist-sendrecv-test:latest
Worker:
replicas: 3
restartPolicy: OnFailure
template:
spec:
containers:
- name: pytorch
image: gcr.io/kubeflow-ci/pytorch-dist-sendrecv-test:1.0
image: kubeflow/pytorch-dist-sendrecv-test:latest
2 changes: 1 addition & 1 deletion examples/tensorflow/dist-mnist/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,5 @@

FROM tensorflow/tensorflow:1.5.0

ADD . /var/tf_dist_mnist
ADD examples/tensorflow/dist-mnist/ /var/tf_dist_mnist
ENTRYPOINT ["python", "/var/tf_dist_mnist/dist_mnist.py"]
4 changes: 2 additions & 2 deletions examples/tensorflow/dist-mnist/tf_job_mnist.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@ spec:
spec:
containers:
- name: tensorflow
image: kubeflow/tf-dist-mnist-test:1.0
image: kubeflow/tf-dist-mnist-test:latest
Worker:
replicas: 4
restartPolicy: Never
template:
spec:
containers:
- name: tensorflow
image: kubeflow/tf-dist-mnist-test:1.0
image: kubeflow/tf-dist-mnist-test:latest
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM tensorflow/tensorflow:1.11.0

COPY keras_model_to_estimator.py /
COPY examples/tensorflow/distribution_strategy/estimator-API/keras_model_to_estimator.py /
ENTRYPOINT ["python", "/keras_model_to_estimator.py", "/tmp/tfkeras_example/"]
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ apiVersion: "kubeflow.org/v1"
kind: "TFJob"
metadata:
name: "distributed-training"
namespace: "kf-latest"
spec:
runPolicy:
cleanPodPolicy: None
Expand All @@ -12,9 +11,9 @@ spec:
restartPolicy: Never
template:
metadata:
annotations:
scheduling.k8s.io/group-name: "distributed-training"
annotations:
scheduling.k8s.io/group-name: "distributed-training"
spec:
containers:
- name: tensorflow
image: gcr.io/kubeflow-examples/distributed_worker:v20181031-513e107c
image: kubeflow/tf-distributed-worker:latest
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@ FROM python:3.9

RUN pip install tensorflow==2.11.0 tensorflow_datasets==4.7.0

COPY multi_worker_strategy-with-keras.py /
COPY examples/tensorflow/distribution_strategy/keras-API/multi_worker_strategy-with-keras.py /
ENTRYPOINT ["python", "/multi_worker_strategy-with-keras.py", "--saved_model_dir", "/train/saved_model/", "--checkpoint_dir", "/train/checkpoint"]
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ spec:
spec:
containers:
- name: tensorflow
image: kubeflowimages/multi_worker_strategy:v20200522-2a5b081c
image: kubeflow/tf-multi-worker-strategy:latest
volumeMounts:
- mountPath: /train
name: training
Expand Down
2 changes: 1 addition & 1 deletion examples/tensorflow/mnist_with_summaries/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,5 @@

FROM tensorflow/tensorflow:1.11.0

ADD . /var/tf_mnist
ADD examples/tensorflow/mnist_with_summaries/ /var/tf_mnist
ENTRYPOINT ["python", "/var/tf_mnist/mnist_with_summaries.py"]
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,5 @@

FROM ibmcom/tensorflow-ppc64le:1.13.1

ADD . /var/tf_mnist
ADD examples/tensorflow/tf_sample/ /var/tf_mnist
ENTRYPOINT ["python", "/var/tf_mnist/mnist_with_summaries.py"]
2 changes: 1 addition & 1 deletion examples/tensorflow/mnist_with_summaries/tf_job_mnist.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ spec:
spec:
containers:
- name: tensorflow
image: gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0
image: kubeflow/tf-mnist-with-summaries:latest
command:
- "python"
- "/var/tf_mnist/mnist_with_summaries.py"
Expand Down
2 changes: 1 addition & 1 deletion examples/tensorflow/simple.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ spec:
spec:
containers:
- name: tensorflow
image: gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0
image: kubeflow/tf-mnist-with-summaries:latest
command:
- "python"
- "/var/tf_mnist/mnist_with_summaries.py"
2 changes: 1 addition & 1 deletion examples/tensorflow/tf_sample/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
FROM tensorflow/tensorflow:1.8.0
RUN pip install retrying
RUN mkdir -p /opt/kubeflow
COPY tf_smoke.py /opt/kubeflow/
COPY examples/tensorflow/tf_sample/tf_smoke.py /opt/kubeflow/
ENTRYPOINT ["python", "/opt/kubeflow/tf_smoke.py"]

0 comments on commit e6b4300

Please sign in to comment.