Skip to content

Commit

Permalink
Merge branch 'kubeflow:master' into specify-cpu-arch
Browse files Browse the repository at this point in the history
  • Loading branch information
tenzen-y authored Jun 6, 2022
2 parents dc5184c + 72fff88 commit b60690d
Show file tree
Hide file tree
Showing 56 changed files with 1,185 additions and 122 deletions.
33 changes: 33 additions & 0 deletions .github/workflows/darts-cifar10-e2e-test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
name: E2E Test with darts-cnn-cifar10
on:
- pull_request

jobs:
e2e:
runs-on: ubuntu-20.04
timeout-minutes: 120
steps:
- name: Checkout
uses: actions/checkout@v2

- name: Setup Test Env
uses: ./.github/workflows/template-setup-e2e-test
with:
kubernetes-version: ${{ matrix.kubernetes-version }}

- name: Run e2e test with ${{ matrix.experiments }} experiments
uses: ./.github/workflows/template-e2e-test
with:
experiments: ${{ matrix.experiments }}
# Comma Delimited
trial-images: darts-cnn-cifar10-cpu

strategy:
fail-fast: false
matrix:
# Detail: https://hub.docker.com/r/kindest/node
# TODO (tenzen-y): We need to consider running tests on more kubernetes versions.
# kubernetes-version: ["v1.20.15", "v1.21.12", "v1.22.9", "v1.23.6", "v1.24.1"]
kubernetes-version: ["v1.21.12", "v1.22.9", "v1.23.6"]
# Comma Delimited
experiments: ["darts-cpu"]
33 changes: 33 additions & 0 deletions .github/workflows/enas-cifar10-e2e-test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
name: E2E Test with enas-cnn-cifar10
on:
- pull_request

jobs:
e2e:
runs-on: ubuntu-20.04
timeout-minutes: 120
steps:
- name: Checkout
uses: actions/checkout@v2

- name: Setup Test Env
uses: ./.github/workflows/template-setup-e2e-test
with:
kubernetes-version: ${{ matrix.kubernetes-version }}

- name: Run e2e test with ${{ matrix.experiments }} experiments
uses: ./.github/workflows/template-e2e-test
with:
experiments: ${{ matrix.experiments }}
# Comma Delimited
trial-images: enas-cnn-cifar10-cpu

strategy:
fail-fast: false
matrix:
# Detail: https://hub.docker.com/r/kindest/node
# TODO (tenzen-y): We need to consider running tests on more kubernetes versions.
# kubernetes-version: ["v1.20.15", "v1.21.12", "v1.22.9", "v1.23.6", "v1.24.1"]
kubernetes-version: ["v1.21.12", "v1.22.9", "v1.23.6"]
# Comma Delimited
experiments: ["enas-cpu"]
30 changes: 30 additions & 0 deletions .github/workflows/katib-ui-e2e-test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
name: E2E Test for katib-ui
on:
- pull_request

jobs:
e2e:
runs-on: ubuntu-20.04
timeout-minutes: 120
steps:
- name: Checkout
uses: actions/checkout@v2

- name: Setup Test Env
uses: ./.github/workflows/template-setup-e2e-test
with:
kubernetes-version: ${{ matrix.kubernetes-version }}

- name: Set Up KinD Cluster
run: ./test/e2e/v1beta1/scripts/gh-actions/setup-kind.sh true

- name: Start Katib
run: ./test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh true false

strategy:
fail-fast: false
matrix:
# Detail: https://hub.docker.com/r/kindest/node
# TODO (tenzen-y): We need to consider running tests on more kubernetes versions.
# kubernetes-version: ["v1.20.15", "v1.21.12", "v1.22.9", "v1.23.6", "v1.24.1"]
kubernetes-version: ["v1.21.12", "v1.22.9", "v1.23.6"]
39 changes: 39 additions & 0 deletions .github/workflows/mxnet-mnist-e2e-test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
name: E2E Test with mxnet-mnist
on:
- pull_request

jobs:
e2e:
runs-on: ubuntu-20.04
timeout-minutes: 120
steps:
- name: Checkout
uses: actions/checkout@v2

- name: Setup Test Env
uses: ./.github/workflows/template-setup-e2e-test
with:
kubernetes-version: ${{ matrix.kubernetes-version }}

- name: Run e2e test with ${{ matrix.experiments }} experiments
uses: ./.github/workflows/template-e2e-test
with:
experiments: ${{ matrix.experiments }}
# Comma Delimited
trial-images: mxnet-mnist

strategy:
fail-fast: false
matrix:
# Detail: https://hub.docker.com/r/kindest/node
# TODO (tenzen-y): We need to consider running tests on more kubernetes versions.
# kubernetes-version: ["v1.20.15", "v1.21.12", "v1.22.9", "v1.23.6", "v1.24.1"]
kubernetes-version: ["v1.21.12", "v1.22.9", "v1.23.6"]
# Comma Delimited
experiments:
# suggestion-hyperopt
- "random,tpe,never-resume"
- "median-stop,from-volume-resume"
# others
- "grid,bayesian-optimization,tpe"
- "multivariate-tpe,cma-es,hyperband"
1 change: 1 addition & 0 deletions .github/workflows/publish-algorithm-images.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ jobs:
dockerfile: ${{ matrix.dockerfile }}

strategy:
fail-fast: false
matrix:
include:
- component-name: suggestion-hyperopt
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/publish-core-images.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ jobs:
dockerfile: ${{ matrix.dockerfile }}

strategy:
fail-fast: false
matrix:
include:
- component-name: katib-controller
Expand Down
7 changes: 5 additions & 2 deletions .github/workflows/publish-trial-images.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ jobs:
dockerfile: ${{ matrix.dockerfile }}

strategy:
fail-fast: false
matrix:
include:
- trial-name: mxnet-mnist
Expand All @@ -38,5 +39,7 @@ jobs:
dockerfile: examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.gpu
- trial-name: enas-cnn-cifar10-cpu
dockerfile: examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.cpu
- trial-name: darts-cnn-cifar10
dockerfile: examples/v1beta1/trial-images/darts-cnn-cifar10/Dockerfile
- trial-name: darts-cnn-cifar10-cpu
dockerfile: examples/v1beta1/trial-images/darts-cnn-cifar10/Dockerfile.cpu
- trial-name: darts-cnn-cifar10-gpu
dockerfile: examples/v1beta1/trial-images/darts-cnn-cifar10/Dockerfile.gpu
36 changes: 36 additions & 0 deletions .github/workflows/pytorch-mnist-e2e-test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
name: E2E Test with pytorch-mnist
on:
- pull_request

jobs:
e2e:
runs-on: ubuntu-20.04
timeout-minutes: 120
steps:
- name: Checkout
uses: actions/checkout@v2

- name: Setup Test Env
uses: ./.github/workflows/template-setup-e2e-test
with:
kubernetes-version: ${{ matrix.kubernetes-version }}

- name: Run e2e test with ${{ matrix.experiments }} experiments
uses: ./.github/workflows/template-e2e-test
with:
experiments: ${{ matrix.experiments }}
training-operator: true
# Comma Delimited
trial-images: pytorch-mnist

strategy:
fail-fast: false
matrix:
# Detail: https://hub.docker.com/r/kindest/node
# TODO (tenzen-y): We need to consider running tests on more kubernetes versions.
# kubernetes-version: ["v1.20.15", "v1.21.12", "v1.22.9", "v1.23.6", "v1.24.1"]
kubernetes-version: ["v1.21.12", "v1.22.9", "v1.23.6"]
# Comma Delimited
experiments:
- "file-metrics-collector,pytorchjob-mnist"
- "median-stop-with-json-format,file-metrics-collector-with-json-format"
35 changes: 35 additions & 0 deletions .github/workflows/template-e2e-test/action.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Template for e2e tests.

inputs:
cluster_name:
required: false
type: string
default: katib-e2e-cluster
experiments:
required: true
type: string
training-operator:
required: false
type: boolean
trial-images:
required: true
type: string
katib-ui:
required: true
type: boolean
default: false

runs:
using: composite
steps:
- name: Set Up KinD Cluster
shell: bash
run: ./test/e2e/v1beta1/scripts/gh-actions/setup-kind.sh ${{ inputs.katib-ui }} ${{ inputs.trial-images }} ${{ inputs.cluster_name }} ${{ inputs.experiments }}

- name: Set Up Katib
shell: bash
run: ./test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh ${{ inputs.katib-ui }} ${{ inputs.training-operator }}

- name: Run E2E Experiment
shell: bash
run: ./test/e2e/v1beta1/scripts/gh-actions/run-e2e-experiment.sh ${{ inputs.experiments }}
29 changes: 29 additions & 0 deletions .github/workflows/template-setup-e2e-test/action.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Template for e2e tests.

inputs:
kubernetes-version:
required: true
type: string
cluster_name:
required: false
type: string
default: katib-e2e-cluster

runs:
using: composite
steps:
- name: Set Up KinD Cluster
uses: helm/kind-action@v1.2.0
with:
version: v0.13.0
node_image: kindest/node:${{ inputs.kubernetes-version }}
cluster_name: ${{ inputs.cluster_name }}
wait: 120s

- name: Set Up Docker Buildx
uses: docker/setup-buildx-action@v1

- name: Set Up Go env
uses: actions/setup-go@v2
with:
go-version: 1.17.10
2 changes: 1 addition & 1 deletion .github/workflows/test-go.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ jobs:
- name: Setup Go
uses: actions/setup-go@v2
with:
go-version: 1.17.1
go-version: 1.17.10

# Verify that go.mod and go.sum is synchronized
- name: Check Go modules
Expand Down
34 changes: 34 additions & 0 deletions .github/workflows/tf-mnist-with-summaries-e2e-test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
name: E2E Test with tf-mnist-with-summaries
on:
- pull_request

jobs:
e2e:
runs-on: ubuntu-20.04
timeout-minutes: 120
steps:
- name: Checkout
uses: actions/checkout@v2

- name: Setup Test Env
uses: ./.github/workflows/template-setup-e2e-test
with:
kubernetes-version: ${{ matrix.kubernetes-version }}

- name: Run e2e test with ${{ matrix.experiments }} experiments
uses: ./.github/workflows/template-e2e-test
with:
experiments: ${{ matrix.experiments }}
training-operator: true
# Comma Delimited
trial-images: tf-mnist-with-summaries

strategy:
fail-fast: false
matrix:
# Detail: https://hub.docker.com/r/kindest/node
# TODO (tenzen-y): We need to consider running tests on more kubernetes versions.
# kubernetes-version: ["v1.20.15", "v1.21.12", "v1.22.9", "v1.23.6", "v1.24.1"]
kubernetes-version: ["v1.21.12", "v1.22.9", "v1.23.6"]
# Comma Delimited
experiments: ["tfjob-mnist-with-summaries"]
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ Follow the next steps to install Katib standalone.

This is the minimal requirements to install Katib:

- Kubernetes >= 1.17
- Kubernetes >= 1.21
- `kubectl` >= 1.21

## Latest Version
Expand Down
2 changes: 1 addition & 1 deletion cmd/earlystopping/medianstop/v1beta1/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM python:3.9
FROM python:3.9-slim

ENV TARGET_DIR /opt/katib
ENV EARLY_STOPPING_DIR cmd/earlystopping/medianstop/v1beta1
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM python:3.9
FROM python:3.9-slim

ENV TARGET_DIR /opt/katib
ENV METRICS_COLLECTOR_DIR cmd/metricscollector/v1beta1/tfevent-metricscollector
Expand Down
26 changes: 15 additions & 11 deletions cmd/suggestion/chocolate/v1beta1/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,15 +1,5 @@
FROM python:3.9

ENV TARGET_DIR /opt/katib
ENV SUGGESTION_DIR cmd/suggestion/chocolate/v1beta1
FROM alpine:3.15 AS downloader
ENV GRPC_HEALTH_PROBE_VERSION v0.4.11

RUN if [ "$(uname -m)" = "ppc64le" ] || [ "$(uname -m)" = "aarch64" ]; then \
apt-get -y update && \
apt-get -y install gfortran libopenblas-dev liblapack-dev && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*; \
fi
RUN if [ "$(uname -m)" = "ppc64le" ]; then \
wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-ppc64le; \
elif [ "$(uname -m)" = "aarch64" ]; then \
Expand All @@ -19,8 +9,22 @@ RUN if [ "$(uname -m)" = "ppc64le" ]; then \
fi && \
chmod +x /bin/grpc_health_probe

FROM python:3.9-slim
ENV TARGET_DIR /opt/katib
ENV SUGGESTION_DIR cmd/suggestion/chocolate/v1beta1

RUN apt-get -y update && \
apt-get -y install git && \
if [ "$(uname -m)" = "ppc64le" ] || [ "$(uname -m)" = "aarch64" ]; then \
apt-get -y install gfortran libopenblas-dev liblapack-dev; \
fi && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

ADD ./pkg/ ${TARGET_DIR}/pkg/
ADD ./${SUGGESTION_DIR}/ ${TARGET_DIR}/${SUGGESTION_DIR}/
COPY --from=downloader /bin/grpc_health_probe /bin/grpc_health_probe

WORKDIR ${TARGET_DIR}/${SUGGESTION_DIR}
RUN pip install --no-cache-dir -r requirements.txt

Expand Down
Loading

0 comments on commit b60690d

Please sign in to comment.