From f58fdab967bd5981b3efff4d08eafcb3d2ebad58 Mon Sep 17 00:00:00 2001 From: tenzen-y Date: Mon, 8 Nov 2021 21:31:27 +0900 Subject: [PATCH 01/17] bump Python to 3.9 --- .github/workflows/test-python.yaml | 2 +- .../medianstop/v1beta1/Dockerfile | 2 +- .../medianstop/v1beta1/requirements.txt | 4 +- .../tfevent-metricscollector/Dockerfile | 28 +++++++-- .../Dockerfile.aarch64 | 28 --------- .../Dockerfile.ppc64le | 2 +- .../tfevent-metricscollector/requirements.txt | 3 + cmd/suggestion/chocolate/v1beta1/Dockerfile | 10 ++-- .../chocolate/v1beta1/requirements.txt | 14 ++--- cmd/suggestion/goptuna/v1beta1/Dockerfile | 5 +- cmd/suggestion/hyperband/v1beta1/Dockerfile | 8 ++- .../hyperband/v1beta1/requirements.txt | 10 ++-- cmd/suggestion/hyperopt/v1beta1/Dockerfile | 8 ++- .../hyperopt/v1beta1/requirements.txt | 12 ++-- cmd/suggestion/nas/darts/v1beta1/Dockerfile | 9 +-- .../nas/darts/v1beta1/requirements.txt | 4 +- cmd/suggestion/nas/enas/v1beta1/Dockerfile | 20 +++++-- .../nas/enas/v1beta1/Dockerfile.aarch64 | 58 ------------------- .../nas/enas/v1beta1/requirements.txt | 6 +- cmd/suggestion/optuna/v1beta1/Dockerfile | 9 +-- .../optuna/v1beta1/requirements.txt | 6 +- cmd/suggestion/skopt/v1beta1/Dockerfile | 9 +-- cmd/suggestion/skopt/v1beta1/requirements.txt | 12 ++-- docs/developer-guide.md | 2 +- .../tfevent_loader.py | 4 +- pkg/suggestion/v1beta1/nas/enas/Controller.py | 26 ++++----- scripts/v1beta1/build.sh | 10 +--- 27 files changed, 130 insertions(+), 181 deletions(-) delete mode 100644 cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile.aarch64 delete mode 100644 cmd/suggestion/nas/enas/v1beta1/Dockerfile.aarch64 mode change 100755 => 100644 pkg/suggestion/v1beta1/nas/enas/Controller.py diff --git a/.github/workflows/test-python.yaml b/.github/workflows/test-python.yaml index da883afa2d1..4b497734e73 100644 --- a/.github/workflows/test-python.yaml +++ b/.github/workflows/test-python.yaml @@ -16,7 +16,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v2 with: - python-version: 3.7 + python-version: 3.9 - name: Install Packages run: | diff --git a/cmd/earlystopping/medianstop/v1beta1/Dockerfile b/cmd/earlystopping/medianstop/v1beta1/Dockerfile index 0054713f12c..f840966cc3b 100644 --- a/cmd/earlystopping/medianstop/v1beta1/Dockerfile +++ b/cmd/earlystopping/medianstop/v1beta1/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.6 +FROM python:3.9 ENV TARGET_DIR /opt/katib ENV EARLY_STOPPING_DIR cmd/earlystopping/medianstop/v1beta1 diff --git a/cmd/earlystopping/medianstop/v1beta1/requirements.txt b/cmd/earlystopping/medianstop/v1beta1/requirements.txt index 2f85202c257..19e57467b90 100644 --- a/cmd/earlystopping/medianstop/v1beta1/requirements.txt +++ b/cmd/earlystopping/medianstop/v1beta1/requirements.txt @@ -1,4 +1,4 @@ -grpcio==1.23.0 -protobuf==3.9.1 +grpcio==1.41.1 +protobuf==3.19.1 googleapis-common-protos==1.6.0 kubernetes==11.0.0 diff --git a/cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile b/cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile index b5d3c807d23..f94e7be7ca8 100644 --- a/cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile +++ b/cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile @@ -1,7 +1,25 @@ -FROM tensorflow/tensorflow:1.11.0 -RUN pip install rfc3339 grpcio googleapis-common-protos -ADD . /usr/src/app/github.com/kubeflow/katib -WORKDIR /usr/src/app/github.com/kubeflow/katib/cmd/metricscollector/v1beta1/tfevent-metricscollector/ +FROM python:3.9 + +ENV TARGET_DIR /opt/katib +ENV METRICS_COLLECTOR_DIR cmd/metricscollector/v1beta1/tfevent-metricscollector +# tensorflow community build for aarch64 +# https://github.com/tensorflow/build#tensorflow-builds +ENV PIP_EXTRA_INDEX_URL https://snapshots.linaro.org/ldcg/python-cache/ + +ADD ./pkg/ ${TARGET_DIR}/pkg/ +ADD ./${METRICS_COLLECTOR_DIR}/ ${TARGET_DIR}/${METRICS_COLLECTOR_DIR}/ +WORKDIR ${TARGET_DIR}/${METRICS_COLLECTOR_DIR} + +RUN if [ "$(uname -m)" = "aarch64" ]; then \ + pip install tensorflow-aarch64==2.7.0; \ + else \ + pip install tensorflow==2.7.0; \ + fi; RUN pip install --no-cache-dir -r requirements.txt -ENV PYTHONPATH /usr/src/app/github.com/kubeflow/katib:/usr/src/app/github.com/kubeflow/katib/pkg/apis/manager/v1beta1/python:/usr/src/app/github.com/kubeflow/katib/pkg/metricscollector/v1beta1/tfevent-metricscollector/:/usr/src/app/github.com/kubeflow/katib/pkg/metricscollector/v1beta1/common/ + +RUN chgrp -R 0 ${TARGET_DIR} \ + && chmod -R g+rwX ${TARGET_DIR} + +ENV PYTHONPATH ${TARGET_DIR}:${TARGET_DIR}/pkg/apis/manager/v1beta1/python:${TARGET_DIR}/pkg/metricscollector/v1beta1/tfevent-metricscollector/::${TARGET_DIR}/pkg/metricscollector/v1beta1/common/ + ENTRYPOINT ["python", "main.py"] diff --git a/cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile.aarch64 b/cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile.aarch64 deleted file mode 100644 index 44746a40e0b..00000000000 --- a/cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile.aarch64 +++ /dev/null @@ -1,28 +0,0 @@ -FROM ubuntu:18.04 - -RUN apt-get update \ - && apt-get -y install software-properties-common \ - autoconf \ - automake \ - build-essential \ - cmake \ - pkg-config \ - wget \ - python-pip \ - libhdf5-dev \ - libhdf5-serial-dev \ - hdf5-tools\ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* - -RUN wget https://github.com/lhelontra/tensorflow-on-arm/releases/download/v1.11.0/tensorflow-1.11.0-cp27-none-linux_aarch64.whl \ - && pip install tensorflow-1.11.0-cp27-none-linux_aarch64.whl \ - && rm tensorflow-1.11.0-cp27-none-linux_aarch64.whl \ - && rm -rf .cache - -RUN pip install rfc3339 grpcio googleapis-common-protos jupyter -ADD . /usr/src/app/github.com/kubeflow/katib -WORKDIR /usr/src/app/github.com/kubeflow/katib/cmd/metricscollector/v1beta1/tfevent-metricscollector/ -RUN pip install --no-cache-dir -r requirements.txt -ENV PYTHONPATH /usr/src/app/github.com/kubeflow/katib:/usr/src/app/github.com/kubeflow/katib/pkg/apis/manager/v1beta1/python:/usr/src/app/github.com/kubeflow/katib/pkg/metricscollector/v1beta1/tfevent-metricscollector/:/usr/src/app/github.com/kubeflow/katib/pkg/metricscollector/v1beta1/common/ -ENTRYPOINT ["python", "main.py"] diff --git a/cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile.ppc64le b/cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile.ppc64le index b8d2b637607..00a75703f6f 100644 --- a/cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile.ppc64le +++ b/cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile.ppc64le @@ -1,4 +1,4 @@ -FROM ibmcom/tensorflow-ppc64le:1.14.0-py3 +FROM ibmcom/tensorflow-ppc64le:2.2.0-py3 RUN pip install rfc3339 grpcio googleapis-common-protos ADD . /usr/src/app/github.com/kubeflow/katib WORKDIR /usr/src/app/github.com/kubeflow/katib/cmd/metricscollector/v1beta1/tfevent-metricscollector/ diff --git a/cmd/metricscollector/v1beta1/tfevent-metricscollector/requirements.txt b/cmd/metricscollector/v1beta1/tfevent-metricscollector/requirements.txt index d2ec0c34de0..cbc91372a46 100644 --- a/cmd/metricscollector/v1beta1/tfevent-metricscollector/requirements.txt +++ b/cmd/metricscollector/v1beta1/tfevent-metricscollector/requirements.txt @@ -1 +1,4 @@ psutil==5.6.6 +rfc3339 +grpcio +googleapis-common-protos diff --git a/cmd/suggestion/chocolate/v1beta1/Dockerfile b/cmd/suggestion/chocolate/v1beta1/Dockerfile index 52bb736fd24..7d623fcb5ce 100644 --- a/cmd/suggestion/chocolate/v1beta1/Dockerfile +++ b/cmd/suggestion/chocolate/v1beta1/Dockerfile @@ -1,15 +1,17 @@ -FROM python:3.6 +FROM python:3.9 ENV TARGET_DIR /opt/katib ENV SUGGESTION_DIR cmd/suggestion/chocolate/v1beta1 +ENV GRPC_HEALTH_PROBE_VERSION v0.3.1 RUN if [ "$(uname -m)" = "ppc64le" ] || [ "$(uname -m)" = "aarch64" ]; then \ apt-get -y update && \ apt-get -y install gfortran libopenblas-dev liblapack-dev && \ - pip install cython 'numpy>=1.13.3'; \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* && \ + pip install cython; \ fi -RUN GRPC_HEALTH_PROBE_VERSION=v0.3.1 && \ - if [ "$(uname -m)" = "ppc64le" ]; then \ +RUN if [ "$(uname -m)" = "ppc64le" ]; then \ wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-ppc64le; \ elif [ "$(uname -m)" = "aarch64" ]; then \ wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-arm64; \ diff --git a/cmd/suggestion/chocolate/v1beta1/requirements.txt b/cmd/suggestion/chocolate/v1beta1/requirements.txt index 1b72ac01900..fd842e04670 100644 --- a/cmd/suggestion/chocolate/v1beta1/requirements.txt +++ b/cmd/suggestion/chocolate/v1beta1/requirements.txt @@ -1,11 +1,11 @@ -grpcio==1.23.0 +grpcio==1.41.1 cloudpickle==0.5.6 -numpy>=1.13.3 -scikit-learn>=0.19.0 -scipy>=0.19.1 +numpy>=1.20.0 +scikit-learn>=0.24.0 +scipy>=1.5.4 forestci==0.3 -protobuf==3.9.1 +protobuf==3.19.1 googleapis-common-protos==1.6.0 -SQLAlchemy==1.3.8 +SQLAlchemy==1.4.26 git+https://github.com/AIworx-Labs/chocolate@master -ghalton>=0.6 +ghalton>=0.6.2 diff --git a/cmd/suggestion/goptuna/v1beta1/Dockerfile b/cmd/suggestion/goptuna/v1beta1/Dockerfile index aad3a699205..5f3040622ec 100644 --- a/cmd/suggestion/goptuna/v1beta1/Dockerfile +++ b/cmd/suggestion/goptuna/v1beta1/Dockerfile @@ -1,6 +1,8 @@ # Build the Goptuna Suggestion. FROM golang:alpine AS build-env +ENV GRPC_HEALTH_PROBE_VERSION v0.3.1 + WORKDIR /go/src/github.com/kubeflow/katib # Download packages. @@ -22,8 +24,7 @@ RUN if [ "$(uname -m)" = "ppc64le" ]; then \ fi # Add GRPC health probe. -RUN GRPC_HEALTH_PROBE_VERSION=v0.3.1 && \ - if [ "$(uname -m)" = "ppc64le" ]; then \ +RUN if [ "$(uname -m)" = "ppc64le" ]; then \ wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-ppc64le; \ elif [ "$(uname -m)" = "aarch64" ]; then \ wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-arm64; \ diff --git a/cmd/suggestion/hyperband/v1beta1/Dockerfile b/cmd/suggestion/hyperband/v1beta1/Dockerfile index 58f92d842fd..068327e23a3 100644 --- a/cmd/suggestion/hyperband/v1beta1/Dockerfile +++ b/cmd/suggestion/hyperband/v1beta1/Dockerfile @@ -1,16 +1,18 @@ -FROM python:3.6 +FROM python:3.9 ENV TARGET_DIR /opt/katib ENV SUGGESTION_DIR cmd/suggestion/hyperband/v1beta1 +ENV GRPC_HEALTH_PROBE_VERSION v0.3.1 RUN if [ "$(uname -m)" = "ppc64le" ] || [ "$(uname -m)" = "aarch64" ]; then \ apt-get -y update && \ apt-get -y install gfortran libopenblas-dev liblapack-dev && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* && \ pip install cython; \ fi -RUN GRPC_HEALTH_PROBE_VERSION=v0.3.1 && \ - if [ "$(uname -m)" = "ppc64le" ]; then \ +RUN if [ "$(uname -m)" = "ppc64le" ]; then \ wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-ppc64le; \ elif [ "$(uname -m)" = "aarch64" ]; then \ wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-arm64; \ diff --git a/cmd/suggestion/hyperband/v1beta1/requirements.txt b/cmd/suggestion/hyperband/v1beta1/requirements.txt index 6677b67f90e..05676a8f593 100644 --- a/cmd/suggestion/hyperband/v1beta1/requirements.txt +++ b/cmd/suggestion/hyperband/v1beta1/requirements.txt @@ -1,8 +1,8 @@ -grpcio==1.23.0 +grpcio==1.41.1 cloudpickle==0.5.6 -numpy>=1.13.3 -scikit-learn>=0.19.0 -scipy>=0.19.1 +numpy>=1.20.0 +scikit-learn>=0.24.0 +scipy>=1.5.4 forestci==0.3 -protobuf==3.9.1 +protobuf==3.19.1 googleapis-common-protos==1.6.0 diff --git a/cmd/suggestion/hyperopt/v1beta1/Dockerfile b/cmd/suggestion/hyperopt/v1beta1/Dockerfile index 2c1d227b160..c40baeba7ea 100644 --- a/cmd/suggestion/hyperopt/v1beta1/Dockerfile +++ b/cmd/suggestion/hyperopt/v1beta1/Dockerfile @@ -1,16 +1,18 @@ -FROM python:3.6 +FROM python:3.9 ENV TARGET_DIR /opt/katib ENV SUGGESTION_DIR cmd/suggestion/hyperopt/v1beta1 +ENV GRPC_HEALTH_PROBE_VERSION v0.3.1 RUN if [ "$(uname -m)" = "ppc64le" ] || [ "$(uname -m)" = "aarch64" ]; then \ apt-get -y update && \ apt-get -y install gfortran libopenblas-dev liblapack-dev && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* && \ pip install cython; \ fi -RUN GRPC_HEALTH_PROBE_VERSION=v0.3.1 && \ - if [ "$(uname -m)" = "ppc64le" ]; then \ +RUN if [ "$(uname -m)" = "ppc64le" ]; then \ wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-ppc64le; \ elif [ "$(uname -m)" = "aarch64" ]; then \ wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-arm64; \ diff --git a/cmd/suggestion/hyperopt/v1beta1/requirements.txt b/cmd/suggestion/hyperopt/v1beta1/requirements.txt index a0a8fb20e4c..5af1f33e9c9 100644 --- a/cmd/suggestion/hyperopt/v1beta1/requirements.txt +++ b/cmd/suggestion/hyperopt/v1beta1/requirements.txt @@ -1,9 +1,9 @@ -grpcio==1.23.0 +grpcio==1.41.1 cloudpickle==0.5.6 -numpy>=1.13.3 -scikit-learn>=0.19.0 -scipy>=0.19.1 +numpy>=1.20.0 +scikit-learn>=0.24.0 +scipy>=1.5.4 forestci==0.3 -protobuf==3.9.1 +protobuf==3.19.1 googleapis-common-protos==1.6.0 -hyperopt==0.2.3 +hyperopt==0.2.5 diff --git a/cmd/suggestion/nas/darts/v1beta1/Dockerfile b/cmd/suggestion/nas/darts/v1beta1/Dockerfile index d95a12dad0b..71863f5dc34 100644 --- a/cmd/suggestion/nas/darts/v1beta1/Dockerfile +++ b/cmd/suggestion/nas/darts/v1beta1/Dockerfile @@ -1,16 +1,18 @@ -FROM python:3.6 +FROM python:3.9 ENV TARGET_DIR /opt/katib ENV SUGGESTION_DIR cmd/suggestion/nas/darts/v1beta1 +ENV GRPC_HEALTH_PROBE_VERSION v0.3.1 RUN if [ "$(uname -m)" = "ppc64le" ] || [ "$(uname -m)" = "aarch64" ]; then \ apt-get -y update && \ apt-get -y install gfortran libopenblas-dev liblapack-dev && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* && \ pip install cython; \ fi -RUN GRPC_HEALTH_PROBE_VERSION=v0.3.1 && \ - if [ "$(uname -m)" = "ppc64le" ]; then \ +RUN if [ "$(uname -m)" = "ppc64le" ]; then \ wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-ppc64le; \ elif [ "$(uname -m)" = "aarch64" ]; then \ wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-arm64; \ @@ -30,4 +32,3 @@ RUN chgrp -R 0 ${TARGET_DIR} \ ENV PYTHONPATH ${TARGET_DIR}:${TARGET_DIR}/pkg/apis/manager/v1beta1/python:${TARGET_DIR}/pkg/apis/manager/health/python ENTRYPOINT ["python", "main.py"] - diff --git a/cmd/suggestion/nas/darts/v1beta1/requirements.txt b/cmd/suggestion/nas/darts/v1beta1/requirements.txt index 92bd5706e11..5206862e66d 100644 --- a/cmd/suggestion/nas/darts/v1beta1/requirements.txt +++ b/cmd/suggestion/nas/darts/v1beta1/requirements.txt @@ -1,3 +1,3 @@ -grpcio==1.23.0 -protobuf==3.9.1 +grpcio==1.41.1 +protobuf==3.19.1 googleapis-common-protos==1.6.0 diff --git a/cmd/suggestion/nas/enas/v1beta1/Dockerfile b/cmd/suggestion/nas/enas/v1beta1/Dockerfile index c5a77c87091..1ac0f873166 100644 --- a/cmd/suggestion/nas/enas/v1beta1/Dockerfile +++ b/cmd/suggestion/nas/enas/v1beta1/Dockerfile @@ -1,16 +1,24 @@ -FROM python:3.6 +FROM python:3.9 ENV TARGET_DIR /opt/katib ENV SUGGESTION_DIR cmd/suggestion/nas/enas/v1beta1 +ENV GRPC_HEALTH_PROBE_VERSION v0.3.1 +# tensorflow community build for aarch64 +# https://github.com/tensorflow/build#tensorflow-builds +ENV PIP_EXTRA_INDEX_URL https://snapshots.linaro.org/ldcg/python-cache/ -RUN if [ "$(uname -m)" = "ppc64le" ]; then \ +RUN if [ "$(uname -m)" = "ppc64le" ] || [ "$(uname -m)" = "aarch64" ]; then \ apt-get -y update && \ apt-get -y install gfortran libopenblas-dev liblapack-dev && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* && \ pip install cython; \ fi -RUN GRPC_HEALTH_PROBE_VERSION=v0.3.1 && \ - if [ "$(uname -m)" = "ppc64le" ]; then \ + +RUN if [ "$(uname -m)" = "ppc64le" ]; then \ wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-ppc64le; \ + elif [ "$(uname -m)" = "aarch64" ]; then \ + wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-arm64; \ else \ wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-amd64; \ fi && \ @@ -19,6 +27,10 @@ RUN GRPC_HEALTH_PROBE_VERSION=v0.3.1 && \ ADD ./pkg/ ${TARGET_DIR}/pkg/ ADD ./${SUGGESTION_DIR}/ ${TARGET_DIR}/${SUGGESTION_DIR}/ WORKDIR ${TARGET_DIR}/${SUGGESTION_DIR} + +RUN if [ "$(uname -m)" = "aarch64" ]; then \ + sed -i 's/tensorflow==/tensorflow-aarch64==/' requirements.txt; \ + fi; RUN pip install --no-cache-dir -r requirements.txt RUN chgrp -R 0 ${TARGET_DIR} \ diff --git a/cmd/suggestion/nas/enas/v1beta1/Dockerfile.aarch64 b/cmd/suggestion/nas/enas/v1beta1/Dockerfile.aarch64 deleted file mode 100644 index 045bc1a1c8e..00000000000 --- a/cmd/suggestion/nas/enas/v1beta1/Dockerfile.aarch64 +++ /dev/null @@ -1,58 +0,0 @@ -FROM golang:alpine AS build-env -# The GOPATH in the image is /go. -ADD . /go/src/github.com/kubeflow/katib -RUN if [ "$(uname -m)" = "ppc64le" ] || [ "$(uname -m)" = "aarch64" ]; then \ - apk --update add git gcc musl-dev && \ - go get github.com/grpc-ecosystem/grpc-health-probe && \ - mv $GOPATH/bin/grpc-health-probe /bin/grpc_health_probe && \ - chmod +x /bin/grpc_health_probe; \ - else \ - GRPC_HEALTH_PROBE_VERSION=v0.3.1 && \ - wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-amd64 && \ - chmod +x /bin/grpc_health_probe; \ - fi - -FROM python:3.7-slim-buster - -ENV TARGET_DIR /opt/katib -ENV SUGGESTION_DIR cmd/suggestion/nas/enas/v1beta1 - -RUN apt-get update \ - && apt-get -y install software-properties-common \ - autoconf \ - automake \ - build-essential \ - cmake \ - libtool \ - pkg-config \ - wget \ - gfortran \ - libopenblas-dev \ - liblapack-dev \ - libhdf5-dev \ - libhdf5-serial-dev \ - hdf5-tools \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* - -RUN pip install cython numpy - -RUN wget https://github.com/lhelontra/tensorflow-on-arm/releases/download/v1.14.0-buster/tensorflow-1.14.0-cp37-none-linux_aarch64.whl \ - && pip install tensorflow-1.14.0-cp37-none-linux_aarch64.whl \ - && rm tensorflow-1.14.0-cp37-none-linux_aarch64.whl \ - && rm -rf .cache - -RUN pip install 'grpcio==1.23.0' 'protobuf==3.9.1' 'googleapis-common-protos==1.6.0' - -COPY --from=build-env /bin/grpc_health_probe /bin/ - -ADD ./pkg/ ${TARGET_DIR}/pkg/ -ADD ./${SUGGESTION_DIR}/ ${TARGET_DIR}/${SUGGESTION_DIR}/ -WORKDIR ${TARGET_DIR}/${SUGGESTION_DIR} - -RUN chgrp -R 0 ${TARGET_DIR} \ - && chmod -R g+rwX ${TARGET_DIR} - -ENV PYTHONPATH ${TARGET_DIR}:${TARGET_DIR}/pkg/apis/manager/v1beta1/python:${TARGET_DIR}/pkg/apis/manager/health/python - -ENTRYPOINT ["python", "main.py"] diff --git a/cmd/suggestion/nas/enas/v1beta1/requirements.txt b/cmd/suggestion/nas/enas/v1beta1/requirements.txt index bde25645c41..6fe48818dbb 100644 --- a/cmd/suggestion/nas/enas/v1beta1/requirements.txt +++ b/cmd/suggestion/nas/enas/v1beta1/requirements.txt @@ -1,4 +1,4 @@ -grpcio==1.23.0 -protobuf==3.9.1 +grpcio==1.41.1 +protobuf==3.19.1 googleapis-common-protos==1.6.0 -tensorflow==1.15.4 +tensorflow==2.7.0 diff --git a/cmd/suggestion/optuna/v1beta1/Dockerfile b/cmd/suggestion/optuna/v1beta1/Dockerfile index bd7e43ecaf2..274bc94a6d6 100644 --- a/cmd/suggestion/optuna/v1beta1/Dockerfile +++ b/cmd/suggestion/optuna/v1beta1/Dockerfile @@ -2,14 +2,15 @@ FROM python:3.9 ENV TARGET_DIR /opt/katib ENV SUGGESTION_DIR cmd/suggestion/optuna/v1beta1 +ENV GRPC_HEALTH_PROBE_VERSION v0.3.1 RUN if [ "$(uname -m)" = "ppc64le" ] || [ "$(uname -m)" = "aarch64" ]; then \ apt-get -y update && \ - apt-get -y install gfortran libopenblas-dev liblapack-dev; \ + apt-get -y install gfortran libopenblas-dev liblapack-dev && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/*; \ fi - -RUN GRPC_HEALTH_PROBE_VERSION=v0.3.1 && \ - if [ "$(uname -m)" = "ppc64le" ]; then \ +RUN if [ "$(uname -m)" = "ppc64le" ]; then \ wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-ppc64le; \ elif [ "$(uname -m)" = "aarch64" ]; then \ wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-arm64; \ diff --git a/cmd/suggestion/optuna/v1beta1/requirements.txt b/cmd/suggestion/optuna/v1beta1/requirements.txt index 09b0692fa06..5e35741485d 100644 --- a/cmd/suggestion/optuna/v1beta1/requirements.txt +++ b/cmd/suggestion/optuna/v1beta1/requirements.txt @@ -1,4 +1,4 @@ -grpcio==1.39.0 -protobuf==3.17.3 +grpcio==1.41.1 +protobuf==3.19.1 googleapis-common-protos==1.53.0 -optuna>=2.8.0 \ No newline at end of file +optuna>=2.8.0 diff --git a/cmd/suggestion/skopt/v1beta1/Dockerfile b/cmd/suggestion/skopt/v1beta1/Dockerfile index 2962715e53b..e04be778308 100644 --- a/cmd/suggestion/skopt/v1beta1/Dockerfile +++ b/cmd/suggestion/skopt/v1beta1/Dockerfile @@ -1,16 +1,17 @@ -FROM python:3.6 +FROM python:3.9 ENV TARGET_DIR /opt/katib ENV SUGGESTION_DIR cmd/suggestion/skopt/v1beta1 +ENV GRPC_HEALTH_PROBE_VERSION v0.3.1 RUN if [ "$(uname -m)" = "ppc64le" ] || [ "$(uname -m)" = "aarch64" ]; then \ apt-get -y update && \ apt-get -y install gfortran libopenblas-dev liblapack-dev && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* && \ pip install cython; \ fi - -RUN GRPC_HEALTH_PROBE_VERSION=v0.3.1 && \ - if [ "$(uname -m)" = "ppc64le" ]; then \ +RUN if [ "$(uname -m)" = "ppc64le" ]; then \ wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-ppc64le; \ elif [ "$(uname -m)" = "aarch64" ]; then \ wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-arm64; \ diff --git a/cmd/suggestion/skopt/v1beta1/requirements.txt b/cmd/suggestion/skopt/v1beta1/requirements.txt index 3734706b97d..8208877d973 100644 --- a/cmd/suggestion/skopt/v1beta1/requirements.txt +++ b/cmd/suggestion/skopt/v1beta1/requirements.txt @@ -1,9 +1,9 @@ -grpcio==1.23.0 +grpcio==1.41.1 cloudpickle==0.5.6 -numpy>=1.13.3 -scikit-learn==0.22.0 -scipy>=0.19.1 +numpy>=1.20.0 +scikit-learn>=0.24.0 +scipy>=1.5.4 forestci==0.3 -protobuf==3.9.1 +protobuf==3.19.1 googleapis-common-protos==1.6.0 -scikit-optimize==0.5.2 +scikit-optimize>=0.9.0 diff --git a/docs/developer-guide.md b/docs/developer-guide.md index 052fee5e79f..0826735b406 100644 --- a/docs/developer-guide.md +++ b/docs/developer-guide.md @@ -15,7 +15,7 @@ see the following user guides: - [Go](https://golang.org/) (1.17 or later) - [Docker](https://docs.docker.com/) (17.05 or later) - [Java](https://docs.oracle.com/javase/8/docs/technotes/guides/install/install_overview.html) (8 or later) -- [Python](https://www.python.org/) (3.7 or later) +- [Python](https://www.python.org/) (3.9 or later) - [kustomize](https://kustomize.io/) (4.0.5 or later) ## Build from source code diff --git a/pkg/metricscollector/v1beta1/tfevent-metricscollector/tfevent_loader.py b/pkg/metricscollector/v1beta1/tfevent-metricscollector/tfevent_loader.py index 5018fc82237..25478e56846 100644 --- a/pkg/metricscollector/v1beta1/tfevent-metricscollector/tfevent_loader.py +++ b/pkg/metricscollector/v1beta1/tfevent-metricscollector/tfevent_loader.py @@ -21,7 +21,6 @@ # Check TFJob example for more information: # https://github.com/kubeflow/katib/blob/master/examples/v1beta1/kubeflow-training-operator/tfjob-mnist-with-summaries.yaml#L16-L22 - import tensorflow as tf import os from datetime import datetime @@ -30,7 +29,6 @@ from logging import getLogger, StreamHandler, INFO import const - class TFEventFileParser: def find_all_files(self, directory): for root, dirs, files in os.walk(directory): @@ -40,7 +38,7 @@ def find_all_files(self, directory): def parse_summary(self, tfefile, metrics): metric_logs = [] - for summary in tf.train.summary_iterator(tfefile): + for summary in tf.compat.v1.train.summary_iterator(tfefile): paths = tfefile.split("/") for v in summary.summary.value: for m in metrics: diff --git a/pkg/suggestion/v1beta1/nas/enas/Controller.py b/pkg/suggestion/v1beta1/nas/enas/Controller.py old mode 100755 new mode 100644 index c3f231d8045..11d31b038de --- a/pkg/suggestion/v1beta1/nas/enas/Controller.py +++ b/pkg/suggestion/v1beta1/nas/enas/Controller.py @@ -54,7 +54,7 @@ def __init__(self, def _build_params(self): """Create TF parameters""" self.logger.info(">>> Building Controller Parameters\n") - initializer = tf.random_uniform_initializer(minval=-0.01, maxval=0.01) + initializer = tf.compat.v1.random_uniform_initializer(minval=-0.01, maxval=0.01) hidden_size = self.controller_hidden_size with tf.compat.v1.variable_scope(self.controller_name, initializer=initializer): @@ -127,7 +127,7 @@ def _build_sampler(self): entropy = log_prob * tf.exp(-log_prob) entropy = tf.stop_gradient(entropy) sample_entropies.append(entropy) - inputs = tf.nn.embedding_lookup(self.w_emb, func) + inputs = tf.nn.embedding_lookup(params=self.w_emb, ids=func) next_c, next_h = _lstm(inputs, prev_c, prev_h, self.w_lstm) prev_c, prev_h = next_c, next_h @@ -154,26 +154,26 @@ def _build_sampler(self): skip_prob = tf.sigmoid(logits) kl = skip_prob * tf.math.log(skip_prob/skip_targets) - kl = tf.reduce_sum(kl) + kl = tf.reduce_sum(input_tensor=kl) skip_penalties.append(kl) log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=skip_index) - sample_log_probs.append(tf.reduce_sum(log_prob, keepdims=True)) + sample_log_probs.append(tf.reduce_sum(input_tensor=log_prob, keepdims=True)) entropy = tf.stop_gradient( - tf.reduce_sum(log_prob * tf.exp(-log_prob), keepdims=True)) + tf.reduce_sum(input_tensor=log_prob * tf.exp(-log_prob), keepdims=True)) sample_entropies.append(entropy) skip_index = tf.dtypes.cast(skip_index, tf.float32) skip_index = tf.reshape(skip_index, [1, layer_id]) - skip_count.append(tf.reduce_sum(skip_index)) + skip_count.append(tf.reduce_sum(input_tensor=skip_index)) inputs = tf.matmul(skip_index, tf.concat(all_h, axis=0)) - inputs /= (1.0 + tf.reduce_sum(skip_index)) + inputs /= (1.0 + tf.reduce_sum(input_tensor=skip_index)) else: inputs = self.g_emb @@ -184,16 +184,16 @@ def _build_sampler(self): self.sample_arc = tf.reshape(arc_seq, [-1]) sample_entropies = tf.stack(sample_entropies) - self.sample_entropy = tf.reduce_sum(sample_entropies) + self.sample_entropy = tf.reduce_sum(input_tensor=sample_entropies) sample_log_probs = tf.stack(sample_log_probs, axis=0) - self.sample_log_probs = tf.reduce_sum(sample_log_probs) + self.sample_log_probs = tf.reduce_sum(input_tensor=sample_log_probs) skip_penalties = tf.stack(skip_penalties) - self.skip_penalties = tf.reduce_mean(skip_penalties) + self.skip_penalties = tf.reduce_mean(input_tensor=skip_penalties) skip_count = tf.stack(skip_count) - self.skip_count = tf.reduce_sum(skip_count) + self.skip_count = tf.reduce_sum(input_tensor=skip_count) def build_trainer(self): """Build the train ops by connecting Controller with candidate.""" @@ -207,7 +207,7 @@ def build_trainer(self): if self.controller_entropy_weight is not None: self.reward += self.controller_entropy_weight * self.sample_entropy - self.sample_log_probs = tf.reduce_sum(self.sample_log_probs) + self.sample_log_probs = tf.reduce_sum(input_tensor=self.sample_log_probs) self.baseline = tf.Variable(0.0, dtype=tf.float32, trainable=False) baseline_update = tf.compat.v1.assign_sub( self.baseline, (1 - self.controller_baseline_decay) * (self.baseline - self.reward)) @@ -249,7 +249,7 @@ def _lstm(x, prev_c, prev_h, w_lstm): def _build_train_op(loss, tf_variables, train_step, learning_rate): """Build training ops from `loss` tensor.""" optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate) - grads = tf.gradients(loss, tf_variables) + grads = tf.gradients(ys=loss, xs=tf_variables) grad_norm = tf.linalg.global_norm(grads) train_op = optimizer.apply_gradients(zip(grads, tf_variables), global_step=train_step) diff --git a/scripts/v1beta1/build.sh b/scripts/v1beta1/build.sh index abc020f1286..3773315b540 100755 --- a/scripts/v1beta1/build.sh +++ b/scripts/v1beta1/build.sh @@ -57,9 +57,7 @@ echo -e "\nBuilding file metrics collector image...\n" docker build -t ${REGISTRY}/file-metrics-collector:${TAG} -f ${CMD_PREFIX}/metricscollector/${VERSION}/file-metricscollector/Dockerfile . echo -e "\nBuilding TF Event metrics collector image...\n" -if [ $MACHINE_ARCH == "aarch64" ]; then - docker build -t ${REGISTRY}/tfevent-metrics-collector:${TAG} -f ${CMD_PREFIX}/metricscollector/${VERSION}/tfevent-metricscollector/Dockerfile.aarch64 . -elif [ $MACHINE_ARCH == "ppc64le" ]; then +if [ $MACHINE_ARCH == "ppc64le" ]; then docker build -t ${REGISTRY}/tfevent-metrics-collector:${TAG} -f ${CMD_PREFIX}/metricscollector/${VERSION}/tfevent-metricscollector/Dockerfile.ppc64le . else docker build -t ${REGISTRY}/tfevent-metrics-collector:${TAG} -f ${CMD_PREFIX}/metricscollector/${VERSION}/tfevent-metricscollector/Dockerfile . @@ -87,11 +85,7 @@ echo -e "\nBuilding optuna suggestion...\n" docker build -t ${REGISTRY}/suggestion-optuna:${TAG} -f ${CMD_PREFIX}/suggestion/optuna/${VERSION}/Dockerfile . echo -e "\nBuilding ENAS suggestion...\n" -if [ $MACHINE_ARCH == "aarch64" ]; then - docker build -t ${REGISTRY}/suggestion-enas:${TAG} -f ${CMD_PREFIX}/suggestion/nas/enas/${VERSION}/Dockerfile.aarch64 . -else - docker build -t ${REGISTRY}/suggestion-enas:${TAG} -f ${CMD_PREFIX}/suggestion/nas/enas/${VERSION}/Dockerfile . -fi +docker build -t ${REGISTRY}/suggestion-enas:${TAG} -f ${CMD_PREFIX}/suggestion/nas/enas/${VERSION}/Dockerfile . echo -e "\nBuilding DARTS suggestion...\n" docker build -t ${REGISTRY}/suggestion-darts:${TAG} -f ${CMD_PREFIX}/suggestion/nas/darts/${VERSION}/Dockerfile . From 0f5c9f956b3ad2fca6d009d4c5c605908ec00a1f Mon Sep 17 00:00:00 2001 From: tenzen-y Date: Fri, 12 Nov 2021 18:25:31 +0900 Subject: [PATCH 02/17] modify script to build container image --- Makefile | 7 +- .../medianstop/v1beta1/Dockerfile | 7 +- .../medianstop/v1beta1/requirements.txt | 1 + .../tfevent-metricscollector/requirements.txt | 6 +- cmd/new-ui/v1beta1/Dockerfile.ppc64le | 63 +++++ cmd/suggestion/chocolate/v1beta1/Dockerfile | 6 +- .../chocolate/v1beta1/requirements.txt | 1 + cmd/suggestion/hyperband/v1beta1/Dockerfile | 3 +- .../hyperband/v1beta1/requirements.txt | 1 + cmd/suggestion/hyperopt/v1beta1/Dockerfile | 3 +- .../hyperopt/v1beta1/requirements.txt | 1 + cmd/suggestion/nas/darts/v1beta1/Dockerfile | 3 +- .../nas/darts/v1beta1/requirements.txt | 1 + cmd/suggestion/nas/enas/v1beta1/Dockerfile | 3 +- .../nas/enas/v1beta1/requirements.txt | 1 + cmd/suggestion/skopt/v1beta1/Dockerfile | 3 +- cmd/suggestion/skopt/v1beta1/requirements.txt | 1 + docs/developer-guide.md | 2 +- .../tfevent-metrics-collector.yaml | 49 ++++ .../enas-cnn-cifar10/Dockerfile.cpu | 4 +- .../enas-cnn-cifar10/Dockerfile.gpu | 4 +- .../trial-images/enas-cnn-cifar10/RunTrial.py | 4 +- .../enas-cnn-cifar10/requirements.txt | 1 - .../tf-mnist-with-summaries/Dockerfile | 14 ++ .../tf-mnist-with-summaries/README.md | 11 + .../tf-mnist-with-summaries/mnist.py | 217 ++++++++++++++++++ scripts/v1beta1/build.sh | 93 +++++--- test/e2e/v1beta1/argo_workflow.py | 42 ++-- 28 files changed, 466 insertions(+), 86 deletions(-) create mode 100644 cmd/new-ui/v1beta1/Dockerfile.ppc64le create mode 100644 examples/v1beta1/metrics-collector/tfevent-metrics-collector.yaml delete mode 100644 examples/v1beta1/trial-images/enas-cnn-cifar10/requirements.txt create mode 100644 examples/v1beta1/trial-images/tf-mnist-with-summaries/Dockerfile create mode 100644 examples/v1beta1/trial-images/tf-mnist-with-summaries/README.md create mode 100644 examples/v1beta1/trial-images/tf-mnist-with-summaries/mnist.py diff --git a/Makefile b/Makefile index 3cf60071b70..1cc4dd85ef6 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,7 @@ HAS_LINT := $(shell command -v golangci-lint;) COMMIT := v1beta1-$(shell git rev-parse --short=7 HEAD) KATIB_REGISTRY := docker.io/kubeflowkatib +CPU_ARCH ?= amd64 # Run tests .PHONY: test @@ -49,10 +50,10 @@ endif # Build images for the Katib v1beta1 components. build: generate -ifeq ($(and $(REGISTRY),$(TAG)),) - $(error REGISTRY and TAG must be set. Usage: make build REGISTRY= TAG=) +ifeq ($(and $(REGISTRY),$(TAG),$(CPU_ARCH)),) + $(error REGISTRY and TAG must be set. Usage: make build REGISTRY= TAG= CPU_ARCH=) endif - bash scripts/v1beta1/build.sh $(REGISTRY) $(TAG) + bash scripts/v1beta1/build.sh $(REGISTRY) $(TAG) $(CPU_ARCH) # Build and push Katib images from the latest master commit. push-latest: generate diff --git a/cmd/earlystopping/medianstop/v1beta1/Dockerfile b/cmd/earlystopping/medianstop/v1beta1/Dockerfile index f840966cc3b..6e661aaecf2 100644 --- a/cmd/earlystopping/medianstop/v1beta1/Dockerfile +++ b/cmd/earlystopping/medianstop/v1beta1/Dockerfile @@ -4,9 +4,10 @@ ENV TARGET_DIR /opt/katib ENV EARLY_STOPPING_DIR cmd/earlystopping/medianstop/v1beta1 RUN if [ "$(uname -m)" = "ppc64le" ] || [ "$(uname -m)" = "aarch64" ]; then \ - apt-get -y update && \ - apt-get -y install gfortran libopenblas-dev liblapack-dev && \ - pip install cython; \ + apt-get -y update && \ + apt-get -y install gfortran libopenblas-dev liblapack-dev && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/*; \ fi ADD ./pkg/ ${TARGET_DIR}/pkg/ diff --git a/cmd/earlystopping/medianstop/v1beta1/requirements.txt b/cmd/earlystopping/medianstop/v1beta1/requirements.txt index 19e57467b90..78475d4f80d 100644 --- a/cmd/earlystopping/medianstop/v1beta1/requirements.txt +++ b/cmd/earlystopping/medianstop/v1beta1/requirements.txt @@ -2,3 +2,4 @@ grpcio==1.41.1 protobuf==3.19.1 googleapis-common-protos==1.6.0 kubernetes==11.0.0 +cython>=0.29.24 diff --git a/cmd/metricscollector/v1beta1/tfevent-metricscollector/requirements.txt b/cmd/metricscollector/v1beta1/tfevent-metricscollector/requirements.txt index cbc91372a46..09f4e56eafb 100644 --- a/cmd/metricscollector/v1beta1/tfevent-metricscollector/requirements.txt +++ b/cmd/metricscollector/v1beta1/tfevent-metricscollector/requirements.txt @@ -1,4 +1,4 @@ psutil==5.6.6 -rfc3339 -grpcio -googleapis-common-protos +rfc3339>=6.2 +grpcio==1.41.1 +googleapis-common-protos==1.6.0 diff --git a/cmd/new-ui/v1beta1/Dockerfile.ppc64le b/cmd/new-ui/v1beta1/Dockerfile.ppc64le new file mode 100644 index 00000000000..1902becd801 --- /dev/null +++ b/cmd/new-ui/v1beta1/Dockerfile.ppc64le @@ -0,0 +1,63 @@ +# --- Clone the kubeflow/kubeflow code --- +FROM ubuntu AS fetch-kubeflow-kubeflow + +RUN apt-get update && apt-get install git -y + +WORKDIR /kf +RUN git clone https://github.com/kubeflow/kubeflow.git && \ + cd kubeflow && \ + git checkout 24bcb8e + +# --- Build the frontend kubeflow library --- +FROM ppc64le/node:12 AS frontend-kubeflow-lib + +WORKDIR /src + +ARG LIB=/kf/kubeflow/components/crud-web-apps/common/frontend/kubeflow-common-lib +COPY --from=fetch-kubeflow-kubeflow $LIB/package*.json ./ +RUN npm ci + +COPY --from=fetch-kubeflow-kubeflow $LIB/ ./ +RUN npm run build + +# --- Build the frontend --- +FROM ppc64le/node:12 AS frontend + +WORKDIR /src +COPY ./pkg/new-ui/v1beta1/frontend/package*.json ./ +RUN npm ci + +COPY ./pkg/new-ui/v1beta1/frontend/ . +COPY --from=frontend-kubeflow-lib /src/dist/kubeflow/ ./node_modules/kubeflow/ + +RUN npm run build:prod + +# --- Build the backend --- +FROM golang:alpine AS go-build + +WORKDIR /go/src/github.com/kubeflow/katib + +# Download packages. +COPY go.mod . +COPY go.sum . +RUN go mod download -x + +# Copy sources. +COPY cmd/ cmd/ +COPY pkg/ pkg/ + +# Build the binary. +RUN if [ "$(uname -m)" = "ppc64le" ]; then \ + CGO_ENABLED=0 GOOS=linux GOARCH=ppc64le go build -a -o katib-ui ./cmd/new-ui/v1beta1; \ + elif [ "$(uname -m)" = "aarch64" ]; then \ + CGO_ENABLED=0 GOOS=linux GOARCH=arm64 go build -a -o katib-ui ./cmd/new-ui/v1beta1; \ + else \ + CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -a -o katib-ui ./cmd/new-ui/v1beta1; \ + fi + +# --- Compose the web app --- +FROM alpine:3.7 +WORKDIR /app +COPY --from=go-build /go/src/github.com/kubeflow/katib/katib-ui /app/ +COPY --from=frontend /src/dist/static /app/build/static/ +ENTRYPOINT ["./katib-ui"] diff --git a/cmd/suggestion/chocolate/v1beta1/Dockerfile b/cmd/suggestion/chocolate/v1beta1/Dockerfile index 7d623fcb5ce..407f8a6852b 100644 --- a/cmd/suggestion/chocolate/v1beta1/Dockerfile +++ b/cmd/suggestion/chocolate/v1beta1/Dockerfile @@ -8,8 +8,7 @@ RUN if [ "$(uname -m)" = "ppc64le" ] || [ "$(uname -m)" = "aarch64" ]; then \ apt-get -y update && \ apt-get -y install gfortran libopenblas-dev liblapack-dev && \ apt-get clean && \ - rm -rf /var/lib/apt/lists/* && \ - pip install cython; \ + rm -rf /var/lib/apt/lists/*; \ fi RUN if [ "$(uname -m)" = "ppc64le" ]; then \ wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-ppc64le; \ @@ -23,6 +22,9 @@ RUN if [ "$(uname -m)" = "ppc64le" ]; then \ ADD ./pkg/ ${TARGET_DIR}/pkg/ ADD ./${SUGGESTION_DIR}/ ${TARGET_DIR}/${SUGGESTION_DIR}/ WORKDIR ${TARGET_DIR}/${SUGGESTION_DIR} +RUN if [ "$(uname -m)" = "aarch64" ]; then \ + sed -i -e '$a git+https://github.com/fmder/ghalton@master' -e '/^ghalton/d' requirements.txt; \ + fi; RUN pip install --no-cache-dir -r requirements.txt RUN chgrp -R 0 ${TARGET_DIR} \ diff --git a/cmd/suggestion/chocolate/v1beta1/requirements.txt b/cmd/suggestion/chocolate/v1beta1/requirements.txt index fd842e04670..8466aa7d7c4 100644 --- a/cmd/suggestion/chocolate/v1beta1/requirements.txt +++ b/cmd/suggestion/chocolate/v1beta1/requirements.txt @@ -9,3 +9,4 @@ googleapis-common-protos==1.6.0 SQLAlchemy==1.4.26 git+https://github.com/AIworx-Labs/chocolate@master ghalton>=0.6.2 +cython>=0.29.24 diff --git a/cmd/suggestion/hyperband/v1beta1/Dockerfile b/cmd/suggestion/hyperband/v1beta1/Dockerfile index 068327e23a3..c1c1991044f 100644 --- a/cmd/suggestion/hyperband/v1beta1/Dockerfile +++ b/cmd/suggestion/hyperband/v1beta1/Dockerfile @@ -8,8 +8,7 @@ RUN if [ "$(uname -m)" = "ppc64le" ] || [ "$(uname -m)" = "aarch64" ]; then \ apt-get -y update && \ apt-get -y install gfortran libopenblas-dev liblapack-dev && \ apt-get clean && \ - rm -rf /var/lib/apt/lists/* && \ - pip install cython; \ + rm -rf /var/lib/apt/lists/*; \ fi RUN if [ "$(uname -m)" = "ppc64le" ]; then \ diff --git a/cmd/suggestion/hyperband/v1beta1/requirements.txt b/cmd/suggestion/hyperband/v1beta1/requirements.txt index 05676a8f593..0b0b7450685 100644 --- a/cmd/suggestion/hyperband/v1beta1/requirements.txt +++ b/cmd/suggestion/hyperband/v1beta1/requirements.txt @@ -6,3 +6,4 @@ scipy>=1.5.4 forestci==0.3 protobuf==3.19.1 googleapis-common-protos==1.6.0 +cython>=0.29.24 diff --git a/cmd/suggestion/hyperopt/v1beta1/Dockerfile b/cmd/suggestion/hyperopt/v1beta1/Dockerfile index c40baeba7ea..e23d58d7538 100644 --- a/cmd/suggestion/hyperopt/v1beta1/Dockerfile +++ b/cmd/suggestion/hyperopt/v1beta1/Dockerfile @@ -8,8 +8,7 @@ RUN if [ "$(uname -m)" = "ppc64le" ] || [ "$(uname -m)" = "aarch64" ]; then \ apt-get -y update && \ apt-get -y install gfortran libopenblas-dev liblapack-dev && \ apt-get clean && \ - rm -rf /var/lib/apt/lists/* && \ - pip install cython; \ + rm -rf /var/lib/apt/lists/*; \ fi RUN if [ "$(uname -m)" = "ppc64le" ]; then \ diff --git a/cmd/suggestion/hyperopt/v1beta1/requirements.txt b/cmd/suggestion/hyperopt/v1beta1/requirements.txt index 5af1f33e9c9..f34047827b5 100644 --- a/cmd/suggestion/hyperopt/v1beta1/requirements.txt +++ b/cmd/suggestion/hyperopt/v1beta1/requirements.txt @@ -7,3 +7,4 @@ forestci==0.3 protobuf==3.19.1 googleapis-common-protos==1.6.0 hyperopt==0.2.5 +cython>=0.29.24 diff --git a/cmd/suggestion/nas/darts/v1beta1/Dockerfile b/cmd/suggestion/nas/darts/v1beta1/Dockerfile index 71863f5dc34..318fc4af86e 100644 --- a/cmd/suggestion/nas/darts/v1beta1/Dockerfile +++ b/cmd/suggestion/nas/darts/v1beta1/Dockerfile @@ -8,8 +8,7 @@ RUN if [ "$(uname -m)" = "ppc64le" ] || [ "$(uname -m)" = "aarch64" ]; then \ apt-get -y update && \ apt-get -y install gfortran libopenblas-dev liblapack-dev && \ apt-get clean && \ - rm -rf /var/lib/apt/lists/* && \ - pip install cython; \ + rm -rf /var/lib/apt/lists/*; \ fi RUN if [ "$(uname -m)" = "ppc64le" ]; then \ diff --git a/cmd/suggestion/nas/darts/v1beta1/requirements.txt b/cmd/suggestion/nas/darts/v1beta1/requirements.txt index 5206862e66d..f5b413a47d4 100644 --- a/cmd/suggestion/nas/darts/v1beta1/requirements.txt +++ b/cmd/suggestion/nas/darts/v1beta1/requirements.txt @@ -1,3 +1,4 @@ grpcio==1.41.1 protobuf==3.19.1 googleapis-common-protos==1.6.0 +cython>=0.29.24 diff --git a/cmd/suggestion/nas/enas/v1beta1/Dockerfile b/cmd/suggestion/nas/enas/v1beta1/Dockerfile index 1ac0f873166..2584138f766 100644 --- a/cmd/suggestion/nas/enas/v1beta1/Dockerfile +++ b/cmd/suggestion/nas/enas/v1beta1/Dockerfile @@ -11,8 +11,7 @@ RUN if [ "$(uname -m)" = "ppc64le" ] || [ "$(uname -m)" = "aarch64" ]; then \ apt-get -y update && \ apt-get -y install gfortran libopenblas-dev liblapack-dev && \ apt-get clean && \ - rm -rf /var/lib/apt/lists/* && \ - pip install cython; \ + rm -rf /var/lib/apt/lists/*; \ fi RUN if [ "$(uname -m)" = "ppc64le" ]; then \ diff --git a/cmd/suggestion/nas/enas/v1beta1/requirements.txt b/cmd/suggestion/nas/enas/v1beta1/requirements.txt index 6fe48818dbb..87d839b0ece 100644 --- a/cmd/suggestion/nas/enas/v1beta1/requirements.txt +++ b/cmd/suggestion/nas/enas/v1beta1/requirements.txt @@ -2,3 +2,4 @@ grpcio==1.41.1 protobuf==3.19.1 googleapis-common-protos==1.6.0 tensorflow==2.7.0 +cython>=0.29.24 diff --git a/cmd/suggestion/skopt/v1beta1/Dockerfile b/cmd/suggestion/skopt/v1beta1/Dockerfile index e04be778308..b71347ca3a7 100644 --- a/cmd/suggestion/skopt/v1beta1/Dockerfile +++ b/cmd/suggestion/skopt/v1beta1/Dockerfile @@ -8,8 +8,7 @@ RUN if [ "$(uname -m)" = "ppc64le" ] || [ "$(uname -m)" = "aarch64" ]; then \ apt-get -y update && \ apt-get -y install gfortran libopenblas-dev liblapack-dev && \ apt-get clean && \ - rm -rf /var/lib/apt/lists/* && \ - pip install cython; \ + rm -rf /var/lib/apt/lists/*; \ fi RUN if [ "$(uname -m)" = "ppc64le" ]; then \ wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-ppc64le; \ diff --git a/cmd/suggestion/skopt/v1beta1/requirements.txt b/cmd/suggestion/skopt/v1beta1/requirements.txt index 8208877d973..2cd9502447d 100644 --- a/cmd/suggestion/skopt/v1beta1/requirements.txt +++ b/cmd/suggestion/skopt/v1beta1/requirements.txt @@ -7,3 +7,4 @@ forestci==0.3 protobuf==3.19.1 googleapis-common-protos==1.6.0 scikit-optimize>=0.9.0 +cython>=0.29.24 diff --git a/docs/developer-guide.md b/docs/developer-guide.md index 0826735b406..c5d8a375bc3 100644 --- a/docs/developer-guide.md +++ b/docs/developer-guide.md @@ -13,7 +13,7 @@ see the following user guides: ## Requirements - [Go](https://golang.org/) (1.17 or later) -- [Docker](https://docs.docker.com/) (17.05 or later) +- [Docker](https://docs.docker.com/) (20.10 or later) - [Java](https://docs.oracle.com/javase/8/docs/technotes/guides/install/install_overview.html) (8 or later) - [Python](https://www.python.org/) (3.9 or later) - [kustomize](https://kustomize.io/) (4.0.5 or later) diff --git a/examples/v1beta1/metrics-collector/tfevent-metrics-collector.yaml b/examples/v1beta1/metrics-collector/tfevent-metrics-collector.yaml new file mode 100644 index 00000000000..c95595543a7 --- /dev/null +++ b/examples/v1beta1/metrics-collector/tfevent-metrics-collector.yaml @@ -0,0 +1,49 @@ +apiVersion: kubeflow.org/v1beta1 +kind: Experiment +metadata: + namespace: kubeflow + name: tfevent-metrics-collector +spec: + parallelTrialCount: 3 + maxTrialCount: 12 + maxFailedTrialCount: 3 + objective: + type: maximize + goal: 0.99 + objectiveMetricName: accuracy_1 + algorithm: + algorithmName: random + metricsCollectorSpec: + source: + fileSystemPath: + path: /train + kind: Directory + collector: + kind: TensorFlowEvent + parameters: + - name: learning_rate + parameterType: double + feasibleSpace: + min: "0.01" + max: "0.05" + trialTemplate: + primaryContainerName: training-container + trialParameters: + - name: learningRate + description: Learning rate for the training model + reference: learning_rate + trialSpec: + apiVersion: batch/v1 + kind: Job + spec: + template: + spec: + containers: + - name: training-container + image: docker.io/kubeflowkatib/tf-mnist-with-summaries:latest + command: + - "python3" + - "/opt/tf-mnist-with-summaries/mnist.py" + - "--log_dir=/train/metrics" + - "--learning_rate=${trialParameters.learningRate}" + restartPolicy: Never diff --git a/examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.cpu b/examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.cpu index 3710a59f8fc..524d08e2506 100644 --- a/examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.cpu +++ b/examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.cpu @@ -1,12 +1,10 @@ -FROM tensorflow/tensorflow:1.15.4-py3 +FROM tensorflow/tensorflow:2.7.0 ENV TARGET_DIR /opt/enas-cnn-cifar10 ADD examples/v1beta1/trial-images/enas-cnn-cifar10 ${TARGET_DIR} WORKDIR ${TARGET_DIR} -RUN pip3 install --upgrade pip -RUN pip3 install --upgrade -r requirements.txt ENV PYTHONPATH ${TARGET_DIR} RUN chgrp -R 0 ${TARGET_DIR} \ diff --git a/examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.gpu b/examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.gpu index 5020d01ad36..316ddf8a8fe 100644 --- a/examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.gpu +++ b/examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.gpu @@ -1,12 +1,10 @@ -FROM tensorflow/tensorflow:1.15.4-gpu-py3 +FROM tensorflow/tensorflow:2.7.0-gpu ENV TARGET_DIR /opt/enas-cnn-cifar10 ADD examples/v1beta1/trial-images/enas-cnn-cifar10 ${TARGET_DIR} WORKDIR ${TARGET_DIR} -RUN pip3 install --upgrade pip -RUN pip3 install --upgrade -r requirements.txt ENV PYTHONPATH ${TARGET_DIR} RUN chgrp -R 0 ${TARGET_DIR} \ diff --git a/examples/v1beta1/trial-images/enas-cnn-cifar10/RunTrial.py b/examples/v1beta1/trial-images/enas-cnn-cifar10/RunTrial.py index 4672e079a27..625f6174d62 100644 --- a/examples/v1beta1/trial-images/enas-cnn-cifar10/RunTrial.py +++ b/examples/v1beta1/trial-images/enas-cnn-cifar10/RunTrial.py @@ -2,8 +2,8 @@ import numpy as np from keras.datasets import cifar10 from ModelConstructor import ModelConstructor -from keras.utils import to_categorical -from keras.utils import multi_gpu_model +from tensorflow.keras.utils import to_categorical +from tensorflow.python.keras.utils.multi_gpu_utils import multi_gpu_model from keras.preprocessing.image import ImageDataGenerator import argparse import time diff --git a/examples/v1beta1/trial-images/enas-cnn-cifar10/requirements.txt b/examples/v1beta1/trial-images/enas-cnn-cifar10/requirements.txt deleted file mode 100644 index 1a23c027782..00000000000 --- a/examples/v1beta1/trial-images/enas-cnn-cifar10/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -keras==2.2.4 diff --git a/examples/v1beta1/trial-images/tf-mnist-with-summaries/Dockerfile b/examples/v1beta1/trial-images/tf-mnist-with-summaries/Dockerfile new file mode 100644 index 00000000000..5f19be878a4 --- /dev/null +++ b/examples/v1beta1/trial-images/tf-mnist-with-summaries/Dockerfile @@ -0,0 +1,14 @@ +FROM tensorflow/tensorflow:2.7.0 + +ADD examples/v1beta1/trial-images/tf-mnist-with-summaries /opt/tf-mnist-with-summaries +WORKDIR /opt/tf-mnist-with-summaries + +# Add folder for the logs. +RUN mkdir /katib + +RUN chgrp -R 0 /opt/tf-mnist-with-summaries \ + && chmod -R g+rwX /opt/tf-mnist-with-summaries \ + && chgrp -R 0 /katib \ + && chmod -R g+rwX /katib + +ENTRYPOINT ["python3", "/opt/tf-mnist-with-summaries/mnist.py"] diff --git a/examples/v1beta1/trial-images/tf-mnist-with-summaries/README.md b/examples/v1beta1/trial-images/tf-mnist-with-summaries/README.md new file mode 100644 index 00000000000..56c75d68665 --- /dev/null +++ b/examples/v1beta1/trial-images/tf-mnist-with-summaries/README.md @@ -0,0 +1,11 @@ +# Tensorflow MNIST Classification With Summaries Example + +This is Tensorflow MNIST image classification training container that outputs TF summaries. +It uses convolutional neural network to train the model. + +If you want to read more about this example, visit the official +[tensorflow](https://github.com/tensorflow/tensorflow/blob/7462dcaae1e8cfe1dfd0c62dd6083f9749a9d827/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py) +GitHub repository. + +Katib uses this training container in some Experiments, for instance in the +[TF Event Metrics Collector](../../metrics-collector/tfevent-metrics-collector.yaml#L55-L64). diff --git a/examples/v1beta1/trial-images/tf-mnist-with-summaries/mnist.py b/examples/v1beta1/trial-images/tf-mnist-with-summaries/mnist.py new file mode 100644 index 00000000000..04315ad8a3f --- /dev/null +++ b/examples/v1beta1/trial-images/tf-mnist-with-summaries/mnist.py @@ -0,0 +1,217 @@ +# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""A simple MNIST classifier which displays summaries in TensorBoard. + +This is an unimpressive MNIST model, but it is a good example of using +tf.name_scope to make a graph legible in the TensorBoard graph explorer, and of +naming summary tags so that they are grouped meaningfully in TensorBoard. + +It demonstrates the functionality of every TensorBoard dashboard. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import os +import sys + +import tensorflow as tf + +from tensorflow.examples.tutorials.mnist import input_data + +FLAGS = None + + +def train(): + # Import data + mnist = input_data.read_data_sets(FLAGS.data_dir, + fake_data=FLAGS.fake_data) + + sess = tf.compat.v1.InteractiveSession() + # Create a multilayer model. + + # Input placeholders + with tf.compat.v1.name_scope('input'): + x = tf.compat.v1.placeholder(tf.float32, [None, 784], name='x-input') + y_ = tf.compat.v1.placeholder(tf.int64, [None], name='y-input') + + with tf.compat.v1.name_scope('input_reshape'): + image_shaped_input = tf.reshape(x, [-1, 28, 28, 1]) + tf.compat.v1.summary.image('input', image_shaped_input, 10) + + # We can't initialize these variables to 0 - the network will get stuck. + def weight_variable(shape): + """Create a weight variable with appropriate initialization.""" + initial = tf.random.truncated_normal(shape, stddev=0.1) + return tf.Variable(initial) + + def bias_variable(shape): + """Create a bias variable with appropriate initialization.""" + initial = tf.constant(0.1, shape=shape) + return tf.Variable(initial) + + def variable_summaries(var): + """Attach a lot of summaries to a Tensor (for TensorBoard visualization).""" + with tf.compat.v1.name_scope('summaries'): + mean = tf.reduce_mean(input_tensor=var) + tf.compat.v1.summary.scalar('mean', mean) + with tf.compat.v1.name_scope('stddev'): + stddev = tf.sqrt(tf.reduce_mean(input_tensor=tf.square(var - mean))) + tf.compat.v1.summary.scalar('stddev', stddev) + tf.compat.v1.summary.scalar('max', tf.reduce_max(input_tensor=var)) + tf.compat.v1.summary.scalar('min', tf.reduce_min(input_tensor=var)) + tf.compat.v1.summary.histogram('histogram', var) + + def nn_layer(input_tensor, input_dim, output_dim, layer_name, act=tf.nn.relu): + """Reusable code for making a simple neural net layer. + + It does a matrix multiply, bias add, and then uses ReLU to nonlinearize. + It also sets up name scoping so that the resultant graph is easy to read, + and adds a number of summary ops. + """ + # Adding a name scope ensures logical grouping of the layers in the graph. + with tf.compat.v1.name_scope(layer_name): + # This Variable will hold the state of the weights for the layer + with tf.compat.v1.name_scope('weights'): + weights = weight_variable([input_dim, output_dim]) + variable_summaries(weights) + with tf.compat.v1.name_scope('biases'): + biases = bias_variable([output_dim]) + variable_summaries(biases) + with tf.compat.v1.name_scope('Wx_plus_b'): + preactivate = tf.matmul(input_tensor, weights) + biases + tf.compat.v1.summary.histogram('pre_activations', preactivate) + activations = act(preactivate, name='activation') + tf.compat.v1.summary.histogram('activations', activations) + return activations + + hidden1 = nn_layer(x, 784, 500, 'layer1') + + with tf.compat.v1.name_scope('dropout'): + keep_prob = tf.compat.v1.placeholder(tf.float32) + tf.compat.v1.summary.scalar('dropout_keep_probability', keep_prob) + dropped = tf.nn.dropout(hidden1, rate=(1 - keep_prob)) + + # Do not apply softmax activation yet, see below. + y = nn_layer(dropped, 500, 10, 'layer2', act=tf.identity) + + with tf.compat.v1.name_scope('cross_entropy'): + # The raw formulation of cross-entropy, + # + # tf.reduce_mean(-tf.reduce_sum(y_ * tf.math.log(tf.softmax(y)), + # reduction_indices=[1])) + # + # can be numerically unstable. + # + # So here we use tf.compat.v1.losses.sparse_softmax_cross_entropy on the + # raw logit outputs of the nn_layer above, and then average across + # the batch. + with tf.compat.v1.name_scope('total'): + cross_entropy = tf.compat.v1.losses.sparse_softmax_cross_entropy( + labels=y_, logits=y) + tf.compat.v1.summary.scalar('cross_entropy', cross_entropy) + + with tf.compat.v1.name_scope('train'): + train_step = tf.compat.v1.train.AdamOptimizer(FLAGS.learning_rate).minimize( + cross_entropy) + + with tf.compat.v1.name_scope('accuracy'): + with tf.compat.v1.name_scope('correct_prediction'): + correct_prediction = tf.equal(tf.argmax(input=y, axis=1), y_) + with tf.compat.v1.name_scope('accuracy'): + accuracy = tf.reduce_mean(input_tensor=tf.cast(correct_prediction, + tf.float32)) + tf.compat.v1.summary.scalar('accuracy', accuracy) + + # Merge all the summaries and write them out to + # /tmp/tensorflow/mnist/logs/mnist_with_summaries (by default) + merged = tf.compat.v1.summary.merge_all() + train_writer = tf.compat.v1.summary.FileWriter(FLAGS.log_dir + '/train', + sess.graph) + test_writer = tf.compat.v1.summary.FileWriter(FLAGS.log_dir + '/test') + tf.compat.v1.global_variables_initializer().run() + + # Train the model, and also write summaries. + # Every 10th step, measure test-set accuracy, and write test summaries + # All other steps, run train_step on training data, & add training summaries + + def feed_dict(train): + """Make a TensorFlow feed_dict: maps data onto Tensor placeholders.""" + if train or FLAGS.fake_data: + xs, ys = mnist.train.next_batch(100, fake_data=FLAGS.fake_data) + k = FLAGS.dropout + else: + xs, ys = mnist.test.images, mnist.test.labels + k = 1.0 + return {x: xs, y_: ys, keep_prob: k} + + for i in range(FLAGS.max_steps): + if i % 10 == 0: # Record summaries and test-set accuracy + summary, acc = sess.run([merged, accuracy], feed_dict=feed_dict(False)) + test_writer.add_summary(summary, i) + print('Accuracy at step %s: %s' % (i, acc)) + else: # Record train set summaries, and train + if i % 100 == 99: # Record execution stats + run_options = tf.compat.v1.RunOptions( + trace_level=tf.compat.v1.RunOptions.FULL_TRACE) + run_metadata = tf.compat.v1.RunMetadata() + summary, _ = sess.run([merged, train_step], + feed_dict=feed_dict(True), + options=run_options, + run_metadata=run_metadata) + train_writer.add_run_metadata(run_metadata, 'step%03d' % i) + train_writer.add_summary(summary, i) + print('Adding run metadata for', i) + else: # Record a summary + summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True)) + train_writer.add_summary(summary, i) + train_writer.close() + test_writer.close() + + +def main(_): + if tf.io.gfile.exists(FLAGS.log_dir): + tf.io.gfile.rmtree(FLAGS.log_dir) + tf.io.gfile.makedirs(FLAGS.log_dir) + with tf.Graph().as_default(): + train() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--fake_data', nargs='?', const=True, type=bool, + default=False, + help='If true, uses fake data for unit testing.') + parser.add_argument('--max_steps', type=int, default=1000, + help='Number of steps to run trainer.') + parser.add_argument('--learning_rate', type=float, default=0.001, + help='Initial learning rate') + parser.add_argument('--dropout', type=float, default=0.9, + help='Keep probability for training dropout.') + parser.add_argument( + '--data_dir', + type=str, + default=os.path.join(os.getenv('TEST_TMPDIR', '/tmp'), + 'tensorflow/mnist/input_data'), + help='Directory for storing input data') + parser.add_argument( + '--log_dir', + type=str, + default=os.path.join(os.getenv('TEST_TMPDIR', '/tmp'), + 'tensorflow/mnist/logs/mnist_with_summaries'), + help='Summaries log directory') + FLAGS, unparsed = parser.parse_known_args() + tf.compat.v1.app.run(main=main, argv=[sys.argv[0]] + unparsed) diff --git a/scripts/v1beta1/build.sh b/scripts/v1beta1/build.sh index 3773315b540..780903ba384 100755 --- a/scripts/v1beta1/build.sh +++ b/scripts/v1beta1/build.sh @@ -21,97 +21,120 @@ set -e REGISTRY=$1 TAG=$2 +ARCH=$3 -if [[ -z "$REGISTRY" || -z "$TAG" ]]; then - echo "Image registry and tag must be set" - echo "Usage: $0 " 1>&2 +if [[ -z "$REGISTRY" || -z "$TAG" || -z "$ARCH" ]]; then + echo "Image registry, tag and architecture must be set" + echo "Usage: $0 " 1>&2 exit 1 fi +SUPPORTED_CPU_ARCHS=(amd64 arm64 ppc64le) +function check_specified_cpu_arch() { + for SUPPORTED_ARCH in "${SUPPORTED_CPU_ARCHS[@]}"; do \ + if [ "$ARCH" = "$SUPPORTED_ARCH" ]; then \ + return 0 + fi; + done + echo "CPU architecture '$ARCH' is not supported" + echo "You can use '${SUPPORTED_CPU_ARCHS[*]}'" + return 1 +} +check_specified_cpu_arch + VERSION="v1beta1" CMD_PREFIX="cmd" -MACHINE_ARCH=$(uname -m) echo "Building images for Katib ${VERSION}..." echo "Image registry: ${REGISTRY}" echo "Image tag: ${TAG}" -SCRIPT_ROOT=$(dirname ${BASH_SOURCE})/../.. -cd ${SCRIPT_ROOT} +SCRIPT_ROOT=$(dirname "$0")/../.. +cd "${SCRIPT_ROOT}" # Katib core images echo -e "\nBuilding Katib controller image...\n" -docker build -t ${REGISTRY}/katib-controller:${TAG} -f ${CMD_PREFIX}/katib-controller/${VERSION}/Dockerfile . +docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/katib-controller:${TAG}" -f ${CMD_PREFIX}/katib-controller/${VERSION}/Dockerfile . echo -e "\nBuilding Katib DB manager image...\n" -docker build -t ${REGISTRY}/katib-db-manager:${TAG} -f ${CMD_PREFIX}/db-manager/${VERSION}/Dockerfile . +docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/katib-db-manager:${TAG}" -f ${CMD_PREFIX}/db-manager/${VERSION}/Dockerfile . # TODO (andreyvelich): Switch to ${CMD_PREFIX}/ui/${VERSION}/Dockerfile once old UI is deprecated. echo -e "\nBuilding Katib UI image...\n" -docker build -t ${REGISTRY}/katib-ui:${TAG} -f ${CMD_PREFIX}/new-ui/${VERSION}/Dockerfile . +if [ "$ARCH" == "ppc64le" ]; then + docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/katib-ui:${TAG}" -f ${CMD_PREFIX}/new-ui/${VERSION}/Dockerfile.ppc64le . +else \ + docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/katib-ui:${TAG}" -f ${CMD_PREFIX}/new-ui/${VERSION}/Dockerfile . +fi echo -e "\nBuilding Katib cert generator image...\n" -docker build -t ${REGISTRY}/cert-generator:${TAG} -f ${CMD_PREFIX}/cert-generator/${VERSION}/Dockerfile . +docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/cert-generator:${TAG}" -f ${CMD_PREFIX}/cert-generator/${VERSION}/Dockerfile . echo -e "\nBuilding file metrics collector image...\n" -docker build -t ${REGISTRY}/file-metrics-collector:${TAG} -f ${CMD_PREFIX}/metricscollector/${VERSION}/file-metricscollector/Dockerfile . +docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/file-metrics-collector:${TAG}" -f ${CMD_PREFIX}/metricscollector/${VERSION}/file-metricscollector/Dockerfile . echo -e "\nBuilding TF Event metrics collector image...\n" -if [ $MACHINE_ARCH == "ppc64le" ]; then - docker build -t ${REGISTRY}/tfevent-metrics-collector:${TAG} -f ${CMD_PREFIX}/metricscollector/${VERSION}/tfevent-metricscollector/Dockerfile.ppc64le . -else - docker build -t ${REGISTRY}/tfevent-metrics-collector:${TAG} -f ${CMD_PREFIX}/metricscollector/${VERSION}/tfevent-metricscollector/Dockerfile . +if [ "$ARCH" == "ppc64le" ]; then + docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/tfevent-metrics-collector:${TAG}" -f ${CMD_PREFIX}/metricscollector/${VERSION}/tfevent-metricscollector/Dockerfile.ppc64le . +else \ + docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/tfevent-metrics-collector:${TAG}" -f ${CMD_PREFIX}/metricscollector/${VERSION}/tfevent-metricscollector/Dockerfile . fi # Suggestion images echo -e "\nBuilding suggestion images..." echo -e "\nBuilding hyperopt suggestion...\n" -docker build -t ${REGISTRY}/suggestion-hyperopt:${TAG} -f ${CMD_PREFIX}/suggestion/hyperopt/${VERSION}/Dockerfile . +docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/suggestion-hyperopt:${TAG}" -f ${CMD_PREFIX}/suggestion/hyperopt/${VERSION}/Dockerfile . echo -e "\nBuilding chocolate suggestion...\n" -docker build -t ${REGISTRY}/suggestion-chocolate:${TAG} -f ${CMD_PREFIX}/suggestion/chocolate/${VERSION}/Dockerfile . +docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/suggestion-chocolate:${TAG}" -f ${CMD_PREFIX}/suggestion/chocolate/${VERSION}/Dockerfile . echo -e "\nBuilding hyperband suggestion...\n" -docker build -t ${REGISTRY}/suggestion-hyperband:${TAG} -f ${CMD_PREFIX}/suggestion/hyperband/${VERSION}/Dockerfile . +docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/suggestion-hyperband:${TAG}" -f ${CMD_PREFIX}/suggestion/hyperband/${VERSION}/Dockerfile . echo -e "\nBuilding skopt suggestion...\n" -docker build -t ${REGISTRY}/suggestion-skopt:${TAG} -f ${CMD_PREFIX}/suggestion/skopt/${VERSION}/Dockerfile . +docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/suggestion-skopt:${TAG}" -f ${CMD_PREFIX}/suggestion/skopt/${VERSION}/Dockerfile . echo -e "\nBuilding goptuna suggestion...\n" -docker build -t ${REGISTRY}/suggestion-goptuna:${TAG} -f ${CMD_PREFIX}/suggestion/goptuna/${VERSION}/Dockerfile . +docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/suggestion-goptuna:${TAG}" -f ${CMD_PREFIX}/suggestion/goptuna/${VERSION}/Dockerfile . echo -e "\nBuilding optuna suggestion...\n" -docker build -t ${REGISTRY}/suggestion-optuna:${TAG} -f ${CMD_PREFIX}/suggestion/optuna/${VERSION}/Dockerfile . +docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/suggestion-optuna:${TAG}" -f ${CMD_PREFIX}/suggestion/optuna/${VERSION}/Dockerfile . echo -e "\nBuilding ENAS suggestion...\n" -docker build -t ${REGISTRY}/suggestion-enas:${TAG} -f ${CMD_PREFIX}/suggestion/nas/enas/${VERSION}/Dockerfile . +docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/suggestion-enas:${TAG}" -f ${CMD_PREFIX}/suggestion/nas/enas/${VERSION}/Dockerfile . echo -e "\nBuilding DARTS suggestion...\n" -docker build -t ${REGISTRY}/suggestion-darts:${TAG} -f ${CMD_PREFIX}/suggestion/nas/darts/${VERSION}/Dockerfile . +docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/suggestion-darts:${TAG}" -f ${CMD_PREFIX}/suggestion/nas/darts/${VERSION}/Dockerfile . # Early stopping images echo -e "\nBuilding early stopping images...\n" echo -e "\nBuilding median stopping rule...\n" -docker build -t ${REGISTRY}/earlystopping-medianstop:${TAG} -f ${CMD_PREFIX}/earlystopping/medianstop/${VERSION}/Dockerfile . +docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/earlystopping-medianstop:${TAG}" -f ${CMD_PREFIX}/earlystopping/medianstop/${VERSION}/Dockerfile . # Training container images -echo -e "\nBuilding training container images..." +if [ ! "$ARCH" = "amd64" ]; then \ + echo -e "\nTraining container images are supported only amd64." +else \ + + echo -e "\nBuilding training container images..." -echo -e "\nBuilding mxnet mnist training container example...\n" -docker build -t ${REGISTRY}/mxnet-mnist:${TAG} -f examples/${VERSION}/trial-images/mxnet-mnist/Dockerfile . + echo -e "\nBuilding mxnet mnist training container example...\n" + docker buildx build --platform linux/amd64 -t "${REGISTRY}/mxnet-mnist:${TAG}" -f examples/${VERSION}/trial-images/mxnet-mnist/Dockerfile . -echo -e "\nBuilding PyTorch mnist training container example...\n" -docker build -t ${REGISTRY}/pytorch-mnist:${TAG} -f examples/${VERSION}/trial-images/pytorch-mnist/Dockerfile . + echo -e "\nBuilding PyTorch mnist training container example...\n" + docker buildx build --platform linux/amd64 -t "${REGISTRY}/pytorch-mnist:${TAG}" -f examples/${VERSION}/trial-images/pytorch-mnist/Dockerfile . -echo -e "\nBuilding Keras CIFAR-10 CNN training container example for ENAS with GPU support...\n" -docker build -t ${REGISTRY}/enas-cnn-cifar10-gpu:${TAG} -f examples/${VERSION}/trial-images/enas-cnn-cifar10/Dockerfile.gpu . + echo -e "\nBuilding Keras CIFAR-10 CNN training container example for ENAS with GPU support...\n" + docker buildx build --platform linux/amd64 -t "${REGISTRY}/enas-cnn-cifar10-gpu:${TAG}" -f examples/${VERSION}/trial-images/enas-cnn-cifar10/Dockerfile.gpu . -echo -e "\nBuilding Keras CIFAR-10 CNN training container example for ENAS with CPU support...\n" -docker build -t ${REGISTRY}/enas-cnn-cifar10-cpu:${TAG} -f examples/${VERSION}/trial-images/enas-cnn-cifar10/Dockerfile.cpu . + echo -e "\nBuilding Keras CIFAR-10 CNN training container example for ENAS with CPU support...\n" + docker buildx build --platform linux/amd64 -t "${REGISTRY}/enas-cnn-cifar10-cpu:${TAG}" -f examples/${VERSION}/trial-images/enas-cnn-cifar10/Dockerfile.cpu . -echo -e "\nBuilding PyTorch CIFAR-10 CNN training container example for DARTS...\n" -docker build -t ${REGISTRY}/darts-cnn-cifar10:${TAG} -f examples/${VERSION}/trial-images/darts-cnn-cifar10/Dockerfile . + echo -e "\nBuilding PyTorch CIFAR-10 CNN training container example for DARTS...\n" + docker buildx build --platform linux/amd64 -t "${REGISTRY}/darts-cnn-cifar10:${TAG}" -f examples/${VERSION}/trial-images/darts-cnn-cifar10/Dockerfile . + +fi echo -e "\nAll Katib images with ${TAG} tag have been built successfully!\n" diff --git a/test/e2e/v1beta1/argo_workflow.py b/test/e2e/v1beta1/argo_workflow.py index 0345c262c9a..444f73c8036 100644 --- a/test/e2e/v1beta1/argo_workflow.py +++ b/test/e2e/v1beta1/argo_workflow.py @@ -41,27 +41,28 @@ # Dict with all Katib images. # Key - image name, Value - dockerfile location. KATIB_IMAGES = { - "katib-controller": "cmd/katib-controller/v1beta1/Dockerfile", - "katib-db-manager": "cmd/db-manager/v1beta1/Dockerfile", + "katib-controller": "cmd/katib-controller/v1beta1/Dockerfile", + "katib-db-manager": "cmd/db-manager/v1beta1/Dockerfile", # TODO (andreyvelich): Change it to /cmd/ui/v1beta1/Dockerfile once old UI is deprecated. - "katib-ui": "cmd/new-ui/v1beta1/Dockerfile", - "cert-generator": "cmd/cert-generator/v1beta1/Dockerfile", - "file-metrics-collector": "cmd/metricscollector/v1beta1/file-metricscollector/Dockerfile", - "tfevent-metrics-collector": "/cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile", - "suggestion-hyperopt": "cmd/suggestion/hyperopt/v1beta1/Dockerfile", - "suggestion-chocolate": "cmd/suggestion/chocolate/v1beta1/Dockerfile", - "suggestion-skopt": "cmd/suggestion/skopt/v1beta1/Dockerfile", - "suggestion-hyperband": "cmd/suggestion/hyperband/v1beta1/Dockerfile", - "suggestion-goptuna": "cmd/suggestion/goptuna/v1beta1/Dockerfile", - "suggestion-optuna": "cmd/suggestion/optuna/v1beta1/Dockerfile", - "suggestion-enas": "cmd/suggestion/nas/enas/v1beta1/Dockerfile", - "suggestion-darts": "cmd/suggestion/nas/darts/v1beta1/Dockerfile", - "earlystopping-medianstop": "cmd/earlystopping/medianstop/v1beta1/Dockerfile", - "trial-mxnet-mnist": "examples/v1beta1/trial-images/mxnet-mnist/Dockerfile", - "trial-pytorch-mnist": "examples/v1beta1/trial-images/pytorch-mnist/Dockerfile", - "trial-enas-cnn-cifar10-gpu": "examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.gpu", - "trial-enas-cnn-cifar10-cpu": "examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.cpu", - "trial-darts-cnn-cifar10": "examples/v1beta1/trial-images/darts-cnn-cifar10/Dockerfile", + "katib-ui": "cmd/new-ui/v1beta1/Dockerfile", + "cert-generator": "cmd/cert-generator/v1beta1/Dockerfile", + "file-metrics-collector": "cmd/metricscollector/v1beta1/file-metricscollector/Dockerfile", + "tfevent-metrics-collector": "/cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile", + "suggestion-hyperopt": "cmd/suggestion/hyperopt/v1beta1/Dockerfile", + "suggestion-chocolate": "cmd/suggestion/chocolate/v1beta1/Dockerfile", + "suggestion-skopt": "cmd/suggestion/skopt/v1beta1/Dockerfile", + "suggestion-hyperband": "cmd/suggestion/hyperband/v1beta1/Dockerfile", + "suggestion-goptuna": "cmd/suggestion/goptuna/v1beta1/Dockerfile", + "suggestion-optuna": "cmd/suggestion/optuna/v1beta1/Dockerfile", + "suggestion-enas": "cmd/suggestion/nas/enas/v1beta1/Dockerfile", + "suggestion-darts": "cmd/suggestion/nas/darts/v1beta1/Dockerfile", + "earlystopping-medianstop": "cmd/earlystopping/medianstop/v1beta1/Dockerfile", + "trial-mxnet-mnist": "examples/v1beta1/trial-images/mxnet-mnist/Dockerfile", + "trial-pytorch-mnist": "examples/v1beta1/trial-images/pytorch-mnist/Dockerfile", +# "trial-tf-mnist-with-summaries": "examples/v1beta1/trial-images/tf-mnist-with-summaries/Dockerfile", + "trial-enas-cnn-cifar10-gpu": "examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.gpu", + "trial-enas-cnn-cifar10-cpu": "examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.cpu", + "trial-darts-cnn-cifar10": "examples/v1beta1/trial-images/darts-cnn-cifar10/Dockerfile", } # Dict with Katib Experiments to run during the test. @@ -79,6 +80,7 @@ "pytorchjob": "examples/v1beta1/kubeflow-training-operator/pytorchjob-mnist.yaml", "tfjob": "examples/v1beta1/kubeflow-training-operator/tfjob-mnist-with-summaries.yaml", "file-metricscollector": "examples/v1beta1/metrics-collector/file-metrics-collector.yaml", +# "tfevent-metricscollector": "examples/v1beta1/metrics-collector/tfevent-metrics-collector.yaml", "never-resume": "examples/v1beta1/resume-experiment/never-resume.yaml", "from-volume-resume": "examples/v1beta1/resume-experiment/from-volume-resume.yaml", "median-stop": "examples/v1beta1/early-stopping/median-stop.yaml" From ceeb7540cc87389dd93cc8ea6bd9c431c648c75e Mon Sep 17 00:00:00 2001 From: tenzen-y Date: Sat, 13 Nov 2021 14:47:08 +0900 Subject: [PATCH 03/17] fix example for enas --- .../Dockerfile.ppc64le | 1 - .../tfevent-metrics-collector.yaml | 1 + .../enas-cnn-cifar10/Dockerfile.cpu | 1 + .../trial-images/enas-cnn-cifar10/RunTrial.py | 20 +- .../enas-cnn-cifar10/requirements.txt | 1 + .../tf-mnist-with-summaries/Dockerfile | 7 +- .../tf-mnist-with-summaries/README.md | 2 +- .../tf-mnist-with-summaries/input_data.py | 333 ++++++++++++++++++ .../tf-mnist-with-summaries/mnist.py | 3 +- scripts/v1beta1/build.sh | 3 + test/e2e/v1beta1/argo_workflow.py | 4 +- 11 files changed, 354 insertions(+), 22 deletions(-) create mode 100644 examples/v1beta1/trial-images/enas-cnn-cifar10/requirements.txt create mode 100644 examples/v1beta1/trial-images/tf-mnist-with-summaries/input_data.py diff --git a/cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile.ppc64le b/cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile.ppc64le index 00a75703f6f..fbc819dce37 100644 --- a/cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile.ppc64le +++ b/cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile.ppc64le @@ -1,5 +1,4 @@ FROM ibmcom/tensorflow-ppc64le:2.2.0-py3 -RUN pip install rfc3339 grpcio googleapis-common-protos ADD . /usr/src/app/github.com/kubeflow/katib WORKDIR /usr/src/app/github.com/kubeflow/katib/cmd/metricscollector/v1beta1/tfevent-metricscollector/ RUN pip install --no-cache-dir -r requirements.txt diff --git a/examples/v1beta1/metrics-collector/tfevent-metrics-collector.yaml b/examples/v1beta1/metrics-collector/tfevent-metrics-collector.yaml index c95595543a7..af8822facbb 100644 --- a/examples/v1beta1/metrics-collector/tfevent-metrics-collector.yaml +++ b/examples/v1beta1/metrics-collector/tfevent-metrics-collector.yaml @@ -46,4 +46,5 @@ spec: - "/opt/tf-mnist-with-summaries/mnist.py" - "--log_dir=/train/metrics" - "--learning_rate=${trialParameters.learningRate}" + - "--max_steps=1" restartPolicy: Never diff --git a/examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.cpu b/examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.cpu index 524d08e2506..30af4f77020 100644 --- a/examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.cpu +++ b/examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.cpu @@ -5,6 +5,7 @@ ENV TARGET_DIR /opt/enas-cnn-cifar10 ADD examples/v1beta1/trial-images/enas-cnn-cifar10 ${TARGET_DIR} WORKDIR ${TARGET_DIR} +RUN pip3 install --no-cache-dir -r requirements.txt ENV PYTHONPATH ${TARGET_DIR} RUN chgrp -R 0 ${TARGET_DIR} \ diff --git a/examples/v1beta1/trial-images/enas-cnn-cifar10/RunTrial.py b/examples/v1beta1/trial-images/enas-cnn-cifar10/RunTrial.py index 625f6174d62..4b5b8ab327e 100644 --- a/examples/v1beta1/trial-images/enas-cnn-cifar10/RunTrial.py +++ b/examples/v1beta1/trial-images/enas-cnn-cifar10/RunTrial.py @@ -1,12 +1,10 @@ -import keras -import numpy as np +from tensorflow import keras from keras.datasets import cifar10 from ModelConstructor import ModelConstructor from tensorflow.keras.utils import to_categorical from tensorflow.python.keras.utils.multi_gpu_utils import multi_gpu_model from keras.preprocessing.image import ImageDataGenerator import argparse -import time if __name__ == "__main__": parser = argparse.ArgumentParser(description='TrainingContainer') @@ -46,7 +44,7 @@ test_model.summary() test_model.compile(loss=keras.losses.categorical_crossentropy, - optimizer=keras.optimizers.Adam(lr=1e-3, decay=1e-4), + optimizer=keras.optimizers.Adam(learning_rate=1e-3, decay=1e-4), metrics=['accuracy']) (x_train, y_train), (x_test, y_test) = cifar10.load_data() @@ -67,12 +65,12 @@ print(">>> Data Loaded. Training starts.") for e in range(num_epochs): - print("\nTotal Epoch {}/{}".format(e+1, num_epochs)) - history = test_model.fit_generator(generator=aug_data_flow, - steps_per_epoch=int(len(x_train)/128)+1, - epochs=1, verbose=1, - validation_data=(x_test, y_test)) - print("Training-Accuracy={}".format(history.history['acc'][-1])) + print("\nTotal Epoch {}/{}".format(e + 1, num_epochs)) + history = test_model.fit(aug_data_flow, + steps_per_epoch=int(len(x_train) / 128) + 1, + epochs=1, verbose=1, + validation_data=(x_test, y_test)) + print("Training-Accuracy={}".format(history.history['accuracy'][-1])) print("Training-Loss={}".format(history.history['loss'][-1])) - print("Validation-Accuracy={}".format(history.history['val_acc'][-1])) + print("Validation-Accuracy={}".format(history.history['val_accuracy'][-1])) print("Validation-Loss={}".format(history.history['val_loss'][-1])) diff --git a/examples/v1beta1/trial-images/enas-cnn-cifar10/requirements.txt b/examples/v1beta1/trial-images/enas-cnn-cifar10/requirements.txt new file mode 100644 index 00000000000..497c40a9811 --- /dev/null +++ b/examples/v1beta1/trial-images/enas-cnn-cifar10/requirements.txt @@ -0,0 +1 @@ +scipy>=1.7.2 diff --git a/examples/v1beta1/trial-images/tf-mnist-with-summaries/Dockerfile b/examples/v1beta1/trial-images/tf-mnist-with-summaries/Dockerfile index 5f19be878a4..e54e4c80698 100644 --- a/examples/v1beta1/trial-images/tf-mnist-with-summaries/Dockerfile +++ b/examples/v1beta1/trial-images/tf-mnist-with-summaries/Dockerfile @@ -3,12 +3,7 @@ FROM tensorflow/tensorflow:2.7.0 ADD examples/v1beta1/trial-images/tf-mnist-with-summaries /opt/tf-mnist-with-summaries WORKDIR /opt/tf-mnist-with-summaries -# Add folder for the logs. -RUN mkdir /katib - RUN chgrp -R 0 /opt/tf-mnist-with-summaries \ - && chmod -R g+rwX /opt/tf-mnist-with-summaries \ - && chgrp -R 0 /katib \ - && chmod -R g+rwX /katib + && chmod -R g+rwX /opt/tf-mnist-with-summaries ENTRYPOINT ["python3", "/opt/tf-mnist-with-summaries/mnist.py"] diff --git a/examples/v1beta1/trial-images/tf-mnist-with-summaries/README.md b/examples/v1beta1/trial-images/tf-mnist-with-summaries/README.md index 56c75d68665..dfcf79f2b75 100644 --- a/examples/v1beta1/trial-images/tf-mnist-with-summaries/README.md +++ b/examples/v1beta1/trial-images/tf-mnist-with-summaries/README.md @@ -8,4 +8,4 @@ If you want to read more about this example, visit the official GitHub repository. Katib uses this training container in some Experiments, for instance in the -[TF Event Metrics Collector](../../metrics-collector/tfevent-metrics-collector.yaml#L55-L64). +[TF Event Metrics Collector](../../metrics-collector/tfevent-metrics-collector.yaml#L42-L49). diff --git a/examples/v1beta1/trial-images/tf-mnist-with-summaries/input_data.py b/examples/v1beta1/trial-images/tf-mnist-with-summaries/input_data.py new file mode 100644 index 00000000000..c203c7b5341 --- /dev/null +++ b/examples/v1beta1/trial-images/tf-mnist-with-summaries/input_data.py @@ -0,0 +1,333 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Functions for downloading and reading MNIST data (deprecated). + +This module and all its submodules are deprecated. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import gzip +import os + +import numpy +from six.moves import urllib +from six.moves import xrange # pylint: disable=redefined-builtin + +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import random_seed +from tensorflow.python.platform import gfile +from tensorflow.python.util.deprecation import deprecated + +_Datasets = collections.namedtuple('_Datasets', ['train', 'validation', 'test']) + +# CVDF mirror of http://yann.lecun.com/exdb/mnist/ +DEFAULT_SOURCE_URL = 'https://storage.googleapis.com/cvdf-datasets/mnist/' + + +def _read32(bytestream): + dt = numpy.dtype(numpy.uint32).newbyteorder('>') + return numpy.frombuffer(bytestream.read(4), dtype=dt)[0] + + +@deprecated(None, 'Please use tf.data to implement this functionality.') +def _extract_images(f): + """Extract the images into a 4D uint8 numpy array [index, y, x, depth]. + + Args: + f: A file object that can be passed into a gzip reader. + + Returns: + data: A 4D uint8 numpy array [index, y, x, depth]. + + Raises: + ValueError: If the bytestream does not start with 2051. + + """ + print('Extracting', f.name) + with gzip.GzipFile(fileobj=f) as bytestream: + magic = _read32(bytestream) + if magic != 2051: + raise ValueError('Invalid magic number %d in MNIST image file: %s' % + (magic, f.name)) + num_images = _read32(bytestream) + rows = _read32(bytestream) + cols = _read32(bytestream) + buf = bytestream.read(rows * cols * num_images) + data = numpy.frombuffer(buf, dtype=numpy.uint8) + data = data.reshape(num_images, rows, cols, 1) + return data + + +@deprecated(None, 'Please use tf.one_hot on tensors.') +def _dense_to_one_hot(labels_dense, num_classes): + """Convert class labels from scalars to one-hot vectors.""" + num_labels = labels_dense.shape[0] + index_offset = numpy.arange(num_labels) * num_classes + labels_one_hot = numpy.zeros((num_labels, num_classes)) + labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1 + return labels_one_hot + + +@deprecated(None, 'Please use tf.data to implement this functionality.') +def _extract_labels(f, one_hot=False, num_classes=10): + """Extract the labels into a 1D uint8 numpy array [index]. + + Args: + f: A file object that can be passed into a gzip reader. + one_hot: Does one hot encoding for the result. + num_classes: Number of classes for the one hot encoding. + + Returns: + labels: a 1D uint8 numpy array. + + Raises: + ValueError: If the bystream doesn't start with 2049. + """ + print('Extracting', f.name) + with gzip.GzipFile(fileobj=f) as bytestream: + magic = _read32(bytestream) + if magic != 2049: + raise ValueError('Invalid magic number %d in MNIST label file: %s' % + (magic, f.name)) + num_items = _read32(bytestream) + buf = bytestream.read(num_items) + labels = numpy.frombuffer(buf, dtype=numpy.uint8) + if one_hot: + return _dense_to_one_hot(labels, num_classes) + return labels + + +class _DataSet(object): + """Container class for a _DataSet (deprecated). + + THIS CLASS IS DEPRECATED. + """ + + @deprecated(None, 'Please use alternatives such as official/mnist/_DataSet.py' + ' from tensorflow/models.') + def __init__(self, + images, + labels, + fake_data=False, + one_hot=False, + dtype=dtypes.float32, + reshape=True, + seed=None): + """Construct a _DataSet. + + one_hot arg is used only if fake_data is true. `dtype` can be either + `uint8` to leave the input as `[0, 255]`, or `float32` to rescale into + `[0, 1]`. Seed arg provides for convenient deterministic testing. + + Args: + images: The images + labels: The labels + fake_data: Ignore inages and labels, use fake data. + one_hot: Bool, return the labels as one hot vectors (if True) or ints (if + False). + dtype: Output image dtype. One of [uint8, float32]. `uint8` output has + range [0,255]. float32 output has range [0,1]. + reshape: Bool. If True returned images are returned flattened to vectors. + seed: The random seed to use. + """ + seed1, seed2 = random_seed.get_seed(seed) + # If op level seed is not set, use whatever graph level seed is returned + numpy.random.seed(seed1 if seed is None else seed2) + dtype = dtypes.as_dtype(dtype).base_dtype + if dtype not in (dtypes.uint8, dtypes.float32): + raise TypeError('Invalid image dtype %r, expected uint8 or float32' % + dtype) + if fake_data: + self._num_examples = 10000 + self.one_hot = one_hot + else: + assert images.shape[0] == labels.shape[0], ( + 'images.shape: %s labels.shape: %s' % (images.shape, labels.shape)) + self._num_examples = images.shape[0] + + # Convert shape from [num examples, rows, columns, depth] + # to [num examples, rows*columns] (assuming depth == 1) + if reshape: + assert images.shape[3] == 1 + images = images.reshape(images.shape[0], + images.shape[1] * images.shape[2]) + if dtype == dtypes.float32: + # Convert from [0, 255] -> [0.0, 1.0]. + images = images.astype(numpy.float32) + images = numpy.multiply(images, 1.0 / 255.0) + self._images = images + self._labels = labels + self._epochs_completed = 0 + self._index_in_epoch = 0 + + @property + def images(self): + return self._images + + @property + def labels(self): + return self._labels + + @property + def num_examples(self): + return self._num_examples + + @property + def epochs_completed(self): + return self._epochs_completed + + def next_batch(self, batch_size, fake_data=False, shuffle=True): + """Return the next `batch_size` examples from this data set.""" + if fake_data: + fake_image = [1] * 784 + if self.one_hot: + fake_label = [1] + [0] * 9 + else: + fake_label = 0 + return [fake_image for _ in xrange(batch_size) + ], [fake_label for _ in xrange(batch_size)] + start = self._index_in_epoch + # Shuffle for the first epoch + if self._epochs_completed == 0 and start == 0 and shuffle: + perm0 = numpy.arange(self._num_examples) + numpy.random.shuffle(perm0) + self._images = self.images[perm0] + self._labels = self.labels[perm0] + # Go to the next epoch + if start + batch_size > self._num_examples: + # Finished epoch + self._epochs_completed += 1 + # Get the rest examples in this epoch + rest_num_examples = self._num_examples - start + images_rest_part = self._images[start:self._num_examples] + labels_rest_part = self._labels[start:self._num_examples] + # Shuffle the data + if shuffle: + perm = numpy.arange(self._num_examples) + numpy.random.shuffle(perm) + self._images = self.images[perm] + self._labels = self.labels[perm] + # Start next epoch + start = 0 + self._index_in_epoch = batch_size - rest_num_examples + end = self._index_in_epoch + images_new_part = self._images[start:end] + labels_new_part = self._labels[start:end] + return numpy.concatenate((images_rest_part, images_new_part), + axis=0), numpy.concatenate( + (labels_rest_part, labels_new_part), axis=0) + else: + self._index_in_epoch += batch_size + end = self._index_in_epoch + return self._images[start:end], self._labels[start:end] + + +@deprecated(None, 'Please write your own downloading logic.') +def _maybe_download(filename, work_directory, source_url): + """Download the data from source url, unless it's already here. + + Args: + filename: string, name of the file in the directory. + work_directory: string, path to working directory. + source_url: url to download from if file doesn't exist. + + Returns: + Path to resulting file. + """ + if not gfile.Exists(work_directory): + gfile.MakeDirs(work_directory) + filepath = os.path.join(work_directory, filename) + if not gfile.Exists(filepath): + urllib.request.urlretrieve(source_url, filepath) + with gfile.GFile(filepath) as f: + size = f.size() + print('Successfully downloaded', filename, size, 'bytes.') + return filepath + + +@deprecated(None, 'Please use alternatives such as:' + ' tensorflow_datasets.load(\'mnist\')') +def read_data_sets(train_dir, + fake_data=False, + one_hot=False, + dtype=dtypes.float32, + reshape=True, + validation_size=5000, + seed=None, + source_url=DEFAULT_SOURCE_URL): + if fake_data: + + def fake(): + return _DataSet([], [], + fake_data=True, + one_hot=one_hot, + dtype=dtype, + seed=seed) + + train = fake() + validation = fake() + test = fake() + return _Datasets(train=train, validation=validation, test=test) + + if not source_url: # empty string check + source_url = DEFAULT_SOURCE_URL + + train_images_file = 'train-images-idx3-ubyte.gz' + train_labels_file = 'train-labels-idx1-ubyte.gz' + test_images_file = 't10k-images-idx3-ubyte.gz' + test_labels_file = 't10k-labels-idx1-ubyte.gz' + + local_file = _maybe_download(train_images_file, train_dir, + source_url + train_images_file) + with gfile.Open(local_file, 'rb') as f: + train_images = _extract_images(f) + + local_file = _maybe_download(train_labels_file, train_dir, + source_url + train_labels_file) + with gfile.Open(local_file, 'rb') as f: + train_labels = _extract_labels(f, one_hot=one_hot) + + local_file = _maybe_download(test_images_file, train_dir, + source_url + test_images_file) + with gfile.Open(local_file, 'rb') as f: + test_images = _extract_images(f) + + local_file = _maybe_download(test_labels_file, train_dir, + source_url + test_labels_file) + with gfile.Open(local_file, 'rb') as f: + test_labels = _extract_labels(f, one_hot=one_hot) + + if not 0 <= validation_size <= len(train_images): + raise ValueError( + 'Validation size should be between 0 and {}. Received: {}.'.format( + len(train_images), validation_size)) + + validation_images = train_images[:validation_size] + validation_labels = train_labels[:validation_size] + train_images = train_images[validation_size:] + train_labels = train_labels[validation_size:] + + options = dict(dtype=dtype, reshape=reshape, seed=seed) + + train = _DataSet(train_images, train_labels, **options) + validation = _DataSet(validation_images, validation_labels, **options) + test = _DataSet(test_images, test_labels, **options) + + return _Datasets(train=train, validation=validation, test=test) + diff --git a/examples/v1beta1/trial-images/tf-mnist-with-summaries/mnist.py b/examples/v1beta1/trial-images/tf-mnist-with-summaries/mnist.py index 04315ad8a3f..6cd093d9029 100644 --- a/examples/v1beta1/trial-images/tf-mnist-with-summaries/mnist.py +++ b/examples/v1beta1/trial-images/tf-mnist-with-summaries/mnist.py @@ -30,7 +30,8 @@ import tensorflow as tf -from tensorflow.examples.tutorials.mnist import input_data +# from tensorflow.examples.tutorials.mnist import input_data +import input_data FLAGS = None diff --git a/scripts/v1beta1/build.sh b/scripts/v1beta1/build.sh index 780903ba384..94c63b5e7a1 100755 --- a/scripts/v1beta1/build.sh +++ b/scripts/v1beta1/build.sh @@ -123,6 +123,9 @@ else \ echo -e "\nBuilding mxnet mnist training container example...\n" docker buildx build --platform linux/amd64 -t "${REGISTRY}/mxnet-mnist:${TAG}" -f examples/${VERSION}/trial-images/mxnet-mnist/Dockerfile . + echo -e "\nBuilding Tensorflow with summaries mnist training container example...\n" + docker buildx build --platform linux/amd64 -t "${REGISTRY}/tf-mnist-with-summaries:${TAG}" -f examples/${VERSION}/trial-images/tf-mnist-with-summaries/Dockerfile . + echo -e "\nBuilding PyTorch mnist training container example...\n" docker buildx build --platform linux/amd64 -t "${REGISTRY}/pytorch-mnist:${TAG}" -f examples/${VERSION}/trial-images/pytorch-mnist/Dockerfile . diff --git a/test/e2e/v1beta1/argo_workflow.py b/test/e2e/v1beta1/argo_workflow.py index 444f73c8036..f7227b7ea60 100644 --- a/test/e2e/v1beta1/argo_workflow.py +++ b/test/e2e/v1beta1/argo_workflow.py @@ -59,7 +59,7 @@ "earlystopping-medianstop": "cmd/earlystopping/medianstop/v1beta1/Dockerfile", "trial-mxnet-mnist": "examples/v1beta1/trial-images/mxnet-mnist/Dockerfile", "trial-pytorch-mnist": "examples/v1beta1/trial-images/pytorch-mnist/Dockerfile", -# "trial-tf-mnist-with-summaries": "examples/v1beta1/trial-images/tf-mnist-with-summaries/Dockerfile", + "trial-tf-mnist-with-summaries": "examples/v1beta1/trial-images/tf-mnist-with-summaries/Dockerfile", "trial-enas-cnn-cifar10-gpu": "examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.gpu", "trial-enas-cnn-cifar10-cpu": "examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.cpu", "trial-darts-cnn-cifar10": "examples/v1beta1/trial-images/darts-cnn-cifar10/Dockerfile", @@ -80,7 +80,7 @@ "pytorchjob": "examples/v1beta1/kubeflow-training-operator/pytorchjob-mnist.yaml", "tfjob": "examples/v1beta1/kubeflow-training-operator/tfjob-mnist-with-summaries.yaml", "file-metricscollector": "examples/v1beta1/metrics-collector/file-metrics-collector.yaml", -# "tfevent-metricscollector": "examples/v1beta1/metrics-collector/tfevent-metrics-collector.yaml", + "tfevent-metricscollector": "examples/v1beta1/metrics-collector/tfevent-metrics-collector.yaml", "never-resume": "examples/v1beta1/resume-experiment/never-resume.yaml", "from-volume-resume": "examples/v1beta1/resume-experiment/from-volume-resume.yaml", "median-stop": "examples/v1beta1/early-stopping/median-stop.yaml" From f77b63bb392df821976f74701dfe0113f4205fe1 Mon Sep 17 00:00:00 2001 From: tenzen-y Date: Sun, 14 Nov 2021 04:26:15 +0900 Subject: [PATCH 04/17] update scripts to modify image name in ci --- .../tfevent-metrics-collector.yaml | 1 - scripts/v1beta1/build.sh | 2 +- scripts/v1beta1/push.sh | 43 ++++++++++--------- test/e2e/v1beta1/argo_workflow.py | 2 +- 4 files changed, 25 insertions(+), 23 deletions(-) diff --git a/examples/v1beta1/metrics-collector/tfevent-metrics-collector.yaml b/examples/v1beta1/metrics-collector/tfevent-metrics-collector.yaml index af8822facbb..c95595543a7 100644 --- a/examples/v1beta1/metrics-collector/tfevent-metrics-collector.yaml +++ b/examples/v1beta1/metrics-collector/tfevent-metrics-collector.yaml @@ -46,5 +46,4 @@ spec: - "/opt/tf-mnist-with-summaries/mnist.py" - "--log_dir=/train/metrics" - "--learning_rate=${trialParameters.learningRate}" - - "--max_steps=1" restartPolicy: Never diff --git a/scripts/v1beta1/build.sh b/scripts/v1beta1/build.sh index 94c63b5e7a1..97fbee9cdd4 100755 --- a/scripts/v1beta1/build.sh +++ b/scripts/v1beta1/build.sh @@ -24,7 +24,7 @@ TAG=$2 ARCH=$3 if [[ -z "$REGISTRY" || -z "$TAG" || -z "$ARCH" ]]; then - echo "Image registry, tag and architecture must be set" + echo "Image registry, tag and cpu-architecture must be set" echo "Usage: $0 " 1>&2 exit 1 fi diff --git a/scripts/v1beta1/push.sh b/scripts/v1beta1/push.sh index 321e3ad76ee..7f906399b1b 100755 --- a/scripts/v1beta1/push.sh +++ b/scripts/v1beta1/push.sh @@ -36,72 +36,75 @@ echo "Image tag: ${TAG}" # Katib core images echo -e "\nPushing Katib controller image...\n" -docker push ${REGISTRY}/katib-controller:${TAG} +docker push "${REGISTRY}/katib-controller:${TAG}" echo -e "\nPushing Katib DB manager image...\n" -docker push ${REGISTRY}/katib-db-manager:${TAG} +docker push "${REGISTRY}/katib-db-manager:${TAG}" echo -e "\nPushing Katib UI image...\n" -docker push ${REGISTRY}/katib-ui:${TAG} +docker push "${REGISTRY}/katib-ui:${TAG}" echo -e "\nPushing Katib cert generator image...\n" -docker push ${REGISTRY}/cert-generator:${TAG} +docker push "${REGISTRY}/cert-generator:${TAG}" echo -e "\nPushing file metrics collector image...\n" -docker push ${REGISTRY}/file-metrics-collector:${TAG} +docker push "${REGISTRY}/file-metrics-collector:${TAG}" echo -e "\nPushing TF Event metrics collector image...\n" -docker push ${REGISTRY}/tfevent-metrics-collector:${TAG} +docker push "${REGISTRY}/tfevent-metrics-collector:${TAG}" # Suggestion images echo -e "\nPushing suggestion images..." echo -e "\nPushing hyperopt suggestion...\n" -docker push ${REGISTRY}/suggestion-hyperopt:${TAG} +docker push "${REGISTRY}/suggestion-hyperopt:${TAG}" echo -e "\nPushing chocolate suggestion...\n" -docker push ${REGISTRY}/suggestion-chocolate:${TAG} +docker push "${REGISTRY}/suggestion-chocolate:${TAG}" echo -e "\nPushing hyperband suggestion...\n" -docker push ${REGISTRY}/suggestion-hyperband:${TAG} +docker push "${REGISTRY}/suggestion-hyperband:${TAG}" echo -e "\nPushing skopt suggestion...\n" -docker push ${REGISTRY}/suggestion-skopt:${TAG} +docker push "${REGISTRY}/suggestion-skopt:${TAG}" echo -e "\nPushing goptuna suggestion...\n" -docker push ${REGISTRY}/suggestion-goptuna:${TAG} +docker push "${REGISTRY}/suggestion-goptuna:${TAG}" echo -e "\nPushing optuna suggestion...\n" -docker push ${REGISTRY}/suggestion-optuna:${TAG} +docker push "${REGISTRY}/suggestion-optuna:${TAG}" echo -e "\nPushing ENAS suggestion...\n" -docker push ${REGISTRY}/suggestion-enas:${TAG} +docker push "${REGISTRY}/suggestion-enas:${TAG}" echo -e "\nPushing DARTS suggestion...\n" -docker push ${REGISTRY}/suggestion-darts:${TAG} +docker push "${REGISTRY}/suggestion-darts:${TAG}" # Early stopping images echo -e "\nPushing early stopping images...\n" echo -e "\nPushing median stopping rule...\n" -docker push ${REGISTRY}/earlystopping-medianstop:${TAG} +docker push "${REGISTRY}/earlystopping-medianstop:${TAG}" # Training container images echo -e "\nPushing training container images..." echo -e "\nPushing mxnet mnist training container example...\n" -docker push ${REGISTRY}/mxnet-mnist:${TAG} +docker push "${REGISTRY}/mxnet-mnist:${TAG}" + +echo -e "\nPushing Tensorflow with summaries mnist training container example...\n" +docker push "${REGISTRY}/tf-mnist-with-summaries:${TAG}" echo -e "\nPushing PyTorch mnist training container example...\n" -docker push ${REGISTRY}/pytorch-mnist:${TAG} +docker push "${REGISTRY}/pytorch-mnist:${TAG}" echo -e "\nPushing Keras CIFAR-10 CNN training container example for ENAS with GPU support...\n" -docker push ${REGISTRY}/enas-cnn-cifar10-gpu:${TAG} +docker push "${REGISTRY}/enas-cnn-cifar10-gpu:${TAG}" echo -e "\nPushing Keras CIFAR-10 CNN training container example for ENAS with CPU support...\n" -docker push ${REGISTRY}/enas-cnn-cifar10-cpu:${TAG} +docker push "${REGISTRY}/enas-cnn-cifar10-cpu:${TAG}" echo -e "\nPushing PyTorch CIFAR-10 CNN training container example for DARTS...\n" -docker push ${REGISTRY}/darts-cnn-cifar10:${TAG} +docker push "${REGISTRY}/darts-cnn-cifar10:${TAG}" echo -e "\nAll Katib images with ${TAG} tag have been pushed successfully!\n" diff --git a/test/e2e/v1beta1/argo_workflow.py b/test/e2e/v1beta1/argo_workflow.py index f7227b7ea60..88c738ef28b 100644 --- a/test/e2e/v1beta1/argo_workflow.py +++ b/test/e2e/v1beta1/argo_workflow.py @@ -47,7 +47,7 @@ "katib-ui": "cmd/new-ui/v1beta1/Dockerfile", "cert-generator": "cmd/cert-generator/v1beta1/Dockerfile", "file-metrics-collector": "cmd/metricscollector/v1beta1/file-metricscollector/Dockerfile", - "tfevent-metrics-collector": "/cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile", + "tfevent-metrics-collector": "cmd/metricscollector/v1beta1/tfevent-metricscollector/Dockerfile", "suggestion-hyperopt": "cmd/suggestion/hyperopt/v1beta1/Dockerfile", "suggestion-chocolate": "cmd/suggestion/chocolate/v1beta1/Dockerfile", "suggestion-skopt": "cmd/suggestion/skopt/v1beta1/Dockerfile", From 5cd70a69f00b2eea5d15ca0a65c864ab47b23897 Mon Sep 17 00:00:00 2001 From: tenzen-y Date: Mon, 15 Nov 2021 19:12:25 +0900 Subject: [PATCH 05/17] review: change docker build command --- scripts/v1beta1/build.sh | 46 ++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/scripts/v1beta1/build.sh b/scripts/v1beta1/build.sh index 97fbee9cdd4..41a1bdf6b07 100755 --- a/scripts/v1beta1/build.sh +++ b/scripts/v1beta1/build.sh @@ -54,64 +54,64 @@ cd "${SCRIPT_ROOT}" # Katib core images echo -e "\nBuilding Katib controller image...\n" -docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/katib-controller:${TAG}" -f ${CMD_PREFIX}/katib-controller/${VERSION}/Dockerfile . +docker build --platform "linux/$ARCH" -t "${REGISTRY}/katib-controller:${TAG}" -f ${CMD_PREFIX}/katib-controller/${VERSION}/Dockerfile . echo -e "\nBuilding Katib DB manager image...\n" -docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/katib-db-manager:${TAG}" -f ${CMD_PREFIX}/db-manager/${VERSION}/Dockerfile . +docker build --platform "linux/$ARCH" -t "${REGISTRY}/katib-db-manager:${TAG}" -f ${CMD_PREFIX}/db-manager/${VERSION}/Dockerfile . # TODO (andreyvelich): Switch to ${CMD_PREFIX}/ui/${VERSION}/Dockerfile once old UI is deprecated. echo -e "\nBuilding Katib UI image...\n" if [ "$ARCH" == "ppc64le" ]; then - docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/katib-ui:${TAG}" -f ${CMD_PREFIX}/new-ui/${VERSION}/Dockerfile.ppc64le . + docker build --platform "linux/$ARCH" -t "${REGISTRY}/katib-ui:${TAG}" -f ${CMD_PREFIX}/new-ui/${VERSION}/Dockerfile.ppc64le . else \ - docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/katib-ui:${TAG}" -f ${CMD_PREFIX}/new-ui/${VERSION}/Dockerfile . + docker build --platform "linux/$ARCH" -t "${REGISTRY}/katib-ui:${TAG}" -f ${CMD_PREFIX}/new-ui/${VERSION}/Dockerfile . fi echo -e "\nBuilding Katib cert generator image...\n" -docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/cert-generator:${TAG}" -f ${CMD_PREFIX}/cert-generator/${VERSION}/Dockerfile . +docker build --platform "linux/$ARCH" -t "${REGISTRY}/cert-generator:${TAG}" -f ${CMD_PREFIX}/cert-generator/${VERSION}/Dockerfile . echo -e "\nBuilding file metrics collector image...\n" -docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/file-metrics-collector:${TAG}" -f ${CMD_PREFIX}/metricscollector/${VERSION}/file-metricscollector/Dockerfile . +docker build --platform "linux/$ARCH" -t "${REGISTRY}/file-metrics-collector:${TAG}" -f ${CMD_PREFIX}/metricscollector/${VERSION}/file-metricscollector/Dockerfile . echo -e "\nBuilding TF Event metrics collector image...\n" if [ "$ARCH" == "ppc64le" ]; then - docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/tfevent-metrics-collector:${TAG}" -f ${CMD_PREFIX}/metricscollector/${VERSION}/tfevent-metricscollector/Dockerfile.ppc64le . + docker build --platform "linux/$ARCH" -t "${REGISTRY}/tfevent-metrics-collector:${TAG}" -f ${CMD_PREFIX}/metricscollector/${VERSION}/tfevent-metricscollector/Dockerfile.ppc64le . else \ - docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/tfevent-metrics-collector:${TAG}" -f ${CMD_PREFIX}/metricscollector/${VERSION}/tfevent-metricscollector/Dockerfile . + docker build --platform "linux/$ARCH" -t "${REGISTRY}/tfevent-metrics-collector:${TAG}" -f ${CMD_PREFIX}/metricscollector/${VERSION}/tfevent-metricscollector/Dockerfile . fi # Suggestion images echo -e "\nBuilding suggestion images..." echo -e "\nBuilding hyperopt suggestion...\n" -docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/suggestion-hyperopt:${TAG}" -f ${CMD_PREFIX}/suggestion/hyperopt/${VERSION}/Dockerfile . +docker build --platform "linux/$ARCH" -t "${REGISTRY}/suggestion-hyperopt:${TAG}" -f ${CMD_PREFIX}/suggestion/hyperopt/${VERSION}/Dockerfile . echo -e "\nBuilding chocolate suggestion...\n" -docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/suggestion-chocolate:${TAG}" -f ${CMD_PREFIX}/suggestion/chocolate/${VERSION}/Dockerfile . +docker build --platform "linux/$ARCH" -t "${REGISTRY}/suggestion-chocolate:${TAG}" -f ${CMD_PREFIX}/suggestion/chocolate/${VERSION}/Dockerfile . echo -e "\nBuilding hyperband suggestion...\n" -docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/suggestion-hyperband:${TAG}" -f ${CMD_PREFIX}/suggestion/hyperband/${VERSION}/Dockerfile . +docker build --platform "linux/$ARCH" -t "${REGISTRY}/suggestion-hyperband:${TAG}" -f ${CMD_PREFIX}/suggestion/hyperband/${VERSION}/Dockerfile . echo -e "\nBuilding skopt suggestion...\n" -docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/suggestion-skopt:${TAG}" -f ${CMD_PREFIX}/suggestion/skopt/${VERSION}/Dockerfile . +docker build --platform "linux/$ARCH" -t "${REGISTRY}/suggestion-skopt:${TAG}" -f ${CMD_PREFIX}/suggestion/skopt/${VERSION}/Dockerfile . echo -e "\nBuilding goptuna suggestion...\n" -docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/suggestion-goptuna:${TAG}" -f ${CMD_PREFIX}/suggestion/goptuna/${VERSION}/Dockerfile . +docker build --platform "linux/$ARCH" -t "${REGISTRY}/suggestion-goptuna:${TAG}" -f ${CMD_PREFIX}/suggestion/goptuna/${VERSION}/Dockerfile . echo -e "\nBuilding optuna suggestion...\n" -docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/suggestion-optuna:${TAG}" -f ${CMD_PREFIX}/suggestion/optuna/${VERSION}/Dockerfile . +docker build --platform "linux/$ARCH" -t "${REGISTRY}/suggestion-optuna:${TAG}" -f ${CMD_PREFIX}/suggestion/optuna/${VERSION}/Dockerfile . echo -e "\nBuilding ENAS suggestion...\n" -docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/suggestion-enas:${TAG}" -f ${CMD_PREFIX}/suggestion/nas/enas/${VERSION}/Dockerfile . +docker build --platform "linux/$ARCH" -t "${REGISTRY}/suggestion-enas:${TAG}" -f ${CMD_PREFIX}/suggestion/nas/enas/${VERSION}/Dockerfile . echo -e "\nBuilding DARTS suggestion...\n" -docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/suggestion-darts:${TAG}" -f ${CMD_PREFIX}/suggestion/nas/darts/${VERSION}/Dockerfile . +docker build --platform "linux/$ARCH" -t "${REGISTRY}/suggestion-darts:${TAG}" -f ${CMD_PREFIX}/suggestion/nas/darts/${VERSION}/Dockerfile . # Early stopping images echo -e "\nBuilding early stopping images...\n" echo -e "\nBuilding median stopping rule...\n" -docker buildx build --platform "linux/$ARCH" -t "${REGISTRY}/earlystopping-medianstop:${TAG}" -f ${CMD_PREFIX}/earlystopping/medianstop/${VERSION}/Dockerfile . +docker build --platform "linux/$ARCH" -t "${REGISTRY}/earlystopping-medianstop:${TAG}" -f ${CMD_PREFIX}/earlystopping/medianstop/${VERSION}/Dockerfile . # Training container images if [ ! "$ARCH" = "amd64" ]; then \ @@ -121,22 +121,22 @@ else \ echo -e "\nBuilding training container images..." echo -e "\nBuilding mxnet mnist training container example...\n" - docker buildx build --platform linux/amd64 -t "${REGISTRY}/mxnet-mnist:${TAG}" -f examples/${VERSION}/trial-images/mxnet-mnist/Dockerfile . + docker build --platform linux/amd64 -t "${REGISTRY}/mxnet-mnist:${TAG}" -f examples/${VERSION}/trial-images/mxnet-mnist/Dockerfile . echo -e "\nBuilding Tensorflow with summaries mnist training container example...\n" - docker buildx build --platform linux/amd64 -t "${REGISTRY}/tf-mnist-with-summaries:${TAG}" -f examples/${VERSION}/trial-images/tf-mnist-with-summaries/Dockerfile . + docker build --platform linux/amd64 -t "${REGISTRY}/tf-mnist-with-summaries:${TAG}" -f examples/${VERSION}/trial-images/tf-mnist-with-summaries/Dockerfile . echo -e "\nBuilding PyTorch mnist training container example...\n" - docker buildx build --platform linux/amd64 -t "${REGISTRY}/pytorch-mnist:${TAG}" -f examples/${VERSION}/trial-images/pytorch-mnist/Dockerfile . + docker build --platform linux/amd64 -t "${REGISTRY}/pytorch-mnist:${TAG}" -f examples/${VERSION}/trial-images/pytorch-mnist/Dockerfile . echo -e "\nBuilding Keras CIFAR-10 CNN training container example for ENAS with GPU support...\n" - docker buildx build --platform linux/amd64 -t "${REGISTRY}/enas-cnn-cifar10-gpu:${TAG}" -f examples/${VERSION}/trial-images/enas-cnn-cifar10/Dockerfile.gpu . + docker build --platform linux/amd64 -t "${REGISTRY}/enas-cnn-cifar10-gpu:${TAG}" -f examples/${VERSION}/trial-images/enas-cnn-cifar10/Dockerfile.gpu . echo -e "\nBuilding Keras CIFAR-10 CNN training container example for ENAS with CPU support...\n" - docker buildx build --platform linux/amd64 -t "${REGISTRY}/enas-cnn-cifar10-cpu:${TAG}" -f examples/${VERSION}/trial-images/enas-cnn-cifar10/Dockerfile.cpu . + docker build --platform linux/amd64 -t "${REGISTRY}/enas-cnn-cifar10-cpu:${TAG}" -f examples/${VERSION}/trial-images/enas-cnn-cifar10/Dockerfile.cpu . echo -e "\nBuilding PyTorch CIFAR-10 CNN training container example for DARTS...\n" - docker buildx build --platform linux/amd64 -t "${REGISTRY}/darts-cnn-cifar10:${TAG}" -f examples/${VERSION}/trial-images/darts-cnn-cifar10/Dockerfile . + docker build --platform linux/amd64 -t "${REGISTRY}/darts-cnn-cifar10:${TAG}" -f examples/${VERSION}/trial-images/darts-cnn-cifar10/Dockerfile . fi From 45f12dad0405118d51b3714882cbaff9e5d26e42 Mon Sep 17 00:00:00 2001 From: tenzen-y Date: Wed, 17 Nov 2021 23:24:24 +0900 Subject: [PATCH 06/17] review: use new tf-mnist-with-example in Ci for tfjob --- .../tfjob-mnist-with-summaries.yaml | 13 +---- .../tfevent-metrics-collector.yaml | 49 ------------------- test/e2e/v1beta1/argo_workflow.py | 1 - 3 files changed, 2 insertions(+), 61 deletions(-) delete mode 100644 examples/v1beta1/metrics-collector/tfevent-metrics-collector.yaml diff --git a/examples/v1beta1/kubeflow-training-operator/tfjob-mnist-with-summaries.yaml b/examples/v1beta1/kubeflow-training-operator/tfjob-mnist-with-summaries.yaml index f68668da657..c64435d8cce 100644 --- a/examples/v1beta1/kubeflow-training-operator/tfjob-mnist-with-summaries.yaml +++ b/examples/v1beta1/kubeflow-training-operator/tfjob-mnist-with-summaries.yaml @@ -26,20 +26,12 @@ spec: feasibleSpace: min: "0.01" max: "0.05" - - name: batch_size - parameterType: int - feasibleSpace: - min: "100" - max: "200" trialTemplate: primaryContainerName: tensorflow trialParameters: - name: learningRate description: Learning rate for the training model reference: learning_rate - - name: batchSize - description: Batch Size - reference: batch_size trialSpec: apiVersion: kubeflow.org/v1 kind: TFJob @@ -52,10 +44,9 @@ spec: spec: containers: - name: tensorflow - image: gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0 + image: docker.io/kubeflowkatib/tf-mnist-with-summaries:latest command: - "python" - - "/var/tf_mnist/mnist_with_summaries.py" + - "/opt/tf-mnist-with-summaries/mnist.py" - "--log_dir=/train/metrics" - "--learning_rate=${trialParameters.learningRate}" - - "--batch_size=${trialParameters.batchSize}" diff --git a/examples/v1beta1/metrics-collector/tfevent-metrics-collector.yaml b/examples/v1beta1/metrics-collector/tfevent-metrics-collector.yaml deleted file mode 100644 index c95595543a7..00000000000 --- a/examples/v1beta1/metrics-collector/tfevent-metrics-collector.yaml +++ /dev/null @@ -1,49 +0,0 @@ -apiVersion: kubeflow.org/v1beta1 -kind: Experiment -metadata: - namespace: kubeflow - name: tfevent-metrics-collector -spec: - parallelTrialCount: 3 - maxTrialCount: 12 - maxFailedTrialCount: 3 - objective: - type: maximize - goal: 0.99 - objectiveMetricName: accuracy_1 - algorithm: - algorithmName: random - metricsCollectorSpec: - source: - fileSystemPath: - path: /train - kind: Directory - collector: - kind: TensorFlowEvent - parameters: - - name: learning_rate - parameterType: double - feasibleSpace: - min: "0.01" - max: "0.05" - trialTemplate: - primaryContainerName: training-container - trialParameters: - - name: learningRate - description: Learning rate for the training model - reference: learning_rate - trialSpec: - apiVersion: batch/v1 - kind: Job - spec: - template: - spec: - containers: - - name: training-container - image: docker.io/kubeflowkatib/tf-mnist-with-summaries:latest - command: - - "python3" - - "/opt/tf-mnist-with-summaries/mnist.py" - - "--log_dir=/train/metrics" - - "--learning_rate=${trialParameters.learningRate}" - restartPolicy: Never diff --git a/test/e2e/v1beta1/argo_workflow.py b/test/e2e/v1beta1/argo_workflow.py index 88c738ef28b..097515c14ef 100644 --- a/test/e2e/v1beta1/argo_workflow.py +++ b/test/e2e/v1beta1/argo_workflow.py @@ -80,7 +80,6 @@ "pytorchjob": "examples/v1beta1/kubeflow-training-operator/pytorchjob-mnist.yaml", "tfjob": "examples/v1beta1/kubeflow-training-operator/tfjob-mnist-with-summaries.yaml", "file-metricscollector": "examples/v1beta1/metrics-collector/file-metrics-collector.yaml", - "tfevent-metricscollector": "examples/v1beta1/metrics-collector/tfevent-metrics-collector.yaml", "never-resume": "examples/v1beta1/resume-experiment/never-resume.yaml", "from-volume-resume": "examples/v1beta1/resume-experiment/from-volume-resume.yaml", "median-stop": "examples/v1beta1/early-stopping/median-stop.yaml" From 68765f461dd28c9cc166ff4a2169e2691dad83b8 Mon Sep 17 00:00:00 2001 From: tenzen-y Date: Sun, 21 Nov 2021 00:20:23 +0900 Subject: [PATCH 07/17] review: refactor tf-mnist-with-summaries --- .../tfevent-metricscollector/requirements.txt | 2 +- .../tfjob-mnist-with-summaries.yaml | 18 +- .../tf-mnist-with-summaries/README.md | 4 +- .../tf-mnist-with-summaries/input_data.py | 333 ------------------ .../tf-mnist-with-summaries/mnist.py | 327 +++++++---------- .../tfevent_loader.py | 2 + 6 files changed, 142 insertions(+), 544 deletions(-) delete mode 100644 examples/v1beta1/trial-images/tf-mnist-with-summaries/input_data.py diff --git a/cmd/metricscollector/v1beta1/tfevent-metricscollector/requirements.txt b/cmd/metricscollector/v1beta1/tfevent-metricscollector/requirements.txt index 09f4e56eafb..ab8a014d528 100644 --- a/cmd/metricscollector/v1beta1/tfevent-metricscollector/requirements.txt +++ b/cmd/metricscollector/v1beta1/tfevent-metricscollector/requirements.txt @@ -1,4 +1,4 @@ -psutil==5.6.6 +psutil==5.8.0 rfc3339>=6.2 grpcio==1.41.1 googleapis-common-protos==1.6.0 diff --git a/examples/v1beta1/kubeflow-training-operator/tfjob-mnist-with-summaries.yaml b/examples/v1beta1/kubeflow-training-operator/tfjob-mnist-with-summaries.yaml index c64435d8cce..e4d4ba72008 100644 --- a/examples/v1beta1/kubeflow-training-operator/tfjob-mnist-with-summaries.yaml +++ b/examples/v1beta1/kubeflow-training-operator/tfjob-mnist-with-summaries.yaml @@ -10,13 +10,13 @@ spec: objective: type: maximize goal: 0.99 - objectiveMetricName: accuracy_1 + objectiveMetricName: accuracy algorithm: algorithmName: random metricsCollectorSpec: source: fileSystemPath: - path: /train + path: /mnist-with-summaries-logs/test kind: Directory collector: kind: TensorFlowEvent @@ -26,12 +26,20 @@ spec: feasibleSpace: min: "0.01" max: "0.05" + - name: batch_size + parameterType: int + feasibleSpace: + min: "100" + max: "200" trialTemplate: primaryContainerName: tensorflow trialParameters: - name: learningRate description: Learning rate for the training model reference: learning_rate + - name: batchSize + description: Batch Size + reference: batch_size trialSpec: apiVersion: kubeflow.org/v1 kind: TFJob @@ -48,5 +56,7 @@ spec: command: - "python" - "/opt/tf-mnist-with-summaries/mnist.py" - - "--log_dir=/train/metrics" - - "--learning_rate=${trialParameters.learningRate}" + - "--epochs=1" + - "--learning-rate=${trialParameters.learningRate}" + - "--batch-size=${trialParameters.batchSize}" + - "--log-path=/mnist-with-summaries-logs" diff --git a/examples/v1beta1/trial-images/tf-mnist-with-summaries/README.md b/examples/v1beta1/trial-images/tf-mnist-with-summaries/README.md index dfcf79f2b75..dd20f6dccc7 100644 --- a/examples/v1beta1/trial-images/tf-mnist-with-summaries/README.md +++ b/examples/v1beta1/trial-images/tf-mnist-with-summaries/README.md @@ -4,8 +4,8 @@ This is Tensorflow MNIST image classification training container that outputs TF It uses convolutional neural network to train the model. If you want to read more about this example, visit the official -[tensorflow](https://github.com/tensorflow/tensorflow/blob/7462dcaae1e8cfe1dfd0c62dd6083f9749a9d827/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py) +[tensorflow](https://www.tensorflow.org/tutorials/quickstart/advanced) GitHub repository. Katib uses this training container in some Experiments, for instance in the -[TF Event Metrics Collector](../../metrics-collector/tfevent-metrics-collector.yaml#L42-L49). +[TF Event Metrics Collector](../../kubeflow-training-operator/tfjob-mnist-with-summaries.yaml#L46-L53). diff --git a/examples/v1beta1/trial-images/tf-mnist-with-summaries/input_data.py b/examples/v1beta1/trial-images/tf-mnist-with-summaries/input_data.py deleted file mode 100644 index c203c7b5341..00000000000 --- a/examples/v1beta1/trial-images/tf-mnist-with-summaries/input_data.py +++ /dev/null @@ -1,333 +0,0 @@ -# Copyright 2016 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Functions for downloading and reading MNIST data (deprecated). - -This module and all its submodules are deprecated. -""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import collections -import gzip -import os - -import numpy -from six.moves import urllib -from six.moves import xrange # pylint: disable=redefined-builtin - -from tensorflow.python.framework import dtypes -from tensorflow.python.framework import random_seed -from tensorflow.python.platform import gfile -from tensorflow.python.util.deprecation import deprecated - -_Datasets = collections.namedtuple('_Datasets', ['train', 'validation', 'test']) - -# CVDF mirror of http://yann.lecun.com/exdb/mnist/ -DEFAULT_SOURCE_URL = 'https://storage.googleapis.com/cvdf-datasets/mnist/' - - -def _read32(bytestream): - dt = numpy.dtype(numpy.uint32).newbyteorder('>') - return numpy.frombuffer(bytestream.read(4), dtype=dt)[0] - - -@deprecated(None, 'Please use tf.data to implement this functionality.') -def _extract_images(f): - """Extract the images into a 4D uint8 numpy array [index, y, x, depth]. - - Args: - f: A file object that can be passed into a gzip reader. - - Returns: - data: A 4D uint8 numpy array [index, y, x, depth]. - - Raises: - ValueError: If the bytestream does not start with 2051. - - """ - print('Extracting', f.name) - with gzip.GzipFile(fileobj=f) as bytestream: - magic = _read32(bytestream) - if magic != 2051: - raise ValueError('Invalid magic number %d in MNIST image file: %s' % - (magic, f.name)) - num_images = _read32(bytestream) - rows = _read32(bytestream) - cols = _read32(bytestream) - buf = bytestream.read(rows * cols * num_images) - data = numpy.frombuffer(buf, dtype=numpy.uint8) - data = data.reshape(num_images, rows, cols, 1) - return data - - -@deprecated(None, 'Please use tf.one_hot on tensors.') -def _dense_to_one_hot(labels_dense, num_classes): - """Convert class labels from scalars to one-hot vectors.""" - num_labels = labels_dense.shape[0] - index_offset = numpy.arange(num_labels) * num_classes - labels_one_hot = numpy.zeros((num_labels, num_classes)) - labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1 - return labels_one_hot - - -@deprecated(None, 'Please use tf.data to implement this functionality.') -def _extract_labels(f, one_hot=False, num_classes=10): - """Extract the labels into a 1D uint8 numpy array [index]. - - Args: - f: A file object that can be passed into a gzip reader. - one_hot: Does one hot encoding for the result. - num_classes: Number of classes for the one hot encoding. - - Returns: - labels: a 1D uint8 numpy array. - - Raises: - ValueError: If the bystream doesn't start with 2049. - """ - print('Extracting', f.name) - with gzip.GzipFile(fileobj=f) as bytestream: - magic = _read32(bytestream) - if magic != 2049: - raise ValueError('Invalid magic number %d in MNIST label file: %s' % - (magic, f.name)) - num_items = _read32(bytestream) - buf = bytestream.read(num_items) - labels = numpy.frombuffer(buf, dtype=numpy.uint8) - if one_hot: - return _dense_to_one_hot(labels, num_classes) - return labels - - -class _DataSet(object): - """Container class for a _DataSet (deprecated). - - THIS CLASS IS DEPRECATED. - """ - - @deprecated(None, 'Please use alternatives such as official/mnist/_DataSet.py' - ' from tensorflow/models.') - def __init__(self, - images, - labels, - fake_data=False, - one_hot=False, - dtype=dtypes.float32, - reshape=True, - seed=None): - """Construct a _DataSet. - - one_hot arg is used only if fake_data is true. `dtype` can be either - `uint8` to leave the input as `[0, 255]`, or `float32` to rescale into - `[0, 1]`. Seed arg provides for convenient deterministic testing. - - Args: - images: The images - labels: The labels - fake_data: Ignore inages and labels, use fake data. - one_hot: Bool, return the labels as one hot vectors (if True) or ints (if - False). - dtype: Output image dtype. One of [uint8, float32]. `uint8` output has - range [0,255]. float32 output has range [0,1]. - reshape: Bool. If True returned images are returned flattened to vectors. - seed: The random seed to use. - """ - seed1, seed2 = random_seed.get_seed(seed) - # If op level seed is not set, use whatever graph level seed is returned - numpy.random.seed(seed1 if seed is None else seed2) - dtype = dtypes.as_dtype(dtype).base_dtype - if dtype not in (dtypes.uint8, dtypes.float32): - raise TypeError('Invalid image dtype %r, expected uint8 or float32' % - dtype) - if fake_data: - self._num_examples = 10000 - self.one_hot = one_hot - else: - assert images.shape[0] == labels.shape[0], ( - 'images.shape: %s labels.shape: %s' % (images.shape, labels.shape)) - self._num_examples = images.shape[0] - - # Convert shape from [num examples, rows, columns, depth] - # to [num examples, rows*columns] (assuming depth == 1) - if reshape: - assert images.shape[3] == 1 - images = images.reshape(images.shape[0], - images.shape[1] * images.shape[2]) - if dtype == dtypes.float32: - # Convert from [0, 255] -> [0.0, 1.0]. - images = images.astype(numpy.float32) - images = numpy.multiply(images, 1.0 / 255.0) - self._images = images - self._labels = labels - self._epochs_completed = 0 - self._index_in_epoch = 0 - - @property - def images(self): - return self._images - - @property - def labels(self): - return self._labels - - @property - def num_examples(self): - return self._num_examples - - @property - def epochs_completed(self): - return self._epochs_completed - - def next_batch(self, batch_size, fake_data=False, shuffle=True): - """Return the next `batch_size` examples from this data set.""" - if fake_data: - fake_image = [1] * 784 - if self.one_hot: - fake_label = [1] + [0] * 9 - else: - fake_label = 0 - return [fake_image for _ in xrange(batch_size) - ], [fake_label for _ in xrange(batch_size)] - start = self._index_in_epoch - # Shuffle for the first epoch - if self._epochs_completed == 0 and start == 0 and shuffle: - perm0 = numpy.arange(self._num_examples) - numpy.random.shuffle(perm0) - self._images = self.images[perm0] - self._labels = self.labels[perm0] - # Go to the next epoch - if start + batch_size > self._num_examples: - # Finished epoch - self._epochs_completed += 1 - # Get the rest examples in this epoch - rest_num_examples = self._num_examples - start - images_rest_part = self._images[start:self._num_examples] - labels_rest_part = self._labels[start:self._num_examples] - # Shuffle the data - if shuffle: - perm = numpy.arange(self._num_examples) - numpy.random.shuffle(perm) - self._images = self.images[perm] - self._labels = self.labels[perm] - # Start next epoch - start = 0 - self._index_in_epoch = batch_size - rest_num_examples - end = self._index_in_epoch - images_new_part = self._images[start:end] - labels_new_part = self._labels[start:end] - return numpy.concatenate((images_rest_part, images_new_part), - axis=0), numpy.concatenate( - (labels_rest_part, labels_new_part), axis=0) - else: - self._index_in_epoch += batch_size - end = self._index_in_epoch - return self._images[start:end], self._labels[start:end] - - -@deprecated(None, 'Please write your own downloading logic.') -def _maybe_download(filename, work_directory, source_url): - """Download the data from source url, unless it's already here. - - Args: - filename: string, name of the file in the directory. - work_directory: string, path to working directory. - source_url: url to download from if file doesn't exist. - - Returns: - Path to resulting file. - """ - if not gfile.Exists(work_directory): - gfile.MakeDirs(work_directory) - filepath = os.path.join(work_directory, filename) - if not gfile.Exists(filepath): - urllib.request.urlretrieve(source_url, filepath) - with gfile.GFile(filepath) as f: - size = f.size() - print('Successfully downloaded', filename, size, 'bytes.') - return filepath - - -@deprecated(None, 'Please use alternatives such as:' - ' tensorflow_datasets.load(\'mnist\')') -def read_data_sets(train_dir, - fake_data=False, - one_hot=False, - dtype=dtypes.float32, - reshape=True, - validation_size=5000, - seed=None, - source_url=DEFAULT_SOURCE_URL): - if fake_data: - - def fake(): - return _DataSet([], [], - fake_data=True, - one_hot=one_hot, - dtype=dtype, - seed=seed) - - train = fake() - validation = fake() - test = fake() - return _Datasets(train=train, validation=validation, test=test) - - if not source_url: # empty string check - source_url = DEFAULT_SOURCE_URL - - train_images_file = 'train-images-idx3-ubyte.gz' - train_labels_file = 'train-labels-idx1-ubyte.gz' - test_images_file = 't10k-images-idx3-ubyte.gz' - test_labels_file = 't10k-labels-idx1-ubyte.gz' - - local_file = _maybe_download(train_images_file, train_dir, - source_url + train_images_file) - with gfile.Open(local_file, 'rb') as f: - train_images = _extract_images(f) - - local_file = _maybe_download(train_labels_file, train_dir, - source_url + train_labels_file) - with gfile.Open(local_file, 'rb') as f: - train_labels = _extract_labels(f, one_hot=one_hot) - - local_file = _maybe_download(test_images_file, train_dir, - source_url + test_images_file) - with gfile.Open(local_file, 'rb') as f: - test_images = _extract_images(f) - - local_file = _maybe_download(test_labels_file, train_dir, - source_url + test_labels_file) - with gfile.Open(local_file, 'rb') as f: - test_labels = _extract_labels(f, one_hot=one_hot) - - if not 0 <= validation_size <= len(train_images): - raise ValueError( - 'Validation size should be between 0 and {}. Received: {}.'.format( - len(train_images), validation_size)) - - validation_images = train_images[:validation_size] - validation_labels = train_labels[:validation_size] - train_images = train_images[validation_size:] - train_labels = train_labels[validation_size:] - - options = dict(dtype=dtype, reshape=reshape, seed=seed) - - train = _DataSet(train_images, train_labels, **options) - validation = _DataSet(validation_images, validation_labels, **options) - test = _DataSet(test_images, test_labels, **options) - - return _Datasets(train=train, validation=validation, test=test) - diff --git a/examples/v1beta1/trial-images/tf-mnist-with-summaries/mnist.py b/examples/v1beta1/trial-images/tf-mnist-with-summaries/mnist.py index 6cd093d9029..9795aef1e92 100644 --- a/examples/v1beta1/trial-images/tf-mnist-with-summaries/mnist.py +++ b/examples/v1beta1/trial-images/tf-mnist-with-summaries/mnist.py @@ -1,218 +1,137 @@ -# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# Copyright 2021 The Kubeflow Authors. # -# Licensed under the Apache License, Version 2.0 (the 'License'); +# Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an 'AS IS' BASIS, +# distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# ============================================================================== -"""A simple MNIST classifier which displays summaries in TensorBoard. - -This is an unimpressive MNIST model, but it is a good example of using -tf.name_scope to make a graph legible in the TensorBoard graph explorer, and of -naming summary tags so that they are grouped meaningfully in TensorBoard. - -It demonstrates the functionality of every TensorBoard dashboard. -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function import argparse import os -import sys - import tensorflow as tf -# from tensorflow.examples.tutorials.mnist import input_data -import input_data - -FLAGS = None - - -def train(): - # Import data - mnist = input_data.read_data_sets(FLAGS.data_dir, - fake_data=FLAGS.fake_data) - - sess = tf.compat.v1.InteractiveSession() - # Create a multilayer model. - - # Input placeholders - with tf.compat.v1.name_scope('input'): - x = tf.compat.v1.placeholder(tf.float32, [None, 784], name='x-input') - y_ = tf.compat.v1.placeholder(tf.int64, [None], name='y-input') - - with tf.compat.v1.name_scope('input_reshape'): - image_shaped_input = tf.reshape(x, [-1, 28, 28, 1]) - tf.compat.v1.summary.image('input', image_shaped_input, 10) - - # We can't initialize these variables to 0 - the network will get stuck. - def weight_variable(shape): - """Create a weight variable with appropriate initialization.""" - initial = tf.random.truncated_normal(shape, stddev=0.1) - return tf.Variable(initial) - - def bias_variable(shape): - """Create a bias variable with appropriate initialization.""" - initial = tf.constant(0.1, shape=shape) - return tf.Variable(initial) - - def variable_summaries(var): - """Attach a lot of summaries to a Tensor (for TensorBoard visualization).""" - with tf.compat.v1.name_scope('summaries'): - mean = tf.reduce_mean(input_tensor=var) - tf.compat.v1.summary.scalar('mean', mean) - with tf.compat.v1.name_scope('stddev'): - stddev = tf.sqrt(tf.reduce_mean(input_tensor=tf.square(var - mean))) - tf.compat.v1.summary.scalar('stddev', stddev) - tf.compat.v1.summary.scalar('max', tf.reduce_max(input_tensor=var)) - tf.compat.v1.summary.scalar('min', tf.reduce_min(input_tensor=var)) - tf.compat.v1.summary.histogram('histogram', var) - - def nn_layer(input_tensor, input_dim, output_dim, layer_name, act=tf.nn.relu): - """Reusable code for making a simple neural net layer. - - It does a matrix multiply, bias add, and then uses ReLU to nonlinearize. - It also sets up name scoping so that the resultant graph is easy to read, - and adds a number of summary ops. - """ - # Adding a name scope ensures logical grouping of the layers in the graph. - with tf.compat.v1.name_scope(layer_name): - # This Variable will hold the state of the weights for the layer - with tf.compat.v1.name_scope('weights'): - weights = weight_variable([input_dim, output_dim]) - variable_summaries(weights) - with tf.compat.v1.name_scope('biases'): - biases = bias_variable([output_dim]) - variable_summaries(biases) - with tf.compat.v1.name_scope('Wx_plus_b'): - preactivate = tf.matmul(input_tensor, weights) + biases - tf.compat.v1.summary.histogram('pre_activations', preactivate) - activations = act(preactivate, name='activation') - tf.compat.v1.summary.histogram('activations', activations) - return activations - - hidden1 = nn_layer(x, 784, 500, 'layer1') - - with tf.compat.v1.name_scope('dropout'): - keep_prob = tf.compat.v1.placeholder(tf.float32) - tf.compat.v1.summary.scalar('dropout_keep_probability', keep_prob) - dropped = tf.nn.dropout(hidden1, rate=(1 - keep_prob)) - - # Do not apply softmax activation yet, see below. - y = nn_layer(dropped, 500, 10, 'layer2', act=tf.identity) - - with tf.compat.v1.name_scope('cross_entropy'): - # The raw formulation of cross-entropy, - # - # tf.reduce_mean(-tf.reduce_sum(y_ * tf.math.log(tf.softmax(y)), - # reduction_indices=[1])) - # - # can be numerically unstable. - # - # So here we use tf.compat.v1.losses.sparse_softmax_cross_entropy on the - # raw logit outputs of the nn_layer above, and then average across - # the batch. - with tf.compat.v1.name_scope('total'): - cross_entropy = tf.compat.v1.losses.sparse_softmax_cross_entropy( - labels=y_, logits=y) - tf.compat.v1.summary.scalar('cross_entropy', cross_entropy) - - with tf.compat.v1.name_scope('train'): - train_step = tf.compat.v1.train.AdamOptimizer(FLAGS.learning_rate).minimize( - cross_entropy) - - with tf.compat.v1.name_scope('accuracy'): - with tf.compat.v1.name_scope('correct_prediction'): - correct_prediction = tf.equal(tf.argmax(input=y, axis=1), y_) - with tf.compat.v1.name_scope('accuracy'): - accuracy = tf.reduce_mean(input_tensor=tf.cast(correct_prediction, - tf.float32)) - tf.compat.v1.summary.scalar('accuracy', accuracy) - - # Merge all the summaries and write them out to - # /tmp/tensorflow/mnist/logs/mnist_with_summaries (by default) - merged = tf.compat.v1.summary.merge_all() - train_writer = tf.compat.v1.summary.FileWriter(FLAGS.log_dir + '/train', - sess.graph) - test_writer = tf.compat.v1.summary.FileWriter(FLAGS.log_dir + '/test') - tf.compat.v1.global_variables_initializer().run() - - # Train the model, and also write summaries. - # Every 10th step, measure test-set accuracy, and write test summaries - # All other steps, run train_step on training data, & add training summaries - - def feed_dict(train): - """Make a TensorFlow feed_dict: maps data onto Tensor placeholders.""" - if train or FLAGS.fake_data: - xs, ys = mnist.train.next_batch(100, fake_data=FLAGS.fake_data) - k = FLAGS.dropout - else: - xs, ys = mnist.test.images, mnist.test.labels - k = 1.0 - return {x: xs, y_: ys, keep_prob: k} - - for i in range(FLAGS.max_steps): - if i % 10 == 0: # Record summaries and test-set accuracy - summary, acc = sess.run([merged, accuracy], feed_dict=feed_dict(False)) - test_writer.add_summary(summary, i) - print('Accuracy at step %s: %s' % (i, acc)) - else: # Record train set summaries, and train - if i % 100 == 99: # Record execution stats - run_options = tf.compat.v1.RunOptions( - trace_level=tf.compat.v1.RunOptions.FULL_TRACE) - run_metadata = tf.compat.v1.RunMetadata() - summary, _ = sess.run([merged, train_step], - feed_dict=feed_dict(True), - options=run_options, - run_metadata=run_metadata) - train_writer.add_run_metadata(run_metadata, 'step%03d' % i) - train_writer.add_summary(summary, i) - print('Adding run metadata for', i) - else: # Record a summary - summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True)) - train_writer.add_summary(summary, i) - train_writer.close() - test_writer.close() - - -def main(_): - if tf.io.gfile.exists(FLAGS.log_dir): - tf.io.gfile.rmtree(FLAGS.log_dir) - tf.io.gfile.makedirs(FLAGS.log_dir) - with tf.Graph().as_default(): - train() - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('--fake_data', nargs='?', const=True, type=bool, - default=False, - help='If true, uses fake data for unit testing.') - parser.add_argument('--max_steps', type=int, default=1000, - help='Number of steps to run trainer.') - parser.add_argument('--learning_rate', type=float, default=0.001, - help='Initial learning rate') - parser.add_argument('--dropout', type=float, default=0.9, - help='Keep probability for training dropout.') - parser.add_argument( - '--data_dir', - type=str, - default=os.path.join(os.getenv('TEST_TMPDIR', '/tmp'), - 'tensorflow/mnist/input_data'), - help='Directory for storing input data') - parser.add_argument( - '--log_dir', - type=str, - default=os.path.join(os.getenv('TEST_TMPDIR', '/tmp'), - 'tensorflow/mnist/logs/mnist_with_summaries'), - help='Summaries log directory') - FLAGS, unparsed = parser.parse_known_args() - tf.compat.v1.app.run(main=main, argv=[sys.argv[0]] + unparsed) +from tensorflow.keras.layers import Dense, Flatten, Conv2D +from tensorflow.keras import Model + + +class MyModel(Model): + def __init__(self): + super(MyModel, self).__init__() + self.conv1 = Conv2D(32, 3, activation='relu') + self.flatten = Flatten() + self.d1 = Dense(128, activation='relu') + self.d2 = Dense(10) + + def call(self, x): + x = self.conv1(x) + x = self.flatten(x) + x = self.d1(x) + return self.d2(x) + + +def train_step(args, model, optimizer, train_ds, epoch, loss_object, train_summary_writer, train_loss, train_accuracy): + for step, (images, labels) in enumerate(train_ds): + with tf.GradientTape() as tape: + # training=True is only needed if there are layers with different + # behavior during training versus inference (e.g. Dropout). + predictions = model(images, training=True) + loss = loss_object(labels, predictions) + gradients = tape.gradient(loss, model.trainable_variables) + optimizer.apply_gradients(zip(gradients, model.trainable_variables)) + + train_loss(loss) + train_accuracy(labels, predictions) + + if step % args.log_interval == 0: + print("Train Epoch: {} [{}/60000 ({:.0f}%)]\tloss={:.4f}, accuracy={:.4f}".format( + epoch + 1, step * args.batch_size, 100. * step * args.batch_size / 60000, + train_loss.result(), train_accuracy.result() * 100) + ) + + with train_summary_writer.as_default(): + tf.summary.scalar('loss', train_loss.result(), step=epoch) + tf.summary.scalar('accuracy', train_accuracy.result(), step=epoch) + + +def test_step(model, test_ds, epoch, loss_object, test_summary_writer, test_loss, test_accuracy): + for (images, labels) in test_ds: + # training=False is only needed if there are layers with different + # behavior during training versus inference (e.g. Dropout). + predictions = model(images, training=False) + t_loss = loss_object(labels, predictions) + + test_loss(t_loss) + test_accuracy(labels, predictions) + + with test_summary_writer.as_default(): + tf.summary.scalar('loss', test_loss.result(), step=epoch) + tf.summary.scalar('accuracy', test_accuracy.result(), step=epoch) + + print("Test Loss: {:.4f}, Test Accuracy: {:.4f}\n".format( + test_loss.result(), test_accuracy.result() * 100) + ) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--batch-size', type=int, default=64, + help='input batch size for training (default: 64)') + parser.add_argument('--learning-rate', type=float, default=0.001, + help='learning rate (default: 0.001)') + parser.add_argument("--epochs", type=int, default=10, metavar="N", + help="number of epochs to train (default: 10)") + parser.add_argument("--log-interval", type=int, default=100, metavar="N", + help="how many batches to wait before logging training status (default: 100)") + parser.add_argument( + '--log-path', + type=str, + default=os.path.join(os.getenv('TEST_TMPDIR', '/tmp'), + 'tensorflow/mnist/logs/mnist_with_summaries'), + help='Summaries log PATH') + args = parser.parse_args() + + # Setup dataset + mnist = tf.keras.datasets.mnist + (x_train, y_train), (x_test, y_test) = mnist.load_data() + x_train, x_test = x_train / 255.0, x_test / 255.0 + # Add a channels dimension + x_train = x_train[..., tf.newaxis].astype("float32") + x_test = x_test[..., tf.newaxis].astype("float32") + train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(10000).batch(args.batch_size) + test_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(args.batch_size) + + # Setup tensorflow summaries + train_log_dir = os.path.join(args.log_path, 'train') + test_log_dir = os.path.join(args.log_path, 'test') + train_summary_writer = tf.summary.create_file_writer(train_log_dir) + test_summary_writer = tf.summary.create_file_writer(test_log_dir) + + # Create an instance of the model + model = MyModel() + loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) + optimizer = tf.keras.optimizers.Adam(learning_rate=args.learning_rate) + + train_loss = tf.keras.metrics.Mean(name='train_loss') + train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy') + + test_loss = tf.keras.metrics.Mean(name='test_loss') + test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='test_accuracy') + + for epoch in range(args.epochs): + # Reset the metrics at the start of the next epoch + train_summary_writer.flush() + test_summary_writer.flush() + + train_step(args, model, optimizer, train_ds, epoch, loss_object, train_summary_writer, + train_loss, train_accuracy) + test_step(model, test_ds, epoch, loss_object, test_summary_writer, test_loss, test_accuracy) + + +if __name__ == "__main__": + main() diff --git a/pkg/metricscollector/v1beta1/tfevent-metricscollector/tfevent_loader.py b/pkg/metricscollector/v1beta1/tfevent-metricscollector/tfevent_loader.py index 25478e56846..53140a3cce8 100644 --- a/pkg/metricscollector/v1beta1/tfevent-metricscollector/tfevent_loader.py +++ b/pkg/metricscollector/v1beta1/tfevent-metricscollector/tfevent_loader.py @@ -30,12 +30,14 @@ import const class TFEventFileParser: + @staticmethod def find_all_files(self, directory): for root, dirs, files in os.walk(directory): yield root for f in files: yield os.path.join(root, f) + @staticmethod def parse_summary(self, tfefile, metrics): metric_logs = [] for summary in tf.compat.v1.train.summary_iterator(tfefile): From f81f9bad21803566d5a02d6b963fd2fc394b4479 Mon Sep 17 00:00:00 2001 From: tenzen-y Date: Sun, 21 Nov 2021 02:53:26 +0900 Subject: [PATCH 08/17] review: remove Dockerfile.ppc64le for new-ui --- cmd/new-ui/v1beta1/Dockerfile.ppc64le | 63 --------------------------- scripts/v1beta1/build.sh | 6 +-- 2 files changed, 1 insertion(+), 68 deletions(-) delete mode 100644 cmd/new-ui/v1beta1/Dockerfile.ppc64le diff --git a/cmd/new-ui/v1beta1/Dockerfile.ppc64le b/cmd/new-ui/v1beta1/Dockerfile.ppc64le deleted file mode 100644 index 1902becd801..00000000000 --- a/cmd/new-ui/v1beta1/Dockerfile.ppc64le +++ /dev/null @@ -1,63 +0,0 @@ -# --- Clone the kubeflow/kubeflow code --- -FROM ubuntu AS fetch-kubeflow-kubeflow - -RUN apt-get update && apt-get install git -y - -WORKDIR /kf -RUN git clone https://github.com/kubeflow/kubeflow.git && \ - cd kubeflow && \ - git checkout 24bcb8e - -# --- Build the frontend kubeflow library --- -FROM ppc64le/node:12 AS frontend-kubeflow-lib - -WORKDIR /src - -ARG LIB=/kf/kubeflow/components/crud-web-apps/common/frontend/kubeflow-common-lib -COPY --from=fetch-kubeflow-kubeflow $LIB/package*.json ./ -RUN npm ci - -COPY --from=fetch-kubeflow-kubeflow $LIB/ ./ -RUN npm run build - -# --- Build the frontend --- -FROM ppc64le/node:12 AS frontend - -WORKDIR /src -COPY ./pkg/new-ui/v1beta1/frontend/package*.json ./ -RUN npm ci - -COPY ./pkg/new-ui/v1beta1/frontend/ . -COPY --from=frontend-kubeflow-lib /src/dist/kubeflow/ ./node_modules/kubeflow/ - -RUN npm run build:prod - -# --- Build the backend --- -FROM golang:alpine AS go-build - -WORKDIR /go/src/github.com/kubeflow/katib - -# Download packages. -COPY go.mod . -COPY go.sum . -RUN go mod download -x - -# Copy sources. -COPY cmd/ cmd/ -COPY pkg/ pkg/ - -# Build the binary. -RUN if [ "$(uname -m)" = "ppc64le" ]; then \ - CGO_ENABLED=0 GOOS=linux GOARCH=ppc64le go build -a -o katib-ui ./cmd/new-ui/v1beta1; \ - elif [ "$(uname -m)" = "aarch64" ]; then \ - CGO_ENABLED=0 GOOS=linux GOARCH=arm64 go build -a -o katib-ui ./cmd/new-ui/v1beta1; \ - else \ - CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -a -o katib-ui ./cmd/new-ui/v1beta1; \ - fi - -# --- Compose the web app --- -FROM alpine:3.7 -WORKDIR /app -COPY --from=go-build /go/src/github.com/kubeflow/katib/katib-ui /app/ -COPY --from=frontend /src/dist/static /app/build/static/ -ENTRYPOINT ["./katib-ui"] diff --git a/scripts/v1beta1/build.sh b/scripts/v1beta1/build.sh index 41a1bdf6b07..1d4678fbcc4 100755 --- a/scripts/v1beta1/build.sh +++ b/scripts/v1beta1/build.sh @@ -61,11 +61,7 @@ docker build --platform "linux/$ARCH" -t "${REGISTRY}/katib-db-manager:${TAG}" - # TODO (andreyvelich): Switch to ${CMD_PREFIX}/ui/${VERSION}/Dockerfile once old UI is deprecated. echo -e "\nBuilding Katib UI image...\n" -if [ "$ARCH" == "ppc64le" ]; then - docker build --platform "linux/$ARCH" -t "${REGISTRY}/katib-ui:${TAG}" -f ${CMD_PREFIX}/new-ui/${VERSION}/Dockerfile.ppc64le . -else \ - docker build --platform "linux/$ARCH" -t "${REGISTRY}/katib-ui:${TAG}" -f ${CMD_PREFIX}/new-ui/${VERSION}/Dockerfile . -fi +docker build --platform "linux/$ARCH" -t "${REGISTRY}/katib-ui:${TAG}" -f ${CMD_PREFIX}/new-ui/${VERSION}/Dockerfile . echo -e "\nBuilding Katib cert generator image...\n" docker build --platform "linux/$ARCH" -t "${REGISTRY}/cert-generator:${TAG}" -f ${CMD_PREFIX}/cert-generator/${VERSION}/Dockerfile . From c521c4028e028cd411be3e18d6917e2ef1ad3db4 Mon Sep 17 00:00:00 2001 From: tenzen-y Date: Sun, 21 Nov 2021 03:11:14 +0900 Subject: [PATCH 09/17] review: update docs related tf-mnist-with-summaries --- docs/images-location.md | 4 ++-- examples/v1beta1/README.md | 2 ++ .../v1beta1/trial-images/tf-mnist-with-summaries/README.md | 4 ++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/docs/images-location.md b/docs/images-location.md index f96e8ffc1b5..d6c9eb82ea8 100644 --- a/docs/images-location.md +++ b/docs/images-location.md @@ -284,13 +284,13 @@ The following table shows images for training containers which are used in the - gcr.io/kubeflow-ci/tf-mnist-with-summaries + docker.io/kubeflowkatib/tf-mnist-with-summaries Tensorflow MNIST example with saving metrics in the summaries - Dockerfile + Dockerfile diff --git a/examples/v1beta1/README.md b/examples/v1beta1/README.md index 2b3a6804f45..11e3508efa3 100644 --- a/examples/v1beta1/README.md +++ b/examples/v1beta1/README.md @@ -100,6 +100,8 @@ Check the following examples: Check the following images for the Trial containers: +- [Tensorflow MNIST with summaries](./trial-images/tf-mnist-with-summaries) + - [MXNet MNIST](./trial-images/mxnet-mnist) - [PyTorch MNIST](./trial-images/pytorch-mnist) diff --git a/examples/v1beta1/trial-images/tf-mnist-with-summaries/README.md b/examples/v1beta1/trial-images/tf-mnist-with-summaries/README.md index dd20f6dccc7..8f8fb4e5182 100644 --- a/examples/v1beta1/trial-images/tf-mnist-with-summaries/README.md +++ b/examples/v1beta1/trial-images/tf-mnist-with-summaries/README.md @@ -5,7 +5,7 @@ It uses convolutional neural network to train the model. If you want to read more about this example, visit the official [tensorflow](https://www.tensorflow.org/tutorials/quickstart/advanced) -GitHub repository. +documentation. Katib uses this training container in some Experiments, for instance in the -[TF Event Metrics Collector](../../kubeflow-training-operator/tfjob-mnist-with-summaries.yaml#L46-L53). +[TFJob example](../../kubeflow-training-operator/tfjob-mnist-with-summaries.yaml#L54-L62). From d125f3754387f59001e1dc732e2473e37a9d3e78 Mon Sep 17 00:00:00 2001 From: tenzen-y Date: Wed, 24 Nov 2021 01:02:25 +0900 Subject: [PATCH 10/17] TFEventMetricsCollector supports TF>=2.0 and stop supporting TF <=1.x --- .github/workflows/test-python.yaml | 3 + .../tfevent_loader.py | 57 ++++++++++-------- .../test_tfevent_metricscollector.py | 46 ++++++++++++++ ...t.tfevents.1637681485.2d662933d616.19.1.v2 | Bin 0 -> 1316 bytes ...t.tfevents.1637681485.2d662933d616.19.0.v2 | Bin 0 -> 1316 bytes test/unit/v1beta1/metricscollector/utils.py | 23 +++++++ test/unit/v1beta1/suggestion/utils.py | 13 ++++ 7 files changed, 117 insertions(+), 25 deletions(-) create mode 100644 test/unit/v1beta1/metricscollector/test_tfevent_metricscollector.py create mode 100644 test/unit/v1beta1/metricscollector/testdata/tfevent-metricscollector/logs/test/events.out.tfevents.1637681485.2d662933d616.19.1.v2 create mode 100644 test/unit/v1beta1/metricscollector/testdata/tfevent-metricscollector/logs/train/events.out.tfevents.1637681485.2d662933d616.19.0.v2 create mode 100644 test/unit/v1beta1/metricscollector/utils.py diff --git a/.github/workflows/test-python.yaml b/.github/workflows/test-python.yaml index 4b497734e73..47760bda39d 100644 --- a/.github/workflows/test-python.yaml +++ b/.github/workflows/test-python.yaml @@ -34,6 +34,9 @@ jobs: - name: Run Python test run: | + # for gRPC API export PYTHONPATH=$(pwd):$(pwd)/pkg/apis/manager/v1beta1/python:$(pwd)/pkg/apis/manager/health/python + # for tfevent-metricscollector + export PYTHONPATH=$PYTHONPATH:$(pwd)/pkg/metricscollector/v1beta1/common:$(pwd)/pkg/metricscollector/v1beta1/tfevent-metricscollector pytest ./test/unit/v1beta1/suggestion pytest ./test/unit/v1beta1/earlystopping diff --git a/pkg/metricscollector/v1beta1/tfevent-metricscollector/tfevent_loader.py b/pkg/metricscollector/v1beta1/tfevent-metricscollector/tfevent_loader.py index 53140a3cce8..5377f854d62 100644 --- a/pkg/metricscollector/v1beta1/tfevent-metricscollector/tfevent_loader.py +++ b/pkg/metricscollector/v1beta1/tfevent-metricscollector/tfevent_loader.py @@ -14,48 +14,55 @@ # TFEventFileParser parses tfevent files and returns an ObservationLog of the metrics specified. # When the event file is under a directory(e.g. test dir), please specify "{{dirname}}/{{metrics name}}" -# For example, in the Kubeflow Training Operator TFJob tutorial for mnist with summary: -# https://github.com/kubeflow/training-operator/blob/master/examples/tensorflow/mnist_with_summaries/mnist_with_summaries.py. -# The "accuracy" metric is saved under "train" and "test" directories. +# For example, in the Tensorflow MNIST Classification With Summaries: +# https://github.com/kubeflow/katib/blob/master/examples/v1beta1/trial-images/tf-mnist-with-summaries/mnist.py. +# The "accuracy" and "loss" metric is saved under "train" and "test" directories. # So in the Metrics Collector specification, please specify name of "train" or "test" directory. # Check TFJob example for more information: # https://github.com/kubeflow/katib/blob/master/examples/v1beta1/kubeflow-training-operator/tfjob-mnist-with-summaries.yaml#L16-L22 import tensorflow as tf +from tensorboard.backend.event_processing.event_accumulator import EventAccumulator import os from datetime import datetime import rfc3339 import api_pb2 from logging import getLogger, StreamHandler, INFO -import const +from pkg.metricscollector.v1beta1.common import const + class TFEventFileParser: + def __init__(self, metric_names): + self.metric_names = metric_names + @staticmethod - def find_all_files(self, directory): + def find_all_files(directory): for root, dirs, files in os.walk(directory): - yield root for f in files: yield os.path.join(root, f) - @staticmethod - def parse_summary(self, tfefile, metrics): + def parse_summary(self, tfefile): metric_logs = [] - for summary in tf.compat.v1.train.summary_iterator(tfefile): - paths = tfefile.split("/") - for v in summary.summary.value: - for m in metrics: - tag = str(v.tag) - if len(paths) >= 2 and len(m.split("/")) >= 2: - tag = str(paths[-2]+"/" + v.tag) - if tag.startswith(m): - ml = api_pb2.MetricLog( - time_stamp=rfc3339.rfc3339(datetime.fromtimestamp(summary.wall_time)), - metric=api_pb2.Metric( - name=m, - value=str(v.simple_value) - ) + event_accumulator = EventAccumulator(tfefile, size_guidance={'tensors': 0}) + event_accumulator.Reload() + for tag in event_accumulator.Tags()['tensors']: + for m in self.metric_names: + + tfefile_parent_dir = os.path.dirname(m) if len(m.split("/")) >= 2 else os.path.dirname(tfefile) + basedir_name = os.path.dirname(tfefile) + if not tag.startswith(m.split("/")[-1]) or not basedir_name.endswith(tfefile_parent_dir): + continue + + for wall_time, step, tensor in event_accumulator.Tensors(tag): + ml = api_pb2.MetricLog( + time_stamp=rfc3339.rfc3339(datetime.fromtimestamp(wall_time)), + metric=api_pb2.Metric( + name=m, + value=str(tf.make_ndarray(tensor)) ) - metric_logs.append(ml) + ) + metric_logs.append(ml) + return metric_logs @@ -68,7 +75,7 @@ def __init__(self, metric_names): self.logger.addHandler(handler) self.logger.propagate = False self.metrics = metric_names - self.parser = TFEventFileParser() + self.parser = TFEventFileParser(self.metrics) def parse_file(self, directory): mls = [] @@ -77,7 +84,7 @@ def parse_file(self, directory): continue try: self.logger.info(f + " will be parsed.") - mls.extend(self.parser.parse_summary(f, self.metrics)) + mls.extend(self.parser.parse_summary(f)) except Exception as e: self.logger.warning("Unexpected error: " + str(e)) continue diff --git a/test/unit/v1beta1/metricscollector/test_tfevent_metricscollector.py b/test/unit/v1beta1/metricscollector/test_tfevent_metricscollector.py new file mode 100644 index 00000000000..81d9952a77d --- /dev/null +++ b/test/unit/v1beta1/metricscollector/test_tfevent_metricscollector.py @@ -0,0 +1,46 @@ +# Copyright 2021 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +import utils + + +class TestTFEventMetricsCollector(unittest.TestCase): + def test_parse_file(self): + + logs_dir = "./test/unit/v1beta1/metricscollector/testdata/tfevent-metricscollector/logs" + + # Metric format is "{{dirname}}/{{metrics name}}" + metric_names = ["train/accuracy", "train/loss", "test/loss", "test/accuracy"] + metric_logs = utils.get_metric_logs(logs_dir, metric_names) + self.assertEqual(40, len(metric_logs)) + + for log in metric_logs: + metric_name = log["metric"]["name"] + self.assertIn(metric_name, metric_name) + + # Metric format is "{{metrics name}}" + metric_names = ["accuracy", "loss"] + metrics_file_dir = os.path.join(logs_dir, "train") + metric_logs = utils.get_metric_logs(metrics_file_dir, metric_names) + self.assertEqual(20, len(metric_logs)) + + for log in metric_logs: + metric_name = log["metric"]["name"] + self.assertIn(metric_name, metric_name) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/unit/v1beta1/metricscollector/testdata/tfevent-metricscollector/logs/test/events.out.tfevents.1637681485.2d662933d616.19.1.v2 b/test/unit/v1beta1/metricscollector/testdata/tfevent-metricscollector/logs/test/events.out.tfevents.1637681485.2d662933d616.19.1.v2 new file mode 100644 index 0000000000000000000000000000000000000000..bf18e9744ccc63ade22e6a71ec01aa86ae5415c6 GIT binary patch literal 1316 zcmb1OfPlsI-b$Pd3=Y9g={Fpuc$10}GxPLZ%Tn`7tc=R-ndEh#>b|Ty<#BfFSu^3_V2n?g=#+o->Io6_6xgNCxGo|!n8lXD+ks7RCfU#6#D}L z;kvQff77ZO)&7+eU(H9c-%RmkGT44*O#3CWQc&%`P+jbfVt<+*TsKzx7p9}xe>wSN zG>ZMVMdzi1?PtNXzhrR&s{Jpl8f;PQ=TLy_#%jMoBbxp5t2ySO*uRWFAscKzE2jOL zg>k6%Gko9w8^!*Ua&X;P?N^(NX8+0mcJ(OsbN=JW2iwnvX+QgwG*tVSZeY5PV*f8r zUYKsI_DfAcvw!}fBi<&&e++_OrRYs|4H6iD|#4 dD4PFwU$r$tvA;VIt{bcUm-nFA?^x<01pqLzS6~1D literal 0 HcmV?d00001 diff --git a/test/unit/v1beta1/metricscollector/testdata/tfevent-metricscollector/logs/train/events.out.tfevents.1637681485.2d662933d616.19.0.v2 b/test/unit/v1beta1/metricscollector/testdata/tfevent-metricscollector/logs/train/events.out.tfevents.1637681485.2d662933d616.19.0.v2 new file mode 100644 index 0000000000000000000000000000000000000000..9f1f4bbd9556ba0c2731ce384646a7528d1e77e9 GIT binary patch literal 1316 zcmb1OfPlsI-b$Pd3=Y9g={Fpuc$10}GxPLZ%Tn`7tc=R-ndEh#>b|Ty!L5W3Vd!U^cHy0-tdvS7NPGV7UxW81q0aWkQuNUGuQ(~cd zRk@V8I1-bSON$bdD`EQi_7&Kp=#TvVnMV(*|LE!PU!13M9b!Os3ov5ZZlKJLYWuOS z->ZzEx>Y2@-f%vvgzLs?{~DPxRQoU11+74_zts9yJlK9FO#6QZp0-7CJ)`CK?^ zb%*Q5YJbm^3RL?qXv^55*#E<1dNSC4W=#8UFJFvmzsUJ%_9*tBae(W_YQHCE6{`I? ze8y8z>}P6>P6yl1f@y!yvJO=HE&kevq1YeN3)hX+elE^xRQnIfTr@zjf8jT|Y_R>T znD(2z$V9c@Z04n1DE7-T%ENSHwO{%Wn*EQji;={az-fj@AAng0^84yY5yT_Q&jsCZ`5+4*zX((*NxTwxBSoeX7Q`#CV}7dWYeYQL9h{5cf+1CGOWW3^vWs20`#N0gUpqSzmOeq$xr jeojpL=kF3mwVyR0wH3vF_1kdWSnW@VK(qh<$sb<;iGEe@ literal 0 HcmV?d00001 diff --git a/test/unit/v1beta1/metricscollector/utils.py b/test/unit/v1beta1/metricscollector/utils.py new file mode 100644 index 00000000000..228130c88fe --- /dev/null +++ b/test/unit/v1beta1/metricscollector/utils.py @@ -0,0 +1,23 @@ +# Copyright 2021 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from tfevent_loader import MetricsCollector +from google.protobuf import json_format + + +def get_metric_logs(logs_dir, metric_names): + mc = MetricsCollector(metric_names) + observation_log = mc.parse_file(logs_dir) + dict_observation_log = json_format.MessageToDict(observation_log) + return dict_observation_log["metricLogs"] diff --git a/test/unit/v1beta1/suggestion/utils.py b/test/unit/v1beta1/suggestion/utils.py index 99f02b76149..14f31ef80e0 100644 --- a/test/unit/v1beta1/suggestion/utils.py +++ b/test/unit/v1beta1/suggestion/utils.py @@ -1,3 +1,16 @@ +# Copyright 2021 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from pkg.apis.manager.v1beta1.python import api_pb2 From 8522be03e4903b38de7358ff4d0ead23e3d7d707 Mon Sep 17 00:00:00 2001 From: Yuki Iwai Date: Wed, 24 Nov 2021 05:23:37 +0900 Subject: [PATCH 11/17] review: add help command to scripts/v1beta1/build.sh Co-authored-by: Andrey Velichkevich --- scripts/v1beta1/build.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/v1beta1/build.sh b/scripts/v1beta1/build.sh index 1d4678fbcc4..f657843eae0 100755 --- a/scripts/v1beta1/build.sh +++ b/scripts/v1beta1/build.sh @@ -38,6 +38,7 @@ function check_specified_cpu_arch() { done echo "CPU architecture '$ARCH' is not supported" echo "You can use '${SUPPORTED_CPU_ARCHS[*]}'" + echo "To get machine architecture run: uname -m" return 1 } check_specified_cpu_arch From 7e86dcb5a47e2f94434a00f70b14d2f6a8efc2a3 Mon Sep 17 00:00:00 2001 From: tenzen-y Date: Wed, 24 Nov 2021 06:02:37 +0900 Subject: [PATCH 12/17] fix unit test for tfevent-metricscollector --- .../metricscollector/test_tfevent_metricscollector.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/unit/v1beta1/metricscollector/test_tfevent_metricscollector.py b/test/unit/v1beta1/metricscollector/test_tfevent_metricscollector.py index 81d9952a77d..f77c3cd7d6e 100644 --- a/test/unit/v1beta1/metricscollector/test_tfevent_metricscollector.py +++ b/test/unit/v1beta1/metricscollector/test_tfevent_metricscollector.py @@ -28,8 +28,8 @@ def test_parse_file(self): self.assertEqual(40, len(metric_logs)) for log in metric_logs: - metric_name = log["metric"]["name"] - self.assertIn(metric_name, metric_name) + actual = log["metric"]["name"] + self.assertIn(actual, metric_names) # Metric format is "{{metrics name}}" metric_names = ["accuracy", "loss"] @@ -38,8 +38,8 @@ def test_parse_file(self): self.assertEqual(20, len(metric_logs)) for log in metric_logs: - metric_name = log["metric"]["name"] - self.assertIn(metric_name, metric_name) + actual = log["metric"]["name"] + self.assertIn(actual, metric_names) if __name__ == '__main__': From 6bdda5df607eb0fe1e554cbe7cccbdbc2020e9aa Mon Sep 17 00:00:00 2001 From: tenzen-y Date: Wed, 24 Nov 2021 15:18:26 +0900 Subject: [PATCH 13/17] review: generate tf event files on CI --- .github/workflows/test-python.yaml | 5 +++++ .../test_tfevent_metricscollector.py | 3 ++- ....out.tfevents.1637681485.2d662933d616.19.1.v2 | Bin 1316 -> 0 bytes ....out.tfevents.1637681485.2d662933d616.19.0.v2 | Bin 1316 -> 0 bytes 4 files changed, 7 insertions(+), 1 deletion(-) delete mode 100644 test/unit/v1beta1/metricscollector/testdata/tfevent-metricscollector/logs/test/events.out.tfevents.1637681485.2d662933d616.19.1.v2 delete mode 100644 test/unit/v1beta1/metricscollector/testdata/tfevent-metricscollector/logs/train/events.out.tfevents.1637681485.2d662933d616.19.0.v2 diff --git a/.github/workflows/test-python.yaml b/.github/workflows/test-python.yaml index 47760bda39d..7b0c539cf32 100644 --- a/.github/workflows/test-python.yaml +++ b/.github/workflows/test-python.yaml @@ -32,6 +32,11 @@ jobs: pip install -r cmd/earlystopping/medianstop/v1beta1/requirements.txt + - name: Generate Tensorflow Event file + run: | + LOG_PATH=test/unit/v1beta1/metricscollector/testdata/tfevent-metricscollector/logs + python examples/v1beta1/trial-images/tf-mnist-with-summaries/mnist.py --epochs 10 --batch-size 200 --log-path $LOG_PATH + - name: Run Python test run: | # for gRPC API diff --git a/test/unit/v1beta1/metricscollector/test_tfevent_metricscollector.py b/test/unit/v1beta1/metricscollector/test_tfevent_metricscollector.py index f77c3cd7d6e..aab766968e5 100644 --- a/test/unit/v1beta1/metricscollector/test_tfevent_metricscollector.py +++ b/test/unit/v1beta1/metricscollector/test_tfevent_metricscollector.py @@ -20,7 +20,8 @@ class TestTFEventMetricsCollector(unittest.TestCase): def test_parse_file(self): - logs_dir = "./test/unit/v1beta1/metricscollector/testdata/tfevent-metricscollector/logs" + current_dir = os.getcwd() + logs_dir = os.path.join(current_dir, "testdata/tfevent-metricscollector/logs") # Metric format is "{{dirname}}/{{metrics name}}" metric_names = ["train/accuracy", "train/loss", "test/loss", "test/accuracy"] diff --git a/test/unit/v1beta1/metricscollector/testdata/tfevent-metricscollector/logs/test/events.out.tfevents.1637681485.2d662933d616.19.1.v2 b/test/unit/v1beta1/metricscollector/testdata/tfevent-metricscollector/logs/test/events.out.tfevents.1637681485.2d662933d616.19.1.v2 deleted file mode 100644 index bf18e9744ccc63ade22e6a71ec01aa86ae5415c6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1316 zcmb1OfPlsI-b$Pd3=Y9g={Fpuc$10}GxPLZ%Tn`7tc=R-ndEh#>b|Ty<#BfFSu^3_V2n?g=#+o->Io6_6xgNCxGo|!n8lXD+ks7RCfU#6#D}L z;kvQff77ZO)&7+eU(H9c-%RmkGT44*O#3CWQc&%`P+jbfVt<+*TsKzx7p9}xe>wSN zG>ZMVMdzi1?PtNXzhrR&s{Jpl8f;PQ=TLy_#%jMoBbxp5t2ySO*uRWFAscKzE2jOL zg>k6%Gko9w8^!*Ua&X;P?N^(NX8+0mcJ(OsbN=JW2iwnvX+QgwG*tVSZeY5PV*f8r zUYKsI_DfAcvw!}fBi<&&e++_OrRYs|4H6iD|#4 dD4PFwU$r$tvA;VIt{bcUm-nFA?^x<01pqLzS6~1D diff --git a/test/unit/v1beta1/metricscollector/testdata/tfevent-metricscollector/logs/train/events.out.tfevents.1637681485.2d662933d616.19.0.v2 b/test/unit/v1beta1/metricscollector/testdata/tfevent-metricscollector/logs/train/events.out.tfevents.1637681485.2d662933d616.19.0.v2 deleted file mode 100644 index 9f1f4bbd9556ba0c2731ce384646a7528d1e77e9..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1316 zcmb1OfPlsI-b$Pd3=Y9g={Fpuc$10}GxPLZ%Tn`7tc=R-ndEh#>b|Ty!L5W3Vd!U^cHy0-tdvS7NPGV7UxW81q0aWkQuNUGuQ(~cd zRk@V8I1-bSON$bdD`EQi_7&Kp=#TvVnMV(*|LE!PU!13M9b!Os3ov5ZZlKJLYWuOS z->ZzEx>Y2@-f%vvgzLs?{~DPxRQoU11+74_zts9yJlK9FO#6QZp0-7CJ)`CK?^ zb%*Q5YJbm^3RL?qXv^55*#E<1dNSC4W=#8UFJFvmzsUJ%_9*tBae(W_YQHCE6{`I? ze8y8z>}P6>P6yl1f@y!yvJO=HE&kevq1YeN3)hX+elE^xRQnIfTr@zjf8jT|Y_R>T znD(2z$V9c@Z04n1DE7-T%ENSHwO{%Wn*EQji;={az-fj@AAng0^84yY5yT_Q&jsCZ`5+4*zX((*NxTwxBSoeX7Q`#CV}7dWYeYQL9h{5cf+1CGOWW3^vWs20`#N0gUpqSzmOeq$xr jeojpL=kF3mwVyR0wH3vF_1kdWSnW@VK(qh<$sb<;iGEe@ From a362894ab26be313c2757329b62a444eb4e3256f Mon Sep 17 00:00:00 2001 From: tenzen-y Date: Fri, 26 Nov 2021 20:49:17 +0900 Subject: [PATCH 14/17] add test command to Makefile --- .github/workflows/test-python.yaml | 27 +------------------ .gitignore | 1 + Makefile | 27 +++++++++++++++++++ .../test_tfevent_metricscollector.py | 6 ++--- .../suggestion/test_chocolate_service.py | 14 ++++++++++ .../v1beta1/suggestion/test_enas_service.py | 14 ++++++++++ 6 files changed, 60 insertions(+), 29 deletions(-) diff --git a/.github/workflows/test-python.yaml b/.github/workflows/test-python.yaml index 7b0c539cf32..c6e4ac41b76 100644 --- a/.github/workflows/test-python.yaml +++ b/.github/workflows/test-python.yaml @@ -18,30 +18,5 @@ jobs: with: python-version: 3.9 - - name: Install Packages - run: | - pip install -r test/unit/v1beta1/requirements.txt - - pip install -r cmd/suggestion/chocolate/v1beta1/requirements.txt - pip install -r cmd/suggestion/hyperopt/v1beta1/requirements.txt - pip install -r cmd/suggestion/skopt/v1beta1/requirements.txt - pip install -r cmd/suggestion/optuna/v1beta1/requirements.txt - pip install -r cmd/suggestion/nas/enas/v1beta1/requirements.txt - pip install -r cmd/suggestion/hyperband/v1beta1/requirements.txt - pip install -r cmd/suggestion/nas/darts/v1beta1/requirements.txt - - pip install -r cmd/earlystopping/medianstop/v1beta1/requirements.txt - - - name: Generate Tensorflow Event file - run: | - LOG_PATH=test/unit/v1beta1/metricscollector/testdata/tfevent-metricscollector/logs - python examples/v1beta1/trial-images/tf-mnist-with-summaries/mnist.py --epochs 10 --batch-size 200 --log-path $LOG_PATH - - name: Run Python test - run: | - # for gRPC API - export PYTHONPATH=$(pwd):$(pwd)/pkg/apis/manager/v1beta1/python:$(pwd)/pkg/apis/manager/health/python - # for tfevent-metricscollector - export PYTHONPATH=$PYTHONPATH:$(pwd)/pkg/metricscollector/v1beta1/common:$(pwd)/pkg/metricscollector/v1beta1/tfevent-metricscollector - pytest ./test/unit/v1beta1/suggestion - pytest ./test/unit/v1beta1/earlystopping + run: make pytest diff --git a/.gitignore b/.gitignore index d90d0d215c8..6241a98bb08 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ __pycache__/ *.egg-info build/ *.charm +test/unit/v1beta1/metricscollector/testdata # SDK generator JAR file hack/gen-python-sdk/openapi-generator-cli.jar diff --git a/Makefile b/Makefile index 1cc4dd85ef6..315aa1b23e7 100644 --- a/Makefile +++ b/Makefile @@ -3,6 +3,11 @@ COMMIT := v1beta1-$(shell git rev-parse --short=7 HEAD) KATIB_REGISTRY := docker.io/kubeflowkatib CPU_ARCH ?= amd64 +# for pytest +PYTHONPATH := $(PYTHONPATH):$(CURDIR)/pkg/apis/manager/v1beta1/python:$(CURDIR)/pkg/apis/manager/health/python +PYTHONPATH := $(PYTHONPATH):$(CURDIR)/pkg/metricscollector/v1beta1/common:$(CURDIR)/pkg/metricscollector/v1beta1/tfevent-metricscollector +TEST_TENSORFLOW_EVENT_FILE_PATH ?= $(CURDIR)/test/unit/v1beta1/metricscollector/testdata/tfevent-metricscollector/logs + # Run tests .PHONY: test test: @@ -95,3 +100,25 @@ prettier-check: # Update boilerplate for the source code. update-boilerplate: ./hack/boilerplate/update-boilerplate.sh + +prepare-pytest: + pip install -r test/unit/v1beta1/requirements.txt + pip install -r cmd/suggestion/chocolate/v1beta1/requirements.txt + pip install -r cmd/suggestion/hyperopt/v1beta1/requirements.txt + pip install -r cmd/suggestion/skopt/v1beta1/requirements.txt + pip install -r cmd/suggestion/optuna/v1beta1/requirements.txt + pip install -r cmd/suggestion/hyperband/v1beta1/requirements.txt + pip install -r cmd/suggestion/nas/enas/v1beta1/requirements.txt + pip install -r cmd/suggestion/nas/darts/v1beta1/requirements.txt + pip install -r cmd/earlystopping/medianstop/v1beta1/requirements.txt + pip install -r cmd/metricscollector/v1beta1/tfevent-metricscollector/requirements.txt + +prepare-pytest-testdata: +ifeq ("$(wildcard $(TEST_TENSORFLOW_EVENT_FILE_PATH))", "") + python examples/v1beta1/trial-images/tf-mnist-with-summaries/mnist.py --epochs 5 --batch-size 200 --log-path $(TEST_TENSORFLOW_EVENT_FILE_PATH) +endif + +pytest: prepare-pytest prepare-pytest-testdata + PYTHONPATH=$(PYTHONPATH) pytest ./test/unit/v1beta1/suggestion + PYTHONPATH=$(PYTHONPATH) pytest ./test/unit/v1beta1/earlystopping + PYTHONPATH=$(PYTHONPATH) pytest ./test/unit/v1beta1/metricscollector diff --git a/test/unit/v1beta1/metricscollector/test_tfevent_metricscollector.py b/test/unit/v1beta1/metricscollector/test_tfevent_metricscollector.py index aab766968e5..b694cbc9adb 100644 --- a/test/unit/v1beta1/metricscollector/test_tfevent_metricscollector.py +++ b/test/unit/v1beta1/metricscollector/test_tfevent_metricscollector.py @@ -20,13 +20,13 @@ class TestTFEventMetricsCollector(unittest.TestCase): def test_parse_file(self): - current_dir = os.getcwd() + current_dir = os.path.dirname(os.path.abspath(__file__)) logs_dir = os.path.join(current_dir, "testdata/tfevent-metricscollector/logs") # Metric format is "{{dirname}}/{{metrics name}}" metric_names = ["train/accuracy", "train/loss", "test/loss", "test/accuracy"] metric_logs = utils.get_metric_logs(logs_dir, metric_names) - self.assertEqual(40, len(metric_logs)) + self.assertEqual(20, len(metric_logs)) for log in metric_logs: actual = log["metric"]["name"] @@ -36,7 +36,7 @@ def test_parse_file(self): metric_names = ["accuracy", "loss"] metrics_file_dir = os.path.join(logs_dir, "train") metric_logs = utils.get_metric_logs(metrics_file_dir, metric_names) - self.assertEqual(20, len(metric_logs)) + self.assertEqual(10, len(metric_logs)) for log in metric_logs: actual = log["metric"]["name"] diff --git a/test/unit/v1beta1/suggestion/test_chocolate_service.py b/test/unit/v1beta1/suggestion/test_chocolate_service.py index 75151e92668..0d811c322f8 100644 --- a/test/unit/v1beta1/suggestion/test_chocolate_service.py +++ b/test/unit/v1beta1/suggestion/test_chocolate_service.py @@ -12,9 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os + import grpc import grpc_testing import unittest +import pytest from pkg.apis.manager.v1beta1.python import api_pb2 @@ -282,5 +285,16 @@ def test_validate_algorithm_settings(self): self.assertEqual(details, 'Max Trial Count: 15 > all possible search space combinations: 12') +@pytest.fixture(scope='function', autouse=True) +def tear_down(): + yield + working_dir = os.getcwd() + db_file = ["my_db.db", "my_db.db?check_same_thread=False.lock", "my_db.db-shm", "my_db.db-wal"] + for fname in db_file: + target_path = os.path.join(working_dir, fname) + if os.path.isfile(target_path): + os.remove(target_path) + + if __name__ == '__main__': unittest.main() diff --git a/test/unit/v1beta1/suggestion/test_enas_service.py b/test/unit/v1beta1/suggestion/test_enas_service.py index e7e8a087e41..e8c51451dcf 100644 --- a/test/unit/v1beta1/suggestion/test_enas_service.py +++ b/test/unit/v1beta1/suggestion/test_enas_service.py @@ -12,10 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os +import shutil + import grpc import grpc_testing import unittest +import pytest + from pkg.apis.manager.v1beta1.python import api_pb2 from pkg.suggestion.v1beta1.nas.enas.service import EnasService @@ -191,5 +196,14 @@ def test_get_suggestion(self): self.assertEqual(2, len(response.parameter_assignments)) +@pytest.fixture(scope='function', autouse=True) +def tear_down(): + yield + working_dir = os.getcwd() + target_path = os.path.join(working_dir, "ctrl_cache") + if os.path.isdir(target_path): + shutil.rmtree(target_path) + + if __name__ == '__main__': unittest.main() From a05214910f4bd6f7808ba18cd1b23a2a8739040c Mon Sep 17 00:00:00 2001 From: tenzen-y Date: Tue, 7 Dec 2021 04:16:18 +0900 Subject: [PATCH 15/17] update publish-trial-images --- .github/workflows/publish-trial-images.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/publish-trial-images.yaml b/.github/workflows/publish-trial-images.yaml index 6ade7aafb7a..9b400bfa8d9 100644 --- a/.github/workflows/publish-trial-images.yaml +++ b/.github/workflows/publish-trial-images.yaml @@ -32,6 +32,8 @@ jobs: dockerfile: examples/v1beta1/trial-images/mxnet-mnist/Dockerfile - trial-name: pytorch-mnist dockerfile: examples/v1beta1/trial-images/pytorch-mnist/Dockerfile + - trial-name: tf-mnist-with-summaries + dockerfile: examples/v1beta1/trial-images/tf-mnist-with-summaries/Dockerfile - trial-name: enas-cnn-cifar10-gpu dockerfile: examples/v1beta1/trial-images/enas-cnn-cifar10/Dockerfile.gpu - trial-name: enas-cnn-cifar10-cpu From 855331288f1dcc6f5f30edf769727b7fcd4aafee Mon Sep 17 00:00:00 2001 From: tenzen-y Date: Fri, 10 Dec 2021 17:33:22 +0900 Subject: [PATCH 16/17] update update-images.sh --- scripts/v1beta1/update-images.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/v1beta1/update-images.sh b/scripts/v1beta1/update-images.sh index ed6d6c22d61..e9a6ea55f98 100755 --- a/scripts/v1beta1/update-images.sh +++ b/scripts/v1beta1/update-images.sh @@ -80,6 +80,7 @@ update_yaml_files "${CONFIG_PATH}" ":[^[:space:]].*\"" ":${TAG}\"" # Postfixes for the each Trial image. MXNET_MNIST="mxnet-mnist" PYTORCH_MNIST="pytorch-mnist" +TF_MNIST_WITH_SUMMARIES="tf-mnist-with-summaries" ENAS_GPU="enas-cnn-cifar10-gpu" ENAS_CPU="enas-cnn-cifar10-cpu" DARTS="darts-cnn-cifar10" @@ -87,6 +88,7 @@ DARTS="darts-cnn-cifar10" echo -e "Update Katib Trial training container images\n" update_yaml_files "./" "${OLD_PREFIX}${MXNET_MNIST}:.*" "${NEW_PREFIX}${MXNET_MNIST}:${TAG}" update_yaml_files "./" "${OLD_PREFIX}${PYTORCH_MNIST}:.*" "${NEW_PREFIX}${PYTORCH_MNIST}:${TAG}" +update_yaml_files "./" "${OLD_PREFIX}${TF_MNIST_WITH_SUMMARIES}:.*" "${NEW_PREFIX}${TF_MNIST_WITH_SUMMARIES}:${TAG}" update_yaml_files "./" "${OLD_PREFIX}${ENAS_GPU}:.*" "${NEW_PREFIX}${ENAS_GPU}:${TAG}" update_yaml_files "./" "${OLD_PREFIX}${ENAS_CPU}:.*" "${NEW_PREFIX}${ENAS_CPU}:${TAG}" update_yaml_files "./" "${OLD_PREFIX}${DARTS}:.*" "${NEW_PREFIX}${DARTS}:${TAG}" From d428ddcf85922fa4cb7d9cd887e77f4019d3cc47 Mon Sep 17 00:00:00 2001 From: tenzen-y Date: Fri, 10 Dec 2021 20:11:37 +0900 Subject: [PATCH 17/17] reduce batch size --- .../tfjob-mnist-with-summaries.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/v1beta1/kubeflow-training-operator/tfjob-mnist-with-summaries.yaml b/examples/v1beta1/kubeflow-training-operator/tfjob-mnist-with-summaries.yaml index e4d4ba72008..90c3dc81a2e 100644 --- a/examples/v1beta1/kubeflow-training-operator/tfjob-mnist-with-summaries.yaml +++ b/examples/v1beta1/kubeflow-training-operator/tfjob-mnist-with-summaries.yaml @@ -29,8 +29,8 @@ spec: - name: batch_size parameterType: int feasibleSpace: - min: "100" - max: "200" + min: "10" + max: "20" trialTemplate: primaryContainerName: tensorflow trialParameters: