diff --git a/.github/workflows/publish-trial-images.yaml b/.github/workflows/publish-trial-images.yaml
index 7d3119b9fa9..949afbe4229 100644
--- a/.github/workflows/publish-trial-images.yaml
+++ b/.github/workflows/publish-trial-images.yaml
@@ -31,8 +31,10 @@ jobs:
include:
- trial-name: mxnet-mnist
dockerfile: examples/v1beta1/trial-images/mxnet-mnist/Dockerfile
- - trial-name: pytorch-mnist
- dockerfile: examples/v1beta1/trial-images/pytorch-mnist/Dockerfile
+ - trial-name: pytorch-mnist-cpu
+ dockerfile: examples/v1beta1/trial-images/pytorch-mnist/Dockerfile.cpu
+ - trial-name: pytorch-mnist-gpu
+ dockerfile: examples/v1beta1/trial-images/pytorch-mnist/Dockerfile.gpu
- trial-name: tf-mnist-with-summaries
dockerfile: examples/v1beta1/trial-images/tf-mnist-with-summaries/Dockerfile
- trial-name: enas-cnn-cifar10-gpu
diff --git a/.github/workflows/pytorch-mnist-e2e-test.yaml b/.github/workflows/pytorch-mnist-e2e-test.yaml
index 62fa9e73e81..65a1a38d54c 100644
--- a/.github/workflows/pytorch-mnist-e2e-test.yaml
+++ b/.github/workflows/pytorch-mnist-e2e-test.yaml
@@ -24,7 +24,7 @@ jobs:
experiments: ${{ matrix.experiments }}
training-operator: true
# Comma Delimited
- trial-images: pytorch-mnist
+ trial-images: pytorch-mnist-cpu
strategy:
fail-fast: false
diff --git a/docs/images-location.md b/docs/images-location.md
index e390fb4ebdb..ed01bb15016 100644
--- a/docs/images-location.md
+++ b/docs/images-location.md
@@ -273,13 +273,24 @@ The following table shows images for training containers which are used in the
- docker.io/kubeflowkatib/pytorch-mnist
+ docker.io/kubeflowkatib/pytorch-mnist-cpu
|
- PyTorch MNIST example with printing metrics to the file or StdOut
+ PyTorch MNIST example with printing metrics to the file or StdOut with CPU support
|
- Dockerfile
+ Dockerfile
+ |
+
+
+
+ docker.io/kubeflowkatib/pytorch-mnist-gpu
+ |
+
+ PyTorch MNIST example with printing metrics to the file or StdOut with GPU support
+ |
+
+ Dockerfile
|
diff --git a/examples/v1beta1/early-stopping/median-stop-with-json-format.yaml b/examples/v1beta1/early-stopping/median-stop-with-json-format.yaml
index 9fe869cc327..b3b6be79e93 100644
--- a/examples/v1beta1/early-stopping/median-stop-with-json-format.yaml
+++ b/examples/v1beta1/early-stopping/median-stop-with-json-format.yaml
@@ -62,7 +62,7 @@ spec:
spec:
containers:
- name: training-container
- image: docker.io/kubeflowkatib/pytorch-mnist:v0.14.0-rc.0
+ image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.14.0-rc.0
command:
- "python3"
- "/opt/pytorch-mnist/mnist.py"
diff --git a/examples/v1beta1/kubeflow-training-operator/pytorchjob-mnist.yaml b/examples/v1beta1/kubeflow-training-operator/pytorchjob-mnist.yaml
index 02a7163b8b6..98cfdde92f8 100644
--- a/examples/v1beta1/kubeflow-training-operator/pytorchjob-mnist.yaml
+++ b/examples/v1beta1/kubeflow-training-operator/pytorchjob-mnist.yaml
@@ -46,7 +46,7 @@ spec:
spec:
containers:
- name: pytorch
- image: docker.io/kubeflowkatib/pytorch-mnist:v0.14.0-rc.0
+ image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.14.0-rc.0
command:
- "python3"
- "/opt/pytorch-mnist/mnist.py"
@@ -61,7 +61,7 @@ spec:
spec:
containers:
- name: pytorch
- image: docker.io/kubeflowkatib/pytorch-mnist:v0.14.0-rc.0
+ image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.14.0-rc.0
command:
- "python3"
- "/opt/pytorch-mnist/mnist.py"
diff --git a/examples/v1beta1/metrics-collector/custom-metrics-collector.yaml b/examples/v1beta1/metrics-collector/custom-metrics-collector.yaml
index 23e75de96b5..3a656943c91 100644
--- a/examples/v1beta1/metrics-collector/custom-metrics-collector.yaml
+++ b/examples/v1beta1/metrics-collector/custom-metrics-collector.yaml
@@ -67,7 +67,7 @@ spec:
spec:
containers:
- name: training-container
- image: docker.io/kubeflowkatib/pytorch-mnist:v0.14.0-rc.0
+ image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.14.0-rc.0
command:
- "python3"
- "/opt/pytorch-mnist/mnist.py"
diff --git a/examples/v1beta1/metrics-collector/file-metrics-collector-with-json-format.yaml b/examples/v1beta1/metrics-collector/file-metrics-collector-with-json-format.yaml
index 0a966c61611..e0f8d07d844 100644
--- a/examples/v1beta1/metrics-collector/file-metrics-collector-with-json-format.yaml
+++ b/examples/v1beta1/metrics-collector/file-metrics-collector-with-json-format.yaml
@@ -52,7 +52,7 @@ spec:
spec:
containers:
- name: training-container
- image: docker.io/kubeflowkatib/pytorch-mnist:v0.14.0-rc.0
+ image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.14.0-rc.0
command:
- "python3"
- "/opt/pytorch-mnist/mnist.py"
diff --git a/examples/v1beta1/metrics-collector/file-metrics-collector.yaml b/examples/v1beta1/metrics-collector/file-metrics-collector.yaml
index e22eb262a24..de596abfce6 100644
--- a/examples/v1beta1/metrics-collector/file-metrics-collector.yaml
+++ b/examples/v1beta1/metrics-collector/file-metrics-collector.yaml
@@ -54,7 +54,7 @@ spec:
spec:
containers:
- name: training-container
- image: docker.io/kubeflowkatib/pytorch-mnist:v0.14.0-rc.0
+ image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.14.0-rc.0
command:
- "python3"
- "/opt/pytorch-mnist/mnist.py"
diff --git a/examples/v1beta1/trial-images/pytorch-mnist/Dockerfile b/examples/v1beta1/trial-images/pytorch-mnist/Dockerfile.cpu
similarity index 100%
rename from examples/v1beta1/trial-images/pytorch-mnist/Dockerfile
rename to examples/v1beta1/trial-images/pytorch-mnist/Dockerfile.cpu
diff --git a/examples/v1beta1/trial-images/pytorch-mnist/Dockerfile.gpu b/examples/v1beta1/trial-images/pytorch-mnist/Dockerfile.gpu
new file mode 100644
index 00000000000..cdb6190f247
--- /dev/null
+++ b/examples/v1beta1/trial-images/pytorch-mnist/Dockerfile.gpu
@@ -0,0 +1,15 @@
+FROM pytorch/pytorch:1.11.0-cuda11.3-cudnn8-runtime
+
+ADD examples/v1beta1/trial-images/pytorch-mnist /opt/pytorch-mnist
+WORKDIR /opt/pytorch-mnist
+
+# Add folder for the logs.
+RUN mkdir /katib
+RUN pip install --no-cache-dir -r requirements.txt
+
+RUN chgrp -R 0 /opt/pytorch-mnist \
+ && chmod -R g+rwX /opt/pytorch-mnist \
+ && chgrp -R 0 /katib \
+ && chmod -R g+rwX /katib
+
+ENTRYPOINT ["python3", "/opt/pytorch-mnist/mnist.py"]
diff --git a/manifests/v1beta1/components/controller/trial-templates.yaml b/manifests/v1beta1/components/controller/trial-templates.yaml
index d7a4820d2d8..6a81fbcf77f 100644
--- a/manifests/v1beta1/components/controller/trial-templates.yaml
+++ b/manifests/v1beta1/components/controller/trial-templates.yaml
@@ -54,7 +54,7 @@ data:
spec:
containers:
- name: pytorch
- image: docker.io/kubeflowkatib/pytorch-mnist:v0.14.0-rc.0
+ image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.14.0-rc.0
command:
- "python3"
- "/opt/pytorch-mnist/mnist.py"
@@ -68,7 +68,7 @@ data:
spec:
containers:
- name: pytorch
- image: docker.io/kubeflowkatib/pytorch-mnist:v0.14.0-rc.0
+ image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.14.0-rc.0
command:
- "python3"
- "/opt/pytorch-mnist/mnist.py"
diff --git a/scripts/v1beta1/build.sh b/scripts/v1beta1/build.sh
index 3c4a7020f58..da6f9723731 100755
--- a/scripts/v1beta1/build.sh
+++ b/scripts/v1beta1/build.sh
@@ -123,8 +123,11 @@ else
echo -e "\nBuilding mxnet mnist training container example...\n"
docker build --platform linux/amd64 -t "${REGISTRY}/mxnet-mnist:${TAG}" -f examples/${VERSION}/trial-images/mxnet-mnist/Dockerfile .
- echo -e "\nBuilding PyTorch mnist training container example...\n"
- docker build --platform linux/amd64 -t "${REGISTRY}/pytorch-mnist:${TAG}" -f examples/${VERSION}/trial-images/pytorch-mnist/Dockerfile .
+ echo -e "\nBuilding PyTorch mnist training container example with CPU support...\n"
+ docker build --platform linux/amd64 -t "${REGISTRY}/pytorch-mnist-cpu:${TAG}" -f examples/${VERSION}/trial-images/pytorch-mnist/Dockerfile.cpu .
+
+ echo -e "\nBuilding PyTorch mnist training container example with GPU support...\n"
+ docker build --platform linux/amd64 -t "${REGISTRY}/pytorch-mnist-gpu:${TAG}" -f examples/${VERSION}/trial-images/pytorch-mnist/Dockerfile.gpu .
echo -e "\nBuilding Keras CIFAR-10 CNN training container example for ENAS with GPU support...\n"
docker build --platform linux/amd64 -t "${REGISTRY}/enas-cnn-cifar10-gpu:${TAG}" -f examples/${VERSION}/trial-images/enas-cnn-cifar10/Dockerfile.gpu .
diff --git a/scripts/v1beta1/push.sh b/scripts/v1beta1/push.sh
index 1d098815933..95f5d3c98eb 100755
--- a/scripts/v1beta1/push.sh
+++ b/scripts/v1beta1/push.sh
@@ -98,8 +98,11 @@ docker push "${REGISTRY}/mxnet-mnist:${TAG}"
echo -e "\nPushing Tensorflow with summaries mnist training container example...\n"
docker push "${REGISTRY}/tf-mnist-with-summaries:${TAG}"
-echo -e "\nPushing PyTorch mnist training container example...\n"
-docker push "${REGISTRY}/pytorch-mnist:${TAG}"
+echo -e "\nPushing PyTorch mnist training container example with CPU support...\n"
+docker push "${REGISTRY}/pytorch-mnist-cpu:${TAG}"
+
+echo -e "\nPushing PyTorch mnist training container example with GPU support...\n"
+docker push "${REGISTRY}/pytorch-mnist-gpu:${TAG}"
echo -e "\nPushing Keras CIFAR-10 CNN training container example for ENAS with GPU support...\n"
docker push "${REGISTRY}/enas-cnn-cifar10-gpu:${TAG}"
diff --git a/scripts/v1beta1/update-images.sh b/scripts/v1beta1/update-images.sh
index d532d150dda..b35892ebc5b 100755
--- a/scripts/v1beta1/update-images.sh
+++ b/scripts/v1beta1/update-images.sh
@@ -83,7 +83,8 @@ update_yaml_files "${CONFIG_PATH}" ":[^[:space:]].*\"" ":${TAG}\""
# Postfixes for the each Trial image.
MXNET_MNIST="mxnet-mnist"
-PYTORCH_MNIST="pytorch-mnist"
+PYTORCH_MNIST_CPU="pytorch-mnist-cpu"
+PYTORCH_MNIST_GPU="pytorch-mnist-gpu"
TF_MNIST_WITH_SUMMARIES="tf-mnist-with-summaries"
ENAS_GPU="enas-cnn-cifar10-gpu"
ENAS_CPU="enas-cnn-cifar10-cpu"
@@ -93,7 +94,8 @@ SIMPLE_PBT="simple-pbt"
echo -e "Update Katib Trial training container images\n"
update_yaml_files "./" "${OLD_PREFIX}${MXNET_MNIST}:.*" "${NEW_PREFIX}${MXNET_MNIST}:${TAG}"
-update_yaml_files "./" "${OLD_PREFIX}${PYTORCH_MNIST}:.*" "${NEW_PREFIX}${PYTORCH_MNIST}:${TAG}"
+update_yaml_files "./" "${OLD_PREFIX}${PYTORCH_MNIST_CPU}:.*" "${NEW_PREFIX}${PYTORCH_MNIST_CPU}:${TAG}"
+update_yaml_files "./" "${OLD_PREFIX}${PYTORCH_MNIST_GPU}:.*" "${NEW_PREFIX}${PYTORCH_MNIST_GPU}:${TAG}"
update_yaml_files "./" "${OLD_PREFIX}${TF_MNIST_WITH_SUMMARIES}:.*" "${NEW_PREFIX}${TF_MNIST_WITH_SUMMARIES}:${TAG}"
update_yaml_files "./" "${OLD_PREFIX}${ENAS_GPU}:.*" "${NEW_PREFIX}${ENAS_GPU}:${TAG}"
update_yaml_files "./" "${OLD_PREFIX}${ENAS_CPU}:.*" "${NEW_PREFIX}${ENAS_CPU}:${TAG}"
diff --git a/test/e2e/v1beta1/scripts/gh-actions/build-load.sh b/test/e2e/v1beta1/scripts/gh-actions/build-load.sh
index a16b4407db3..30fae899236 100755
--- a/test/e2e/v1beta1/scripts/gh-actions/build-load.sh
+++ b/test/e2e/v1beta1/scripts/gh-actions/build-load.sh
@@ -30,7 +30,7 @@ REGISTRY="docker.io/kubeflowkatib"
TAG="e2e-test"
VERSION="v1beta1"
CMD_PREFIX="cmd"
-SPECIFIED_DEVICE_TYPE_IMAGES=("enas-cnn-cifar10-cpu" "darts-cnn-cifar10-cpu")
+SPECIFIED_DEVICE_TYPE_IMAGES=("enas-cnn-cifar10-cpu" "darts-cnn-cifar10-cpu" "pytorch-mnist-cpu")
IFS="," read -r -a TRIAL_IMAGE_ARRAY <<< "$TRIAL_IMAGES"
IFS="," read -r -a EXPERIMENT_ARRAY <<< "$EXPERIMENTS"
@@ -51,7 +51,7 @@ _build_containers() {
docker build --platform "$(uname -m)" -t "$REGISTRY/$CONTAINER_NAME:$TAG" -f "../../../../../$DOCKERFILE" ../../../../../
}
-_load_kind_cluster() {
+_load_minikube_cluster() {
CONTAINER_NAME=${1:-"katib-controller"}
echo -e "\n\nLoading $CONTAINER_NAME image...\n\n"
@@ -99,7 +99,7 @@ run() {
for s in "${suggestions[@]}"; do
if [ "$s" == "$CONTAINER_NAME" ]; then
_build_containers "$CONTAINER_NAME" "$DOCKERFILE"
- _load_kind_cluster "$CONTAINER_NAME"
+ _load_minikube_cluster "$CONTAINER_NAME"
break
fi
done
@@ -126,7 +126,7 @@ run() {
for e in "${earlystoppings[@]}"; do
if [ "$e" == "$CONTAINER_NAME" ]; then
_build_containers "$CONTAINER_NAME" "$DOCKERFILE"
- _load_kind_cluster "$CONTAINER_NAME"
+ _load_minikube_cluster "$CONTAINER_NAME"
break
fi
done
@@ -134,7 +134,7 @@ run() {
# Others
else
_build_containers "$CONTAINER_NAME" "$DOCKERFILE"
- _load_kind_cluster "$CONTAINER_NAME"
+ _load_minikube_cluster "$CONTAINER_NAME"
fi
}