forked from kubeflow/training-operator
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Upgrade PyTorchJob examples to PyTorch v2 (kubeflow#2024)
* refactor: upgrade pytorch job examples to pytorch v2 Signed-off-by: champon1020 <nagatelu1020@gmail.com> * fix: remove torch.compile and update base image of Dockerfile Signed-off-by: champon1020 <nagatelu1020@gmail.com> * fix: comment out pytorch mnist Dockerfiles in the config of CI Signed-off-by: champon1020 <nagatelu1020@gmail.com> * fix: minor changes * add Dockerfile context to github workflow yaml * add commenets to the head of Dockerfile Signed-off-by: champon1020 <nagatelu1020@gmail.com> --------- Signed-off-by: champon1020 <nagatelu1020@gmail.com>
- Loading branch information
1 parent
fb35949
commit 21f25ce
Showing
10 changed files
with
42 additions
and
133 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,15 @@ | ||
FROM pytorch/pytorch:1.0-cuda10.0-cudnn7-runtime | ||
# We need to use the nvcr.io/nvidia/pytorch image as a base image to support both linux/amd64 and linux_arm64 platforms. | ||
# PyTorch=2.2.0, cuda=12.3.2 | ||
# Ref: https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-01.html#rel-24-01 | ||
FROM nvcr.io/nvidia/pytorch:24.01-py3 | ||
|
||
RUN pip install tensorboardX==1.6.0 | ||
RUN pip install tensorboardX==2.6.2 | ||
RUN mkdir -p /opt/mnist | ||
|
||
WORKDIR /opt/mnist/src | ||
ADD mnist.py /opt/mnist/src/mnist.py | ||
|
||
RUN chgrp -R 0 /opt/mnist \ | ||
&& chmod -R g+rwX /opt/mnist | ||
RUN chgrp -R 0 /opt/mnist \ | ||
&& chmod -R g+rwX /opt/mnist | ||
|
||
ENTRYPOINT ["python", "/opt/mnist/src/mnist.py"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,52 +1,15 @@ | ||
FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04 | ||
ARG PYTHON_VERSION=3.6 | ||
# We need to use the nvcr.io/nvidia/pytorch image as a base image to support both linux/amd64 and linux_arm64 platforms. | ||
# PyTorch=2.2.0, cuda=12.3.2 | ||
# Ref: https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-01.html#rel-24-01 | ||
FROM nvcr.io/nvidia/pytorch:24.01-py3 | ||
|
||
RUN apt-get update && apt-get install -y --no-install-recommends \ | ||
build-essential \ | ||
cmake \ | ||
git \ | ||
curl \ | ||
vim \ | ||
wget \ | ||
ca-certificates \ | ||
openssh-client \ | ||
libjpeg-dev \ | ||
libpng-dev &&\ | ||
rm -rf /var/lib/apt/lists/* | ||
RUN pip install tensorboardX==2.6.2 | ||
RUN mkdir -p /opt/mnist | ||
|
||
RUN wget https://www.open-mpi.org/software/ompi/v3.0/downloads/openmpi-3.0.0.tar.gz && \ | ||
gunzip -c openmpi-3.0.0.tar.gz | tar xf - && \ | ||
cd openmpi-3.0.0 && \ | ||
./configure --prefix=/home/.openmpi --with-cuda && \ | ||
make all install | ||
WORKDIR /opt/mnist/src | ||
ADD mnist.py /opt/mnist/src/mnist.py | ||
|
||
ENV PATH="$PATH:/home/.openmpi/bin" | ||
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/.openmpi/lib/" | ||
RUN chgrp -R 0 /opt/mnist \ | ||
&& chmod -R g+rwX /opt/mnist | ||
|
||
RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value | ||
RUN wget -O ~/miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ | ||
chmod +x ~/miniconda.sh && \ | ||
~/miniconda.sh -b -p /opt/conda && \ | ||
rm ~/miniconda.sh && \ | ||
/opt/conda/bin/conda update conda && \ | ||
/opt/conda/bin/conda install -y python=$PYTHON_VERSION numpy pyyaml scipy ipython mkl mkl-include cython typing && \ | ||
/opt/conda/bin/conda clean -ya | ||
ENV PATH /opt/conda/bin:$PATH | ||
# This must be done before pip so that requirements.txt is available | ||
WORKDIR /opt/pytorch | ||
|
||
RUN git clone --recursive https://github.com/pytorch/pytorch | ||
|
||
RUN TORCH_CUDA_ARCH_LIST="3.5 5.2 6.0 6.1 7.0+PTX" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \ | ||
CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \ | ||
cd pytorch/ && \ | ||
pip install -v . | ||
|
||
RUN /opt/conda/bin/conda config --set ssl_verify False | ||
RUN pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org | ||
RUN pip install --trusted-host pypi.org --trusted-host files.pythonhosted.org torchvision tensorboardX==1.6.0 | ||
|
||
WORKDIR /var | ||
ADD mnist.py /var | ||
|
||
ENTRYPOINT ["mpirun", "-n", "1", "--allow-run-as-root", "python", "/var/mnist.py"] | ||
ENTRYPOINT ["mpirun", "-n", "1", "--allow-run-as-root", "python", "/opt/mnist/src/mnist.py"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters