From a708103ebb1230510804f7b13b55f359da545ebf Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Wed, 17 Jan 2024 14:42:49 -0800 Subject: [PATCH 01/70] image version update --- docker/Dockerfile | 184 +++++++++++++++++++++++----------------------- setup.py | 1 + 2 files changed, 93 insertions(+), 92 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index ea72ebc7b4..559d4a391d 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -15,15 +15,15 @@ ARG BASE_IMAGE=${CUDA_VERSION:+"nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu2 ARG BASE_IMAGE=${BASE_IMAGE:-"ubuntu:20.04"} # The Python version to install -ARG PYTHON_VERSION=3.10 +ARG PYTHON_VERSION=3.11 # The Pytorch Version to install -ARG PYTORCH_VERSION=1.13.1 +ARG PYTORCH_VERSION=2.1.2 # The Torchvision version to install. # Reference https://github.com/pytorch/vision#installation to determine the Torchvision # version that corresponds to the PyTorch version -ARG TORCHVISION_VERSION=0.14.1 +ARG TORCHVISION_VERSION=0.16.2 # In the Dockerimage, Pillow-SIMD is installed instead of Pillow. To trick pip into thinking that # Pillow is also installed (so it won't override it with a future pip install), a Pillow stub is included @@ -77,55 +77,55 @@ ARG CUDA_VERSION # If this file is present after the first command, kaniko # won't be able to build the docker image. RUN if [ -n "$CUDA_VERSION" ]; then \ - rm -f /usr/local/cuda-$(echo $CUDA_VERSION | cut -c -4)/cuda-$(echo $CUDA_VERSION | cut -c -4); \ + rm -f /usr/local/cuda-$(echo $CUDA_VERSION | cut -c -4)/cuda-$(echo $CUDA_VERSION | cut -c -4); \ fi # update repository keys # https://developer.nvidia.com/blog/updating-the-cuda-linux-gpg-repository-key/ RUN if [ -n "$CUDA_VERSION" ] ; then \ - rm -f /etc/apt/sources.list.d/cuda.list && \ - rm -f /etc/apt/sources.list.d/nvidia-ml.list && \ - apt-get update && \ - apt-get install -y --no-install-recommends wget && \ - apt-get autoclean && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* \ - apt-key del 7fa2af80 && \ - mkdir -p /tmp/cuda-keyring && \ - wget -P /tmp/cuda-keyring https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb && \ - dpkg -i /tmp/cuda-keyring/cuda-keyring_1.0-1_all.deb && \ - rm -rf /tmp/cuda-keyring ; \ + rm -f /etc/apt/sources.list.d/cuda.list && \ + rm -f /etc/apt/sources.list.d/nvidia-ml.list && \ + apt-get update && \ + apt-get install -y --no-install-recommends wget && \ + apt-get autoclean && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* \ + apt-key del 7fa2af80 && \ + mkdir -p /tmp/cuda-keyring && \ + wget -P /tmp/cuda-keyring https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb && \ + dpkg -i /tmp/cuda-keyring/cuda-keyring_1.0-1_all.deb && \ + rm -rf /tmp/cuda-keyring ; \ fi RUN apt-get update && \ apt-get install -y --no-install-recommends \ - libgomp1 \ - curl \ - wget \ - sudo \ - build-essential \ - software-properties-common \ - dirmngr \ - apt-utils \ - gpg-agent \ - openssh-client \ - # For PILLOW: - zlib1g-dev \ - libtiff-dev \ - libfreetype6-dev \ - liblcms2-dev \ - tcl \ - libjpeg8-dev \ - less \ - # For AWS EFA: - autoconf \ - autotools-dev \ - automake \ - libtool \ - # Development tools - tmux \ - htop && \ + libgomp1 \ + curl \ + wget \ + sudo \ + build-essential \ + software-properties-common \ + dirmngr \ + apt-utils \ + gpg-agent \ + openssh-client \ + # For PILLOW: + zlib1g-dev \ + libtiff-dev \ + libfreetype6-dev \ + liblcms2-dev \ + tcl \ + libjpeg8-dev \ + less \ + # For AWS EFA: + autoconf \ + autotools-dev \ + automake \ + libtool \ + # Development tools + tmux \ + htop && \ apt-get autoclean && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* @@ -135,7 +135,7 @@ RUN apt-get update && \ ############################### RUN add-apt-repository ppa:git-core/ppa && \ apt-get install -y --no-install-recommends \ - git && \ + git && \ apt-get autoclean && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* @@ -201,14 +201,14 @@ ENV PYTORCH_NIGHTLY_URL=${PYTORCH_NIGHTLY_URL} ENV PYTORCH_NIGHTLY_VERSION=${PYTORCH_NIGHTLY_VERSION} RUN if [ -z "$PYTORCH_NIGHTLY_URL" ] ; then \ - CUDA_VERSION_TAG=$(python${PYTHON_VERSION} -c "print('cu' + ''.join('${CUDA_VERSION}'.split('.')[:2]) if '${CUDA_VERSION}' else 'cpu')") && \ - pip${PYTHON_VERSION} install --no-cache-dir --find-links https://download.pytorch.org/whl/torch_stable.html \ - torch==${PYTORCH_VERSION}+${CUDA_VERSION_TAG} \ - torchvision==${TORCHVISION_VERSION}+${CUDA_VERSION_TAG} ; \ + CUDA_VERSION_TAG=$(python${PYTHON_VERSION} -c "print('cu' + ''.join('${CUDA_VERSION}'.split('.')[:2]) if '${CUDA_VERSION}' else 'cpu')") && \ + pip${PYTHON_VERSION} install --no-cache-dir --find-links https://download.pytorch.org/whl/torch_stable.html \ + torch==${PYTORCH_VERSION}+${CUDA_VERSION_TAG} \ + torchvision==${TORCHVISION_VERSION}+${CUDA_VERSION_TAG} ; \ else \ - pip${PYTHON_VERSION} install --no-cache-dir --pre --index-url ${PYTORCH_NIGHTLY_URL} \ - torch==${PYTORCH_VERSION}.${PYTORCH_NIGHTLY_VERSION} \ - torchvision==${TORCHVISION_VERSION}.${PYTORCH_NIGHTLY_VERSION} ; \ + pip${PYTHON_VERSION} install --no-cache-dir --pre --index-url ${PYTORCH_NIGHTLY_URL} \ + torch==${PYTORCH_VERSION}.${PYTORCH_NIGHTLY_VERSION} \ + torchvision==${TORCHVISION_VERSION}.${PYTORCH_NIGHTLY_VERSION} ; \ fi ##################################### @@ -225,34 +225,34 @@ ENV FI_EFA_USE_DEVICE_RDMA=1 RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ apt-get update && \ apt-get install -y --no-install-recommends \ - hwloc \ - libhwloc-dev && \ + hwloc \ + libhwloc-dev && \ apt-get autoclean && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* ; \ fi RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ - cd /tmp && \ - curl -OsS https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \ - tar -xf /tmp/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \ - cd aws-efa-installer && \ - apt-get update && \ - ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify && \ - rm -rf /tmp/aws-efa-installer* ; \ + cd /tmp && \ + curl -OsS https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \ + tar -xf /tmp/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \ + cd aws-efa-installer && \ + apt-get update && \ + ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify && \ + rm -rf /tmp/aws-efa-installer* ; \ fi RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ - git clone https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl && \ - cd /opt/aws-ofi-nccl && \ - git checkout ${AWS_OFI_NCCL_VERSION} && \ - ./autogen.sh && \ - ./configure --prefix=/opt/aws-ofi-nccl/install \ - --with-libfabric=/opt/amazon/efa/ \ - --with-cuda=/usr/local/cuda \ - --disable-tests \ - --enable-platform-aws && \ - make && make install ; \ + git clone https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl && \ + cd /opt/aws-ofi-nccl && \ + git checkout ${AWS_OFI_NCCL_VERSION} && \ + ./autogen.sh && \ + ./configure --prefix=/opt/aws-ofi-nccl/install \ + --with-libfabric=/opt/amazon/efa/ \ + --with-cuda=/usr/local/cuda \ + --disable-tests \ + --enable-platform-aws && \ + make && make install ; \ fi ################################### @@ -262,11 +262,11 @@ RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ ARG MOFED_VERSION RUN if [ -n "$MOFED_VERSION" ] ; then \ - mkdir -p /tmp/mofed && \ - wget -nv -P /tmp/mofed http://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VERSION}/MLNX_OFED_LINUX-${MOFED_VERSION}-ubuntu20.04-x86_64.tgz && \ - tar -zxvf /tmp/mofed/MLNX_OFED_LINUX-${MOFED_VERSION}-ubuntu20.04-x86_64.tgz -C /tmp/mofed && \ - /tmp/mofed/MLNX_OFED_LINUX-${MOFED_VERSION}-ubuntu20.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force && \ - rm -rf /tmp/mofed ; \ + mkdir -p /tmp/mofed && \ + wget -nv -P /tmp/mofed http://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VERSION}/MLNX_OFED_LINUX-${MOFED_VERSION}-ubuntu20.04-x86_64.tgz && \ + tar -zxvf /tmp/mofed/MLNX_OFED_LINUX-${MOFED_VERSION}-ubuntu20.04-x86_64.tgz -C /tmp/mofed && \ + /tmp/mofed/MLNX_OFED_LINUX-${MOFED_VERSION}-ubuntu20.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force && \ + rm -rf /tmp/mofed ; \ fi ##################### @@ -274,27 +274,27 @@ RUN if [ -n "$MOFED_VERSION" ] ; then \ ##################### # skip if torch nightly is installed as there is incompatability RUN if [[ -n "$CUDA_VERSION" ]] && [[ -z "${PYTORCH_NIGHTLY_URL}" ]]; then \ - mkdir -p /tmp/apex && \ - cd /tmp/apex && \ - git clone https://github.com/NVIDIA/apex && \ - cd apex && \ - git checkout 82ee367f3da74b4cd62a1fb47aa9806f0f47b58b && \ - pip${PYTHON_VERSION} install --no-cache-dir -r requirements.txt && \ - pip${PYTHON_VERSION} install --no-cache-dir \ - --global-option="--cpp_ext" \ - --global-option="--cuda_ext" \ - --target /usr/local/lib/python${PYTHON_VERSION}/dist-packages \ - ./ && \ - rm -rf /tmp/apex ; \ + mkdir -p /tmp/apex && \ + cd /tmp/apex && \ + git clone https://github.com/NVIDIA/apex && \ + cd apex && \ + git checkout 82ee367f3da74b4cd62a1fb47aa9806f0f47b58b && \ + pip${PYTHON_VERSION} install --no-cache-dir -r requirements.txt && \ + pip${PYTHON_VERSION} install --no-cache-dir \ + --global-option="--cpp_ext" \ + --global-option="--cuda_ext" \ + --target /usr/local/lib/python${PYTHON_VERSION}/dist-packages \ + ./ && \ + rm -rf /tmp/apex ; \ fi ########################## # Install Flash Attention ########################## RUN if [ -n "$CUDA_VERSION" ] ; then \ - pip${PYTHON_VERSION} install --upgrade --no-cache-dir ninja==1.11.1 && \ - pip${PYTHON_VERSION} install --upgrade --no-cache-dir --force-reinstall packaging==22.0 && \ - pip${PYTHON_VERSION} install --no-cache-dir flash-attn==1.0.9; \ + pip${PYTHON_VERSION} install --upgrade --no-cache-dir ninja==1.11.1 && \ + pip${PYTHON_VERSION} install --upgrade --no-cache-dir --force-reinstall packaging==22.0 && \ + pip${PYTHON_VERSION} install --no-cache-dir flash-attn==1.0.9; \ fi ############### @@ -351,9 +351,9 @@ RUN apt-get update && \ # Upgrade pip packages ######################### RUN pip install --no-cache-dir --upgrade \ - certifi${CERTIFI_VERSION} \ - ipython${IPYTHON_VERSION} \ - urllib3${URLLIB3_VERSION} + certifi${CERTIFI_VERSION} \ + ipython${IPYTHON_VERSION} \ + urllib3${URLLIB3_VERSION} ################################################## # Override NVIDIA mistaken env var for 11.8 images diff --git a/setup.py b/setup.py index 7322bdc49e..8b5733f8ab 100644 --- a/setup.py +++ b/setup.py @@ -261,6 +261,7 @@ def package_files(prefix: str, directory: str, extension: str): 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', ], install_requires=install_requires, entry_points={ From 3c516d29011da669c69d6f8d232214e0c881c491 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Wed, 17 Jan 2024 14:58:48 -0800 Subject: [PATCH 02/70] update builder --- docker/build_matrix.yaml | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml index cd2efc0e19..f924768a77 100644 --- a/docker/build_matrix.yaml +++ b/docker/build_matrix.yaml @@ -18,7 +18,7 @@ brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 - PYTHON_VERSION: '3.10' + PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.1.2 @@ -46,7 +46,7 @@ brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 - PYTHON_VERSION: '3.10' + PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.1.2 @@ -61,7 +61,7 @@ IMAGE_NAME: torch-2-1-2-cpu MOFED_VERSION: '' NVIDIA_REQUIRE_CUDA_OVERRIDE: '' - PYTHON_VERSION: '3.10' + PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.1.2 @@ -81,7 +81,7 @@ brand=geforce,driver>=470,driver<471 brand=geforce,driver>=515,driver<516 brand=quadro,driver>=470,driver<471 brand=quadro,driver>=515,driver<516 brand=titan,driver>=470,driver<471 brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=470,driver<471 brand=titanrtx,driver>=515,driver<516 - PYTHON_VERSION: '3.10' + PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.0.1 @@ -100,7 +100,7 @@ brand=geforce,driver>=470,driver<471 brand=geforce,driver>=515,driver<516 brand=quadro,driver>=470,driver<471 brand=quadro,driver>=515,driver<516 brand=titan,driver>=470,driver<471 brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=470,driver<471 brand=titanrtx,driver>=515,driver<516 - PYTHON_VERSION: '3.10' + PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.0.1 @@ -114,7 +114,7 @@ IMAGE_NAME: torch-2-0-1-cpu MOFED_VERSION: '' NVIDIA_REQUIRE_CUDA_OVERRIDE: '' - PYTHON_VERSION: '3.10' + PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.0.1 @@ -128,7 +128,7 @@ IMAGE_NAME: torch-1-13-1-cu117 MOFED_VERSION: 5.5-1.0.3.2 NVIDIA_REQUIRE_CUDA_OVERRIDE: '' - PYTHON_VERSION: '3.10' + PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 1.13.1 @@ -142,7 +142,7 @@ IMAGE_NAME: torch-1-13-1-cu117-aws MOFED_VERSION: '' NVIDIA_REQUIRE_CUDA_OVERRIDE: '' - PYTHON_VERSION: '3.10' + PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 1.13.1 @@ -156,7 +156,7 @@ IMAGE_NAME: torch-1-13-1-cpu MOFED_VERSION: '' NVIDIA_REQUIRE_CUDA_OVERRIDE: '' - PYTHON_VERSION: '3.10' + PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 1.13.1 @@ -183,7 +183,7 @@ brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 - PYTHON_VERSION: '3.10' + PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: https://download.pytorch.org/whl/nightly/cu121 PYTORCH_NIGHTLY_VERSION: dev20231213+cu121 PYTORCH_VERSION: 2.2.0 @@ -211,7 +211,7 @@ brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 - PYTHON_VERSION: '3.10' + PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.1.2 @@ -227,7 +227,7 @@ IMAGE_NAME: composer-0-17-2-cpu MOFED_VERSION: 5.5-1.0.3.2 NVIDIA_REQUIRE_CUDA_OVERRIDE: '' - PYTHON_VERSION: '3.10' + PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.1.2 From 21751de3240d9412ae877916c9c89d45c6ffb2a6 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Wed, 17 Jan 2024 15:07:53 -0800 Subject: [PATCH 03/70] remove torch 1.13 --- docker/build_matrix.yaml | 84 ++++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml index f924768a77..95c1784b17 100644 --- a/docker/build_matrix.yaml +++ b/docker/build_matrix.yaml @@ -122,48 +122,48 @@ - mosaicml/pytorch:2.0.1_cpu-python3.10-ubuntu20.04 TARGET: pytorch_stage TORCHVISION_VERSION: 0.15.2 -- AWS_OFI_NCCL_VERSION: '' - BASE_IMAGE: nvidia/cuda:11.7.1-cudnn8-devel-ubuntu20.04 - CUDA_VERSION: 11.7.1 - IMAGE_NAME: torch-1-13-1-cu117 - MOFED_VERSION: 5.5-1.0.3.2 - NVIDIA_REQUIRE_CUDA_OVERRIDE: '' - PYTHON_VERSION: '3.11' - PYTORCH_NIGHTLY_URL: '' - PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 1.13.1 - TAGS: - - mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04 - TARGET: pytorch_stage - TORCHVISION_VERSION: 0.14.1 -- AWS_OFI_NCCL_VERSION: v1.7.4-aws - BASE_IMAGE: nvidia/cuda:11.7.1-cudnn8-devel-ubuntu20.04 - CUDA_VERSION: 11.7.1 - IMAGE_NAME: torch-1-13-1-cu117-aws - MOFED_VERSION: '' - NVIDIA_REQUIRE_CUDA_OVERRIDE: '' - PYTHON_VERSION: '3.11' - PYTORCH_NIGHTLY_URL: '' - PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 1.13.1 - TAGS: - - mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04-aws - TARGET: pytorch_stage - TORCHVISION_VERSION: 0.14.1 -- AWS_OFI_NCCL_VERSION: '' - BASE_IMAGE: ubuntu:20.04 - CUDA_VERSION: '' - IMAGE_NAME: torch-1-13-1-cpu - MOFED_VERSION: '' - NVIDIA_REQUIRE_CUDA_OVERRIDE: '' - PYTHON_VERSION: '3.11' - PYTORCH_NIGHTLY_URL: '' - PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 1.13.1 - TAGS: - - mosaicml/pytorch:1.13.1_cpu-python3.10-ubuntu20.04 - TARGET: pytorch_stage - TORCHVISION_VERSION: 0.14.1 +# - AWS_OFI_NCCL_VERSION: '' +# BASE_IMAGE: nvidia/cuda:11.7.1-cudnn8-devel-ubuntu20.04 +# CUDA_VERSION: 11.7.1 +# IMAGE_NAME: torch-1-13-1-cu117 +# MOFED_VERSION: 5.5-1.0.3.2 +# NVIDIA_REQUIRE_CUDA_OVERRIDE: '' +# PYTHON_VERSION: '3.11' +# PYTORCH_NIGHTLY_URL: '' +# PYTORCH_NIGHTLY_VERSION: '' +# PYTORCH_VERSION: 1.13.1 +# TAGS: +# - mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04 +# TARGET: pytorch_stage +# TORCHVISION_VERSION: 0.14.1 +# - AWS_OFI_NCCL_VERSION: v1.7.4-aws +# BASE_IMAGE: nvidia/cuda:11.7.1-cudnn8-devel-ubuntu20.04 +# CUDA_VERSION: 11.7.1 +# IMAGE_NAME: torch-1-13-1-cu117-aws +# MOFED_VERSION: '' +# NVIDIA_REQUIRE_CUDA_OVERRIDE: '' +# PYTHON_VERSION: '3.11' +# PYTORCH_NIGHTLY_URL: '' +# PYTORCH_NIGHTLY_VERSION: '' +# PYTORCH_VERSION: 1.13.1 +# TAGS: +# - mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04-aws +# TARGET: pytorch_stage +# TORCHVISION_VERSION: 0.14.1 +# - AWS_OFI_NCCL_VERSION: '' +# BASE_IMAGE: ubuntu:20.04 +# CUDA_VERSION: '' +# IMAGE_NAME: torch-1-13-1-cpu +# MOFED_VERSION: '' +# NVIDIA_REQUIRE_CUDA_OVERRIDE: '' +# PYTHON_VERSION: '3.11' +# PYTORCH_NIGHTLY_URL: '' +# PYTORCH_NIGHTLY_VERSION: '' +# PYTORCH_VERSION: 1.13.1 +# TAGS: +# - mosaicml/pytorch:1.13.1_cpu-python3.10-ubuntu20.04 +# TARGET: pytorch_stage +# TORCHVISION_VERSION: 0.14.1 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 CUDA_VERSION: 12.1.0 From 2a1dbf64f2558e2a82a59c295877499fbd2dace3 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Wed, 17 Jan 2024 15:23:22 -0800 Subject: [PATCH 04/70] possible snappy fix --- docker/Dockerfile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 559d4a391d..7c4edd9667 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -345,6 +345,7 @@ RUN apt-get update && \ apt-get upgrade -y && \ apt-get autoclean && \ apt-get clean && \ + apt-get install -y libsnappy-dev && \ rm -rf /var/lib/apt/lists/* ######################### @@ -353,7 +354,8 @@ RUN apt-get update && \ RUN pip install --no-cache-dir --upgrade \ certifi${CERTIFI_VERSION} \ ipython${IPYTHON_VERSION} \ - urllib3${URLLIB3_VERSION} + urllib3${URLLIB3_VERSION} \ + python-snappy ################################################## # Override NVIDIA mistaken env var for 11.8 images From 456c82b6850c59bf971507aa4da83fea992b9ec7 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Fri, 19 Jan 2024 11:06:30 -0800 Subject: [PATCH 05/70] moved changes to generate_build_matrix --- docker/Dockerfile | 7 +++---- docker/generate_build_matrix.py | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 7c4edd9667..e413b0b2ca 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -15,15 +15,15 @@ ARG BASE_IMAGE=${CUDA_VERSION:+"nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu2 ARG BASE_IMAGE=${BASE_IMAGE:-"ubuntu:20.04"} # The Python version to install -ARG PYTHON_VERSION=3.11 +ARG PYTHON_VERSION=3.10 # The Pytorch Version to install -ARG PYTORCH_VERSION=2.1.2 +ARG PYTORCH_VERSION=1.13.1 # The Torchvision version to install. # Reference https://github.com/pytorch/vision#installation to determine the Torchvision # version that corresponds to the PyTorch version -ARG TORCHVISION_VERSION=0.16.2 +ARG TORCHVISION_VERSION=0.14.1 # In the Dockerimage, Pillow-SIMD is installed instead of Pillow. To trick pip into thinking that # Pillow is also installed (so it won't override it with a future pip install), a Pillow stub is included @@ -345,7 +345,6 @@ RUN apt-get update && \ apt-get upgrade -y && \ apt-get autoclean && \ apt-get clean && \ - apt-get install -y libsnappy-dev && \ rm -rf /var/lib/apt/lists/* ######################### diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py index 3ae69f6d77..a208ff57a9 100644 --- a/docker/generate_build_matrix.py +++ b/docker/generate_build_matrix.py @@ -18,7 +18,7 @@ import tabulate import yaml -LATEST_PYTHON_VERSION = '3.10' +LATEST_PYTHON_VERSION = '3.11' PRODUCTION_PYTORCH_VERSION = '2.1.2' From 7d6d8dd2576700d1ae6da2065d6ef0e8ece424fe Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Fri, 19 Jan 2024 11:25:23 -0800 Subject: [PATCH 06/70] 3.11 support --- docker/README.md | 16 ++- docker/build_matrix.yaml | 200 ++++++++++++++++++++++++-------- docker/generate_build_matrix.py | 2 +- 3 files changed, 163 insertions(+), 55 deletions(-) diff --git a/docker/README.md b/docker/README.md index b05733bb1b..9a7ed3f52c 100644 --- a/docker/README.md +++ b/docker/README.md @@ -32,14 +32,20 @@ To install composer, once inside the image, run `pip install mosaicml`. | Linux Distro | Flavor | PyTorch Version | CUDA Version | Python Version | Docker Tags | |----------------|----------|-------------------|---------------------|------------------|------------------------------------------------------------------------------------------| | Ubuntu 20.04 | Base | 2.2.0 | 12.1.0 (Infiniband) | 3.10 | `mosaicml/pytorch:2.2.0_cu121-nightly20231213-python3.10-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.1.2 | 12.1.0 (Infiniband) | 3.10 | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.1.2 | 12.1.0 (EFA) | 3.10 | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04-aws` | -| Ubuntu 20.04 | Base | 2.1.2 | cpu | 3.10 | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.1.2 | 12.1.0 (Infiniband) | 3.11 | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.1.2_cu121-python3.11-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.1.2 | 12.1.0 (EFA) | 3.11 | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.1.2_cu121-python3.11-ubuntu20.04-aws` | +| Ubuntu 20.04 | Base | 2.1.2 | cpu | 3.11 | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.1.2_cpu-python3.11-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.1.2 | 12.1.0 (Infiniband) | 3.10 | `mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.1.2 | cpu | 3.10 | `mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.0.1 | 11.8.0 (Infiniband) | 3.11 | `mosaicml/pytorch:2.0.1_cu118-python3.11-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.0.1 | 11.8.0 (EFA) | 3.11 | `mosaicml/pytorch:2.0.1_cu118-python3.11-ubuntu20.04-aws` | +| Ubuntu 20.04 | Base | 2.0.1 | cpu | 3.11 | `mosaicml/pytorch:2.0.1_cpu-python3.11-ubuntu20.04` | | Ubuntu 20.04 | Base | 2.0.1 | 11.8.0 (Infiniband) | 3.10 | `mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.0.1 | 11.8.0 (EFA) | 3.10 | `mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04-aws` | | Ubuntu 20.04 | Base | 2.0.1 | cpu | 3.10 | `mosaicml/pytorch:2.0.1_cpu-python3.10-ubuntu20.04` | +| Ubuntu 20.04 | Base | 1.13.1 | 11.7.1 (Infiniband) | 3.11 | `mosaicml/pytorch:1.13.1_cu117-python3.11-ubuntu20.04` | +| Ubuntu 20.04 | Base | 1.13.1 | 11.7.1 (EFA) | 3.11 | `mosaicml/pytorch:1.13.1_cu117-python3.11-ubuntu20.04-aws` | +| Ubuntu 20.04 | Base | 1.13.1 | cpu | 3.11 | `mosaicml/pytorch:1.13.1_cpu-python3.11-ubuntu20.04` | | Ubuntu 20.04 | Base | 1.13.1 | 11.7.1 (Infiniband) | 3.10 | `mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04` | -| Ubuntu 20.04 | Base | 1.13.1 | 11.7.1 (EFA) | 3.10 | `mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04-aws` | | Ubuntu 20.04 | Base | 1.13.1 | cpu | 3.10 | `mosaicml/pytorch:1.13.1_cpu-python3.10-ubuntu20.04` | diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml index 95c1784b17..233b6b6be3 100644 --- a/docker/build_matrix.yaml +++ b/docker/build_matrix.yaml @@ -18,12 +18,114 @@ brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 - PYTHON_VERSION: '3.11' + PYTHON_VERSION: '3.10' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.1.2 TAGS: - mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04 + TARGET: pytorch_stage + TORCHVISION_VERSION: 0.16.2 +- AWS_OFI_NCCL_VERSION: '' + BASE_IMAGE: ubuntu:20.04 + CUDA_VERSION: '' + IMAGE_NAME: torch-2-1-2-cpu + MOFED_VERSION: '' + NVIDIA_REQUIRE_CUDA_OVERRIDE: '' + PYTHON_VERSION: '3.10' + PYTORCH_NIGHTLY_URL: '' + PYTORCH_NIGHTLY_VERSION: '' + PYTORCH_VERSION: 2.1.2 + TAGS: + - mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04 + TARGET: pytorch_stage + TORCHVISION_VERSION: 0.16.2 +- AWS_OFI_NCCL_VERSION: '' + BASE_IMAGE: nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 + CUDA_VERSION: 11.8.0 + IMAGE_NAME: torch-2-0-1-cu118 + MOFED_VERSION: 5.5-1.0.3.2 + NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=11.8 brand=tesla,driver>=470,driver<471 brand=tesla,driver>=515,driver<516 + brand=unknown,driver>=470,driver<471 brand=unknown,driver>=515,driver<516 brand=nvidia,driver>=470,driver<471 + brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=470,driver<471 brand=nvidiartx,driver>=515,driver<516 + brand=geforce,driver>=470,driver<471 brand=geforce,driver>=515,driver<516 brand=quadro,driver>=470,driver<471 + brand=quadro,driver>=515,driver<516 brand=titan,driver>=470,driver<471 brand=titan,driver>=515,driver<516 + brand=titanrtx,driver>=470,driver<471 brand=titanrtx,driver>=515,driver<516 + PYTHON_VERSION: '3.10' + PYTORCH_NIGHTLY_URL: '' + PYTORCH_NIGHTLY_VERSION: '' + PYTORCH_VERSION: 2.0.1 + TAGS: + - mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04 + TARGET: pytorch_stage + TORCHVISION_VERSION: 0.15.2 +- AWS_OFI_NCCL_VERSION: '' + BASE_IMAGE: ubuntu:20.04 + CUDA_VERSION: '' + IMAGE_NAME: torch-2-0-1-cpu + MOFED_VERSION: '' + NVIDIA_REQUIRE_CUDA_OVERRIDE: '' + PYTHON_VERSION: '3.10' + PYTORCH_NIGHTLY_URL: '' + PYTORCH_NIGHTLY_VERSION: '' + PYTORCH_VERSION: 2.0.1 + TAGS: + - mosaicml/pytorch:2.0.1_cpu-python3.10-ubuntu20.04 + TARGET: pytorch_stage + TORCHVISION_VERSION: 0.15.2 +- AWS_OFI_NCCL_VERSION: '' + BASE_IMAGE: nvidia/cuda:11.7.1-cudnn8-devel-ubuntu20.04 + CUDA_VERSION: 11.7.1 + IMAGE_NAME: torch-1-13-1-cu117 + MOFED_VERSION: 5.5-1.0.3.2 + NVIDIA_REQUIRE_CUDA_OVERRIDE: '' + PYTHON_VERSION: '3.10' + PYTORCH_NIGHTLY_URL: '' + PYTORCH_NIGHTLY_VERSION: '' + PYTORCH_VERSION: 1.13.1 + TAGS: + - mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04 + TARGET: pytorch_stage + TORCHVISION_VERSION: 0.14.1 +- AWS_OFI_NCCL_VERSION: '' + BASE_IMAGE: ubuntu:20.04 + CUDA_VERSION: '' + IMAGE_NAME: torch-1-13-1-cpu + MOFED_VERSION: '' + NVIDIA_REQUIRE_CUDA_OVERRIDE: '' + PYTHON_VERSION: '3.10' + PYTORCH_NIGHTLY_URL: '' + PYTORCH_NIGHTLY_VERSION: '' + PYTORCH_VERSION: 1.13.1 + TAGS: + - mosaicml/pytorch:1.13.1_cpu-python3.10-ubuntu20.04 + TARGET: pytorch_stage + TORCHVISION_VERSION: 0.14.1 +- AWS_OFI_NCCL_VERSION: '' + BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 + CUDA_VERSION: 12.1.0 + IMAGE_NAME: torch-2-1-2-cu121 + MOFED_VERSION: 5.5-1.0.3.2 + NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 + brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 + brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471 + brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471 + brand=tesla,driver>=510,driver<511 brand=unknown,driver>=510,driver<511 brand=nvidia,driver>=510,driver<511 + brand=nvidiartx,driver>=510,driver<511 brand=geforce,driver>=510,driver<511 brand=geforcertx,driver>=510,driver<511 + brand=quadro,driver>=510,driver<511 brand=quadrortx,driver>=510,driver<511 brand=titan,driver>=510,driver<511 + brand=titanrtx,driver>=510,driver<511 brand=tesla,driver>=515,driver<516 brand=unknown,driver>=515,driver<516 + brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=515,driver<516 brand=geforce,driver>=515,driver<516 + brand=geforcertx,driver>=515,driver<516 brand=quadro,driver>=515,driver<516 brand=quadrortx,driver>=515,driver<516 + brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=515,driver<516 brand=tesla,driver>=525,driver<526 + brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 + brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 + brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 + PYTHON_VERSION: '3.11' + PYTORCH_NIGHTLY_URL: '' + PYTORCH_NIGHTLY_VERSION: '' + PYTORCH_VERSION: 2.1.2 + TAGS: + - mosaicml/pytorch:2.1.2_cu121-python3.11-ubuntu20.04 - mosaicml/pytorch:latest TARGET: pytorch_stage TORCHVISION_VERSION: 0.16.2 @@ -51,7 +153,7 @@ PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.1.2 TAGS: - - mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04-aws + - mosaicml/pytorch:2.1.2_cu121-python3.11-ubuntu20.04-aws - mosaicml/pytorch:latest-aws TARGET: pytorch_stage TORCHVISION_VERSION: 0.16.2 @@ -66,7 +168,7 @@ PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.1.2 TAGS: - - mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04 + - mosaicml/pytorch:2.1.2_cpu-python3.11-ubuntu20.04 - mosaicml/pytorch:latest_cpu TARGET: pytorch_stage TORCHVISION_VERSION: 0.16.2 @@ -86,7 +188,7 @@ PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.0.1 TAGS: - - mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04 + - mosaicml/pytorch:2.0.1_cu118-python3.11-ubuntu20.04 TARGET: pytorch_stage TORCHVISION_VERSION: 0.15.2 - AWS_OFI_NCCL_VERSION: v1.7.4-aws @@ -105,7 +207,7 @@ PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.0.1 TAGS: - - mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04-aws + - mosaicml/pytorch:2.0.1_cu118-python3.11-ubuntu20.04-aws TARGET: pytorch_stage TORCHVISION_VERSION: 0.15.2 - AWS_OFI_NCCL_VERSION: '' @@ -119,51 +221,51 @@ PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.0.1 TAGS: - - mosaicml/pytorch:2.0.1_cpu-python3.10-ubuntu20.04 + - mosaicml/pytorch:2.0.1_cpu-python3.11-ubuntu20.04 TARGET: pytorch_stage TORCHVISION_VERSION: 0.15.2 -# - AWS_OFI_NCCL_VERSION: '' -# BASE_IMAGE: nvidia/cuda:11.7.1-cudnn8-devel-ubuntu20.04 -# CUDA_VERSION: 11.7.1 -# IMAGE_NAME: torch-1-13-1-cu117 -# MOFED_VERSION: 5.5-1.0.3.2 -# NVIDIA_REQUIRE_CUDA_OVERRIDE: '' -# PYTHON_VERSION: '3.11' -# PYTORCH_NIGHTLY_URL: '' -# PYTORCH_NIGHTLY_VERSION: '' -# PYTORCH_VERSION: 1.13.1 -# TAGS: -# - mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04 -# TARGET: pytorch_stage -# TORCHVISION_VERSION: 0.14.1 -# - AWS_OFI_NCCL_VERSION: v1.7.4-aws -# BASE_IMAGE: nvidia/cuda:11.7.1-cudnn8-devel-ubuntu20.04 -# CUDA_VERSION: 11.7.1 -# IMAGE_NAME: torch-1-13-1-cu117-aws -# MOFED_VERSION: '' -# NVIDIA_REQUIRE_CUDA_OVERRIDE: '' -# PYTHON_VERSION: '3.11' -# PYTORCH_NIGHTLY_URL: '' -# PYTORCH_NIGHTLY_VERSION: '' -# PYTORCH_VERSION: 1.13.1 -# TAGS: -# - mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04-aws -# TARGET: pytorch_stage -# TORCHVISION_VERSION: 0.14.1 -# - AWS_OFI_NCCL_VERSION: '' -# BASE_IMAGE: ubuntu:20.04 -# CUDA_VERSION: '' -# IMAGE_NAME: torch-1-13-1-cpu -# MOFED_VERSION: '' -# NVIDIA_REQUIRE_CUDA_OVERRIDE: '' -# PYTHON_VERSION: '3.11' -# PYTORCH_NIGHTLY_URL: '' -# PYTORCH_NIGHTLY_VERSION: '' -# PYTORCH_VERSION: 1.13.1 -# TAGS: -# - mosaicml/pytorch:1.13.1_cpu-python3.10-ubuntu20.04 -# TARGET: pytorch_stage -# TORCHVISION_VERSION: 0.14.1 +- AWS_OFI_NCCL_VERSION: '' + BASE_IMAGE: nvidia/cuda:11.7.1-cudnn8-devel-ubuntu20.04 + CUDA_VERSION: 11.7.1 + IMAGE_NAME: torch-1-13-1-cu117 + MOFED_VERSION: 5.5-1.0.3.2 + NVIDIA_REQUIRE_CUDA_OVERRIDE: '' + PYTHON_VERSION: '3.11' + PYTORCH_NIGHTLY_URL: '' + PYTORCH_NIGHTLY_VERSION: '' + PYTORCH_VERSION: 1.13.1 + TAGS: + - mosaicml/pytorch:1.13.1_cu117-python3.11-ubuntu20.04 + TARGET: pytorch_stage + TORCHVISION_VERSION: 0.14.1 +- AWS_OFI_NCCL_VERSION: v1.7.4-aws + BASE_IMAGE: nvidia/cuda:11.7.1-cudnn8-devel-ubuntu20.04 + CUDA_VERSION: 11.7.1 + IMAGE_NAME: torch-1-13-1-cu117-aws + MOFED_VERSION: '' + NVIDIA_REQUIRE_CUDA_OVERRIDE: '' + PYTHON_VERSION: '3.11' + PYTORCH_NIGHTLY_URL: '' + PYTORCH_NIGHTLY_VERSION: '' + PYTORCH_VERSION: 1.13.1 + TAGS: + - mosaicml/pytorch:1.13.1_cu117-python3.11-ubuntu20.04-aws + TARGET: pytorch_stage + TORCHVISION_VERSION: 0.14.1 +- AWS_OFI_NCCL_VERSION: '' + BASE_IMAGE: ubuntu:20.04 + CUDA_VERSION: '' + IMAGE_NAME: torch-1-13-1-cpu + MOFED_VERSION: '' + NVIDIA_REQUIRE_CUDA_OVERRIDE: '' + PYTHON_VERSION: '3.11' + PYTORCH_NIGHTLY_URL: '' + PYTORCH_NIGHTLY_VERSION: '' + PYTORCH_VERSION: 1.13.1 + TAGS: + - mosaicml/pytorch:1.13.1_cpu-python3.11-ubuntu20.04 + TARGET: pytorch_stage + TORCHVISION_VERSION: 0.14.1 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 CUDA_VERSION: 12.1.0 @@ -183,7 +285,7 @@ brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 - PYTHON_VERSION: '3.11' + PYTHON_VERSION: '3.10' PYTORCH_NIGHTLY_URL: https://download.pytorch.org/whl/nightly/cu121 PYTORCH_NIGHTLY_VERSION: dev20231213+cu121 PYTORCH_VERSION: 2.2.0 diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py index a208ff57a9..bcd82a3a65 100644 --- a/docker/generate_build_matrix.py +++ b/docker/generate_build_matrix.py @@ -165,7 +165,7 @@ def _write_table(table_tag: str, table_contents: str): def _main(): - python_versions = ['3.10'] + python_versions = ['3.10', '3.11'] pytorch_versions = ['2.1.2', '2.0.1', '1.13.1'] cuda_options = [True, False] stages = ['pytorch_stage'] From 2ba8b3a208d1c2a10809119f0eef76fc48e9eb53 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Fri, 19 Jan 2024 11:32:22 -0800 Subject: [PATCH 07/70] test --- docker/Dockerfile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index e413b0b2ca..3af243c5e7 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -353,8 +353,7 @@ RUN apt-get update && \ RUN pip install --no-cache-dir --upgrade \ certifi${CERTIFI_VERSION} \ ipython${IPYTHON_VERSION} \ - urllib3${URLLIB3_VERSION} \ - python-snappy + urllib3${URLLIB3_VERSION} ################################################## # Override NVIDIA mistaken env var for 11.8 images From d89890a4bf86ef15062fef783ca6261dee8e9335 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Fri, 19 Jan 2024 11:38:04 -0800 Subject: [PATCH 08/70] version test --- docker/Dockerfile | 3 +- docker/README.md | 6 -- docker/build_matrix.yaml | 102 -------------------------------- docker/generate_build_matrix.py | 2 +- 4 files changed, 3 insertions(+), 110 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 3af243c5e7..e413b0b2ca 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -353,7 +353,8 @@ RUN apt-get update && \ RUN pip install --no-cache-dir --upgrade \ certifi${CERTIFI_VERSION} \ ipython${IPYTHON_VERSION} \ - urllib3${URLLIB3_VERSION} + urllib3${URLLIB3_VERSION} \ + python-snappy ################################################## # Override NVIDIA mistaken env var for 11.8 images diff --git a/docker/README.md b/docker/README.md index 9a7ed3f52c..54b3fe0d7c 100644 --- a/docker/README.md +++ b/docker/README.md @@ -35,18 +35,12 @@ To install composer, once inside the image, run `pip install mosaicml`. | Ubuntu 20.04 | Base | 2.1.2 | 12.1.0 (Infiniband) | 3.11 | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.1.2_cu121-python3.11-ubuntu20.04` | | Ubuntu 20.04 | Base | 2.1.2 | 12.1.0 (EFA) | 3.11 | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.1.2_cu121-python3.11-ubuntu20.04-aws` | | Ubuntu 20.04 | Base | 2.1.2 | cpu | 3.11 | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.1.2_cpu-python3.11-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.1.2 | 12.1.0 (Infiniband) | 3.10 | `mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.1.2 | cpu | 3.10 | `mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04` | | Ubuntu 20.04 | Base | 2.0.1 | 11.8.0 (Infiniband) | 3.11 | `mosaicml/pytorch:2.0.1_cu118-python3.11-ubuntu20.04` | | Ubuntu 20.04 | Base | 2.0.1 | 11.8.0 (EFA) | 3.11 | `mosaicml/pytorch:2.0.1_cu118-python3.11-ubuntu20.04-aws` | | Ubuntu 20.04 | Base | 2.0.1 | cpu | 3.11 | `mosaicml/pytorch:2.0.1_cpu-python3.11-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.0.1 | 11.8.0 (Infiniband) | 3.10 | `mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.0.1 | cpu | 3.10 | `mosaicml/pytorch:2.0.1_cpu-python3.10-ubuntu20.04` | | Ubuntu 20.04 | Base | 1.13.1 | 11.7.1 (Infiniband) | 3.11 | `mosaicml/pytorch:1.13.1_cu117-python3.11-ubuntu20.04` | | Ubuntu 20.04 | Base | 1.13.1 | 11.7.1 (EFA) | 3.11 | `mosaicml/pytorch:1.13.1_cu117-python3.11-ubuntu20.04-aws` | | Ubuntu 20.04 | Base | 1.13.1 | cpu | 3.11 | `mosaicml/pytorch:1.13.1_cpu-python3.11-ubuntu20.04` | -| Ubuntu 20.04 | Base | 1.13.1 | 11.7.1 (Infiniband) | 3.10 | `mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04` | -| Ubuntu 20.04 | Base | 1.13.1 | cpu | 3.10 | `mosaicml/pytorch:1.13.1_cpu-python3.10-ubuntu20.04` | **Note**: The `mosaicml/pytorch:latest`, `mosaicml/pytorch:latest_cpu`, and `mosaicml/pytorch:latest-aws` diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml index 233b6b6be3..964f3b03ab 100644 --- a/docker/build_matrix.yaml +++ b/docker/build_matrix.yaml @@ -1,106 +1,4 @@ # This file is automatically generated by generate_build_matrix.py. DO NOT EDIT! -- AWS_OFI_NCCL_VERSION: '' - BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 - CUDA_VERSION: 12.1.0 - IMAGE_NAME: torch-2-1-2-cu121 - MOFED_VERSION: 5.5-1.0.3.2 - NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 - brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 - brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471 - brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471 - brand=tesla,driver>=510,driver<511 brand=unknown,driver>=510,driver<511 brand=nvidia,driver>=510,driver<511 - brand=nvidiartx,driver>=510,driver<511 brand=geforce,driver>=510,driver<511 brand=geforcertx,driver>=510,driver<511 - brand=quadro,driver>=510,driver<511 brand=quadrortx,driver>=510,driver<511 brand=titan,driver>=510,driver<511 - brand=titanrtx,driver>=510,driver<511 brand=tesla,driver>=515,driver<516 brand=unknown,driver>=515,driver<516 - brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=515,driver<516 brand=geforce,driver>=515,driver<516 - brand=geforcertx,driver>=515,driver<516 brand=quadro,driver>=515,driver<516 brand=quadrortx,driver>=515,driver<516 - brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=515,driver<516 brand=tesla,driver>=525,driver<526 - brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 - brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 - brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 - PYTHON_VERSION: '3.10' - PYTORCH_NIGHTLY_URL: '' - PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.1.2 - TAGS: - - mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04 - TARGET: pytorch_stage - TORCHVISION_VERSION: 0.16.2 -- AWS_OFI_NCCL_VERSION: '' - BASE_IMAGE: ubuntu:20.04 - CUDA_VERSION: '' - IMAGE_NAME: torch-2-1-2-cpu - MOFED_VERSION: '' - NVIDIA_REQUIRE_CUDA_OVERRIDE: '' - PYTHON_VERSION: '3.10' - PYTORCH_NIGHTLY_URL: '' - PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.1.2 - TAGS: - - mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04 - TARGET: pytorch_stage - TORCHVISION_VERSION: 0.16.2 -- AWS_OFI_NCCL_VERSION: '' - BASE_IMAGE: nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 - CUDA_VERSION: 11.8.0 - IMAGE_NAME: torch-2-0-1-cu118 - MOFED_VERSION: 5.5-1.0.3.2 - NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=11.8 brand=tesla,driver>=470,driver<471 brand=tesla,driver>=515,driver<516 - brand=unknown,driver>=470,driver<471 brand=unknown,driver>=515,driver<516 brand=nvidia,driver>=470,driver<471 - brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=470,driver<471 brand=nvidiartx,driver>=515,driver<516 - brand=geforce,driver>=470,driver<471 brand=geforce,driver>=515,driver<516 brand=quadro,driver>=470,driver<471 - brand=quadro,driver>=515,driver<516 brand=titan,driver>=470,driver<471 brand=titan,driver>=515,driver<516 - brand=titanrtx,driver>=470,driver<471 brand=titanrtx,driver>=515,driver<516 - PYTHON_VERSION: '3.10' - PYTORCH_NIGHTLY_URL: '' - PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.0.1 - TAGS: - - mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04 - TARGET: pytorch_stage - TORCHVISION_VERSION: 0.15.2 -- AWS_OFI_NCCL_VERSION: '' - BASE_IMAGE: ubuntu:20.04 - CUDA_VERSION: '' - IMAGE_NAME: torch-2-0-1-cpu - MOFED_VERSION: '' - NVIDIA_REQUIRE_CUDA_OVERRIDE: '' - PYTHON_VERSION: '3.10' - PYTORCH_NIGHTLY_URL: '' - PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.0.1 - TAGS: - - mosaicml/pytorch:2.0.1_cpu-python3.10-ubuntu20.04 - TARGET: pytorch_stage - TORCHVISION_VERSION: 0.15.2 -- AWS_OFI_NCCL_VERSION: '' - BASE_IMAGE: nvidia/cuda:11.7.1-cudnn8-devel-ubuntu20.04 - CUDA_VERSION: 11.7.1 - IMAGE_NAME: torch-1-13-1-cu117 - MOFED_VERSION: 5.5-1.0.3.2 - NVIDIA_REQUIRE_CUDA_OVERRIDE: '' - PYTHON_VERSION: '3.10' - PYTORCH_NIGHTLY_URL: '' - PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 1.13.1 - TAGS: - - mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04 - TARGET: pytorch_stage - TORCHVISION_VERSION: 0.14.1 -- AWS_OFI_NCCL_VERSION: '' - BASE_IMAGE: ubuntu:20.04 - CUDA_VERSION: '' - IMAGE_NAME: torch-1-13-1-cpu - MOFED_VERSION: '' - NVIDIA_REQUIRE_CUDA_OVERRIDE: '' - PYTHON_VERSION: '3.10' - PYTORCH_NIGHTLY_URL: '' - PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 1.13.1 - TAGS: - - mosaicml/pytorch:1.13.1_cpu-python3.10-ubuntu20.04 - TARGET: pytorch_stage - TORCHVISION_VERSION: 0.14.1 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 CUDA_VERSION: 12.1.0 diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py index bcd82a3a65..1060646642 100644 --- a/docker/generate_build_matrix.py +++ b/docker/generate_build_matrix.py @@ -165,7 +165,7 @@ def _write_table(table_tag: str, table_contents: str): def _main(): - python_versions = ['3.10', '3.11'] + python_versions = ['3.11'] pytorch_versions = ['2.1.2', '2.0.1', '1.13.1'] cuda_options = [True, False] stages = ['pytorch_stage'] From 4d82663f5e78fe0542373712f4bd167fcb8134eb Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Fri, 19 Jan 2024 11:41:24 -0800 Subject: [PATCH 09/70] remove snappy test --- docker/Dockerfile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index e413b0b2ca..3af243c5e7 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -353,8 +353,7 @@ RUN apt-get update && \ RUN pip install --no-cache-dir --upgrade \ certifi${CERTIFI_VERSION} \ ipython${IPYTHON_VERSION} \ - urllib3${URLLIB3_VERSION} \ - python-snappy + urllib3${URLLIB3_VERSION} ################################################## # Override NVIDIA mistaken env var for 11.8 images From 7f26e187d1c37804041278140e639edcf900b894 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Fri, 19 Jan 2024 11:52:45 -0800 Subject: [PATCH 10/70] add 3.10 + 3.11 --- docker/README.md | 6 ++ docker/build_matrix.yaml | 102 ++++++++++++++++++++++++++++++++ docker/generate_build_matrix.py | 2 +- 3 files changed, 109 insertions(+), 1 deletion(-) diff --git a/docker/README.md b/docker/README.md index 54b3fe0d7c..9a7ed3f52c 100644 --- a/docker/README.md +++ b/docker/README.md @@ -35,12 +35,18 @@ To install composer, once inside the image, run `pip install mosaicml`. | Ubuntu 20.04 | Base | 2.1.2 | 12.1.0 (Infiniband) | 3.11 | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.1.2_cu121-python3.11-ubuntu20.04` | | Ubuntu 20.04 | Base | 2.1.2 | 12.1.0 (EFA) | 3.11 | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.1.2_cu121-python3.11-ubuntu20.04-aws` | | Ubuntu 20.04 | Base | 2.1.2 | cpu | 3.11 | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.1.2_cpu-python3.11-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.1.2 | 12.1.0 (Infiniband) | 3.10 | `mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.1.2 | cpu | 3.10 | `mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04` | | Ubuntu 20.04 | Base | 2.0.1 | 11.8.0 (Infiniband) | 3.11 | `mosaicml/pytorch:2.0.1_cu118-python3.11-ubuntu20.04` | | Ubuntu 20.04 | Base | 2.0.1 | 11.8.0 (EFA) | 3.11 | `mosaicml/pytorch:2.0.1_cu118-python3.11-ubuntu20.04-aws` | | Ubuntu 20.04 | Base | 2.0.1 | cpu | 3.11 | `mosaicml/pytorch:2.0.1_cpu-python3.11-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.0.1 | 11.8.0 (Infiniband) | 3.10 | `mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.0.1 | cpu | 3.10 | `mosaicml/pytorch:2.0.1_cpu-python3.10-ubuntu20.04` | | Ubuntu 20.04 | Base | 1.13.1 | 11.7.1 (Infiniband) | 3.11 | `mosaicml/pytorch:1.13.1_cu117-python3.11-ubuntu20.04` | | Ubuntu 20.04 | Base | 1.13.1 | 11.7.1 (EFA) | 3.11 | `mosaicml/pytorch:1.13.1_cu117-python3.11-ubuntu20.04-aws` | | Ubuntu 20.04 | Base | 1.13.1 | cpu | 3.11 | `mosaicml/pytorch:1.13.1_cpu-python3.11-ubuntu20.04` | +| Ubuntu 20.04 | Base | 1.13.1 | 11.7.1 (Infiniband) | 3.10 | `mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04` | +| Ubuntu 20.04 | Base | 1.13.1 | cpu | 3.10 | `mosaicml/pytorch:1.13.1_cpu-python3.10-ubuntu20.04` | **Note**: The `mosaicml/pytorch:latest`, `mosaicml/pytorch:latest_cpu`, and `mosaicml/pytorch:latest-aws` diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml index 964f3b03ab..e8dcf5a292 100644 --- a/docker/build_matrix.yaml +++ b/docker/build_matrix.yaml @@ -164,6 +164,108 @@ - mosaicml/pytorch:1.13.1_cpu-python3.11-ubuntu20.04 TARGET: pytorch_stage TORCHVISION_VERSION: 0.14.1 +- AWS_OFI_NCCL_VERSION: '' + BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 + CUDA_VERSION: 12.1.0 + IMAGE_NAME: torch-2-1-2-cu121 + MOFED_VERSION: 5.5-1.0.3.2 + NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 + brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 + brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471 + brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471 + brand=tesla,driver>=510,driver<511 brand=unknown,driver>=510,driver<511 brand=nvidia,driver>=510,driver<511 + brand=nvidiartx,driver>=510,driver<511 brand=geforce,driver>=510,driver<511 brand=geforcertx,driver>=510,driver<511 + brand=quadro,driver>=510,driver<511 brand=quadrortx,driver>=510,driver<511 brand=titan,driver>=510,driver<511 + brand=titanrtx,driver>=510,driver<511 brand=tesla,driver>=515,driver<516 brand=unknown,driver>=515,driver<516 + brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=515,driver<516 brand=geforce,driver>=515,driver<516 + brand=geforcertx,driver>=515,driver<516 brand=quadro,driver>=515,driver<516 brand=quadrortx,driver>=515,driver<516 + brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=515,driver<516 brand=tesla,driver>=525,driver<526 + brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 + brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 + brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 + PYTHON_VERSION: '3.10' + PYTORCH_NIGHTLY_URL: '' + PYTORCH_NIGHTLY_VERSION: '' + PYTORCH_VERSION: 2.1.2 + TAGS: + - mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04 + TARGET: pytorch_stage + TORCHVISION_VERSION: 0.16.2 +- AWS_OFI_NCCL_VERSION: '' + BASE_IMAGE: ubuntu:20.04 + CUDA_VERSION: '' + IMAGE_NAME: torch-2-1-2-cpu + MOFED_VERSION: '' + NVIDIA_REQUIRE_CUDA_OVERRIDE: '' + PYTHON_VERSION: '3.10' + PYTORCH_NIGHTLY_URL: '' + PYTORCH_NIGHTLY_VERSION: '' + PYTORCH_VERSION: 2.1.2 + TAGS: + - mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04 + TARGET: pytorch_stage + TORCHVISION_VERSION: 0.16.2 +- AWS_OFI_NCCL_VERSION: '' + BASE_IMAGE: nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 + CUDA_VERSION: 11.8.0 + IMAGE_NAME: torch-2-0-1-cu118 + MOFED_VERSION: 5.5-1.0.3.2 + NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=11.8 brand=tesla,driver>=470,driver<471 brand=tesla,driver>=515,driver<516 + brand=unknown,driver>=470,driver<471 brand=unknown,driver>=515,driver<516 brand=nvidia,driver>=470,driver<471 + brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=470,driver<471 brand=nvidiartx,driver>=515,driver<516 + brand=geforce,driver>=470,driver<471 brand=geforce,driver>=515,driver<516 brand=quadro,driver>=470,driver<471 + brand=quadro,driver>=515,driver<516 brand=titan,driver>=470,driver<471 brand=titan,driver>=515,driver<516 + brand=titanrtx,driver>=470,driver<471 brand=titanrtx,driver>=515,driver<516 + PYTHON_VERSION: '3.10' + PYTORCH_NIGHTLY_URL: '' + PYTORCH_NIGHTLY_VERSION: '' + PYTORCH_VERSION: 2.0.1 + TAGS: + - mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04 + TARGET: pytorch_stage + TORCHVISION_VERSION: 0.15.2 +- AWS_OFI_NCCL_VERSION: '' + BASE_IMAGE: ubuntu:20.04 + CUDA_VERSION: '' + IMAGE_NAME: torch-2-0-1-cpu + MOFED_VERSION: '' + NVIDIA_REQUIRE_CUDA_OVERRIDE: '' + PYTHON_VERSION: '3.10' + PYTORCH_NIGHTLY_URL: '' + PYTORCH_NIGHTLY_VERSION: '' + PYTORCH_VERSION: 2.0.1 + TAGS: + - mosaicml/pytorch:2.0.1_cpu-python3.10-ubuntu20.04 + TARGET: pytorch_stage + TORCHVISION_VERSION: 0.15.2 +- AWS_OFI_NCCL_VERSION: '' + BASE_IMAGE: nvidia/cuda:11.7.1-cudnn8-devel-ubuntu20.04 + CUDA_VERSION: 11.7.1 + IMAGE_NAME: torch-1-13-1-cu117 + MOFED_VERSION: 5.5-1.0.3.2 + NVIDIA_REQUIRE_CUDA_OVERRIDE: '' + PYTHON_VERSION: '3.10' + PYTORCH_NIGHTLY_URL: '' + PYTORCH_NIGHTLY_VERSION: '' + PYTORCH_VERSION: 1.13.1 + TAGS: + - mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04 + TARGET: pytorch_stage + TORCHVISION_VERSION: 0.14.1 +- AWS_OFI_NCCL_VERSION: '' + BASE_IMAGE: ubuntu:20.04 + CUDA_VERSION: '' + IMAGE_NAME: torch-1-13-1-cpu + MOFED_VERSION: '' + NVIDIA_REQUIRE_CUDA_OVERRIDE: '' + PYTHON_VERSION: '3.10' + PYTORCH_NIGHTLY_URL: '' + PYTORCH_NIGHTLY_VERSION: '' + PYTORCH_VERSION: 1.13.1 + TAGS: + - mosaicml/pytorch:1.13.1_cpu-python3.10-ubuntu20.04 + TARGET: pytorch_stage + TORCHVISION_VERSION: 0.14.1 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 CUDA_VERSION: 12.1.0 diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py index 1060646642..377f32b168 100644 --- a/docker/generate_build_matrix.py +++ b/docker/generate_build_matrix.py @@ -165,7 +165,7 @@ def _write_table(table_tag: str, table_contents: str): def _main(): - python_versions = ['3.11'] + python_versions = ['3.11', '3.10'] pytorch_versions = ['2.1.2', '2.0.1', '1.13.1'] cuda_options = [True, False] stages = ['pytorch_stage'] From f9c83a1a297164d9763e62e8890539414b44c462 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Fri, 19 Jan 2024 13:23:05 -0800 Subject: [PATCH 11/70] potential snappy fix --- docker/Dockerfile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 3af243c5e7..01d3ebbcb8 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -118,6 +118,7 @@ RUN apt-get update && \ tcl \ libjpeg8-dev \ less \ + libsnappy-dev \ # For AWS EFA: autoconf \ autotools-dev \ @@ -353,7 +354,8 @@ RUN apt-get update && \ RUN pip install --no-cache-dir --upgrade \ certifi${CERTIFI_VERSION} \ ipython${IPYTHON_VERSION} \ - urllib3${URLLIB3_VERSION} + urllib3${URLLIB3_VERSION} \ + python-snappy ################################################## # Override NVIDIA mistaken env var for 11.8 images From 775cb451477002c4a3c692fd7cdcaffa6ab106b4 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Fri, 19 Jan 2024 13:34:50 -0800 Subject: [PATCH 12/70] nightly patch --- docker/generate_build_matrix.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py index 377f32b168..bd7107a1ec 100644 --- a/docker/generate_build_matrix.py +++ b/docker/generate_build_matrix.py @@ -18,7 +18,7 @@ import tabulate import yaml -LATEST_PYTHON_VERSION = '3.11' +LATEST_PYTHON_VERSION = '3.10' PRODUCTION_PYTORCH_VERSION = '2.1.2' From 1824dd35475982a3989ddfd40f465575d3a38a1a Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Fri, 19 Jan 2024 13:57:01 -0800 Subject: [PATCH 13/70] debug --- docker/generate_build_matrix.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py index bd7107a1ec..3ae69f6d77 100644 --- a/docker/generate_build_matrix.py +++ b/docker/generate_build_matrix.py @@ -165,7 +165,7 @@ def _write_table(table_tag: str, table_contents: str): def _main(): - python_versions = ['3.11', '3.10'] + python_versions = ['3.10'] pytorch_versions = ['2.1.2', '2.0.1', '1.13.1'] cuda_options = [True, False] stages = ['pytorch_stage'] From e491a2a092a4a4c95dcb8cdd49c10d8988b7eece Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Fri, 19 Jan 2024 13:57:22 -0800 Subject: [PATCH 14/70] debug --- docker/README.md | 16 ++--- docker/build_matrix.yaml | 136 +++++---------------------------------- 2 files changed, 22 insertions(+), 130 deletions(-) diff --git a/docker/README.md b/docker/README.md index 9a7ed3f52c..b05733bb1b 100644 --- a/docker/README.md +++ b/docker/README.md @@ -32,20 +32,14 @@ To install composer, once inside the image, run `pip install mosaicml`. | Linux Distro | Flavor | PyTorch Version | CUDA Version | Python Version | Docker Tags | |----------------|----------|-------------------|---------------------|------------------|------------------------------------------------------------------------------------------| | Ubuntu 20.04 | Base | 2.2.0 | 12.1.0 (Infiniband) | 3.10 | `mosaicml/pytorch:2.2.0_cu121-nightly20231213-python3.10-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.1.2 | 12.1.0 (Infiniband) | 3.11 | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.1.2_cu121-python3.11-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.1.2 | 12.1.0 (EFA) | 3.11 | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.1.2_cu121-python3.11-ubuntu20.04-aws` | -| Ubuntu 20.04 | Base | 2.1.2 | cpu | 3.11 | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.1.2_cpu-python3.11-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.1.2 | 12.1.0 (Infiniband) | 3.10 | `mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.1.2 | cpu | 3.10 | `mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.0.1 | 11.8.0 (Infiniband) | 3.11 | `mosaicml/pytorch:2.0.1_cu118-python3.11-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.0.1 | 11.8.0 (EFA) | 3.11 | `mosaicml/pytorch:2.0.1_cu118-python3.11-ubuntu20.04-aws` | -| Ubuntu 20.04 | Base | 2.0.1 | cpu | 3.11 | `mosaicml/pytorch:2.0.1_cpu-python3.11-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.1.2 | 12.1.0 (Infiniband) | 3.10 | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.1.2 | 12.1.0 (EFA) | 3.10 | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04-aws` | +| Ubuntu 20.04 | Base | 2.1.2 | cpu | 3.10 | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04` | | Ubuntu 20.04 | Base | 2.0.1 | 11.8.0 (Infiniband) | 3.10 | `mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.0.1 | 11.8.0 (EFA) | 3.10 | `mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04-aws` | | Ubuntu 20.04 | Base | 2.0.1 | cpu | 3.10 | `mosaicml/pytorch:2.0.1_cpu-python3.10-ubuntu20.04` | -| Ubuntu 20.04 | Base | 1.13.1 | 11.7.1 (Infiniband) | 3.11 | `mosaicml/pytorch:1.13.1_cu117-python3.11-ubuntu20.04` | -| Ubuntu 20.04 | Base | 1.13.1 | 11.7.1 (EFA) | 3.11 | `mosaicml/pytorch:1.13.1_cu117-python3.11-ubuntu20.04-aws` | -| Ubuntu 20.04 | Base | 1.13.1 | cpu | 3.11 | `mosaicml/pytorch:1.13.1_cpu-python3.11-ubuntu20.04` | | Ubuntu 20.04 | Base | 1.13.1 | 11.7.1 (Infiniband) | 3.10 | `mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04` | +| Ubuntu 20.04 | Base | 1.13.1 | 11.7.1 (EFA) | 3.10 | `mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04-aws` | | Ubuntu 20.04 | Base | 1.13.1 | cpu | 3.10 | `mosaicml/pytorch:1.13.1_cpu-python3.10-ubuntu20.04` | diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml index e8dcf5a292..cd2efc0e19 100644 --- a/docker/build_matrix.yaml +++ b/docker/build_matrix.yaml @@ -18,12 +18,12 @@ brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 - PYTHON_VERSION: '3.11' + PYTHON_VERSION: '3.10' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.1.2 TAGS: - - mosaicml/pytorch:2.1.2_cu121-python3.11-ubuntu20.04 + - mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04 - mosaicml/pytorch:latest TARGET: pytorch_stage TORCHVISION_VERSION: 0.16.2 @@ -46,12 +46,12 @@ brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 - PYTHON_VERSION: '3.11' + PYTHON_VERSION: '3.10' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.1.2 TAGS: - - mosaicml/pytorch:2.1.2_cu121-python3.11-ubuntu20.04-aws + - mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04-aws - mosaicml/pytorch:latest-aws TARGET: pytorch_stage TORCHVISION_VERSION: 0.16.2 @@ -61,12 +61,12 @@ IMAGE_NAME: torch-2-1-2-cpu MOFED_VERSION: '' NVIDIA_REQUIRE_CUDA_OVERRIDE: '' - PYTHON_VERSION: '3.11' + PYTHON_VERSION: '3.10' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.1.2 TAGS: - - mosaicml/pytorch:2.1.2_cpu-python3.11-ubuntu20.04 + - mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04 - mosaicml/pytorch:latest_cpu TARGET: pytorch_stage TORCHVISION_VERSION: 0.16.2 @@ -81,12 +81,12 @@ brand=geforce,driver>=470,driver<471 brand=geforce,driver>=515,driver<516 brand=quadro,driver>=470,driver<471 brand=quadro,driver>=515,driver<516 brand=titan,driver>=470,driver<471 brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=470,driver<471 brand=titanrtx,driver>=515,driver<516 - PYTHON_VERSION: '3.11' + PYTHON_VERSION: '3.10' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.0.1 TAGS: - - mosaicml/pytorch:2.0.1_cu118-python3.11-ubuntu20.04 + - mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04 TARGET: pytorch_stage TORCHVISION_VERSION: 0.15.2 - AWS_OFI_NCCL_VERSION: v1.7.4-aws @@ -100,12 +100,12 @@ brand=geforce,driver>=470,driver<471 brand=geforce,driver>=515,driver<516 brand=quadro,driver>=470,driver<471 brand=quadro,driver>=515,driver<516 brand=titan,driver>=470,driver<471 brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=470,driver<471 brand=titanrtx,driver>=515,driver<516 - PYTHON_VERSION: '3.11' + PYTHON_VERSION: '3.10' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.0.1 TAGS: - - mosaicml/pytorch:2.0.1_cu118-python3.11-ubuntu20.04-aws + - mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04-aws TARGET: pytorch_stage TORCHVISION_VERSION: 0.15.2 - AWS_OFI_NCCL_VERSION: '' @@ -114,12 +114,12 @@ IMAGE_NAME: torch-2-0-1-cpu MOFED_VERSION: '' NVIDIA_REQUIRE_CUDA_OVERRIDE: '' - PYTHON_VERSION: '3.11' + PYTHON_VERSION: '3.10' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.0.1 TAGS: - - mosaicml/pytorch:2.0.1_cpu-python3.11-ubuntu20.04 + - mosaicml/pytorch:2.0.1_cpu-python3.10-ubuntu20.04 TARGET: pytorch_stage TORCHVISION_VERSION: 0.15.2 - AWS_OFI_NCCL_VERSION: '' @@ -128,12 +128,12 @@ IMAGE_NAME: torch-1-13-1-cu117 MOFED_VERSION: 5.5-1.0.3.2 NVIDIA_REQUIRE_CUDA_OVERRIDE: '' - PYTHON_VERSION: '3.11' + PYTHON_VERSION: '3.10' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 1.13.1 TAGS: - - mosaicml/pytorch:1.13.1_cu117-python3.11-ubuntu20.04 + - mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04 TARGET: pytorch_stage TORCHVISION_VERSION: 0.14.1 - AWS_OFI_NCCL_VERSION: v1.7.4-aws @@ -142,114 +142,12 @@ IMAGE_NAME: torch-1-13-1-cu117-aws MOFED_VERSION: '' NVIDIA_REQUIRE_CUDA_OVERRIDE: '' - PYTHON_VERSION: '3.11' - PYTORCH_NIGHTLY_URL: '' - PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 1.13.1 - TAGS: - - mosaicml/pytorch:1.13.1_cu117-python3.11-ubuntu20.04-aws - TARGET: pytorch_stage - TORCHVISION_VERSION: 0.14.1 -- AWS_OFI_NCCL_VERSION: '' - BASE_IMAGE: ubuntu:20.04 - CUDA_VERSION: '' - IMAGE_NAME: torch-1-13-1-cpu - MOFED_VERSION: '' - NVIDIA_REQUIRE_CUDA_OVERRIDE: '' - PYTHON_VERSION: '3.11' - PYTORCH_NIGHTLY_URL: '' - PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 1.13.1 - TAGS: - - mosaicml/pytorch:1.13.1_cpu-python3.11-ubuntu20.04 - TARGET: pytorch_stage - TORCHVISION_VERSION: 0.14.1 -- AWS_OFI_NCCL_VERSION: '' - BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 - CUDA_VERSION: 12.1.0 - IMAGE_NAME: torch-2-1-2-cu121 - MOFED_VERSION: 5.5-1.0.3.2 - NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 - brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 - brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471 - brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471 - brand=tesla,driver>=510,driver<511 brand=unknown,driver>=510,driver<511 brand=nvidia,driver>=510,driver<511 - brand=nvidiartx,driver>=510,driver<511 brand=geforce,driver>=510,driver<511 brand=geforcertx,driver>=510,driver<511 - brand=quadro,driver>=510,driver<511 brand=quadrortx,driver>=510,driver<511 brand=titan,driver>=510,driver<511 - brand=titanrtx,driver>=510,driver<511 brand=tesla,driver>=515,driver<516 brand=unknown,driver>=515,driver<516 - brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=515,driver<516 brand=geforce,driver>=515,driver<516 - brand=geforcertx,driver>=515,driver<516 brand=quadro,driver>=515,driver<516 brand=quadrortx,driver>=515,driver<516 - brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=515,driver<516 brand=tesla,driver>=525,driver<526 - brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 - brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 - brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 - PYTHON_VERSION: '3.10' - PYTORCH_NIGHTLY_URL: '' - PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.1.2 - TAGS: - - mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04 - TARGET: pytorch_stage - TORCHVISION_VERSION: 0.16.2 -- AWS_OFI_NCCL_VERSION: '' - BASE_IMAGE: ubuntu:20.04 - CUDA_VERSION: '' - IMAGE_NAME: torch-2-1-2-cpu - MOFED_VERSION: '' - NVIDIA_REQUIRE_CUDA_OVERRIDE: '' - PYTHON_VERSION: '3.10' - PYTORCH_NIGHTLY_URL: '' - PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.1.2 - TAGS: - - mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04 - TARGET: pytorch_stage - TORCHVISION_VERSION: 0.16.2 -- AWS_OFI_NCCL_VERSION: '' - BASE_IMAGE: nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 - CUDA_VERSION: 11.8.0 - IMAGE_NAME: torch-2-0-1-cu118 - MOFED_VERSION: 5.5-1.0.3.2 - NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=11.8 brand=tesla,driver>=470,driver<471 brand=tesla,driver>=515,driver<516 - brand=unknown,driver>=470,driver<471 brand=unknown,driver>=515,driver<516 brand=nvidia,driver>=470,driver<471 - brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=470,driver<471 brand=nvidiartx,driver>=515,driver<516 - brand=geforce,driver>=470,driver<471 brand=geforce,driver>=515,driver<516 brand=quadro,driver>=470,driver<471 - brand=quadro,driver>=515,driver<516 brand=titan,driver>=470,driver<471 brand=titan,driver>=515,driver<516 - brand=titanrtx,driver>=470,driver<471 brand=titanrtx,driver>=515,driver<516 - PYTHON_VERSION: '3.10' - PYTORCH_NIGHTLY_URL: '' - PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.0.1 - TAGS: - - mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04 - TARGET: pytorch_stage - TORCHVISION_VERSION: 0.15.2 -- AWS_OFI_NCCL_VERSION: '' - BASE_IMAGE: ubuntu:20.04 - CUDA_VERSION: '' - IMAGE_NAME: torch-2-0-1-cpu - MOFED_VERSION: '' - NVIDIA_REQUIRE_CUDA_OVERRIDE: '' - PYTHON_VERSION: '3.10' - PYTORCH_NIGHTLY_URL: '' - PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.0.1 - TAGS: - - mosaicml/pytorch:2.0.1_cpu-python3.10-ubuntu20.04 - TARGET: pytorch_stage - TORCHVISION_VERSION: 0.15.2 -- AWS_OFI_NCCL_VERSION: '' - BASE_IMAGE: nvidia/cuda:11.7.1-cudnn8-devel-ubuntu20.04 - CUDA_VERSION: 11.7.1 - IMAGE_NAME: torch-1-13-1-cu117 - MOFED_VERSION: 5.5-1.0.3.2 - NVIDIA_REQUIRE_CUDA_OVERRIDE: '' PYTHON_VERSION: '3.10' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 1.13.1 TAGS: - - mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04 + - mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04-aws TARGET: pytorch_stage TORCHVISION_VERSION: 0.14.1 - AWS_OFI_NCCL_VERSION: '' @@ -313,7 +211,7 @@ brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 - PYTHON_VERSION: '3.11' + PYTHON_VERSION: '3.10' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.1.2 @@ -329,7 +227,7 @@ IMAGE_NAME: composer-0-17-2-cpu MOFED_VERSION: 5.5-1.0.3.2 NVIDIA_REQUIRE_CUDA_OVERRIDE: '' - PYTHON_VERSION: '3.11' + PYTHON_VERSION: '3.10' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.1.2 From ecce586e57f968df741b03b412bc07eede19433d Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Fri, 19 Jan 2024 16:41:57 -0800 Subject: [PATCH 15/70] extrapolated pytorch to depend on python version --- docker/README.md | 13 ++- docker/build_matrix.yaml | 190 +++++++++++++++++++++----------- docker/generate_build_matrix.py | 115 ++++++++++--------- 3 files changed, 193 insertions(+), 125 deletions(-) diff --git a/docker/README.md b/docker/README.md index b05733bb1b..3a44d57396 100644 --- a/docker/README.md +++ b/docker/README.md @@ -32,14 +32,17 @@ To install composer, once inside the image, run `pip install mosaicml`. | Linux Distro | Flavor | PyTorch Version | CUDA Version | Python Version | Docker Tags | |----------------|----------|-------------------|---------------------|------------------|------------------------------------------------------------------------------------------| | Ubuntu 20.04 | Base | 2.2.0 | 12.1.0 (Infiniband) | 3.10 | `mosaicml/pytorch:2.2.0_cu121-nightly20231213-python3.10-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.1.2 | 12.1.0 (Infiniband) | 3.10 | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.1.2 | 12.1.0 (EFA) | 3.10 | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04-aws` | -| Ubuntu 20.04 | Base | 2.1.2 | cpu | 3.10 | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.1.2 | 12.1.0 (Infiniband) | 3.11 | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.1.2_cu121-python3.11-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.1.2 | 12.1.0 (EFA) | 3.11 | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.1.2_cu121-python3.11-ubuntu20.04-aws` | +| Ubuntu 20.04 | Base | 2.1.2 | cpu | 3.11 | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.1.2_cpu-python3.11-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.1.2 | 12.1.0 (Infiniband) | 3.10 | `mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.1.2 | cpu | 3.10 | `mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.0.1 | 11.8.0 (Infiniband) | 3.11 | `mosaicml/pytorch:2.0.1_cu118-python3.11-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.0.1 | 11.8.0 (EFA) | 3.11 | `mosaicml/pytorch:2.0.1_cu118-python3.11-ubuntu20.04-aws` | +| Ubuntu 20.04 | Base | 2.0.1 | cpu | 3.11 | `mosaicml/pytorch:2.0.1_cpu-python3.11-ubuntu20.04` | | Ubuntu 20.04 | Base | 2.0.1 | 11.8.0 (Infiniband) | 3.10 | `mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.0.1 | 11.8.0 (EFA) | 3.10 | `mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04-aws` | | Ubuntu 20.04 | Base | 2.0.1 | cpu | 3.10 | `mosaicml/pytorch:2.0.1_cpu-python3.10-ubuntu20.04` | | Ubuntu 20.04 | Base | 1.13.1 | 11.7.1 (Infiniband) | 3.10 | `mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04` | -| Ubuntu 20.04 | Base | 1.13.1 | 11.7.1 (EFA) | 3.10 | `mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04-aws` | | Ubuntu 20.04 | Base | 1.13.1 | cpu | 3.10 | `mosaicml/pytorch:1.13.1_cpu-python3.10-ubuntu20.04` | diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml index cd2efc0e19..4726911cb7 100644 --- a/docker/build_matrix.yaml +++ b/docker/build_matrix.yaml @@ -1,4 +1,37 @@ # This file is automatically generated by generate_build_matrix.py. DO NOT EDIT! +- AWS_OFI_NCCL_VERSION: '' + BASE_IMAGE: nvidia/cuda:11.7.1-cudnn8-devel-ubuntu20.04 + CUDA_VERSION: 11.7.1 + IMAGE_NAME: torch-1-13-1-cu117 + MOFED_VERSION: 5.5-1.0.3.2 + NVIDIA_REQUIRE_CUDA_OVERRIDE: '' + PYTHON_VERSION: '3.10' + PYTORCH_NIGHTLY_URL: '' + PYTORCH_NIGHTLY_VERSION: '' + PYTORCH_VERSION: 1.13.1 + TAGS: + - mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04 + TARGET: pytorch_stage + TORCHVISION_VERSION: 0.14.1 +- AWS_OFI_NCCL_VERSION: '' + BASE_IMAGE: nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 + CUDA_VERSION: 11.8.0 + IMAGE_NAME: torch-2-0-1-cu118 + MOFED_VERSION: 5.5-1.0.3.2 + NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=11.8 brand=tesla,driver>=470,driver<471 brand=tesla,driver>=515,driver<516 + brand=unknown,driver>=470,driver<471 brand=unknown,driver>=515,driver<516 brand=nvidia,driver>=470,driver<471 + brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=470,driver<471 brand=nvidiartx,driver>=515,driver<516 + brand=geforce,driver>=470,driver<471 brand=geforce,driver>=515,driver<516 brand=quadro,driver>=470,driver<471 + brand=quadro,driver>=515,driver<516 brand=titan,driver>=470,driver<471 brand=titan,driver>=515,driver<516 + brand=titanrtx,driver>=470,driver<471 brand=titanrtx,driver>=515,driver<516 + PYTHON_VERSION: '3.10' + PYTORCH_NIGHTLY_URL: '' + PYTORCH_NIGHTLY_VERSION: '' + PYTORCH_VERSION: 2.0.1 + TAGS: + - mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04 + TARGET: pytorch_stage + TORCHVISION_VERSION: 0.15.2 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 CUDA_VERSION: 12.1.0 @@ -24,37 +57,36 @@ PYTORCH_VERSION: 2.1.2 TAGS: - mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04 - - mosaicml/pytorch:latest TARGET: pytorch_stage TORCHVISION_VERSION: 0.16.2 -- AWS_OFI_NCCL_VERSION: v1.7.4-aws - BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 - CUDA_VERSION: 12.1.0 - IMAGE_NAME: torch-2-1-2-cu121-aws +- AWS_OFI_NCCL_VERSION: '' + BASE_IMAGE: ubuntu:20.04 + CUDA_VERSION: '' + IMAGE_NAME: torch-1-13-1-cpu MOFED_VERSION: '' - NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 - brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 - brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471 - brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471 - brand=tesla,driver>=510,driver<511 brand=unknown,driver>=510,driver<511 brand=nvidia,driver>=510,driver<511 - brand=nvidiartx,driver>=510,driver<511 brand=geforce,driver>=510,driver<511 brand=geforcertx,driver>=510,driver<511 - brand=quadro,driver>=510,driver<511 brand=quadrortx,driver>=510,driver<511 brand=titan,driver>=510,driver<511 - brand=titanrtx,driver>=510,driver<511 brand=tesla,driver>=515,driver<516 brand=unknown,driver>=515,driver<516 - brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=515,driver<516 brand=geforce,driver>=515,driver<516 - brand=geforcertx,driver>=515,driver<516 brand=quadro,driver>=515,driver<516 brand=quadrortx,driver>=515,driver<516 - brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=515,driver<516 brand=tesla,driver>=525,driver<526 - brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 - brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 - brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 + NVIDIA_REQUIRE_CUDA_OVERRIDE: '' PYTHON_VERSION: '3.10' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.1.2 + PYTORCH_VERSION: 1.13.1 TAGS: - - mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04-aws - - mosaicml/pytorch:latest-aws + - mosaicml/pytorch:1.13.1_cpu-python3.10-ubuntu20.04 TARGET: pytorch_stage - TORCHVISION_VERSION: 0.16.2 + TORCHVISION_VERSION: 0.14.1 +- AWS_OFI_NCCL_VERSION: '' + BASE_IMAGE: ubuntu:20.04 + CUDA_VERSION: '' + IMAGE_NAME: torch-2-0-1-cpu + MOFED_VERSION: '' + NVIDIA_REQUIRE_CUDA_OVERRIDE: '' + PYTHON_VERSION: '3.10' + PYTORCH_NIGHTLY_URL: '' + PYTORCH_NIGHTLY_VERSION: '' + PYTORCH_VERSION: 2.0.1 + TAGS: + - mosaicml/pytorch:2.0.1_cpu-python3.10-ubuntu20.04 + TARGET: pytorch_stage + TORCHVISION_VERSION: 0.15.2 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: ubuntu:20.04 CUDA_VERSION: '' @@ -67,7 +99,6 @@ PYTORCH_VERSION: 2.1.2 TAGS: - mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04 - - mosaicml/pytorch:latest_cpu TARGET: pytorch_stage TORCHVISION_VERSION: 0.16.2 - AWS_OFI_NCCL_VERSION: '' @@ -81,14 +112,42 @@ brand=geforce,driver>=470,driver<471 brand=geforce,driver>=515,driver<516 brand=quadro,driver>=470,driver<471 brand=quadro,driver>=515,driver<516 brand=titan,driver>=470,driver<471 brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=470,driver<471 brand=titanrtx,driver>=515,driver<516 - PYTHON_VERSION: '3.10' + PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.0.1 TAGS: - - mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04 + - mosaicml/pytorch:2.0.1_cu118-python3.11-ubuntu20.04 TARGET: pytorch_stage TORCHVISION_VERSION: 0.15.2 +- AWS_OFI_NCCL_VERSION: '' + BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 + CUDA_VERSION: 12.1.0 + IMAGE_NAME: torch-2-1-2-cu121 + MOFED_VERSION: 5.5-1.0.3.2 + NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 + brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 + brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471 + brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471 + brand=tesla,driver>=510,driver<511 brand=unknown,driver>=510,driver<511 brand=nvidia,driver>=510,driver<511 + brand=nvidiartx,driver>=510,driver<511 brand=geforce,driver>=510,driver<511 brand=geforcertx,driver>=510,driver<511 + brand=quadro,driver>=510,driver<511 brand=quadrortx,driver>=510,driver<511 brand=titan,driver>=510,driver<511 + brand=titanrtx,driver>=510,driver<511 brand=tesla,driver>=515,driver<516 brand=unknown,driver>=515,driver<516 + brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=515,driver<516 brand=geforce,driver>=515,driver<516 + brand=geforcertx,driver>=515,driver<516 brand=quadro,driver>=515,driver<516 brand=quadrortx,driver>=515,driver<516 + brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=515,driver<516 brand=tesla,driver>=525,driver<526 + brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 + brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 + brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 + PYTHON_VERSION: '3.11' + PYTORCH_NIGHTLY_URL: '' + PYTORCH_NIGHTLY_VERSION: '' + PYTORCH_VERSION: 2.1.2 + TAGS: + - mosaicml/pytorch:2.1.2_cu121-python3.11-ubuntu20.04 + - mosaicml/pytorch:latest + TARGET: pytorch_stage + TORCHVISION_VERSION: 0.16.2 - AWS_OFI_NCCL_VERSION: v1.7.4-aws BASE_IMAGE: nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 CUDA_VERSION: 11.8.0 @@ -100,70 +159,71 @@ brand=geforce,driver>=470,driver<471 brand=geforce,driver>=515,driver<516 brand=quadro,driver>=470,driver<471 brand=quadro,driver>=515,driver<516 brand=titan,driver>=470,driver<471 brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=470,driver<471 brand=titanrtx,driver>=515,driver<516 - PYTHON_VERSION: '3.10' + PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.0.1 TAGS: - - mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04-aws + - mosaicml/pytorch:2.0.1_cu118-python3.11-ubuntu20.04-aws TARGET: pytorch_stage TORCHVISION_VERSION: 0.15.2 -- AWS_OFI_NCCL_VERSION: '' - BASE_IMAGE: ubuntu:20.04 - CUDA_VERSION: '' - IMAGE_NAME: torch-2-0-1-cpu +- AWS_OFI_NCCL_VERSION: v1.7.4-aws + BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 + CUDA_VERSION: 12.1.0 + IMAGE_NAME: torch-2-1-2-cu121-aws MOFED_VERSION: '' - NVIDIA_REQUIRE_CUDA_OVERRIDE: '' - PYTHON_VERSION: '3.10' + NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 + brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 + brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471 + brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471 + brand=tesla,driver>=510,driver<511 brand=unknown,driver>=510,driver<511 brand=nvidia,driver>=510,driver<511 + brand=nvidiartx,driver>=510,driver<511 brand=geforce,driver>=510,driver<511 brand=geforcertx,driver>=510,driver<511 + brand=quadro,driver>=510,driver<511 brand=quadrortx,driver>=510,driver<511 brand=titan,driver>=510,driver<511 + brand=titanrtx,driver>=510,driver<511 brand=tesla,driver>=515,driver<516 brand=unknown,driver>=515,driver<516 + brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=515,driver<516 brand=geforce,driver>=515,driver<516 + brand=geforcertx,driver>=515,driver<516 brand=quadro,driver>=515,driver<516 brand=quadrortx,driver>=515,driver<516 + brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=515,driver<516 brand=tesla,driver>=525,driver<526 + brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 + brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 + brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 + PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.0.1 + PYTORCH_VERSION: 2.1.2 TAGS: - - mosaicml/pytorch:2.0.1_cpu-python3.10-ubuntu20.04 + - mosaicml/pytorch:2.1.2_cu121-python3.11-ubuntu20.04-aws + - mosaicml/pytorch:latest-aws TARGET: pytorch_stage - TORCHVISION_VERSION: 0.15.2 + TORCHVISION_VERSION: 0.16.2 - AWS_OFI_NCCL_VERSION: '' - BASE_IMAGE: nvidia/cuda:11.7.1-cudnn8-devel-ubuntu20.04 - CUDA_VERSION: 11.7.1 - IMAGE_NAME: torch-1-13-1-cu117 - MOFED_VERSION: 5.5-1.0.3.2 - NVIDIA_REQUIRE_CUDA_OVERRIDE: '' - PYTHON_VERSION: '3.10' - PYTORCH_NIGHTLY_URL: '' - PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 1.13.1 - TAGS: - - mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04 - TARGET: pytorch_stage - TORCHVISION_VERSION: 0.14.1 -- AWS_OFI_NCCL_VERSION: v1.7.4-aws - BASE_IMAGE: nvidia/cuda:11.7.1-cudnn8-devel-ubuntu20.04 - CUDA_VERSION: 11.7.1 - IMAGE_NAME: torch-1-13-1-cu117-aws + BASE_IMAGE: ubuntu:20.04 + CUDA_VERSION: '' + IMAGE_NAME: torch-2-0-1-cpu MOFED_VERSION: '' NVIDIA_REQUIRE_CUDA_OVERRIDE: '' - PYTHON_VERSION: '3.10' + PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 1.13.1 + PYTORCH_VERSION: 2.0.1 TAGS: - - mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04-aws + - mosaicml/pytorch:2.0.1_cpu-python3.11-ubuntu20.04 TARGET: pytorch_stage - TORCHVISION_VERSION: 0.14.1 + TORCHVISION_VERSION: 0.15.2 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: ubuntu:20.04 CUDA_VERSION: '' - IMAGE_NAME: torch-1-13-1-cpu + IMAGE_NAME: torch-2-1-2-cpu MOFED_VERSION: '' NVIDIA_REQUIRE_CUDA_OVERRIDE: '' - PYTHON_VERSION: '3.10' + PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 1.13.1 + PYTORCH_VERSION: 2.1.2 TAGS: - - mosaicml/pytorch:1.13.1_cpu-python3.10-ubuntu20.04 + - mosaicml/pytorch:2.1.2_cpu-python3.11-ubuntu20.04 + - mosaicml/pytorch:latest_cpu TARGET: pytorch_stage - TORCHVISION_VERSION: 0.14.1 + TORCHVISION_VERSION: 0.16.2 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 CUDA_VERSION: 12.1.0 @@ -211,7 +271,7 @@ brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 - PYTHON_VERSION: '3.10' + PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.1.2 @@ -227,7 +287,7 @@ IMAGE_NAME: composer-0-17-2-cpu MOFED_VERSION: 5.5-1.0.3.2 NVIDIA_REQUIRE_CUDA_OVERRIDE: '' - PYTHON_VERSION: '3.10' + PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.1.2 diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py index 3ae69f6d77..f1fa58c598 100644 --- a/docker/generate_build_matrix.py +++ b/docker/generate_build_matrix.py @@ -18,7 +18,7 @@ import tabulate import yaml -LATEST_PYTHON_VERSION = '3.10' +LATEST_PYTHON_VERSION = '3.11' PRODUCTION_PYTORCH_VERSION = '2.1.2' @@ -31,6 +31,13 @@ def _get_torchvision_version(pytorch_version: str): return '0.14.1' raise ValueError(f'Invalid pytorch_version: {pytorch_version}') +def _get_pytorch_version(python_version: str): + if python_version == '3.10': + return ['1.13.1', '2.0.1', '2.1.2'] + if python_version == '3.11': + return ['2.0.1', '2.1.2'] + raise ValueError(f'Invalid python_version: {python_version}') + def _get_base_image(cuda_version: str): if not cuda_version: @@ -165,68 +172,66 @@ def _write_table(table_tag: str, table_contents: str): def _main(): - python_versions = ['3.10'] - pytorch_versions = ['2.1.2', '2.0.1', '1.13.1'] + python_versions = ['3.10', '3.11'] cuda_options = [True, False] stages = ['pytorch_stage'] interconnects = ['mellanox', 'EFA'] # mellanox is default, EFA needed for AWS pytorch_entries = [] - for product in itertools.product(python_versions, pytorch_versions, cuda_options, stages, interconnects): - python_version, pytorch_version, use_cuda, stage, interconnect = product - - cuda_version = _get_cuda_version(pytorch_version=pytorch_version, use_cuda=use_cuda) - - entry = { - 'IMAGE_NAME': - _get_image_name(pytorch_version, cuda_version, stage, interconnect), - 'BASE_IMAGE': - _get_base_image(cuda_version), - 'CUDA_VERSION': - cuda_version, - 'PYTHON_VERSION': - python_version, - 'PYTORCH_VERSION': - pytorch_version, - 'TARGET': - stage, - 'TORCHVISION_VERSION': - _get_torchvision_version(pytorch_version), - 'TAGS': - _get_pytorch_tags( - python_version=python_version, - pytorch_version=pytorch_version, - cuda_version=cuda_version, - stage=stage, - interconnect=interconnect, - ), - 'PYTORCH_NIGHTLY_URL': - '', - 'PYTORCH_NIGHTLY_VERSION': - '', - 'NVIDIA_REQUIRE_CUDA_OVERRIDE': - _get_cuda_override(cuda_version), - } - - # Only build EFA image on latest python with cuda on pytorch_stage - if interconnect == 'EFA' and not (python_version == LATEST_PYTHON_VERSION and use_cuda and - stage == 'pytorch_stage'): - continue - - # Skip the mellanox drivers if not in the cuda images or using EFA - if not cuda_version or interconnect == 'EFA': - entry['MOFED_VERSION'] = '' - else: - entry['MOFED_VERSION'] = '5.5-1.0.3.2' + for product in itertools.product(python_versions, cuda_options, stages, interconnects): + python_version, use_cuda, stage, interconnect = product + for pytorch_version in _get_pytorch_version(python_version): + cuda_version = _get_cuda_version(pytorch_version=pytorch_version, use_cuda=use_cuda) + entry = { + 'IMAGE_NAME': + _get_image_name(pytorch_version, cuda_version, stage, interconnect), + 'BASE_IMAGE': + _get_base_image(cuda_version), + 'CUDA_VERSION': + cuda_version, + 'PYTHON_VERSION': + python_version, + 'PYTORCH_VERSION': + pytorch_version, + 'TARGET': + stage, + 'TORCHVISION_VERSION': + _get_torchvision_version(pytorch_version), + 'TAGS': + _get_pytorch_tags( + python_version=python_version, + pytorch_version=pytorch_version, + cuda_version=cuda_version, + stage=stage, + interconnect=interconnect, + ), + 'PYTORCH_NIGHTLY_URL': + '', + 'PYTORCH_NIGHTLY_VERSION': + '', + 'NVIDIA_REQUIRE_CUDA_OVERRIDE': + _get_cuda_override(cuda_version), + } + + # Only build EFA image on latest python with cuda on pytorch_stage + if interconnect == 'EFA' and not (python_version == LATEST_PYTHON_VERSION and use_cuda and + stage == 'pytorch_stage'): + continue + + # Skip the mellanox drivers if not in the cuda images or using EFA + if not cuda_version or interconnect == 'EFA': + entry['MOFED_VERSION'] = '' + else: + entry['MOFED_VERSION'] = '5.5-1.0.3.2' - # Skip EFA drivers if not using EFA - if interconnect != 'EFA': - entry['AWS_OFI_NCCL_VERSION'] = '' - else: - entry['AWS_OFI_NCCL_VERSION'] = 'v1.7.4-aws' + # Skip EFA drivers if not using EFA + if interconnect != 'EFA': + entry['AWS_OFI_NCCL_VERSION'] = '' + else: + entry['AWS_OFI_NCCL_VERSION'] = 'v1.7.4-aws' - pytorch_entries.append(entry) + pytorch_entries.append(entry) nightly_entry = { 'AWS_OFI_NCCL_VERSION': '', 'BASE_IMAGE': 'nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04', From 851bc405de18e117f2fa4187dc19dfe43518b4e1 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Mon, 22 Jan 2024 11:28:36 -0800 Subject: [PATCH 16/70] python 3.8 deprecation assertion --- composer/trainer/trainer.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py index 2b9c9731a5..c965a82f91 100644 --- a/composer/trainer/trainer.py +++ b/composer/trainer/trainer.py @@ -13,6 +13,7 @@ import os import random import re +import sys import tempfile import textwrap import time @@ -922,6 +923,10 @@ def __init__( # compile config for PyTorch 2.0 or higher compile_config: Optional[Dict[str, Any]] = None, ): + + # Check if the current Python version is compatible + major, minor = sys.version_info[0], sys.version_info[1] + assert (major == 3 and minor <= 8) or (major < 3), f"Python version {major}.{minor} is not supported. Please use Python 3.9 or higher." self.auto_log_hparams = auto_log_hparams self.python_log_level = python_log_level From 4d01cabe3d9851334e93d53b6d0e18cdac8035ad Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Mon, 22 Jan 2024 11:55:32 -0800 Subject: [PATCH 17/70] removed deprecation --- composer/trainer/trainer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py index c965a82f91..c84f24441a 100644 --- a/composer/trainer/trainer.py +++ b/composer/trainer/trainer.py @@ -925,8 +925,8 @@ def __init__( ): # Check if the current Python version is compatible - major, minor = sys.version_info[0], sys.version_info[1] - assert (major == 3 and minor <= 8) or (major < 3), f"Python version {major}.{minor} is not supported. Please use Python 3.9 or higher." + # major, minor = sys.version_info[0], sys.version_info[1] + # assert (major == 3 and minor <= 8) or (major < 3), f"Python version {major}.{minor} is not supported. Please use Python 3.9 or higher." self.auto_log_hparams = auto_log_hparams self.python_log_level = python_log_level From 7be59d46e49c5c510d54d7b3e200c33639faeb50 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Mon, 22 Jan 2024 12:02:20 -0800 Subject: [PATCH 18/70] removing import for test --- composer/trainer/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py index c84f24441a..9ebec6c373 100644 --- a/composer/trainer/trainer.py +++ b/composer/trainer/trainer.py @@ -13,7 +13,7 @@ import os import random import re -import sys +#import sys import tempfile import textwrap import time From 9e31fa663d3ada2cc98d6cc644a9178b419ea66b Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Mon, 22 Jan 2024 12:59:24 -0800 Subject: [PATCH 19/70] lint --- docker/generate_build_matrix.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py index e32f74fa29..fd3d152f2f 100644 --- a/docker/generate_build_matrix.py +++ b/docker/generate_build_matrix.py @@ -31,6 +31,7 @@ def _get_torchvision_version(pytorch_version: str): return '0.14.1' raise ValueError(f'Invalid pytorch_version: {pytorch_version}') + def _get_pytorch_version(python_version: str): if python_version == '3.10': return ['1.13.1', '2.0.1', '2.1.2'] @@ -216,7 +217,7 @@ def _main(): # Only build EFA image on latest python with cuda on pytorch_stage if interconnect == 'EFA' and not (python_version == LATEST_PYTHON_VERSION and use_cuda and - stage == 'pytorch_stage'): + stage == 'pytorch_stage'): continue # Skip the mellanox drivers if not in the cuda images or using EFA From 34cd00a1c03b5b4496cd908f964160f0dcc7dd39 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Mon, 22 Jan 2024 14:15:25 -0800 Subject: [PATCH 20/70] lint --- composer/datasets/utils.py | 4 ++-- composer/trainer/trainer.py | 5 ----- tests/datasets/test_in_context_learning_datasets.py | 4 ++-- 3 files changed, 4 insertions(+), 9 deletions(-) diff --git a/composer/datasets/utils.py b/composer/datasets/utils.py index 431a860900..44186ac58e 100644 --- a/composer/datasets/utils.py +++ b/composer/datasets/utils.py @@ -179,7 +179,7 @@ class MultiTokenEOSCriteria(transformers.StoppingCriteria): def __init__( self, stop_sequence: str, - tokenizer: transformers.PreTrainedTokenizer, + tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast], batch_size: int, ) -> None: self.done_tracker = [False] * batch_size @@ -213,7 +213,7 @@ def __call__(self, input_ids, scores: Optional[torch.FloatTensor] = None, **kwar return False not in self.done_tracker def stop_sequences_criteria( - tokenizer: transformers.PreTrainedTokenizer, + tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast], stop_sequences: List[str], batch_size: int, ) -> transformers.StoppingCriteriaList: diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py index 9ebec6c373..2b9c9731a5 100644 --- a/composer/trainer/trainer.py +++ b/composer/trainer/trainer.py @@ -13,7 +13,6 @@ import os import random import re -#import sys import tempfile import textwrap import time @@ -923,10 +922,6 @@ def __init__( # compile config for PyTorch 2.0 or higher compile_config: Optional[Dict[str, Any]] = None, ): - - # Check if the current Python version is compatible - # major, minor = sys.version_info[0], sys.version_info[1] - # assert (major == 3 and minor <= 8) or (major < 3), f"Python version {major}.{minor} is not supported. Please use Python 3.9 or higher." self.auto_log_hparams = auto_log_hparams self.python_log_level = python_log_level diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py index ec7df306d6..2a3ff87884 100644 --- a/tests/datasets/test_in_context_learning_datasets.py +++ b/tests/datasets/test_in_context_learning_datasets.py @@ -73,13 +73,13 @@ def test_stop_sequences_criteria(tiny_gpt2_tokenizer): seq1 = tiny_gpt2_tokenizer('Dogs are furry')['input_ids'] seq2 = tiny_gpt2_tokenizer('Dogs are furry\n\n')['input_ids'] seq1 = [50257] * (len(seq2) - len(seq1)) + seq1 - input_ids = torch.tensor([seq1, seq2]) + input_ids = torch.LongTensor([seq1, seq2]) assert not eos_criteria(input_ids, None) eos_criteria = MultiTokenEOSCriteria('\n\n', tiny_gpt2_tokenizer, 2) seq1 = tiny_gpt2_tokenizer('Dogs are furry\n\n')['input_ids'] seq2 = tiny_gpt2_tokenizer('Dogs are furry\n\n')['input_ids'] - input_ids = torch.tensor([seq1, seq2]) + input_ids = torch.LongTensor([seq1, seq2]) assert eos_criteria(input_ids, None) From 245107d3b99a67ee32d94f21fafaa2978dfdd6d2 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Tue, 23 Jan 2024 00:07:04 -0800 Subject: [PATCH 21/70] pr review changes --- docker/Dockerfile | 189 ++++++++++++++++---------------- docker/generate_build_matrix.py | 4 +- 2 files changed, 96 insertions(+), 97 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index c7d1d6665f..4b30c29d4f 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -77,56 +77,55 @@ ARG CUDA_VERSION # If this file is present after the first command, kaniko # won't be able to build the docker image. RUN if [ -n "$CUDA_VERSION" ]; then \ - rm -f /usr/local/cuda-$(echo $CUDA_VERSION | cut -c -4)/cuda-$(echo $CUDA_VERSION | cut -c -4); \ + rm -f /usr/local/cuda-$(echo $CUDA_VERSION | cut -c -4)/cuda-$(echo $CUDA_VERSION | cut -c -4); \ fi # update repository keys # https://developer.nvidia.com/blog/updating-the-cuda-linux-gpg-repository-key/ RUN if [ -n "$CUDA_VERSION" ] ; then \ - rm -f /etc/apt/sources.list.d/cuda.list && \ - rm -f /etc/apt/sources.list.d/nvidia-ml.list && \ - apt-get update && \ - apt-get install -y --no-install-recommends wget && \ - apt-get autoclean && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* \ - apt-key del 7fa2af80 && \ - mkdir -p /tmp/cuda-keyring && \ - wget -P /tmp/cuda-keyring https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb && \ - dpkg -i /tmp/cuda-keyring/cuda-keyring_1.0-1_all.deb && \ - rm -rf /tmp/cuda-keyring ; \ + rm -f /etc/apt/sources.list.d/cuda.list && \ + rm -f /etc/apt/sources.list.d/nvidia-ml.list && \ + apt-get update && \ + apt-get install -y --no-install-recommends wget && \ + apt-get autoclean && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* \ + apt-key del 7fa2af80 && \ + mkdir -p /tmp/cuda-keyring && \ + wget -P /tmp/cuda-keyring https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb && \ + dpkg -i /tmp/cuda-keyring/cuda-keyring_1.0-1_all.deb && \ + rm -rf /tmp/cuda-keyring ; \ fi RUN apt-get update && \ apt-get install -y --no-install-recommends \ - libgomp1 \ - curl \ - wget \ - sudo \ - build-essential \ - software-properties-common \ - dirmngr \ - apt-utils \ - gpg-agent \ - openssh-client \ - # For PILLOW: - zlib1g-dev \ - libtiff-dev \ - libfreetype6-dev \ - liblcms2-dev \ - tcl \ - libjpeg8-dev \ - less \ - libsnappy-dev \ - # For AWS EFA: - autoconf \ - autotools-dev \ - automake \ - libtool \ - # Development tools - tmux \ - htop && \ + libgomp1 \ + curl \ + wget \ + sudo \ + build-essential \ + software-properties-common \ + dirmngr \ + apt-utils \ + gpg-agent \ + openssh-client \ + # For PILLOW: + zlib1g-dev \ + libtiff-dev \ + libfreetype6-dev \ + liblcms2-dev \ + tcl \ + libjpeg8-dev \ + less \ + # For AWS EFA: + autoconf \ + autotools-dev \ + automake \ + libtool \ + # Development tools + tmux \ + htop && \ apt-get autoclean && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* @@ -136,7 +135,7 @@ RUN apt-get update && \ ############################### RUN add-apt-repository ppa:git-core/ppa && \ apt-get install -y --no-install-recommends \ - git && \ + git && \ apt-get autoclean && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* @@ -202,14 +201,14 @@ ENV PYTORCH_NIGHTLY_URL=${PYTORCH_NIGHTLY_URL} ENV PYTORCH_NIGHTLY_VERSION=${PYTORCH_NIGHTLY_VERSION} RUN if [ -z "$PYTORCH_NIGHTLY_URL" ] ; then \ - CUDA_VERSION_TAG=$(python${PYTHON_VERSION} -c "print('cu' + ''.join('${CUDA_VERSION}'.split('.')[:2]) if '${CUDA_VERSION}' else 'cpu')") && \ - pip${PYTHON_VERSION} install --no-cache-dir --find-links https://download.pytorch.org/whl/torch_stable.html \ - torch==${PYTORCH_VERSION}+${CUDA_VERSION_TAG} \ - torchvision==${TORCHVISION_VERSION}+${CUDA_VERSION_TAG} ; \ + CUDA_VERSION_TAG=$(python${PYTHON_VERSION} -c "print('cu' + ''.join('${CUDA_VERSION}'.split('.')[:2]) if '${CUDA_VERSION}' else 'cpu')") && \ + pip${PYTHON_VERSION} install --no-cache-dir --find-links https://download.pytorch.org/whl/torch_stable.html \ + torch==${PYTORCH_VERSION}+${CUDA_VERSION_TAG} \ + torchvision==${TORCHVISION_VERSION}+${CUDA_VERSION_TAG} ; \ else \ - pip${PYTHON_VERSION} install --no-cache-dir --pre --index-url ${PYTORCH_NIGHTLY_URL} \ - torch==${PYTORCH_VERSION}.${PYTORCH_NIGHTLY_VERSION} \ - torchvision==${TORCHVISION_VERSION}.${PYTORCH_NIGHTLY_VERSION} ; \ + pip${PYTHON_VERSION} install --no-cache-dir --pre --index-url ${PYTORCH_NIGHTLY_URL} \ + torch==${PYTORCH_VERSION}.${PYTORCH_NIGHTLY_VERSION} \ + torchvision==${TORCHVISION_VERSION}.${PYTORCH_NIGHTLY_VERSION} ; \ fi ##################################### @@ -226,34 +225,34 @@ ENV FI_EFA_USE_DEVICE_RDMA=1 RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ apt-get update && \ apt-get install -y --no-install-recommends \ - hwloc \ - libhwloc-dev && \ + hwloc \ + libhwloc-dev && \ apt-get autoclean && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* ; \ fi RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ - cd /tmp && \ - curl -OsS https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \ - tar -xf /tmp/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \ - cd aws-efa-installer && \ - apt-get update && \ - ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify && \ - rm -rf /tmp/aws-efa-installer* ; \ + cd /tmp && \ + curl -OsS https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \ + tar -xf /tmp/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \ + cd aws-efa-installer && \ + apt-get update && \ + ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify && \ + rm -rf /tmp/aws-efa-installer* ; \ fi RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ - git clone https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl && \ - cd /opt/aws-ofi-nccl && \ - git checkout ${AWS_OFI_NCCL_VERSION} && \ - ./autogen.sh && \ - ./configure --prefix=/opt/aws-ofi-nccl/install \ - --with-libfabric=/opt/amazon/efa/ \ - --with-cuda=/usr/local/cuda \ - --disable-tests \ - --enable-platform-aws && \ - make && make install ; \ + git clone https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl && \ + cd /opt/aws-ofi-nccl && \ + git checkout ${AWS_OFI_NCCL_VERSION} && \ + ./autogen.sh && \ + ./configure --prefix=/opt/aws-ofi-nccl/install \ + --with-libfabric=/opt/amazon/efa/ \ + --with-cuda=/usr/local/cuda \ + --disable-tests \ + --enable-platform-aws && \ + make && make install ; \ fi ################################### @@ -263,11 +262,11 @@ RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ ARG MOFED_VERSION RUN if [ -n "$MOFED_VERSION" ] ; then \ - mkdir -p /tmp/mofed && \ - wget -nv -P /tmp/mofed http://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VERSION}/MLNX_OFED_LINUX-${MOFED_VERSION}-ubuntu20.04-x86_64.tgz && \ - tar -zxvf /tmp/mofed/MLNX_OFED_LINUX-${MOFED_VERSION}-ubuntu20.04-x86_64.tgz -C /tmp/mofed && \ - /tmp/mofed/MLNX_OFED_LINUX-${MOFED_VERSION}-ubuntu20.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force && \ - rm -rf /tmp/mofed ; \ + mkdir -p /tmp/mofed && \ + wget -nv -P /tmp/mofed http://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VERSION}/MLNX_OFED_LINUX-${MOFED_VERSION}-ubuntu20.04-x86_64.tgz && \ + tar -zxvf /tmp/mofed/MLNX_OFED_LINUX-${MOFED_VERSION}-ubuntu20.04-x86_64.tgz -C /tmp/mofed && \ + /tmp/mofed/MLNX_OFED_LINUX-${MOFED_VERSION}-ubuntu20.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force && \ + rm -rf /tmp/mofed ; \ fi ##################### @@ -275,30 +274,30 @@ RUN if [ -n "$MOFED_VERSION" ] ; then \ ##################### # skip if torch nightly is installed as there is incompatability RUN if [[ -n "$CUDA_VERSION" ]] && [[ -z "${PYTORCH_NIGHTLY_URL}" ]]; then \ - mkdir -p /tmp/apex && \ - cd /tmp/apex && \ - git clone https://github.com/NVIDIA/apex && \ - cd apex && \ - git checkout 82ee367f3da74b4cd62a1fb47aa9806f0f47b58b && \ - pip${PYTHON_VERSION} install --no-cache-dir -r requirements.txt && \ - pip${PYTHON_VERSION} install --no-cache-dir \ - --global-option="--cpp_ext" \ - --global-option="--cuda_ext" \ - --target /usr/local/lib/python${PYTHON_VERSION}/dist-packages \ - ./ && \ - rm -rf /tmp/apex ; \ + mkdir -p /tmp/apex && \ + cd /tmp/apex && \ + git clone https://github.com/NVIDIA/apex && \ + cd apex && \ + git checkout 82ee367f3da74b4cd62a1fb47aa9806f0f47b58b && \ + pip${PYTHON_VERSION} install --no-cache-dir -r requirements.txt && \ + pip${PYTHON_VERSION} install --no-cache-dir \ + --global-option="--cpp_ext" \ + --global-option="--cuda_ext" \ + --target /usr/local/lib/python${PYTHON_VERSION}/dist-packages \ + ./ && \ + rm -rf /tmp/apex ; \ fi ########################## # Install Flash Attention ########################## RUN if [ -n "$CUDA_VERSION" ] ; then \ - pip${PYTHON_VERSION} install --upgrade --no-cache-dir ninja==1.11.1 && \ - pip${PYTHON_VERSION} install --upgrade --no-cache-dir --force-reinstall packaging==22.0 && \ - git clone --branch v2.4.2 https://github.com/Dao-AILab/flash-attention.git && \ - cd flash-attention && \ - MAX_JOBS=1 python${PYTHON_VERSION} setup.py install && \ - cd .. ; \ + pip${PYTHON_VERSION} install --upgrade --no-cache-dir ninja==1.11.1 && \ + pip${PYTHON_VERSION} install --upgrade --no-cache-dir --force-reinstall packaging==22.0 && \ + git clone --branch v2.4.2 https://github.com/Dao-AILab/flash-attention.git && \ + cd flash-attention && \ + MAX_JOBS=1 python${PYTHON_VERSION} setup.py install && \ + cd .. ; \ fi ############### @@ -355,10 +354,10 @@ RUN apt-get update && \ # Upgrade pip packages ######################### RUN pip install --no-cache-dir --upgrade \ - certifi${CERTIFI_VERSION} \ - ipython${IPYTHON_VERSION} \ - urllib3${URLLIB3_VERSION} \ - python-snappy + certifi${CERTIFI_VERSION} \ + ipython${IPYTHON_VERSION} \ + urllib3${URLLIB3_VERSION} \ + python-snappy ################################################## # Override NVIDIA mistaken env var for 11.8 images @@ -380,4 +379,4 @@ ARG DEBIAN_FRONTEND=noninteractive ARG COMPOSER_INSTALL_COMMAND -RUN pip install "${COMPOSER_INSTALL_COMMAND}" +RUN pip install "${COMPOSER_INSTALL_COMMAND}" \ No newline at end of file diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py index fd3d152f2f..fd2f193c04 100644 --- a/docker/generate_build_matrix.py +++ b/docker/generate_build_matrix.py @@ -32,7 +32,7 @@ def _get_torchvision_version(pytorch_version: str): raise ValueError(f'Invalid pytorch_version: {pytorch_version}') -def _get_pytorch_version(python_version: str): +def _get_pytorch_versions(python_version: str): if python_version == '3.10': return ['1.13.1', '2.0.1', '2.1.2'] if python_version == '3.11': @@ -182,7 +182,7 @@ def _main(): for product in itertools.product(python_versions, cuda_options, stages, interconnects): python_version, use_cuda, stage, interconnect = product - for pytorch_version in _get_pytorch_version(python_version): + for pytorch_version in _get_pytorch_versions(python_version): cuda_version = _get_cuda_version(pytorch_version=pytorch_version, use_cuda=use_cuda) entry = { 'IMAGE_NAME': From 1760a6aa3015132067649d94e303a442990e56a5 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Tue, 23 Jan 2024 00:14:34 -0800 Subject: [PATCH 22/70] apt install snappy before pip install --- docker/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/Dockerfile b/docker/Dockerfile index 4b30c29d4f..fee5737190 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -118,6 +118,7 @@ RUN apt-get update && \ tcl \ libjpeg8-dev \ less \ + libsnappy-dev \ # For AWS EFA: autoconf \ autotools-dev \ From d7e8956d46ee71d8f5907678e72f5db785d24309 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Tue, 23 Jan 2024 00:25:38 -0800 Subject: [PATCH 23/70] lint --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index fee5737190..e7ef0ecef3 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -380,4 +380,4 @@ ARG DEBIAN_FRONTEND=noninteractive ARG COMPOSER_INSTALL_COMMAND -RUN pip install "${COMPOSER_INSTALL_COMMAND}" \ No newline at end of file +RUN pip install "${COMPOSER_INSTALL_COMMAND}" From a5fdcf756e5bfcb07852f446eaba4aa88f145a4d Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Tue, 23 Jan 2024 12:21:10 -0800 Subject: [PATCH 24/70] disk usage print logs --- docker/Dockerfile | 46 +++++++++++++++++++++++++++++++++------------- 1 file changed, 33 insertions(+), 13 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index e7ef0ecef3..48bfcb7ad2 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -96,8 +96,8 @@ RUN if [ -n "$CUDA_VERSION" ] ; then \ wget -P /tmp/cuda-keyring https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb && \ dpkg -i /tmp/cuda-keyring/cuda-keyring_1.0-1_all.deb && \ rm -rf /tmp/cuda-keyring ; \ + du -hs /var/lib/apt/lists/* /usr/lib/python3/* \ fi - RUN apt-get update && \ apt-get install -y --no-install-recommends \ libgomp1 \ @@ -129,7 +129,8 @@ RUN apt-get update && \ htop && \ apt-get autoclean && \ apt-get clean && \ - rm -rf /var/lib/apt/lists/* + rm -rf /var/lib/apt/lists/* \ + du -hs /var/lib/apt/lists/* /usr/lib/python3/* ############################### # Install latest version of git @@ -139,7 +140,8 @@ RUN add-apt-repository ppa:git-core/ppa && \ git && \ apt-get autoclean && \ apt-get clean && \ - rm -rf /var/lib/apt/lists/* + rm -rf /var/lib/apt/lists/* \ + du -hs /var/lib/apt/lists/* /usr/lib/python3/* ############################## # Install NodeJS (for Pyright) @@ -149,7 +151,8 @@ RUN \ apt-get install -y --no-install-recommends nodejs && \ apt-get autoclean && \ apt-get clean && \ - rm -rf /var/lib/apt/lists/* + rm -rf /var/lib/apt/lists/* \ + du -hs /var/lib/apt/lists/* /usr/lib/python3/* ################ # Install Python @@ -169,10 +172,12 @@ RUN add-apt-repository ppa:deadsnakes/ppa && \ python${PYTHON_VERSION}-venv && \ apt-get autoclean && \ apt-get clean && \ - rm -rf /var/lib/apt/lists/* + rm -rf /var/lib/apt/lists/* \ + du -hs /var/lib/apt/lists/* /usr/lib/python3/* RUN curl -fsSL https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} - && \ - pip${PYTHON_VERSION} install --no-cache-dir --upgrade 'pip<23' setuptools + pip${PYTHON_VERSION} install --no-cache-dir --upgrade 'pip<23' setuptools \ + du -hs /var/lib/apt/lists/* /usr/lib/python3/* ##################### # Install pillow-simd @@ -186,7 +191,8 @@ COPY pillow_stub /tmp/pillow_stub RUN pip${PYTHON_VERSION} install --no-cache-dir --upgrade /tmp/pillow_stub && \ pip${PYTHON_VERSION} install --no-cache-dir --upgrade pillow_simd==${PILLOW_SIMD_VERSION} && \ - rm -rf /tmp/pillow_stub + rm -rf /tmp/pillow_stub \ + du -hs /var/lib/apt/lists/* /usr/lib/python3/* ################# # Install Pytorch @@ -206,10 +212,12 @@ RUN if [ -z "$PYTORCH_NIGHTLY_URL" ] ; then \ pip${PYTHON_VERSION} install --no-cache-dir --find-links https://download.pytorch.org/whl/torch_stable.html \ torch==${PYTORCH_VERSION}+${CUDA_VERSION_TAG} \ torchvision==${TORCHVISION_VERSION}+${CUDA_VERSION_TAG} ; \ + du -hs /var/lib/apt/lists/* /usr/lib/python3/* \ else \ pip${PYTHON_VERSION} install --no-cache-dir --pre --index-url ${PYTORCH_NIGHTLY_URL} \ torch==${PYTORCH_VERSION}.${PYTORCH_NIGHTLY_VERSION} \ torchvision==${TORCHVISION_VERSION}.${PYTORCH_NIGHTLY_VERSION} ; \ + du -hs /var/lib/apt/lists/* /usr/lib/python3/* \ fi ##################################### @@ -231,6 +239,7 @@ RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ apt-get autoclean && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* ; \ + du -hs /var/lib/apt/lists/* /usr/lib/python3/* \ fi RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ @@ -241,6 +250,7 @@ RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ apt-get update && \ ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify && \ rm -rf /tmp/aws-efa-installer* ; \ + du -hs /var/lib/apt/lists/* /usr/lib/python3/* \ fi RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ @@ -254,6 +264,7 @@ RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ --disable-tests \ --enable-platform-aws && \ make && make install ; \ + du -hs /var/lib/apt/lists/* /usr/lib/python3/* \ fi ################################### @@ -268,6 +279,7 @@ RUN if [ -n "$MOFED_VERSION" ] ; then \ tar -zxvf /tmp/mofed/MLNX_OFED_LINUX-${MOFED_VERSION}-ubuntu20.04-x86_64.tgz -C /tmp/mofed && \ /tmp/mofed/MLNX_OFED_LINUX-${MOFED_VERSION}-ubuntu20.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force && \ rm -rf /tmp/mofed ; \ + du -hs /var/lib/apt/lists/* /usr/lib/python3/* \ fi ##################### @@ -287,6 +299,7 @@ RUN if [[ -n "$CUDA_VERSION" ]] && [[ -z "${PYTORCH_NIGHTLY_URL}" ]]; then \ --target /usr/local/lib/python${PYTHON_VERSION}/dist-packages \ ./ && \ rm -rf /tmp/apex ; \ + du -hs /var/lib/apt/lists/* /usr/lib/python3/* \ fi ########################## @@ -299,17 +312,20 @@ RUN if [ -n "$CUDA_VERSION" ] ; then \ cd flash-attention && \ MAX_JOBS=1 python${PYTHON_VERSION} setup.py install && \ cd .. ; \ + du -hs /var/lib/apt/lists/* /usr/lib/python3/* \ fi ############### # Install cmake ############### -RUN pip${PYTHON_VERSION} install --no-cache-dir cmake==3.26.3 +RUN pip${PYTHON_VERSION} install --no-cache-dir cmake==3.26.3 \ + du -hs /var/lib/apt/lists/* /usr/lib/python3/* ########################### # Install Pandoc Dependency ########################### -RUN pip${PYTHON_VERSION} install --no-cache-dir pandoc==2.3 +RUN pip${PYTHON_VERSION} install --no-cache-dir pandoc==2.3 \ + du -hs /var/lib/apt/lists/* /usr/lib/python3/* ################################ # Use the correct python version @@ -330,7 +346,8 @@ RUN mkdir -p ${COMPOSER_PYTHON_BIN} && \ # Include this folder, and the local bin folder, on the path echo "export PATH=~/.local/bin:$COMPOSER_PYTHON_BIN:$PATH" >> /etc/profile && \ echo "export PATH=~/.local/bin:$COMPOSER_PYTHON_BIN:$PATH" >> /etc/bash.bashrc && \ - echo "export PATH=~/.local/bin:$COMPOSER_PYTHON_BIN:$PATH" >> /etc/zshenv + echo "export PATH=~/.local/bin:$COMPOSER_PYTHON_BIN:$PATH" >> /etc/zshenv \ + du -hs /var/lib/apt/lists/* /usr/lib/python3/* # Ensure that non-interactive shells load /etc/profile ENV BASH_ENV=/etc/profile @@ -340,7 +357,8 @@ ENV BASH_ENV=/etc/profile ######################### RUN useradd -rm -d /home/mosaicml -s /bin/bash -u 1000 -U -s /bin/bash mosaicml && \ usermod -a -G sudo mosaicml && \ - echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers + echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers \ + du -hs /var/lib/apt/lists/* /usr/lib/python3/* ######################### # Upgrade apt packages @@ -349,7 +367,8 @@ RUN apt-get update && \ apt-get upgrade -y && \ apt-get autoclean && \ apt-get clean && \ - rm -rf /var/lib/apt/lists/* + rm -rf /var/lib/apt/lists/* \ + du -hs /var/lib/apt/lists/* /usr/lib/python3/* ######################### # Upgrade pip packages @@ -358,7 +377,8 @@ RUN pip install --no-cache-dir --upgrade \ certifi${CERTIFI_VERSION} \ ipython${IPYTHON_VERSION} \ urllib3${URLLIB3_VERSION} \ - python-snappy + python-snappy \ + du -hs /var/lib/apt/lists/* /usr/lib/python3/* ################################################## # Override NVIDIA mistaken env var for 11.8 images From 742587176220f8ddb006bb8555f68742a195703d Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Tue, 23 Jan 2024 12:29:37 -0800 Subject: [PATCH 25/70] du depth 3 --- docker/Dockerfile | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 48bfcb7ad2..f016f67578 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -96,7 +96,7 @@ RUN if [ -n "$CUDA_VERSION" ] ; then \ wget -P /tmp/cuda-keyring https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb && \ dpkg -i /tmp/cuda-keyring/cuda-keyring_1.0-1_all.deb && \ rm -rf /tmp/cuda-keyring ; \ - du -hs /var/lib/apt/lists/* /usr/lib/python3/* \ + du -h --max-depth=3 \ fi RUN apt-get update && \ apt-get install -y --no-install-recommends \ @@ -130,7 +130,7 @@ RUN apt-get update && \ apt-get autoclean && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* \ - du -hs /var/lib/apt/lists/* /usr/lib/python3/* + du -h --max-depth=3 ############################### # Install latest version of git @@ -141,7 +141,7 @@ RUN add-apt-repository ppa:git-core/ppa && \ apt-get autoclean && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* \ - du -hs /var/lib/apt/lists/* /usr/lib/python3/* + du -h --max-depth=3 ############################## # Install NodeJS (for Pyright) @@ -152,7 +152,7 @@ RUN \ apt-get autoclean && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* \ - du -hs /var/lib/apt/lists/* /usr/lib/python3/* + du -h --max-depth=3 ################ # Install Python @@ -173,11 +173,11 @@ RUN add-apt-repository ppa:deadsnakes/ppa && \ apt-get autoclean && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* \ - du -hs /var/lib/apt/lists/* /usr/lib/python3/* + du -h --max-depth=3 RUN curl -fsSL https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} - && \ pip${PYTHON_VERSION} install --no-cache-dir --upgrade 'pip<23' setuptools \ - du -hs /var/lib/apt/lists/* /usr/lib/python3/* + du -h --max-depth=3 ##################### # Install pillow-simd @@ -192,7 +192,7 @@ COPY pillow_stub /tmp/pillow_stub RUN pip${PYTHON_VERSION} install --no-cache-dir --upgrade /tmp/pillow_stub && \ pip${PYTHON_VERSION} install --no-cache-dir --upgrade pillow_simd==${PILLOW_SIMD_VERSION} && \ rm -rf /tmp/pillow_stub \ - du -hs /var/lib/apt/lists/* /usr/lib/python3/* + du -h --max-depth=3 ################# # Install Pytorch @@ -212,12 +212,12 @@ RUN if [ -z "$PYTORCH_NIGHTLY_URL" ] ; then \ pip${PYTHON_VERSION} install --no-cache-dir --find-links https://download.pytorch.org/whl/torch_stable.html \ torch==${PYTORCH_VERSION}+${CUDA_VERSION_TAG} \ torchvision==${TORCHVISION_VERSION}+${CUDA_VERSION_TAG} ; \ - du -hs /var/lib/apt/lists/* /usr/lib/python3/* \ + du -h --max-depth=3 \ else \ pip${PYTHON_VERSION} install --no-cache-dir --pre --index-url ${PYTORCH_NIGHTLY_URL} \ torch==${PYTORCH_VERSION}.${PYTORCH_NIGHTLY_VERSION} \ torchvision==${TORCHVISION_VERSION}.${PYTORCH_NIGHTLY_VERSION} ; \ - du -hs /var/lib/apt/lists/* /usr/lib/python3/* \ + du -h --max-depth=3 \ fi ##################################### @@ -239,7 +239,7 @@ RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ apt-get autoclean && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* ; \ - du -hs /var/lib/apt/lists/* /usr/lib/python3/* \ + du -h --max-depth=3 \ fi RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ @@ -250,7 +250,7 @@ RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ apt-get update && \ ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify && \ rm -rf /tmp/aws-efa-installer* ; \ - du -hs /var/lib/apt/lists/* /usr/lib/python3/* \ + du -h --max-depth=3 \ fi RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ @@ -264,7 +264,7 @@ RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ --disable-tests \ --enable-platform-aws && \ make && make install ; \ - du -hs /var/lib/apt/lists/* /usr/lib/python3/* \ + du -h --max-depth=3 \ fi ################################### @@ -279,7 +279,7 @@ RUN if [ -n "$MOFED_VERSION" ] ; then \ tar -zxvf /tmp/mofed/MLNX_OFED_LINUX-${MOFED_VERSION}-ubuntu20.04-x86_64.tgz -C /tmp/mofed && \ /tmp/mofed/MLNX_OFED_LINUX-${MOFED_VERSION}-ubuntu20.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force && \ rm -rf /tmp/mofed ; \ - du -hs /var/lib/apt/lists/* /usr/lib/python3/* \ + du -h --max-depth=3 \ fi ##################### @@ -299,7 +299,7 @@ RUN if [[ -n "$CUDA_VERSION" ]] && [[ -z "${PYTORCH_NIGHTLY_URL}" ]]; then \ --target /usr/local/lib/python${PYTHON_VERSION}/dist-packages \ ./ && \ rm -rf /tmp/apex ; \ - du -hs /var/lib/apt/lists/* /usr/lib/python3/* \ + du -h --max-depth=3 \ fi ########################## @@ -312,20 +312,20 @@ RUN if [ -n "$CUDA_VERSION" ] ; then \ cd flash-attention && \ MAX_JOBS=1 python${PYTHON_VERSION} setup.py install && \ cd .. ; \ - du -hs /var/lib/apt/lists/* /usr/lib/python3/* \ + du -h --max-depth=3 \ fi ############### # Install cmake ############### RUN pip${PYTHON_VERSION} install --no-cache-dir cmake==3.26.3 \ - du -hs /var/lib/apt/lists/* /usr/lib/python3/* + du -h --max-depth=3 ########################### # Install Pandoc Dependency ########################### RUN pip${PYTHON_VERSION} install --no-cache-dir pandoc==2.3 \ - du -hs /var/lib/apt/lists/* /usr/lib/python3/* + du -h --max-depth=3 / ################################ # Use the correct python version @@ -347,7 +347,7 @@ RUN mkdir -p ${COMPOSER_PYTHON_BIN} && \ echo "export PATH=~/.local/bin:$COMPOSER_PYTHON_BIN:$PATH" >> /etc/profile && \ echo "export PATH=~/.local/bin:$COMPOSER_PYTHON_BIN:$PATH" >> /etc/bash.bashrc && \ echo "export PATH=~/.local/bin:$COMPOSER_PYTHON_BIN:$PATH" >> /etc/zshenv \ - du -hs /var/lib/apt/lists/* /usr/lib/python3/* + du -h --max-depth=3 # Ensure that non-interactive shells load /etc/profile ENV BASH_ENV=/etc/profile @@ -358,7 +358,7 @@ ENV BASH_ENV=/etc/profile RUN useradd -rm -d /home/mosaicml -s /bin/bash -u 1000 -U -s /bin/bash mosaicml && \ usermod -a -G sudo mosaicml && \ echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers \ - du -hs /var/lib/apt/lists/* /usr/lib/python3/* + du -h --max-depth=3 ######################### # Upgrade apt packages @@ -368,7 +368,7 @@ RUN apt-get update && \ apt-get autoclean && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* \ - du -hs /var/lib/apt/lists/* /usr/lib/python3/* + du -h --max-depth=3 ######################### # Upgrade pip packages @@ -378,7 +378,7 @@ RUN pip install --no-cache-dir --upgrade \ ipython${IPYTHON_VERSION} \ urllib3${URLLIB3_VERSION} \ python-snappy \ - du -hs /var/lib/apt/lists/* /usr/lib/python3/* + du -h --max-depth=3 ################################################## # Override NVIDIA mistaken env var for 11.8 images From c98196c62ea24c420736d2bb4ef51d0d3d951375 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Tue, 23 Jan 2024 12:34:24 -0800 Subject: [PATCH 26/70] syntax --- docker/Dockerfile | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index f016f67578..3780049524 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -95,8 +95,8 @@ RUN if [ -n "$CUDA_VERSION" ] ; then \ mkdir -p /tmp/cuda-keyring && \ wget -P /tmp/cuda-keyring https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb && \ dpkg -i /tmp/cuda-keyring/cuda-keyring_1.0-1_all.deb && \ - rm -rf /tmp/cuda-keyring ; \ - du -h --max-depth=3 \ + rm -rf /tmp/cuda-keyring \ + du -h --max-depth=3 ; \ fi RUN apt-get update && \ apt-get install -y --no-install-recommends \ @@ -211,13 +211,13 @@ RUN if [ -z "$PYTORCH_NIGHTLY_URL" ] ; then \ CUDA_VERSION_TAG=$(python${PYTHON_VERSION} -c "print('cu' + ''.join('${CUDA_VERSION}'.split('.')[:2]) if '${CUDA_VERSION}' else 'cpu')") && \ pip${PYTHON_VERSION} install --no-cache-dir --find-links https://download.pytorch.org/whl/torch_stable.html \ torch==${PYTORCH_VERSION}+${CUDA_VERSION_TAG} \ - torchvision==${TORCHVISION_VERSION}+${CUDA_VERSION_TAG} ; \ - du -h --max-depth=3 \ + torchvision==${TORCHVISION_VERSION}+${CUDA_VERSION_TAG} \ + du -h --max-depth=3 ; \ else \ pip${PYTHON_VERSION} install --no-cache-dir --pre --index-url ${PYTORCH_NIGHTLY_URL} \ torch==${PYTORCH_VERSION}.${PYTORCH_NIGHTLY_VERSION} \ - torchvision==${TORCHVISION_VERSION}.${PYTORCH_NIGHTLY_VERSION} ; \ - du -h --max-depth=3 \ + torchvision==${TORCHVISION_VERSION}.${PYTORCH_NIGHTLY_VERSION} \ + du -h --max-depth=3 ; \ fi ##################################### @@ -238,8 +238,8 @@ RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ libhwloc-dev && \ apt-get autoclean && \ apt-get clean && \ - rm -rf /var/lib/apt/lists/* ; \ - du -h --max-depth=3 \ + rm -rf /var/lib/apt/lists/* \ + du -h --max-depth=3 ; \ fi RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ @@ -249,8 +249,8 @@ RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ cd aws-efa-installer && \ apt-get update && \ ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify && \ - rm -rf /tmp/aws-efa-installer* ; \ - du -h --max-depth=3 \ + rm -rf /tmp/aws-efa-installer* \ + du -h --max-depth=3 ; \ fi RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ @@ -263,8 +263,8 @@ RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ --with-cuda=/usr/local/cuda \ --disable-tests \ --enable-platform-aws && \ - make && make install ; \ - du -h --max-depth=3 \ + make && make install \ + du -h --max-depth=3 ; \ fi ################################### @@ -278,8 +278,8 @@ RUN if [ -n "$MOFED_VERSION" ] ; then \ wget -nv -P /tmp/mofed http://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VERSION}/MLNX_OFED_LINUX-${MOFED_VERSION}-ubuntu20.04-x86_64.tgz && \ tar -zxvf /tmp/mofed/MLNX_OFED_LINUX-${MOFED_VERSION}-ubuntu20.04-x86_64.tgz -C /tmp/mofed && \ /tmp/mofed/MLNX_OFED_LINUX-${MOFED_VERSION}-ubuntu20.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force && \ - rm -rf /tmp/mofed ; \ - du -h --max-depth=3 \ + rm -rf /tmp/mofed \ + du -h --max-depth=3 ; \ fi ##################### @@ -298,8 +298,8 @@ RUN if [[ -n "$CUDA_VERSION" ]] && [[ -z "${PYTORCH_NIGHTLY_URL}" ]]; then \ --global-option="--cuda_ext" \ --target /usr/local/lib/python${PYTHON_VERSION}/dist-packages \ ./ && \ - rm -rf /tmp/apex ; \ - du -h --max-depth=3 \ + rm -rf /tmp/apex \ + du -h --max-depth=3 ; \ fi ########################## @@ -311,8 +311,8 @@ RUN if [ -n "$CUDA_VERSION" ] ; then \ git clone --branch v2.4.2 https://github.com/Dao-AILab/flash-attention.git && \ cd flash-attention && \ MAX_JOBS=1 python${PYTHON_VERSION} setup.py install && \ - cd .. ; \ - du -h --max-depth=3 \ + cd .. \ + du -h --max-depth=3 ; \ fi ############### From 8854fa622d6a033fbb62b69fea6decd6f3b6d1bc Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Tue, 23 Jan 2024 12:42:40 -0800 Subject: [PATCH 27/70] more syntax --- docker/Dockerfile | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 3780049524..6c2f3387d5 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -95,7 +95,7 @@ RUN if [ -n "$CUDA_VERSION" ] ; then \ mkdir -p /tmp/cuda-keyring && \ wget -P /tmp/cuda-keyring https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb && \ dpkg -i /tmp/cuda-keyring/cuda-keyring_1.0-1_all.deb && \ - rm -rf /tmp/cuda-keyring \ + rm -rf /tmp/cuda-keyring && \ du -h --max-depth=3 ; \ fi RUN apt-get update && \ @@ -129,7 +129,7 @@ RUN apt-get update && \ htop && \ apt-get autoclean && \ apt-get clean && \ - rm -rf /var/lib/apt/lists/* \ + rm -rf /var/lib/apt/lists/* && \ du -h --max-depth=3 ############################### @@ -140,7 +140,7 @@ RUN add-apt-repository ppa:git-core/ppa && \ git && \ apt-get autoclean && \ apt-get clean && \ - rm -rf /var/lib/apt/lists/* \ + rm -rf /var/lib/apt/lists/* && \ du -h --max-depth=3 ############################## @@ -151,7 +151,7 @@ RUN \ apt-get install -y --no-install-recommends nodejs && \ apt-get autoclean && \ apt-get clean && \ - rm -rf /var/lib/apt/lists/* \ + rm -rf /var/lib/apt/lists/* && \ du -h --max-depth=3 ################ @@ -172,11 +172,11 @@ RUN add-apt-repository ppa:deadsnakes/ppa && \ python${PYTHON_VERSION}-venv && \ apt-get autoclean && \ apt-get clean && \ - rm -rf /var/lib/apt/lists/* \ + rm -rf /var/lib/apt/lists/* && \ du -h --max-depth=3 RUN curl -fsSL https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} - && \ - pip${PYTHON_VERSION} install --no-cache-dir --upgrade 'pip<23' setuptools \ + pip${PYTHON_VERSION} install --no-cache-dir --upgrade 'pip<23' setuptools && \ du -h --max-depth=3 ##################### @@ -191,7 +191,7 @@ COPY pillow_stub /tmp/pillow_stub RUN pip${PYTHON_VERSION} install --no-cache-dir --upgrade /tmp/pillow_stub && \ pip${PYTHON_VERSION} install --no-cache-dir --upgrade pillow_simd==${PILLOW_SIMD_VERSION} && \ - rm -rf /tmp/pillow_stub \ + rm -rf /tmp/pillow_stub && \ du -h --max-depth=3 ################# @@ -211,12 +211,12 @@ RUN if [ -z "$PYTORCH_NIGHTLY_URL" ] ; then \ CUDA_VERSION_TAG=$(python${PYTHON_VERSION} -c "print('cu' + ''.join('${CUDA_VERSION}'.split('.')[:2]) if '${CUDA_VERSION}' else 'cpu')") && \ pip${PYTHON_VERSION} install --no-cache-dir --find-links https://download.pytorch.org/whl/torch_stable.html \ torch==${PYTORCH_VERSION}+${CUDA_VERSION_TAG} \ - torchvision==${TORCHVISION_VERSION}+${CUDA_VERSION_TAG} \ + torchvision==${TORCHVISION_VERSION}+${CUDA_VERSION_TAG} && \ du -h --max-depth=3 ; \ else \ pip${PYTHON_VERSION} install --no-cache-dir --pre --index-url ${PYTORCH_NIGHTLY_URL} \ torch==${PYTORCH_VERSION}.${PYTORCH_NIGHTLY_VERSION} \ - torchvision==${TORCHVISION_VERSION}.${PYTORCH_NIGHTLY_VERSION} \ + torchvision==${TORCHVISION_VERSION}.${PYTORCH_NIGHTLY_VERSION} && \ du -h --max-depth=3 ; \ fi @@ -238,7 +238,7 @@ RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ libhwloc-dev && \ apt-get autoclean && \ apt-get clean && \ - rm -rf /var/lib/apt/lists/* \ + rm -rf /var/lib/apt/lists/* && \ du -h --max-depth=3 ; \ fi @@ -249,7 +249,7 @@ RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ cd aws-efa-installer && \ apt-get update && \ ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify && \ - rm -rf /tmp/aws-efa-installer* \ + rm -rf /tmp/aws-efa-installer* && \ du -h --max-depth=3 ; \ fi @@ -263,7 +263,7 @@ RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ --with-cuda=/usr/local/cuda \ --disable-tests \ --enable-platform-aws && \ - make && make install \ + make && make install && \ du -h --max-depth=3 ; \ fi @@ -278,7 +278,7 @@ RUN if [ -n "$MOFED_VERSION" ] ; then \ wget -nv -P /tmp/mofed http://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VERSION}/MLNX_OFED_LINUX-${MOFED_VERSION}-ubuntu20.04-x86_64.tgz && \ tar -zxvf /tmp/mofed/MLNX_OFED_LINUX-${MOFED_VERSION}-ubuntu20.04-x86_64.tgz -C /tmp/mofed && \ /tmp/mofed/MLNX_OFED_LINUX-${MOFED_VERSION}-ubuntu20.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force && \ - rm -rf /tmp/mofed \ + rm -rf /tmp/mofed && \ du -h --max-depth=3 ; \ fi @@ -298,7 +298,7 @@ RUN if [[ -n "$CUDA_VERSION" ]] && [[ -z "${PYTORCH_NIGHTLY_URL}" ]]; then \ --global-option="--cuda_ext" \ --target /usr/local/lib/python${PYTHON_VERSION}/dist-packages \ ./ && \ - rm -rf /tmp/apex \ + rm -rf /tmp/apex && \ du -h --max-depth=3 ; \ fi @@ -311,20 +311,20 @@ RUN if [ -n "$CUDA_VERSION" ] ; then \ git clone --branch v2.4.2 https://github.com/Dao-AILab/flash-attention.git && \ cd flash-attention && \ MAX_JOBS=1 python${PYTHON_VERSION} setup.py install && \ - cd .. \ + cd .. && \ du -h --max-depth=3 ; \ fi ############### # Install cmake ############### -RUN pip${PYTHON_VERSION} install --no-cache-dir cmake==3.26.3 \ +RUN pip${PYTHON_VERSION} install --no-cache-dir cmake==3.26.3 && \ du -h --max-depth=3 ########################### # Install Pandoc Dependency ########################### -RUN pip${PYTHON_VERSION} install --no-cache-dir pandoc==2.3 \ +RUN pip${PYTHON_VERSION} install --no-cache-dir pandoc==2.3 && \ du -h --max-depth=3 / ################################ @@ -346,7 +346,7 @@ RUN mkdir -p ${COMPOSER_PYTHON_BIN} && \ # Include this folder, and the local bin folder, on the path echo "export PATH=~/.local/bin:$COMPOSER_PYTHON_BIN:$PATH" >> /etc/profile && \ echo "export PATH=~/.local/bin:$COMPOSER_PYTHON_BIN:$PATH" >> /etc/bash.bashrc && \ - echo "export PATH=~/.local/bin:$COMPOSER_PYTHON_BIN:$PATH" >> /etc/zshenv \ + echo "export PATH=~/.local/bin:$COMPOSER_PYTHON_BIN:$PATH" >> /etc/zshenv && \ du -h --max-depth=3 # Ensure that non-interactive shells load /etc/profile @@ -357,7 +357,7 @@ ENV BASH_ENV=/etc/profile ######################### RUN useradd -rm -d /home/mosaicml -s /bin/bash -u 1000 -U -s /bin/bash mosaicml && \ usermod -a -G sudo mosaicml && \ - echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers \ + echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers && \ du -h --max-depth=3 ######################### @@ -367,7 +367,7 @@ RUN apt-get update && \ apt-get upgrade -y && \ apt-get autoclean && \ apt-get clean && \ - rm -rf /var/lib/apt/lists/* \ + rm -rf /var/lib/apt/lists/* && \ du -h --max-depth=3 ######################### @@ -377,8 +377,8 @@ RUN pip install --no-cache-dir --upgrade \ certifi${CERTIFI_VERSION} \ ipython${IPYTHON_VERSION} \ urllib3${URLLIB3_VERSION} \ - python-snappy \ - du -h --max-depth=3 + python-snappy +RUN du -h --max-depth=3 ################################################## # Override NVIDIA mistaken env var for 11.8 images From 37d3b677f823a42da7512429a973a4b6842dba0a Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Tue, 23 Jan 2024 12:48:27 -0800 Subject: [PATCH 28/70] syntax --- docker/Dockerfile | 88 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 61 insertions(+), 27 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 6c2f3387d5..7e31f6d41e 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -95,9 +95,12 @@ RUN if [ -n "$CUDA_VERSION" ] ; then \ mkdir -p /tmp/cuda-keyring && \ wget -P /tmp/cuda-keyring https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb && \ dpkg -i /tmp/cuda-keyring/cuda-keyring_1.0-1_all.deb && \ - rm -rf /tmp/cuda-keyring && \ - du -h --max-depth=3 ; \ + rm -rf /tmp/cuda-keyring \ fi + +RUN set -eux; \ + du -h --max-depth=3 + RUN apt-get update && \ apt-get install -y --no-install-recommends \ libgomp1 \ @@ -129,7 +132,9 @@ RUN apt-get update && \ htop && \ apt-get autoclean && \ apt-get clean && \ - rm -rf /var/lib/apt/lists/* && \ + rm -rf /var/lib/apt/lists/* + +RUN set -eux; \ du -h --max-depth=3 ############################### @@ -140,7 +145,9 @@ RUN add-apt-repository ppa:git-core/ppa && \ git && \ apt-get autoclean && \ apt-get clean && \ - rm -rf /var/lib/apt/lists/* && \ + rm -rf /var/lib/apt/lists/* + +RUN set -eux; \ du -h --max-depth=3 ############################## @@ -151,7 +158,9 @@ RUN \ apt-get install -y --no-install-recommends nodejs && \ apt-get autoclean && \ apt-get clean && \ - rm -rf /var/lib/apt/lists/* && \ + rm -rf /var/lib/apt/lists/* + +RUN set -eux; \ du -h --max-depth=3 ################ @@ -172,11 +181,15 @@ RUN add-apt-repository ppa:deadsnakes/ppa && \ python${PYTHON_VERSION}-venv && \ apt-get autoclean && \ apt-get clean && \ - rm -rf /var/lib/apt/lists/* && \ + rm -rf /var/lib/apt/lists/* + +RUN set -eux; \ du -h --max-depth=3 RUN curl -fsSL https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} - && \ - pip${PYTHON_VERSION} install --no-cache-dir --upgrade 'pip<23' setuptools && \ + pip${PYTHON_VERSION} install --no-cache-dir --upgrade 'pip<23' setuptools + +RUN set -eux; \ du -h --max-depth=3 ##################### @@ -191,7 +204,9 @@ COPY pillow_stub /tmp/pillow_stub RUN pip${PYTHON_VERSION} install --no-cache-dir --upgrade /tmp/pillow_stub && \ pip${PYTHON_VERSION} install --no-cache-dir --upgrade pillow_simd==${PILLOW_SIMD_VERSION} && \ - rm -rf /tmp/pillow_stub && \ + rm -rf /tmp/pillow_stub + +RUN set -eux; \ du -h --max-depth=3 ################# @@ -211,15 +226,16 @@ RUN if [ -z "$PYTORCH_NIGHTLY_URL" ] ; then \ CUDA_VERSION_TAG=$(python${PYTHON_VERSION} -c "print('cu' + ''.join('${CUDA_VERSION}'.split('.')[:2]) if '${CUDA_VERSION}' else 'cpu')") && \ pip${PYTHON_VERSION} install --no-cache-dir --find-links https://download.pytorch.org/whl/torch_stable.html \ torch==${PYTORCH_VERSION}+${CUDA_VERSION_TAG} \ - torchvision==${TORCHVISION_VERSION}+${CUDA_VERSION_TAG} && \ - du -h --max-depth=3 ; \ + torchvision==${TORCHVISION_VERSION}+${CUDA_VERSION_TAG} \ else \ pip${PYTHON_VERSION} install --no-cache-dir --pre --index-url ${PYTORCH_NIGHTLY_URL} \ torch==${PYTORCH_VERSION}.${PYTORCH_NIGHTLY_VERSION} \ - torchvision==${TORCHVISION_VERSION}.${PYTORCH_NIGHTLY_VERSION} && \ - du -h --max-depth=3 ; \ + torchvision==${TORCHVISION_VERSION}.${PYTORCH_NIGHTLY_VERSION} \ fi +RUN set -eux; \ + du -h --max-depth=3 + ##################################### # Install EFA and AWS-OFI-NCCL plugin ##################################### @@ -238,10 +254,12 @@ RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ libhwloc-dev && \ apt-get autoclean && \ apt-get clean && \ - rm -rf /var/lib/apt/lists/* && \ - du -h --max-depth=3 ; \ + rm -rf /var/lib/apt/lists/* ; \ fi +RUN set -eux; \ + du -h --max-depth=3 + RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ cd /tmp && \ curl -OsS https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \ @@ -249,10 +267,12 @@ RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ cd aws-efa-installer && \ apt-get update && \ ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify && \ - rm -rf /tmp/aws-efa-installer* && \ - du -h --max-depth=3 ; \ + rm -rf /tmp/aws-efa-installer* ; \ fi +RUN set -eux; \ + du -h --max-depth=3 + RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ git clone https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl && \ cd /opt/aws-ofi-nccl && \ @@ -263,10 +283,12 @@ RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ --with-cuda=/usr/local/cuda \ --disable-tests \ --enable-platform-aws && \ - make && make install && \ - du -h --max-depth=3 ; \ + make && make install ; \ fi +RUN set -eux; \ + du -h --max-depth=3 + ################################### # Mellanox OFED driver installation ################################### @@ -278,10 +300,12 @@ RUN if [ -n "$MOFED_VERSION" ] ; then \ wget -nv -P /tmp/mofed http://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VERSION}/MLNX_OFED_LINUX-${MOFED_VERSION}-ubuntu20.04-x86_64.tgz && \ tar -zxvf /tmp/mofed/MLNX_OFED_LINUX-${MOFED_VERSION}-ubuntu20.04-x86_64.tgz -C /tmp/mofed && \ /tmp/mofed/MLNX_OFED_LINUX-${MOFED_VERSION}-ubuntu20.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force && \ - rm -rf /tmp/mofed && \ - du -h --max-depth=3 ; \ + rm -rf /tmp/mofed && ; \ fi +RUN set -eux; \ + du -h --max-depth=3 + ##################### # Install NVIDIA Apex ##################### @@ -298,10 +322,12 @@ RUN if [[ -n "$CUDA_VERSION" ]] && [[ -z "${PYTORCH_NIGHTLY_URL}" ]]; then \ --global-option="--cuda_ext" \ --target /usr/local/lib/python${PYTHON_VERSION}/dist-packages \ ./ && \ - rm -rf /tmp/apex && \ - du -h --max-depth=3 ; \ + rm -rf /tmp/apex && ; \ fi +RUN set -eux; \ + du -h --max-depth=3 + ########################## # Install Flash Attention ########################## @@ -311,10 +337,12 @@ RUN if [ -n "$CUDA_VERSION" ] ; then \ git clone --branch v2.4.2 https://github.com/Dao-AILab/flash-attention.git && \ cd flash-attention && \ MAX_JOBS=1 python${PYTHON_VERSION} setup.py install && \ - cd .. && \ - du -h --max-depth=3 ; \ + cd .. ; \ fi +RUN set -eux; \ + du -h --max-depth=3 + ############### # Install cmake ############### @@ -357,7 +385,9 @@ ENV BASH_ENV=/etc/profile ######################### RUN useradd -rm -d /home/mosaicml -s /bin/bash -u 1000 -U -s /bin/bash mosaicml && \ usermod -a -G sudo mosaicml && \ - echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers && \ + echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers + +RUN set -eux; \ du -h --max-depth=3 ######################### @@ -367,7 +397,9 @@ RUN apt-get update && \ apt-get upgrade -y && \ apt-get autoclean && \ apt-get clean && \ - rm -rf /var/lib/apt/lists/* && \ + rm -rf /var/lib/apt/lists/* + +RUN set -eux; \ du -h --max-depth=3 ######################### @@ -378,7 +410,9 @@ RUN pip install --no-cache-dir --upgrade \ ipython${IPYTHON_VERSION} \ urllib3${URLLIB3_VERSION} \ python-snappy -RUN du -h --max-depth=3 + +RUN set -eux; \ + du -h --max-depth=3 ################################################## # Override NVIDIA mistaken env var for 11.8 images From 3515141c565035ea2b6643c8538ff241956f0cc1 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Tue, 23 Jan 2024 12:53:36 -0800 Subject: [PATCH 29/70] syntax --- docker/Dockerfile | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 7e31f6d41e..7215d7f464 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -95,7 +95,7 @@ RUN if [ -n "$CUDA_VERSION" ] ; then \ mkdir -p /tmp/cuda-keyring && \ wget -P /tmp/cuda-keyring https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb && \ dpkg -i /tmp/cuda-keyring/cuda-keyring_1.0-1_all.deb && \ - rm -rf /tmp/cuda-keyring \ + rm -rf /tmp/cuda-keyring ; \ fi RUN set -eux; \ @@ -181,7 +181,7 @@ RUN add-apt-repository ppa:deadsnakes/ppa && \ python${PYTHON_VERSION}-venv && \ apt-get autoclean && \ apt-get clean && \ - rm -rf /var/lib/apt/lists/* + rm -rf /var/lib/apt/lists/* RUN set -eux; \ du -h --max-depth=3 @@ -189,9 +189,6 @@ RUN set -eux; \ RUN curl -fsSL https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} - && \ pip${PYTHON_VERSION} install --no-cache-dir --upgrade 'pip<23' setuptools -RUN set -eux; \ - du -h --max-depth=3 - ##################### # Install pillow-simd ##################### @@ -226,11 +223,11 @@ RUN if [ -z "$PYTORCH_NIGHTLY_URL" ] ; then \ CUDA_VERSION_TAG=$(python${PYTHON_VERSION} -c "print('cu' + ''.join('${CUDA_VERSION}'.split('.')[:2]) if '${CUDA_VERSION}' else 'cpu')") && \ pip${PYTHON_VERSION} install --no-cache-dir --find-links https://download.pytorch.org/whl/torch_stable.html \ torch==${PYTORCH_VERSION}+${CUDA_VERSION_TAG} \ - torchvision==${TORCHVISION_VERSION}+${CUDA_VERSION_TAG} \ + torchvision==${TORCHVISION_VERSION}+${CUDA_VERSION_TAG} ; \ else \ pip${PYTHON_VERSION} install --no-cache-dir --pre --index-url ${PYTORCH_NIGHTLY_URL} \ torch==${PYTORCH_VERSION}.${PYTORCH_NIGHTLY_VERSION} \ - torchvision==${TORCHVISION_VERSION}.${PYTORCH_NIGHTLY_VERSION} \ + torchvision==${TORCHVISION_VERSION}.${PYTORCH_NIGHTLY_VERSION} ; \ fi RUN set -eux; \ @@ -300,7 +297,7 @@ RUN if [ -n "$MOFED_VERSION" ] ; then \ wget -nv -P /tmp/mofed http://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VERSION}/MLNX_OFED_LINUX-${MOFED_VERSION}-ubuntu20.04-x86_64.tgz && \ tar -zxvf /tmp/mofed/MLNX_OFED_LINUX-${MOFED_VERSION}-ubuntu20.04-x86_64.tgz -C /tmp/mofed && \ /tmp/mofed/MLNX_OFED_LINUX-${MOFED_VERSION}-ubuntu20.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force && \ - rm -rf /tmp/mofed && ; \ + rm -rf /tmp/mofed ; \ fi RUN set -eux; \ @@ -322,7 +319,7 @@ RUN if [[ -n "$CUDA_VERSION" ]] && [[ -z "${PYTORCH_NIGHTLY_URL}" ]]; then \ --global-option="--cuda_ext" \ --target /usr/local/lib/python${PYTHON_VERSION}/dist-packages \ ./ && \ - rm -rf /tmp/apex && ; \ + rm -rf /tmp/apex ; \ fi RUN set -eux; \ @@ -346,14 +343,18 @@ RUN set -eux; \ ############### # Install cmake ############### -RUN pip${PYTHON_VERSION} install --no-cache-dir cmake==3.26.3 && \ +RUN pip${PYTHON_VERSION} install --no-cache-dir cmake==3.26.3 + +RUN set -eux; \ du -h --max-depth=3 ########################### # Install Pandoc Dependency ########################### -RUN pip${PYTHON_VERSION} install --no-cache-dir pandoc==2.3 && \ - du -h --max-depth=3 / +RUN pip${PYTHON_VERSION} install --no-cache-dir pandoc==2.3 + +RUN set -eux; \ + du -h --max-depth=3 ################################ # Use the correct python version @@ -374,7 +375,9 @@ RUN mkdir -p ${COMPOSER_PYTHON_BIN} && \ # Include this folder, and the local bin folder, on the path echo "export PATH=~/.local/bin:$COMPOSER_PYTHON_BIN:$PATH" >> /etc/profile && \ echo "export PATH=~/.local/bin:$COMPOSER_PYTHON_BIN:$PATH" >> /etc/bash.bashrc && \ - echo "export PATH=~/.local/bin:$COMPOSER_PYTHON_BIN:$PATH" >> /etc/zshenv && \ + echo "export PATH=~/.local/bin:$COMPOSER_PYTHON_BIN:$PATH" >> /etc/zshenv + +RUN set -eux; \ du -h --max-depth=3 # Ensure that non-interactive shells load /etc/profile @@ -385,7 +388,7 @@ ENV BASH_ENV=/etc/profile ######################### RUN useradd -rm -d /home/mosaicml -s /bin/bash -u 1000 -U -s /bin/bash mosaicml && \ usermod -a -G sudo mosaicml && \ - echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers + echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers RUN set -eux; \ du -h --max-depth=3 @@ -410,7 +413,7 @@ RUN pip install --no-cache-dir --upgrade \ ipython${IPYTHON_VERSION} \ urllib3${URLLIB3_VERSION} \ python-snappy - + RUN set -eux; \ du -h --max-depth=3 @@ -434,4 +437,4 @@ ARG DEBIAN_FRONTEND=noninteractive ARG COMPOSER_INSTALL_COMMAND -RUN pip install "${COMPOSER_INSTALL_COMMAND}" +RUN pip install "${COMPOSER_INSTALL_COMMAND}" \ No newline at end of file From 573ed705d3218382ab1800c03e8391f6ca1744d8 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Tue, 23 Jan 2024 12:56:52 -0800 Subject: [PATCH 30/70] inspect root --- docker/Dockerfile | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 7215d7f464..53e473519f 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -99,7 +99,7 @@ RUN if [ -n "$CUDA_VERSION" ] ; then \ fi RUN set -eux; \ - du -h --max-depth=3 + du -h --max-depth=3 / RUN apt-get update && \ apt-get install -y --no-install-recommends \ @@ -135,7 +135,7 @@ RUN apt-get update && \ rm -rf /var/lib/apt/lists/* RUN set -eux; \ - du -h --max-depth=3 + du -h --max-depth=3 / ############################### # Install latest version of git @@ -148,7 +148,7 @@ RUN add-apt-repository ppa:git-core/ppa && \ rm -rf /var/lib/apt/lists/* RUN set -eux; \ - du -h --max-depth=3 + du -h --max-depth=3 / ############################## # Install NodeJS (for Pyright) @@ -161,7 +161,7 @@ RUN \ rm -rf /var/lib/apt/lists/* RUN set -eux; \ - du -h --max-depth=3 + du -h --max-depth=3 / ################ # Install Python @@ -184,7 +184,7 @@ RUN add-apt-repository ppa:deadsnakes/ppa && \ rm -rf /var/lib/apt/lists/* RUN set -eux; \ - du -h --max-depth=3 + du -h --max-depth=3 / RUN curl -fsSL https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} - && \ pip${PYTHON_VERSION} install --no-cache-dir --upgrade 'pip<23' setuptools @@ -204,7 +204,7 @@ RUN pip${PYTHON_VERSION} install --no-cache-dir --upgrade /tmp/pillow_stub && \ rm -rf /tmp/pillow_stub RUN set -eux; \ - du -h --max-depth=3 + du -h --max-depth=3 / ################# # Install Pytorch @@ -231,7 +231,7 @@ RUN if [ -z "$PYTORCH_NIGHTLY_URL" ] ; then \ fi RUN set -eux; \ - du -h --max-depth=3 + du -h --max-depth=3 / ##################################### # Install EFA and AWS-OFI-NCCL plugin @@ -255,7 +255,7 @@ RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ fi RUN set -eux; \ - du -h --max-depth=3 + du -h --max-depth=3 / RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ cd /tmp && \ @@ -268,7 +268,7 @@ RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ fi RUN set -eux; \ - du -h --max-depth=3 + du -h --max-depth=3 / RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ git clone https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl && \ @@ -284,7 +284,7 @@ RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ fi RUN set -eux; \ - du -h --max-depth=3 + du -h --max-depth=3 / ################################### # Mellanox OFED driver installation @@ -301,7 +301,7 @@ RUN if [ -n "$MOFED_VERSION" ] ; then \ fi RUN set -eux; \ - du -h --max-depth=3 + du -h --max-depth=3 / ##################### # Install NVIDIA Apex @@ -323,7 +323,7 @@ RUN if [[ -n "$CUDA_VERSION" ]] && [[ -z "${PYTORCH_NIGHTLY_URL}" ]]; then \ fi RUN set -eux; \ - du -h --max-depth=3 + du -h --max-depth=3 / ########################## # Install Flash Attention @@ -338,7 +338,7 @@ RUN if [ -n "$CUDA_VERSION" ] ; then \ fi RUN set -eux; \ - du -h --max-depth=3 + du -h --max-depth=3 / ############### # Install cmake @@ -346,7 +346,7 @@ RUN set -eux; \ RUN pip${PYTHON_VERSION} install --no-cache-dir cmake==3.26.3 RUN set -eux; \ - du -h --max-depth=3 + du -h --max-depth=3 / ########################### # Install Pandoc Dependency @@ -354,7 +354,7 @@ RUN set -eux; \ RUN pip${PYTHON_VERSION} install --no-cache-dir pandoc==2.3 RUN set -eux; \ - du -h --max-depth=3 + du -h --max-depth=3 / ################################ # Use the correct python version @@ -378,7 +378,7 @@ RUN mkdir -p ${COMPOSER_PYTHON_BIN} && \ echo "export PATH=~/.local/bin:$COMPOSER_PYTHON_BIN:$PATH" >> /etc/zshenv RUN set -eux; \ - du -h --max-depth=3 + du -h --max-depth=3 / # Ensure that non-interactive shells load /etc/profile ENV BASH_ENV=/etc/profile @@ -391,7 +391,7 @@ RUN useradd -rm -d /home/mosaicml -s /bin/bash -u 1000 -U -s /bin/bash mosaicml echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers RUN set -eux; \ - du -h --max-depth=3 + du -h --max-depth=3 / ######################### # Upgrade apt packages @@ -403,7 +403,7 @@ RUN apt-get update && \ rm -rf /var/lib/apt/lists/* RUN set -eux; \ - du -h --max-depth=3 + du -h --max-depth=3 / ######################### # Upgrade pip packages @@ -415,7 +415,7 @@ RUN pip install --no-cache-dir --upgrade \ python-snappy RUN set -eux; \ - du -h --max-depth=3 + du -h --max-depth=3 / ################################################## # Override NVIDIA mistaken env var for 11.8 images From 234bed037a4f10ca39c62f9df1e9133ada2ce4ed Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Tue, 23 Jan 2024 12:59:02 -0800 Subject: [PATCH 31/70] depth 1 --- docker/Dockerfile | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 53e473519f..43b8f45b99 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -99,7 +99,7 @@ RUN if [ -n "$CUDA_VERSION" ] ; then \ fi RUN set -eux; \ - du -h --max-depth=3 / + du -h --max-depth=1 / RUN apt-get update && \ apt-get install -y --no-install-recommends \ @@ -135,7 +135,7 @@ RUN apt-get update && \ rm -rf /var/lib/apt/lists/* RUN set -eux; \ - du -h --max-depth=3 / + du -h --max-depth=1 / ############################### # Install latest version of git @@ -148,7 +148,7 @@ RUN add-apt-repository ppa:git-core/ppa && \ rm -rf /var/lib/apt/lists/* RUN set -eux; \ - du -h --max-depth=3 / + du -h --max-depth=1 / ############################## # Install NodeJS (for Pyright) @@ -161,7 +161,7 @@ RUN \ rm -rf /var/lib/apt/lists/* RUN set -eux; \ - du -h --max-depth=3 / + du -h --max-depth=1 / ################ # Install Python @@ -184,7 +184,7 @@ RUN add-apt-repository ppa:deadsnakes/ppa && \ rm -rf /var/lib/apt/lists/* RUN set -eux; \ - du -h --max-depth=3 / + du -h --max-depth=1 / RUN curl -fsSL https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} - && \ pip${PYTHON_VERSION} install --no-cache-dir --upgrade 'pip<23' setuptools @@ -204,7 +204,7 @@ RUN pip${PYTHON_VERSION} install --no-cache-dir --upgrade /tmp/pillow_stub && \ rm -rf /tmp/pillow_stub RUN set -eux; \ - du -h --max-depth=3 / + du -h --max-depth=1 / ################# # Install Pytorch @@ -231,7 +231,7 @@ RUN if [ -z "$PYTORCH_NIGHTLY_URL" ] ; then \ fi RUN set -eux; \ - du -h --max-depth=3 / + du -h --max-depth=1 / ##################################### # Install EFA and AWS-OFI-NCCL plugin @@ -255,7 +255,7 @@ RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ fi RUN set -eux; \ - du -h --max-depth=3 / + du -h --max-depth=1 / RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ cd /tmp && \ @@ -268,7 +268,7 @@ RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ fi RUN set -eux; \ - du -h --max-depth=3 / + du -h --max-depth=1 / RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ git clone https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl && \ @@ -284,7 +284,7 @@ RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ fi RUN set -eux; \ - du -h --max-depth=3 / + du -h --max-depth=1 / ################################### # Mellanox OFED driver installation @@ -301,7 +301,7 @@ RUN if [ -n "$MOFED_VERSION" ] ; then \ fi RUN set -eux; \ - du -h --max-depth=3 / + du -h --max-depth=1 / ##################### # Install NVIDIA Apex @@ -323,7 +323,7 @@ RUN if [[ -n "$CUDA_VERSION" ]] && [[ -z "${PYTORCH_NIGHTLY_URL}" ]]; then \ fi RUN set -eux; \ - du -h --max-depth=3 / + du -h --max-depth=1 / ########################## # Install Flash Attention @@ -338,7 +338,7 @@ RUN if [ -n "$CUDA_VERSION" ] ; then \ fi RUN set -eux; \ - du -h --max-depth=3 / + du -h --max-depth=1 / ############### # Install cmake @@ -346,7 +346,7 @@ RUN set -eux; \ RUN pip${PYTHON_VERSION} install --no-cache-dir cmake==3.26.3 RUN set -eux; \ - du -h --max-depth=3 / + du -h --max-depth=1 / ########################### # Install Pandoc Dependency @@ -354,7 +354,7 @@ RUN set -eux; \ RUN pip${PYTHON_VERSION} install --no-cache-dir pandoc==2.3 RUN set -eux; \ - du -h --max-depth=3 / + du -h --max-depth=1 / ################################ # Use the correct python version @@ -378,7 +378,7 @@ RUN mkdir -p ${COMPOSER_PYTHON_BIN} && \ echo "export PATH=~/.local/bin:$COMPOSER_PYTHON_BIN:$PATH" >> /etc/zshenv RUN set -eux; \ - du -h --max-depth=3 / + du -h --max-depth=1 / # Ensure that non-interactive shells load /etc/profile ENV BASH_ENV=/etc/profile @@ -391,7 +391,7 @@ RUN useradd -rm -d /home/mosaicml -s /bin/bash -u 1000 -U -s /bin/bash mosaicml echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers RUN set -eux; \ - du -h --max-depth=3 / + du -h --max-depth=1 / ######################### # Upgrade apt packages @@ -403,7 +403,7 @@ RUN apt-get update && \ rm -rf /var/lib/apt/lists/* RUN set -eux; \ - du -h --max-depth=3 / + du -h --max-depth=1 / ######################### # Upgrade pip packages @@ -415,7 +415,7 @@ RUN pip install --no-cache-dir --upgrade \ python-snappy RUN set -eux; \ - du -h --max-depth=3 / + du -h --max-depth=1 / ################################################## # Override NVIDIA mistaken env var for 11.8 images From c5dcfa5faeeb26abbf3a816580df39b1383d53b2 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Tue, 23 Jan 2024 13:03:11 -0800 Subject: [PATCH 32/70] debug --- docker/Dockerfile | 57 ++++++++++++++++------------------------------- 1 file changed, 19 insertions(+), 38 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 43b8f45b99..4d35b6f42f 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -98,8 +98,7 @@ RUN if [ -n "$CUDA_VERSION" ] ; then \ rm -rf /tmp/cuda-keyring ; \ fi -RUN set -eux; \ - du -h --max-depth=1 / +RUN du -h --max-depth=1 / RUN apt-get update && \ apt-get install -y --no-install-recommends \ @@ -134,8 +133,7 @@ RUN apt-get update && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -RUN set -eux; \ - du -h --max-depth=1 / +RUN du -h --max-depth=1 / ############################### # Install latest version of git @@ -147,8 +145,7 @@ RUN add-apt-repository ppa:git-core/ppa && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -RUN set -eux; \ - du -h --max-depth=1 / +RUN du -h --max-depth=1 / ############################## # Install NodeJS (for Pyright) @@ -160,8 +157,7 @@ RUN \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -RUN set -eux; \ - du -h --max-depth=1 / +RUN du -h --max-depth=1 / ################ # Install Python @@ -183,8 +179,7 @@ RUN add-apt-repository ppa:deadsnakes/ppa && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -RUN set -eux; \ - du -h --max-depth=1 / +RUN du -h --max-depth=1 / RUN curl -fsSL https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} - && \ pip${PYTHON_VERSION} install --no-cache-dir --upgrade 'pip<23' setuptools @@ -203,8 +198,7 @@ RUN pip${PYTHON_VERSION} install --no-cache-dir --upgrade /tmp/pillow_stub && \ pip${PYTHON_VERSION} install --no-cache-dir --upgrade pillow_simd==${PILLOW_SIMD_VERSION} && \ rm -rf /tmp/pillow_stub -RUN set -eux; \ - du -h --max-depth=1 / +RUN du -h --max-depth=1 / ################# # Install Pytorch @@ -230,8 +224,7 @@ RUN if [ -z "$PYTORCH_NIGHTLY_URL" ] ; then \ torchvision==${TORCHVISION_VERSION}.${PYTORCH_NIGHTLY_VERSION} ; \ fi -RUN set -eux; \ - du -h --max-depth=1 / +RUN du -h --max-depth=1 / ##################################### # Install EFA and AWS-OFI-NCCL plugin @@ -254,8 +247,7 @@ RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ rm -rf /var/lib/apt/lists/* ; \ fi -RUN set -eux; \ - du -h --max-depth=1 / +RUN du -h --max-depth=1 / RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ cd /tmp && \ @@ -267,8 +259,7 @@ RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ rm -rf /tmp/aws-efa-installer* ; \ fi -RUN set -eux; \ - du -h --max-depth=1 / +RUN du -h --max-depth=1 / RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ git clone https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl && \ @@ -283,8 +274,7 @@ RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ make && make install ; \ fi -RUN set -eux; \ - du -h --max-depth=1 / +RUN du -h --max-depth=1 / ################################### # Mellanox OFED driver installation @@ -300,8 +290,7 @@ RUN if [ -n "$MOFED_VERSION" ] ; then \ rm -rf /tmp/mofed ; \ fi -RUN set -eux; \ - du -h --max-depth=1 / +RUN du -h --max-depth=1 / ##################### # Install NVIDIA Apex @@ -322,8 +311,7 @@ RUN if [[ -n "$CUDA_VERSION" ]] && [[ -z "${PYTORCH_NIGHTLY_URL}" ]]; then \ rm -rf /tmp/apex ; \ fi -RUN set -eux; \ - du -h --max-depth=1 / +RUN du -h --max-depth=1 / ########################## # Install Flash Attention @@ -337,24 +325,21 @@ RUN if [ -n "$CUDA_VERSION" ] ; then \ cd .. ; \ fi -RUN set -eux; \ - du -h --max-depth=1 / +RUN du -h --max-depth=1 / ############### # Install cmake ############### RUN pip${PYTHON_VERSION} install --no-cache-dir cmake==3.26.3 -RUN set -eux; \ - du -h --max-depth=1 / +RUN du -h --max-depth=1 / ########################### # Install Pandoc Dependency ########################### RUN pip${PYTHON_VERSION} install --no-cache-dir pandoc==2.3 -RUN set -eux; \ - du -h --max-depth=1 / +RUN du -h --max-depth=1 / ################################ # Use the correct python version @@ -377,8 +362,7 @@ RUN mkdir -p ${COMPOSER_PYTHON_BIN} && \ echo "export PATH=~/.local/bin:$COMPOSER_PYTHON_BIN:$PATH" >> /etc/bash.bashrc && \ echo "export PATH=~/.local/bin:$COMPOSER_PYTHON_BIN:$PATH" >> /etc/zshenv -RUN set -eux; \ - du -h --max-depth=1 / +RUN du -h --max-depth=1 / # Ensure that non-interactive shells load /etc/profile ENV BASH_ENV=/etc/profile @@ -390,8 +374,7 @@ RUN useradd -rm -d /home/mosaicml -s /bin/bash -u 1000 -U -s /bin/bash mosaicml usermod -a -G sudo mosaicml && \ echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers -RUN set -eux; \ - du -h --max-depth=1 / +RUN du -h --max-depth=1 / ######################### # Upgrade apt packages @@ -402,8 +385,7 @@ RUN apt-get update && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -RUN set -eux; \ - du -h --max-depth=1 / +RUN du -h --max-depth=1 / ######################### # Upgrade pip packages @@ -414,8 +396,7 @@ RUN pip install --no-cache-dir --upgrade \ urllib3${URLLIB3_VERSION} \ python-snappy -RUN set -eux; \ - du -h --max-depth=1 / +RUN du -h --max-depth=1 / ################################################## # Override NVIDIA mistaken env var for 11.8 images From b15467635079b7b839411ca299acc423e724e991 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Tue, 23 Jan 2024 13:07:25 -0800 Subject: [PATCH 33/70] remove sys and proc from du --- docker/Dockerfile | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 4d35b6f42f..6f6817888d 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -98,7 +98,7 @@ RUN if [ -n "$CUDA_VERSION" ] ; then \ rm -rf /tmp/cuda-keyring ; \ fi -RUN du -h --max-depth=1 / +RUN du -h --max-depth=1 / --exclude=/proc --exclude=/sys RUN apt-get update && \ apt-get install -y --no-install-recommends \ @@ -133,7 +133,7 @@ RUN apt-get update && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -RUN du -h --max-depth=1 / +RUN du -h --max-depth=1 / --exclude=/proc --exclude=/sys ############################### # Install latest version of git @@ -145,7 +145,7 @@ RUN add-apt-repository ppa:git-core/ppa && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -RUN du -h --max-depth=1 / +RUN du -h --max-depth=1 / --exclude=/proc --exclude=/sys ############################## # Install NodeJS (for Pyright) @@ -157,7 +157,7 @@ RUN \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -RUN du -h --max-depth=1 / +RUN du -h --max-depth=1 / --exclude=/proc --exclude=/sys ################ # Install Python @@ -179,7 +179,7 @@ RUN add-apt-repository ppa:deadsnakes/ppa && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -RUN du -h --max-depth=1 / +RUN du -h --max-depth=1 / --exclude=/proc --exclude=/sys RUN curl -fsSL https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} - && \ pip${PYTHON_VERSION} install --no-cache-dir --upgrade 'pip<23' setuptools @@ -198,7 +198,7 @@ RUN pip${PYTHON_VERSION} install --no-cache-dir --upgrade /tmp/pillow_stub && \ pip${PYTHON_VERSION} install --no-cache-dir --upgrade pillow_simd==${PILLOW_SIMD_VERSION} && \ rm -rf /tmp/pillow_stub -RUN du -h --max-depth=1 / +RUN du -h --max-depth=1 / --exclude=/proc --exclude=/sys ################# # Install Pytorch @@ -224,7 +224,7 @@ RUN if [ -z "$PYTORCH_NIGHTLY_URL" ] ; then \ torchvision==${TORCHVISION_VERSION}.${PYTORCH_NIGHTLY_VERSION} ; \ fi -RUN du -h --max-depth=1 / +RUN du -h --max-depth=1 / --exclude=/proc --exclude=/sys ##################################### # Install EFA and AWS-OFI-NCCL plugin @@ -247,7 +247,7 @@ RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ rm -rf /var/lib/apt/lists/* ; \ fi -RUN du -h --max-depth=1 / +RUN du -h --max-depth=1 / --exclude=/proc --exclude=/sys RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ cd /tmp && \ @@ -259,7 +259,7 @@ RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ rm -rf /tmp/aws-efa-installer* ; \ fi -RUN du -h --max-depth=1 / +RUN du -h --max-depth=1 / --exclude=/proc --exclude=/sys RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ git clone https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl && \ @@ -274,7 +274,7 @@ RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ make && make install ; \ fi -RUN du -h --max-depth=1 / +RUN du -h --max-depth=1 / --exclude=/proc --exclude=/sys ################################### # Mellanox OFED driver installation @@ -290,7 +290,7 @@ RUN if [ -n "$MOFED_VERSION" ] ; then \ rm -rf /tmp/mofed ; \ fi -RUN du -h --max-depth=1 / +RUN du -h --max-depth=1 / --exclude=/proc --exclude=/sys ##################### # Install NVIDIA Apex @@ -311,7 +311,7 @@ RUN if [[ -n "$CUDA_VERSION" ]] && [[ -z "${PYTORCH_NIGHTLY_URL}" ]]; then \ rm -rf /tmp/apex ; \ fi -RUN du -h --max-depth=1 / +RUN du -h --max-depth=1 / --exclude=/proc --exclude=/sys ########################## # Install Flash Attention @@ -325,21 +325,21 @@ RUN if [ -n "$CUDA_VERSION" ] ; then \ cd .. ; \ fi -RUN du -h --max-depth=1 / +RUN du -h --max-depth=1 / --exclude=/proc --exclude=/sys ############### # Install cmake ############### RUN pip${PYTHON_VERSION} install --no-cache-dir cmake==3.26.3 -RUN du -h --max-depth=1 / +RUN du -h --max-depth=1 / --exclude=/proc --exclude=/sys ########################### # Install Pandoc Dependency ########################### RUN pip${PYTHON_VERSION} install --no-cache-dir pandoc==2.3 -RUN du -h --max-depth=1 / +RUN du -h --max-depth=1 / --exclude=/proc --exclude=/sys ################################ # Use the correct python version @@ -362,7 +362,7 @@ RUN mkdir -p ${COMPOSER_PYTHON_BIN} && \ echo "export PATH=~/.local/bin:$COMPOSER_PYTHON_BIN:$PATH" >> /etc/bash.bashrc && \ echo "export PATH=~/.local/bin:$COMPOSER_PYTHON_BIN:$PATH" >> /etc/zshenv -RUN du -h --max-depth=1 / +RUN du -h --max-depth=1 / --exclude=/proc --exclude=/sys # Ensure that non-interactive shells load /etc/profile ENV BASH_ENV=/etc/profile @@ -374,7 +374,7 @@ RUN useradd -rm -d /home/mosaicml -s /bin/bash -u 1000 -U -s /bin/bash mosaicml usermod -a -G sudo mosaicml && \ echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers -RUN du -h --max-depth=1 / +RUN du -h --max-depth=1 / --exclude=/proc --exclude=/sys ######################### # Upgrade apt packages @@ -385,7 +385,7 @@ RUN apt-get update && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -RUN du -h --max-depth=1 / +RUN du -h --max-depth=1 / --exclude=/proc --exclude=/sys ######################### # Upgrade pip packages @@ -396,7 +396,7 @@ RUN pip install --no-cache-dir --upgrade \ urllib3${URLLIB3_VERSION} \ python-snappy -RUN du -h --max-depth=1 / +RUN du -h --max-depth=1 / --exclude=/proc --exclude=/sys ################################################## # Override NVIDIA mistaken env var for 11.8 images From 8b91b3c962f58337739a8268da649e239913fc82 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Tue, 23 Jan 2024 14:08:28 -0800 Subject: [PATCH 34/70] install fa2 through pip --- docker/Dockerfile | 49 +++-------------------------------------------- 1 file changed, 3 insertions(+), 46 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 6f6817888d..00ed41c9c0 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -98,8 +98,6 @@ RUN if [ -n "$CUDA_VERSION" ] ; then \ rm -rf /tmp/cuda-keyring ; \ fi -RUN du -h --max-depth=1 / --exclude=/proc --exclude=/sys - RUN apt-get update && \ apt-get install -y --no-install-recommends \ libgomp1 \ @@ -133,8 +131,6 @@ RUN apt-get update && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -RUN du -h --max-depth=1 / --exclude=/proc --exclude=/sys - ############################### # Install latest version of git ############################### @@ -145,8 +141,6 @@ RUN add-apt-repository ppa:git-core/ppa && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -RUN du -h --max-depth=1 / --exclude=/proc --exclude=/sys - ############################## # Install NodeJS (for Pyright) ############################## @@ -157,8 +151,6 @@ RUN \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -RUN du -h --max-depth=1 / --exclude=/proc --exclude=/sys - ################ # Install Python ################ @@ -179,8 +171,6 @@ RUN add-apt-repository ppa:deadsnakes/ppa && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -RUN du -h --max-depth=1 / --exclude=/proc --exclude=/sys - RUN curl -fsSL https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} - && \ pip${PYTHON_VERSION} install --no-cache-dir --upgrade 'pip<23' setuptools @@ -198,8 +188,6 @@ RUN pip${PYTHON_VERSION} install --no-cache-dir --upgrade /tmp/pillow_stub && \ pip${PYTHON_VERSION} install --no-cache-dir --upgrade pillow_simd==${PILLOW_SIMD_VERSION} && \ rm -rf /tmp/pillow_stub -RUN du -h --max-depth=1 / --exclude=/proc --exclude=/sys - ################# # Install Pytorch ################# @@ -224,8 +212,6 @@ RUN if [ -z "$PYTORCH_NIGHTLY_URL" ] ; then \ torchvision==${TORCHVISION_VERSION}.${PYTORCH_NIGHTLY_VERSION} ; \ fi -RUN du -h --max-depth=1 / --exclude=/proc --exclude=/sys - ##################################### # Install EFA and AWS-OFI-NCCL plugin ##################################### @@ -247,8 +233,6 @@ RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ rm -rf /var/lib/apt/lists/* ; \ fi -RUN du -h --max-depth=1 / --exclude=/proc --exclude=/sys - RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ cd /tmp && \ curl -OsS https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \ @@ -259,8 +243,6 @@ RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ rm -rf /tmp/aws-efa-installer* ; \ fi -RUN du -h --max-depth=1 / --exclude=/proc --exclude=/sys - RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ git clone https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl && \ cd /opt/aws-ofi-nccl && \ @@ -274,8 +256,6 @@ RUN if [ -n "$AWS_OFI_NCCL_VERSION" ] ; then \ make && make install ; \ fi -RUN du -h --max-depth=1 / --exclude=/proc --exclude=/sys - ################################### # Mellanox OFED driver installation ################################### @@ -290,8 +270,7 @@ RUN if [ -n "$MOFED_VERSION" ] ; then \ rm -rf /tmp/mofed ; \ fi -RUN du -h --max-depth=1 / --exclude=/proc --exclude=/sys - + ##################### # Install NVIDIA Apex ##################### @@ -311,36 +290,22 @@ RUN if [[ -n "$CUDA_VERSION" ]] && [[ -z "${PYTORCH_NIGHTLY_URL}" ]]; then \ rm -rf /tmp/apex ; \ fi -RUN du -h --max-depth=1 / --exclude=/proc --exclude=/sys - + ########################## # Install Flash Attention ########################## -RUN if [ -n "$CUDA_VERSION" ] ; then \ - pip${PYTHON_VERSION} install --upgrade --no-cache-dir ninja==1.11.1 && \ - pip${PYTHON_VERSION} install --upgrade --no-cache-dir --force-reinstall packaging==22.0 && \ - git clone --branch v2.4.2 https://github.com/Dao-AILab/flash-attention.git && \ - cd flash-attention && \ - MAX_JOBS=1 python${PYTHON_VERSION} setup.py install && \ - cd .. ; \ - fi - -RUN du -h --max-depth=1 / --exclude=/proc --exclude=/sys +RUN pip install flash-attn==2.5.0 ############### # Install cmake ############### RUN pip${PYTHON_VERSION} install --no-cache-dir cmake==3.26.3 -RUN du -h --max-depth=1 / --exclude=/proc --exclude=/sys - ########################### # Install Pandoc Dependency ########################### RUN pip${PYTHON_VERSION} install --no-cache-dir pandoc==2.3 -RUN du -h --max-depth=1 / --exclude=/proc --exclude=/sys - ################################ # Use the correct python version ################################ @@ -362,8 +327,6 @@ RUN mkdir -p ${COMPOSER_PYTHON_BIN} && \ echo "export PATH=~/.local/bin:$COMPOSER_PYTHON_BIN:$PATH" >> /etc/bash.bashrc && \ echo "export PATH=~/.local/bin:$COMPOSER_PYTHON_BIN:$PATH" >> /etc/zshenv -RUN du -h --max-depth=1 / --exclude=/proc --exclude=/sys - # Ensure that non-interactive shells load /etc/profile ENV BASH_ENV=/etc/profile @@ -374,8 +337,6 @@ RUN useradd -rm -d /home/mosaicml -s /bin/bash -u 1000 -U -s /bin/bash mosaicml usermod -a -G sudo mosaicml && \ echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers -RUN du -h --max-depth=1 / --exclude=/proc --exclude=/sys - ######################### # Upgrade apt packages ######################### @@ -385,8 +346,6 @@ RUN apt-get update && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -RUN du -h --max-depth=1 / --exclude=/proc --exclude=/sys - ######################### # Upgrade pip packages ######################### @@ -396,8 +355,6 @@ RUN pip install --no-cache-dir --upgrade \ urllib3${URLLIB3_VERSION} \ python-snappy -RUN du -h --max-depth=1 / --exclude=/proc --exclude=/sys - ################################################## # Override NVIDIA mistaken env var for 11.8 images ################################################## From 228c6df25b94f93396c49a10cbd306609eae3317 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Tue, 23 Jan 2024 14:13:20 -0800 Subject: [PATCH 35/70] install dependancy --- docker/Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 00ed41c9c0..f760690284 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -294,7 +294,8 @@ RUN if [[ -n "$CUDA_VERSION" ]] && [[ -z "${PYTORCH_NIGHTLY_URL}" ]]; then \ ########################## # Install Flash Attention ########################## -RUN pip install flash-attn==2.5.0 +RUN pip install packaging==22.0 && \ + pip install flash-attn==2.5.0 ############### # Install cmake From cb121c29ae7f2941afc0e1566a9d7af15c050a4f Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Tue, 23 Jan 2024 14:22:01 -0800 Subject: [PATCH 36/70] no build isolation --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index f760690284..dfa45171a3 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -295,7 +295,7 @@ RUN if [[ -n "$CUDA_VERSION" ]] && [[ -z "${PYTORCH_NIGHTLY_URL}" ]]; then \ # Install Flash Attention ########################## RUN pip install packaging==22.0 && \ - pip install flash-attn==2.5.0 + pip install flash-attn==2.5.0 --no-build-isolation ############### # Install cmake From c9946eda6c2e648c0c1a4907329860d88c2350ff Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Tue, 23 Jan 2024 14:30:22 -0800 Subject: [PATCH 37/70] setuptools --- docker/Dockerfile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index dfa45171a3..b2c6c70a08 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -294,8 +294,9 @@ RUN if [[ -n "$CUDA_VERSION" ]] && [[ -z "${PYTORCH_NIGHTLY_URL}" ]]; then \ ########################## # Install Flash Attention ########################## -RUN pip install packaging==22.0 && \ - pip install flash-attn==2.5.0 --no-build-isolation +RUN pip install --upgrade pip setuptools +RUN pip install packaging==22.0 +RUN pip install flash-attn==2.5.0 --no-build-isolation ############### # Install cmake From 4d9472529f52d847540b48607a26521ba507f933 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Tue, 23 Jan 2024 14:46:26 -0800 Subject: [PATCH 38/70] downgrade to 2.3.6 --- docker/Dockerfile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index b2c6c70a08..69b24ce55e 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -294,9 +294,8 @@ RUN if [[ -n "$CUDA_VERSION" ]] && [[ -z "${PYTORCH_NIGHTLY_URL}" ]]; then \ ########################## # Install Flash Attention ########################## -RUN pip install --upgrade pip setuptools RUN pip install packaging==22.0 -RUN pip install flash-attn==2.5.0 --no-build-isolation +RUN pip install flash-attn==2.3.6 --no-build-isolation ############### # Install cmake From 308d5a8d7bb48c20c63ccde23cb48baacae04af2 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Tue, 23 Jan 2024 15:07:30 -0800 Subject: [PATCH 39/70] 2 workers --- docker/Dockerfile | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 69b24ce55e..48001b72d1 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -294,8 +294,14 @@ RUN if [[ -n "$CUDA_VERSION" ]] && [[ -z "${PYTORCH_NIGHTLY_URL}" ]]; then \ ########################## # Install Flash Attention ########################## -RUN pip install packaging==22.0 -RUN pip install flash-attn==2.3.6 --no-build-isolation +RUN if [ -n "$CUDA_VERSION" ] ; then \ + pip${PYTHON_VERSION} install --upgrade --no-cache-dir ninja==1.11.1 && \ + pip${PYTHON_VERSION} install --upgrade --no-cache-dir --force-reinstall packaging==22.0 && \ + git clone --branch v2.4.2 https://github.com/Dao-AILab/flash-attention.git && \ + cd flash-attention && \ + MAX_JOBS=2 python${PYTHON_VERSION} setup.py install && \ + cd .. ; \ + fi ############### # Install cmake From c9c84aa8d19b00e00f0300e7aa46761f584b1148 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Tue, 23 Jan 2024 15:34:34 -0800 Subject: [PATCH 40/70] revert --- docker/Dockerfile | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 48001b72d1..a775cee367 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -294,14 +294,9 @@ RUN if [[ -n "$CUDA_VERSION" ]] && [[ -z "${PYTORCH_NIGHTLY_URL}" ]]; then \ ########################## # Install Flash Attention ########################## -RUN if [ -n "$CUDA_VERSION" ] ; then \ - pip${PYTHON_VERSION} install --upgrade --no-cache-dir ninja==1.11.1 && \ - pip${PYTHON_VERSION} install --upgrade --no-cache-dir --force-reinstall packaging==22.0 && \ - git clone --branch v2.4.2 https://github.com/Dao-AILab/flash-attention.git && \ - cd flash-attention && \ - MAX_JOBS=2 python${PYTHON_VERSION} setup.py install && \ - cd .. ; \ - fi +RUN pip install --update setuptools && \ + pip install packaging==22.0 && \ + pip flash-attn==2.3.6 ############### # Install cmake From e11718eb55eeb15766f6ad4731420dff802a22e0 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Tue, 23 Jan 2024 15:37:59 -0800 Subject: [PATCH 41/70] flash 1.0.9 --- docker/Dockerfile | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index a775cee367..920d1e6cd6 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -294,9 +294,11 @@ RUN if [[ -n "$CUDA_VERSION" ]] && [[ -z "${PYTORCH_NIGHTLY_URL}" ]]; then \ ########################## # Install Flash Attention ########################## -RUN pip install --update setuptools && \ - pip install packaging==22.0 && \ - pip flash-attn==2.3.6 +RUN if [ -n "$CUDA_VERSION" ] ; then \ + pip${PYTHON_VERSION} install --upgrade --no-cache-dir ninja==1.11.1 && \ + pip${PYTHON_VERSION} install --upgrade --no-cache-dir --force-reinstall packaging==22.0 && \ + pip${PYTHON_VERSION} install --no-cache-dir flash-attn==1.0.9; \ + fi ############### # Install cmake From a6887f40dfb65c8ae6f7a4ae1d41fe2e3b1122bc Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Tue, 23 Jan 2024 15:41:26 -0800 Subject: [PATCH 42/70] flash 2.3.6 --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 920d1e6cd6..f7351e8f6d 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -297,7 +297,7 @@ RUN if [[ -n "$CUDA_VERSION" ]] && [[ -z "${PYTORCH_NIGHTLY_URL}" ]]; then \ RUN if [ -n "$CUDA_VERSION" ] ; then \ pip${PYTHON_VERSION} install --upgrade --no-cache-dir ninja==1.11.1 && \ pip${PYTHON_VERSION} install --upgrade --no-cache-dir --force-reinstall packaging==22.0 && \ - pip${PYTHON_VERSION} install --no-cache-dir flash-attn==1.0.9; \ + pip${PYTHON_VERSION} install --no-cache-dir flash-attn==2.3.6; \ fi ############### From 5a27e42b26c62891bb6dc689ffb0fc3df6b1a8ee Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Tue, 23 Jan 2024 15:51:23 -0800 Subject: [PATCH 43/70] lint --- .github/workflows/pr-docker.yaml | 1 + docker/Dockerfile | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/pr-docker.yaml b/.github/workflows/pr-docker.yaml index 52e009e68f..19922cdcfb 100644 --- a/.github/workflows/pr-docker.yaml +++ b/.github/workflows/pr-docker.yaml @@ -38,6 +38,7 @@ jobs: BUILD_MATRIX=$(python .github/bin/gen_docker_matrix.py docker/build_matrix.yaml -b COMPOSER_INSTALL_COMMAND=$COMPOSER_INSTALL_COMMAND) echo $BUILD_MATRIX >> $GITHUB_OUTPUT stage-docker-build: + timeout-minutes: 1440 needs: build-image-matrix uses: ./.github/workflows/docker-configure-build-push.yaml strategy: diff --git a/docker/Dockerfile b/docker/Dockerfile index f7351e8f6d..0d1100590e 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -270,7 +270,7 @@ RUN if [ -n "$MOFED_VERSION" ] ; then \ rm -rf /tmp/mofed ; \ fi - + ##################### # Install NVIDIA Apex ##################### @@ -290,7 +290,7 @@ RUN if [[ -n "$CUDA_VERSION" ]] && [[ -z "${PYTORCH_NIGHTLY_URL}" ]]; then \ rm -rf /tmp/apex ; \ fi - + ########################## # Install Flash Attention ########################## @@ -379,4 +379,4 @@ ARG DEBIAN_FRONTEND=noninteractive ARG COMPOSER_INSTALL_COMMAND -RUN pip install "${COMPOSER_INSTALL_COMMAND}" \ No newline at end of file +RUN pip install "${COMPOSER_INSTALL_COMMAND}" From 38dd794c46284c8e1f9ebea62a88e8c10448d816 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Tue, 23 Jan 2024 16:06:35 -0800 Subject: [PATCH 44/70] fa 2.5.0 --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 0d1100590e..15a393cd12 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -297,7 +297,7 @@ RUN if [[ -n "$CUDA_VERSION" ]] && [[ -z "${PYTORCH_NIGHTLY_URL}" ]]; then \ RUN if [ -n "$CUDA_VERSION" ] ; then \ pip${PYTHON_VERSION} install --upgrade --no-cache-dir ninja==1.11.1 && \ pip${PYTHON_VERSION} install --upgrade --no-cache-dir --force-reinstall packaging==22.0 && \ - pip${PYTHON_VERSION} install --no-cache-dir flash-attn==2.3.6; \ + pip${PYTHON_VERSION} install --no-cache-dir flash-attn==2.5.0; \ fi ############### From 1b16dddae4e9ba9ef7444c051513ce671b30e9e3 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Wed, 24 Jan 2024 10:34:01 -0800 Subject: [PATCH 45/70] nightly 3.11 --- docker/generate_build_matrix.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py index fd2f193c04..1b889e3f73 100644 --- a/docker/generate_build_matrix.py +++ b/docker/generate_build_matrix.py @@ -233,7 +233,7 @@ def _main(): entry['AWS_OFI_NCCL_VERSION'] = 'v1.7.4-aws' pytorch_entries.append(entry) - nightly_entry = { + nightly_entry_310 = { 'AWS_OFI_NCCL_VERSION': '', 'BASE_IMAGE': 'nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04', 'CUDA_VERSION': '12.1.0', @@ -248,7 +248,25 @@ def _main(): 'TARGET': 'pytorch_stage', 'TORCHVISION_VERSION': '0.18.0' } - pytorch_entries.append(nightly_entry) + pytorch_entries.append(nightly_entry_10) + + nightly_entry_311 = { + 'AWS_OFI_NCCL_VERSION': '', + 'BASE_IMAGE': 'nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04', + 'CUDA_VERSION': '12.1.0', + 'IMAGE_NAME': 'torch-nightly-2-3-0-20240110-cu121', + 'MOFED_VERSION': '5.5-1.0.3.2', + 'NVIDIA_REQUIRE_CUDA_OVERRIDE': _get_cuda_override('12.1.0'), + 'PYTHON_VERSION': '3.11', + 'PYTORCH_VERSION': '2.3.0', + 'PYTORCH_NIGHTLY_URL': 'https://download.pytorch.org/whl/nightly/cu121', + 'PYTORCH_NIGHTLY_VERSION': 'dev20240110+cu121', + 'TAGS': ['mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04'], + 'TARGET': 'pytorch_stage', + 'TORCHVISION_VERSION': '0.18.0' + } + pytorch_entries.append(nightly_entry_11) + composer_entries = [] # The `GIT_COMMIT` is a placeholder and Jenkins will substitute it with the actual git commit for the `composer_staging` images From eba1c94cc7c131cf728e5ab86391e4d936d455ad Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Wed, 24 Jan 2024 10:42:50 -0800 Subject: [PATCH 46/70] type --- docker/README.md | 1 + docker/build_matrix.yaml | 27 +++++++++++++++++++++++++++ docker/generate_build_matrix.py | 4 ++-- 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/docker/README.md b/docker/README.md index a632c1681e..4d30025425 100644 --- a/docker/README.md +++ b/docker/README.md @@ -31,6 +31,7 @@ To install composer, once inside the image, run `pip install mosaicml`. | Linux Distro | Flavor | PyTorch Version | CUDA Version | Python Version | Docker Tags | |----------------|----------|-------------------|---------------------|------------------|------------------------------------------------------------------------------------------| +| Ubuntu 20.04 | Base | 2.3.0 | 12.1.0 (Infiniband) | 3.11 | `mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04` | | Ubuntu 20.04 | Base | 2.3.0 | 12.1.0 (Infiniband) | 3.10 | `mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04` | | Ubuntu 20.04 | Base | 2.1.2 | 12.1.0 (Infiniband) | 3.11 | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.1.2_cu121-python3.11-ubuntu20.04` | | Ubuntu 20.04 | Base | 2.1.2 | 12.1.0 (EFA) | 3.11 | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.1.2_cu121-python3.11-ubuntu20.04-aws` | diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml index 1a7f4ac65b..bfa12e0090 100644 --- a/docker/build_matrix.yaml +++ b/docker/build_matrix.yaml @@ -199,6 +199,33 @@ - mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04 TARGET: pytorch_stage TORCHVISION_VERSION: 0.18.0 +- AWS_OFI_NCCL_VERSION: '' + BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 + CUDA_VERSION: 12.1.0 + IMAGE_NAME: torch-nightly-2-3-0-20240110-cu121 + MOFED_VERSION: 5.5-1.0.3.2 + NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 + brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 + brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471 + brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471 + brand=tesla,driver>=510,driver<511 brand=unknown,driver>=510,driver<511 brand=nvidia,driver>=510,driver<511 + brand=nvidiartx,driver>=510,driver<511 brand=geforce,driver>=510,driver<511 brand=geforcertx,driver>=510,driver<511 + brand=quadro,driver>=510,driver<511 brand=quadrortx,driver>=510,driver<511 brand=titan,driver>=510,driver<511 + brand=titanrtx,driver>=510,driver<511 brand=tesla,driver>=515,driver<516 brand=unknown,driver>=515,driver<516 + brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=515,driver<516 brand=geforce,driver>=515,driver<516 + brand=geforcertx,driver>=515,driver<516 brand=quadro,driver>=515,driver<516 brand=quadrortx,driver>=515,driver<516 + brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=515,driver<516 brand=tesla,driver>=525,driver<526 + brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 + brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 + brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 + PYTHON_VERSION: '3.11' + PYTORCH_NIGHTLY_URL: https://download.pytorch.org/whl/nightly/cu121 + PYTORCH_NIGHTLY_VERSION: dev20240110+cu121 + PYTORCH_VERSION: 2.3.0 + TAGS: + - mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04 + TARGET: pytorch_stage + TORCHVISION_VERSION: 0.18.0 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.18.0 diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py index 1b889e3f73..7662ea11af 100644 --- a/docker/generate_build_matrix.py +++ b/docker/generate_build_matrix.py @@ -248,7 +248,7 @@ def _main(): 'TARGET': 'pytorch_stage', 'TORCHVISION_VERSION': '0.18.0' } - pytorch_entries.append(nightly_entry_10) + pytorch_entries.append(nightly_entry_310) nightly_entry_311 = { 'AWS_OFI_NCCL_VERSION': '', @@ -265,7 +265,7 @@ def _main(): 'TARGET': 'pytorch_stage', 'TORCHVISION_VERSION': '0.18.0' } - pytorch_entries.append(nightly_entry_11) + pytorch_entries.append(nightly_entry_311) composer_entries = [] From c402a68f21f2aa9c0d2530d0ef247710c3250f2b Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Wed, 24 Jan 2024 11:03:04 -0800 Subject: [PATCH 47/70] remove python 3.11 and torch 2.1.2 --- docker/README.md | 23 +++++------ docker/build_matrix.yaml | 71 --------------------------------- docker/generate_build_matrix.py | 2 +- 3 files changed, 11 insertions(+), 85 deletions(-) diff --git a/docker/README.md b/docker/README.md index 4d30025425..f546eac9d9 100644 --- a/docker/README.md +++ b/docker/README.md @@ -29,19 +29,16 @@ The base flavor contains PyTorch pre-installed; the vision flavor also includes To install composer, once inside the image, run `pip install mosaicml`. -| Linux Distro | Flavor | PyTorch Version | CUDA Version | Python Version | Docker Tags | -|----------------|----------|-------------------|---------------------|------------------|------------------------------------------------------------------------------------------| -| Ubuntu 20.04 | Base | 2.3.0 | 12.1.0 (Infiniband) | 3.11 | `mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.3.0 | 12.1.0 (Infiniband) | 3.10 | `mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.1.2 | 12.1.0 (Infiniband) | 3.11 | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.1.2_cu121-python3.11-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.1.2 | 12.1.0 (EFA) | 3.11 | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.1.2_cu121-python3.11-ubuntu20.04-aws` | -| Ubuntu 20.04 | Base | 2.1.2 | cpu | 3.11 | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.1.2_cpu-python3.11-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.1.2 | 12.1.0 (Infiniband) | 3.10 | `mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.1.2 | cpu | 3.10 | `mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.0.1 | 11.8.0 (Infiniband) | 3.10 | `mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.0.1 | cpu | 3.10 | `mosaicml/pytorch:2.0.1_cpu-python3.10-ubuntu20.04` | -| Ubuntu 20.04 | Base | 1.13.1 | 11.7.1 (Infiniband) | 3.10 | `mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04` | -| Ubuntu 20.04 | Base | 1.13.1 | cpu | 3.10 | `mosaicml/pytorch:1.13.1_cpu-python3.10-ubuntu20.04` | +| Linux Distro | Flavor | PyTorch Version | CUDA Version | Python Version | Docker Tags | +|----------------|----------|-------------------|---------------------|------------------|-----------------------------------------------------------------------| +| Ubuntu 20.04 | Base | 2.3.0 | 12.1.0 (Infiniband) | 3.11 | `mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.3.0 | 12.1.0 (Infiniband) | 3.10 | `mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.1.2 | 12.1.0 (Infiniband) | 3.10 | `mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.1.2 | cpu | 3.10 | `mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.0.1 | 11.8.0 (Infiniband) | 3.10 | `mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.0.1 | cpu | 3.10 | `mosaicml/pytorch:2.0.1_cpu-python3.10-ubuntu20.04` | +| Ubuntu 20.04 | Base | 1.13.1 | 11.7.1 (Infiniband) | 3.10 | `mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04` | +| Ubuntu 20.04 | Base | 1.13.1 | cpu | 3.10 | `mosaicml/pytorch:1.13.1_cpu-python3.10-ubuntu20.04` | **Note**: The `mosaicml/pytorch:latest`, `mosaicml/pytorch:latest_cpu`, and `mosaicml/pytorch:latest-aws` diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml index bfa12e0090..ec6160db75 100644 --- a/docker/build_matrix.yaml +++ b/docker/build_matrix.yaml @@ -101,77 +101,6 @@ - mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04 TARGET: pytorch_stage TORCHVISION_VERSION: 0.16.2 -- AWS_OFI_NCCL_VERSION: '' - BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 - CUDA_VERSION: 12.1.0 - IMAGE_NAME: torch-2-1-2-cu121 - MOFED_VERSION: 5.5-1.0.3.2 - NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 - brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 - brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471 - brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471 - brand=tesla,driver>=510,driver<511 brand=unknown,driver>=510,driver<511 brand=nvidia,driver>=510,driver<511 - brand=nvidiartx,driver>=510,driver<511 brand=geforce,driver>=510,driver<511 brand=geforcertx,driver>=510,driver<511 - brand=quadro,driver>=510,driver<511 brand=quadrortx,driver>=510,driver<511 brand=titan,driver>=510,driver<511 - brand=titanrtx,driver>=510,driver<511 brand=tesla,driver>=515,driver<516 brand=unknown,driver>=515,driver<516 - brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=515,driver<516 brand=geforce,driver>=515,driver<516 - brand=geforcertx,driver>=515,driver<516 brand=quadro,driver>=515,driver<516 brand=quadrortx,driver>=515,driver<516 - brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=515,driver<516 brand=tesla,driver>=525,driver<526 - brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 - brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 - brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 - PYTHON_VERSION: '3.11' - PYTORCH_NIGHTLY_URL: '' - PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.1.2 - TAGS: - - mosaicml/pytorch:2.1.2_cu121-python3.11-ubuntu20.04 - - mosaicml/pytorch:latest - TARGET: pytorch_stage - TORCHVISION_VERSION: 0.16.2 -- AWS_OFI_NCCL_VERSION: v1.7.4-aws - BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 - CUDA_VERSION: 12.1.0 - IMAGE_NAME: torch-2-1-2-cu121-aws - MOFED_VERSION: '' - NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 - brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 - brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471 - brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471 - brand=tesla,driver>=510,driver<511 brand=unknown,driver>=510,driver<511 brand=nvidia,driver>=510,driver<511 - brand=nvidiartx,driver>=510,driver<511 brand=geforce,driver>=510,driver<511 brand=geforcertx,driver>=510,driver<511 - brand=quadro,driver>=510,driver<511 brand=quadrortx,driver>=510,driver<511 brand=titan,driver>=510,driver<511 - brand=titanrtx,driver>=510,driver<511 brand=tesla,driver>=515,driver<516 brand=unknown,driver>=515,driver<516 - brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=515,driver<516 brand=geforce,driver>=515,driver<516 - brand=geforcertx,driver>=515,driver<516 brand=quadro,driver>=515,driver<516 brand=quadrortx,driver>=515,driver<516 - brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=515,driver<516 brand=tesla,driver>=525,driver<526 - brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 - brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 - brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 - PYTHON_VERSION: '3.11' - PYTORCH_NIGHTLY_URL: '' - PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.1.2 - TAGS: - - mosaicml/pytorch:2.1.2_cu121-python3.11-ubuntu20.04-aws - - mosaicml/pytorch:latest-aws - TARGET: pytorch_stage - TORCHVISION_VERSION: 0.16.2 -- AWS_OFI_NCCL_VERSION: '' - BASE_IMAGE: ubuntu:20.04 - CUDA_VERSION: '' - IMAGE_NAME: torch-2-1-2-cpu - MOFED_VERSION: '' - NVIDIA_REQUIRE_CUDA_OVERRIDE: '' - PYTHON_VERSION: '3.11' - PYTORCH_NIGHTLY_URL: '' - PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.1.2 - TAGS: - - mosaicml/pytorch:2.1.2_cpu-python3.11-ubuntu20.04 - - mosaicml/pytorch:latest_cpu - TARGET: pytorch_stage - TORCHVISION_VERSION: 0.16.2 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 CUDA_VERSION: 12.1.0 diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py index 7662ea11af..04ca56cbe4 100644 --- a/docker/generate_build_matrix.py +++ b/docker/generate_build_matrix.py @@ -36,7 +36,7 @@ def _get_pytorch_versions(python_version: str): if python_version == '3.10': return ['1.13.1', '2.0.1', '2.1.2'] if python_version == '3.11': - return ['2.1.2'] + return [] raise ValueError(f'Invalid python_version: {python_version}') From cedb433ae86b0dfc441363b3338a175286c290d6 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Wed, 24 Jan 2024 13:20:09 -0800 Subject: [PATCH 48/70] remove timeout --- .github/workflows/pr-docker.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/pr-docker.yaml b/.github/workflows/pr-docker.yaml index 19922cdcfb..52e009e68f 100644 --- a/.github/workflows/pr-docker.yaml +++ b/.github/workflows/pr-docker.yaml @@ -38,7 +38,6 @@ jobs: BUILD_MATRIX=$(python .github/bin/gen_docker_matrix.py docker/build_matrix.yaml -b COMPOSER_INSTALL_COMMAND=$COMPOSER_INSTALL_COMMAND) echo $BUILD_MATRIX >> $GITHUB_OUTPUT stage-docker-build: - timeout-minutes: 1440 needs: build-image-matrix uses: ./.github/workflows/docker-configure-build-push.yaml strategy: From 5964c197adef70033b45439c6ba3bcf578649677 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Wed, 24 Jan 2024 13:25:17 -0800 Subject: [PATCH 49/70] reset latest version --- docker/generate_build_matrix.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py index 04ca56cbe4..3fff12d9d0 100644 --- a/docker/generate_build_matrix.py +++ b/docker/generate_build_matrix.py @@ -18,7 +18,7 @@ import tabulate import yaml -LATEST_PYTHON_VERSION = '3.11' +LATEST_PYTHON_VERSION = '3.10' PRODUCTION_PYTORCH_VERSION = '2.1.2' From 438cfb8291d67ddf9a494a27b65fc2668a9eb810 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Wed, 24 Jan 2024 13:30:40 -0800 Subject: [PATCH 50/70] smoke test update --- .github/workflows/smoketest.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/smoketest.yaml b/.github/workflows/smoketest.yaml index 00121f935b..901ac30fe6 100644 --- a/.github/workflows/smoketest.yaml +++ b/.github/workflows/smoketest.yaml @@ -25,6 +25,7 @@ jobs: - "3.8" - "3.9" - "3.10" + - "3.11" steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 From 5b880ad1caef1e3b04dc33a3ca4734086c3eb70c Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Wed, 24 Jan 2024 13:31:30 -0800 Subject: [PATCH 51/70] lint --- docker/Dockerfile | 1 - 1 file changed, 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 15a393cd12..f92e307e1a 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -290,7 +290,6 @@ RUN if [[ -n "$CUDA_VERSION" ]] && [[ -z "${PYTORCH_NIGHTLY_URL}" ]]; then \ rm -rf /tmp/apex ; \ fi - ########################## # Install Flash Attention ########################## From d9c3550e64c12e63d662d273677fdb24d7c0d9c5 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Wed, 24 Jan 2024 13:32:41 -0800 Subject: [PATCH 52/70] update yaml --- docker/README.md | 23 ++++++++------ docker/build_matrix.yaml | 67 ++++++++++++++++++++++++++++++++++++++-- 2 files changed, 78 insertions(+), 12 deletions(-) diff --git a/docker/README.md b/docker/README.md index f546eac9d9..7c6fc08856 100644 --- a/docker/README.md +++ b/docker/README.md @@ -29,16 +29,19 @@ The base flavor contains PyTorch pre-installed; the vision flavor also includes To install composer, once inside the image, run `pip install mosaicml`. -| Linux Distro | Flavor | PyTorch Version | CUDA Version | Python Version | Docker Tags | -|----------------|----------|-------------------|---------------------|------------------|-----------------------------------------------------------------------| -| Ubuntu 20.04 | Base | 2.3.0 | 12.1.0 (Infiniband) | 3.11 | `mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.3.0 | 12.1.0 (Infiniband) | 3.10 | `mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.1.2 | 12.1.0 (Infiniband) | 3.10 | `mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.1.2 | cpu | 3.10 | `mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.0.1 | 11.8.0 (Infiniband) | 3.10 | `mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.0.1 | cpu | 3.10 | `mosaicml/pytorch:2.0.1_cpu-python3.10-ubuntu20.04` | -| Ubuntu 20.04 | Base | 1.13.1 | 11.7.1 (Infiniband) | 3.10 | `mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04` | -| Ubuntu 20.04 | Base | 1.13.1 | cpu | 3.10 | `mosaicml/pytorch:1.13.1_cpu-python3.10-ubuntu20.04` | +| Linux Distro | Flavor | PyTorch Version | CUDA Version | Python Version | Docker Tags | +|----------------|----------|-------------------|---------------------|------------------|------------------------------------------------------------------------------------------| +| Ubuntu 20.04 | Base | 2.3.0 | 12.1.0 (Infiniband) | 3.11 | `mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.3.0 | 12.1.0 (Infiniband) | 3.10 | `mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.1.2 | 12.1.0 (Infiniband) | 3.10 | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.1.2 | 12.1.0 (EFA) | 3.10 | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04-aws` | +| Ubuntu 20.04 | Base | 2.1.2 | cpu | 3.10 | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.0.1 | 11.8.0 (Infiniband) | 3.10 | `mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.0.1 | 11.8.0 (EFA) | 3.10 | `mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04-aws` | +| Ubuntu 20.04 | Base | 2.0.1 | cpu | 3.10 | `mosaicml/pytorch:2.0.1_cpu-python3.10-ubuntu20.04` | +| Ubuntu 20.04 | Base | 1.13.1 | 11.7.1 (Infiniband) | 3.10 | `mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04` | +| Ubuntu 20.04 | Base | 1.13.1 | 11.7.1 (EFA) | 3.10 | `mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04-aws` | +| Ubuntu 20.04 | Base | 1.13.1 | cpu | 3.10 | `mosaicml/pytorch:1.13.1_cpu-python3.10-ubuntu20.04` | **Note**: The `mosaicml/pytorch:latest`, `mosaicml/pytorch:latest_cpu`, and `mosaicml/pytorch:latest-aws` diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml index ec6160db75..fad6635938 100644 --- a/docker/build_matrix.yaml +++ b/docker/build_matrix.yaml @@ -57,6 +57,68 @@ PYTORCH_VERSION: 2.1.2 TAGS: - mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04 + - mosaicml/pytorch:latest + TARGET: pytorch_stage + TORCHVISION_VERSION: 0.16.2 +- AWS_OFI_NCCL_VERSION: v1.7.4-aws + BASE_IMAGE: nvidia/cuda:11.7.1-cudnn8-devel-ubuntu20.04 + CUDA_VERSION: 11.7.1 + IMAGE_NAME: torch-1-13-1-cu117-aws + MOFED_VERSION: '' + NVIDIA_REQUIRE_CUDA_OVERRIDE: '' + PYTHON_VERSION: '3.10' + PYTORCH_NIGHTLY_URL: '' + PYTORCH_NIGHTLY_VERSION: '' + PYTORCH_VERSION: 1.13.1 + TAGS: + - mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04-aws + TARGET: pytorch_stage + TORCHVISION_VERSION: 0.14.1 +- AWS_OFI_NCCL_VERSION: v1.7.4-aws + BASE_IMAGE: nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 + CUDA_VERSION: 11.8.0 + IMAGE_NAME: torch-2-0-1-cu118-aws + MOFED_VERSION: '' + NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=11.8 brand=tesla,driver>=470,driver<471 brand=tesla,driver>=515,driver<516 + brand=unknown,driver>=470,driver<471 brand=unknown,driver>=515,driver<516 brand=nvidia,driver>=470,driver<471 + brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=470,driver<471 brand=nvidiartx,driver>=515,driver<516 + brand=geforce,driver>=470,driver<471 brand=geforce,driver>=515,driver<516 brand=quadro,driver>=470,driver<471 + brand=quadro,driver>=515,driver<516 brand=titan,driver>=470,driver<471 brand=titan,driver>=515,driver<516 + brand=titanrtx,driver>=470,driver<471 brand=titanrtx,driver>=515,driver<516 + PYTHON_VERSION: '3.10' + PYTORCH_NIGHTLY_URL: '' + PYTORCH_NIGHTLY_VERSION: '' + PYTORCH_VERSION: 2.0.1 + TAGS: + - mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04-aws + TARGET: pytorch_stage + TORCHVISION_VERSION: 0.15.2 +- AWS_OFI_NCCL_VERSION: v1.7.4-aws + BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 + CUDA_VERSION: 12.1.0 + IMAGE_NAME: torch-2-1-2-cu121-aws + MOFED_VERSION: '' + NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 + brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 + brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471 + brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471 + brand=tesla,driver>=510,driver<511 brand=unknown,driver>=510,driver<511 brand=nvidia,driver>=510,driver<511 + brand=nvidiartx,driver>=510,driver<511 brand=geforce,driver>=510,driver<511 brand=geforcertx,driver>=510,driver<511 + brand=quadro,driver>=510,driver<511 brand=quadrortx,driver>=510,driver<511 brand=titan,driver>=510,driver<511 + brand=titanrtx,driver>=510,driver<511 brand=tesla,driver>=515,driver<516 brand=unknown,driver>=515,driver<516 + brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=515,driver<516 brand=geforce,driver>=515,driver<516 + brand=geforcertx,driver>=515,driver<516 brand=quadro,driver>=515,driver<516 brand=quadrortx,driver>=515,driver<516 + brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=515,driver<516 brand=tesla,driver>=525,driver<526 + brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 + brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 + brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 + PYTHON_VERSION: '3.10' + PYTORCH_NIGHTLY_URL: '' + PYTORCH_NIGHTLY_VERSION: '' + PYTORCH_VERSION: 2.1.2 + TAGS: + - mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04-aws + - mosaicml/pytorch:latest-aws TARGET: pytorch_stage TORCHVISION_VERSION: 0.16.2 - AWS_OFI_NCCL_VERSION: '' @@ -99,6 +161,7 @@ PYTORCH_VERSION: 2.1.2 TAGS: - mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04 + - mosaicml/pytorch:latest_cpu TARGET: pytorch_stage TORCHVISION_VERSION: 0.16.2 - AWS_OFI_NCCL_VERSION: '' @@ -175,7 +238,7 @@ brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 - PYTHON_VERSION: '3.11' + PYTHON_VERSION: '3.10' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.1.2 @@ -191,7 +254,7 @@ IMAGE_NAME: composer-0-18-0-cpu MOFED_VERSION: 5.5-1.0.3.2 NVIDIA_REQUIRE_CUDA_OVERRIDE: '' - PYTHON_VERSION: '3.11' + PYTHON_VERSION: '3.10' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.1.2 From 773f3b568e770020224d652b68c235b0df180566 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Wed, 24 Jan 2024 14:27:00 -0800 Subject: [PATCH 53/70] 2.3.6 test --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index f92e307e1a..ee7331cb02 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -296,7 +296,7 @@ RUN if [[ -n "$CUDA_VERSION" ]] && [[ -z "${PYTORCH_NIGHTLY_URL}" ]]; then \ RUN if [ -n "$CUDA_VERSION" ] ; then \ pip${PYTHON_VERSION} install --upgrade --no-cache-dir ninja==1.11.1 && \ pip${PYTHON_VERSION} install --upgrade --no-cache-dir --force-reinstall packaging==22.0 && \ - pip${PYTHON_VERSION} install --no-cache-dir flash-attn==2.5.0; \ + pip${PYTHON_VERSION} install --no-cache-dir flash-attn==2.3.6; \ fi ############### From f1ee751c840cf6b12ec0a2a77e821b66a688a13f Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Wed, 24 Jan 2024 14:44:37 -0800 Subject: [PATCH 54/70] revert test --- docker/generate_build_matrix.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py index 3fff12d9d0..21f710f225 100644 --- a/docker/generate_build_matrix.py +++ b/docker/generate_build_matrix.py @@ -250,22 +250,22 @@ def _main(): } pytorch_entries.append(nightly_entry_310) - nightly_entry_311 = { - 'AWS_OFI_NCCL_VERSION': '', - 'BASE_IMAGE': 'nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04', - 'CUDA_VERSION': '12.1.0', - 'IMAGE_NAME': 'torch-nightly-2-3-0-20240110-cu121', - 'MOFED_VERSION': '5.5-1.0.3.2', - 'NVIDIA_REQUIRE_CUDA_OVERRIDE': _get_cuda_override('12.1.0'), - 'PYTHON_VERSION': '3.11', - 'PYTORCH_VERSION': '2.3.0', - 'PYTORCH_NIGHTLY_URL': 'https://download.pytorch.org/whl/nightly/cu121', - 'PYTORCH_NIGHTLY_VERSION': 'dev20240110+cu121', - 'TAGS': ['mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04'], - 'TARGET': 'pytorch_stage', - 'TORCHVISION_VERSION': '0.18.0' - } - pytorch_entries.append(nightly_entry_311) + # nightly_entry_311 = { + # 'AWS_OFI_NCCL_VERSION': '', + # 'BASE_IMAGE': 'nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04', + # 'CUDA_VERSION': '12.1.0', + # 'IMAGE_NAME': 'torch-nightly-2-3-0-20240110-cu121', + # 'MOFED_VERSION': '5.5-1.0.3.2', + # 'NVIDIA_REQUIRE_CUDA_OVERRIDE': _get_cuda_override('12.1.0'), + # 'PYTHON_VERSION': '3.11', + # 'PYTORCH_VERSION': '2.3.0', + # 'PYTORCH_NIGHTLY_URL': 'https://download.pytorch.org/whl/nightly/cu121', + # 'PYTORCH_NIGHTLY_VERSION': 'dev20240110+cu121', + # 'TAGS': ['mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04'], + # 'TARGET': 'pytorch_stage', + # 'TORCHVISION_VERSION': '0.18.0' + # } + # pytorch_entries.append(nightly_entry_311) composer_entries = [] From 196999520b1cf1ada1d9c558ab6a9bd725fab531 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Wed, 24 Jan 2024 14:45:15 -0800 Subject: [PATCH 55/70] reversion continued --- docker/README.md | 1 - docker/build_matrix.yaml | 27 --------------------------- 2 files changed, 28 deletions(-) diff --git a/docker/README.md b/docker/README.md index 7c6fc08856..e3bab86b5c 100644 --- a/docker/README.md +++ b/docker/README.md @@ -31,7 +31,6 @@ To install composer, once inside the image, run `pip install mosaicml`. | Linux Distro | Flavor | PyTorch Version | CUDA Version | Python Version | Docker Tags | |----------------|----------|-------------------|---------------------|------------------|------------------------------------------------------------------------------------------| -| Ubuntu 20.04 | Base | 2.3.0 | 12.1.0 (Infiniband) | 3.11 | `mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04` | | Ubuntu 20.04 | Base | 2.3.0 | 12.1.0 (Infiniband) | 3.10 | `mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04` | | Ubuntu 20.04 | Base | 2.1.2 | 12.1.0 (Infiniband) | 3.10 | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04` | | Ubuntu 20.04 | Base | 2.1.2 | 12.1.0 (EFA) | 3.10 | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04-aws` | diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml index fad6635938..b7e0400151 100644 --- a/docker/build_matrix.yaml +++ b/docker/build_matrix.yaml @@ -191,33 +191,6 @@ - mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04 TARGET: pytorch_stage TORCHVISION_VERSION: 0.18.0 -- AWS_OFI_NCCL_VERSION: '' - BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 - CUDA_VERSION: 12.1.0 - IMAGE_NAME: torch-nightly-2-3-0-20240110-cu121 - MOFED_VERSION: 5.5-1.0.3.2 - NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 - brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 - brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471 - brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471 - brand=tesla,driver>=510,driver<511 brand=unknown,driver>=510,driver<511 brand=nvidia,driver>=510,driver<511 - brand=nvidiartx,driver>=510,driver<511 brand=geforce,driver>=510,driver<511 brand=geforcertx,driver>=510,driver<511 - brand=quadro,driver>=510,driver<511 brand=quadrortx,driver>=510,driver<511 brand=titan,driver>=510,driver<511 - brand=titanrtx,driver>=510,driver<511 brand=tesla,driver>=515,driver<516 brand=unknown,driver>=515,driver<516 - brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=515,driver<516 brand=geforce,driver>=515,driver<516 - brand=geforcertx,driver>=515,driver<516 brand=quadro,driver>=515,driver<516 brand=quadrortx,driver>=515,driver<516 - brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=515,driver<516 brand=tesla,driver>=525,driver<526 - brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 - brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 - brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 - PYTHON_VERSION: '3.11' - PYTORCH_NIGHTLY_URL: https://download.pytorch.org/whl/nightly/cu121 - PYTORCH_NIGHTLY_VERSION: dev20240110+cu121 - PYTORCH_VERSION: 2.3.0 - TAGS: - - mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04 - TARGET: pytorch_stage - TORCHVISION_VERSION: 0.18.0 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.18.0 From 38102538f4e5a0332240c1c2deb71378f505cd29 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Wed, 24 Jan 2024 14:53:53 -0800 Subject: [PATCH 56/70] restoring from before reversion --- docker/README.md | 1 + docker/build_matrix.yaml | 27 +++++++++++++++++++++++++++ docker/generate_build_matrix.py | 32 ++++++++++++++++---------------- 3 files changed, 44 insertions(+), 16 deletions(-) diff --git a/docker/README.md b/docker/README.md index e3bab86b5c..7c6fc08856 100644 --- a/docker/README.md +++ b/docker/README.md @@ -31,6 +31,7 @@ To install composer, once inside the image, run `pip install mosaicml`. | Linux Distro | Flavor | PyTorch Version | CUDA Version | Python Version | Docker Tags | |----------------|----------|-------------------|---------------------|------------------|------------------------------------------------------------------------------------------| +| Ubuntu 20.04 | Base | 2.3.0 | 12.1.0 (Infiniband) | 3.11 | `mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04` | | Ubuntu 20.04 | Base | 2.3.0 | 12.1.0 (Infiniband) | 3.10 | `mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04` | | Ubuntu 20.04 | Base | 2.1.2 | 12.1.0 (Infiniband) | 3.10 | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04` | | Ubuntu 20.04 | Base | 2.1.2 | 12.1.0 (EFA) | 3.10 | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04-aws` | diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml index b7e0400151..fad6635938 100644 --- a/docker/build_matrix.yaml +++ b/docker/build_matrix.yaml @@ -191,6 +191,33 @@ - mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04 TARGET: pytorch_stage TORCHVISION_VERSION: 0.18.0 +- AWS_OFI_NCCL_VERSION: '' + BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 + CUDA_VERSION: 12.1.0 + IMAGE_NAME: torch-nightly-2-3-0-20240110-cu121 + MOFED_VERSION: 5.5-1.0.3.2 + NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 + brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 + brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471 + brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471 + brand=tesla,driver>=510,driver<511 brand=unknown,driver>=510,driver<511 brand=nvidia,driver>=510,driver<511 + brand=nvidiartx,driver>=510,driver<511 brand=geforce,driver>=510,driver<511 brand=geforcertx,driver>=510,driver<511 + brand=quadro,driver>=510,driver<511 brand=quadrortx,driver>=510,driver<511 brand=titan,driver>=510,driver<511 + brand=titanrtx,driver>=510,driver<511 brand=tesla,driver>=515,driver<516 brand=unknown,driver>=515,driver<516 + brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=515,driver<516 brand=geforce,driver>=515,driver<516 + brand=geforcertx,driver>=515,driver<516 brand=quadro,driver>=515,driver<516 brand=quadrortx,driver>=515,driver<516 + brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=515,driver<516 brand=tesla,driver>=525,driver<526 + brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 + brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 + brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 + PYTHON_VERSION: '3.11' + PYTORCH_NIGHTLY_URL: https://download.pytorch.org/whl/nightly/cu121 + PYTORCH_NIGHTLY_VERSION: dev20240110+cu121 + PYTORCH_VERSION: 2.3.0 + TAGS: + - mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04 + TARGET: pytorch_stage + TORCHVISION_VERSION: 0.18.0 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.18.0 diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py index 21f710f225..3fff12d9d0 100644 --- a/docker/generate_build_matrix.py +++ b/docker/generate_build_matrix.py @@ -250,22 +250,22 @@ def _main(): } pytorch_entries.append(nightly_entry_310) - # nightly_entry_311 = { - # 'AWS_OFI_NCCL_VERSION': '', - # 'BASE_IMAGE': 'nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04', - # 'CUDA_VERSION': '12.1.0', - # 'IMAGE_NAME': 'torch-nightly-2-3-0-20240110-cu121', - # 'MOFED_VERSION': '5.5-1.0.3.2', - # 'NVIDIA_REQUIRE_CUDA_OVERRIDE': _get_cuda_override('12.1.0'), - # 'PYTHON_VERSION': '3.11', - # 'PYTORCH_VERSION': '2.3.0', - # 'PYTORCH_NIGHTLY_URL': 'https://download.pytorch.org/whl/nightly/cu121', - # 'PYTORCH_NIGHTLY_VERSION': 'dev20240110+cu121', - # 'TAGS': ['mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04'], - # 'TARGET': 'pytorch_stage', - # 'TORCHVISION_VERSION': '0.18.0' - # } - # pytorch_entries.append(nightly_entry_311) + nightly_entry_311 = { + 'AWS_OFI_NCCL_VERSION': '', + 'BASE_IMAGE': 'nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04', + 'CUDA_VERSION': '12.1.0', + 'IMAGE_NAME': 'torch-nightly-2-3-0-20240110-cu121', + 'MOFED_VERSION': '5.5-1.0.3.2', + 'NVIDIA_REQUIRE_CUDA_OVERRIDE': _get_cuda_override('12.1.0'), + 'PYTHON_VERSION': '3.11', + 'PYTORCH_VERSION': '2.3.0', + 'PYTORCH_NIGHTLY_URL': 'https://download.pytorch.org/whl/nightly/cu121', + 'PYTORCH_NIGHTLY_VERSION': 'dev20240110+cu121', + 'TAGS': ['mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04'], + 'TARGET': 'pytorch_stage', + 'TORCHVISION_VERSION': '0.18.0' + } + pytorch_entries.append(nightly_entry_311) composer_entries = [] From ecc2a605922d2c3731cb31f721fde7664b97fd9b Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Wed, 24 Jan 2024 15:37:34 -0800 Subject: [PATCH 57/70] max jobs --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index ee7331cb02..e5ae9b9468 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -296,7 +296,7 @@ RUN if [[ -n "$CUDA_VERSION" ]] && [[ -z "${PYTORCH_NIGHTLY_URL}" ]]; then \ RUN if [ -n "$CUDA_VERSION" ] ; then \ pip${PYTHON_VERSION} install --upgrade --no-cache-dir ninja==1.11.1 && \ pip${PYTHON_VERSION} install --upgrade --no-cache-dir --force-reinstall packaging==22.0 && \ - pip${PYTHON_VERSION} install --no-cache-dir flash-attn==2.3.6; \ + MAX_JOBS=1 pip${PYTHON_VERSION} install --no-cache-dir flash-attn==2.5.0; \ fi ############### From 680a7024106243ebd5dd77123a5c6741c696db46 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Wed, 24 Jan 2024 16:29:30 -0800 Subject: [PATCH 58/70] increase timeout --- .github/workflows/pr-docker.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr-docker.yaml b/.github/workflows/pr-docker.yaml index 52e009e68f..5cc75ff92c 100644 --- a/.github/workflows/pr-docker.yaml +++ b/.github/workflows/pr-docker.yaml @@ -18,7 +18,7 @@ jobs: build-image-matrix: if: github.repository_owner == 'mosaicml' runs-on: ubuntu-latest - timeout-minutes: 2 + timeout-minutes: 1440 outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} steps: From bc8023031ea927a4e8768990e468d9547511e65b Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Wed, 24 Jan 2024 16:32:02 -0800 Subject: [PATCH 59/70] increase timeout --- .github/workflows/docker-configure-build-push.yaml | 1 + .github/workflows/pr-docker.yaml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/docker-configure-build-push.yaml b/.github/workflows/docker-configure-build-push.yaml index 8ae2705700..1a84f6d0b4 100644 --- a/.github/workflows/docker-configure-build-push.yaml +++ b/.github/workflows/docker-configure-build-push.yaml @@ -36,6 +36,7 @@ on: required: true jobs: configure-build-push: + timeout-minutes: 1440 runs-on: ubuntu-latest steps: - name: Maximize Build Space on Worker diff --git a/.github/workflows/pr-docker.yaml b/.github/workflows/pr-docker.yaml index 5cc75ff92c..52e009e68f 100644 --- a/.github/workflows/pr-docker.yaml +++ b/.github/workflows/pr-docker.yaml @@ -18,7 +18,7 @@ jobs: build-image-matrix: if: github.repository_owner == 'mosaicml' runs-on: ubuntu-latest - timeout-minutes: 1440 + timeout-minutes: 2 outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} steps: From 5bab5825a25ff59c73dded4aa3560a7b90394a0d Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Thu, 25 Jan 2024 13:20:04 -0800 Subject: [PATCH 60/70] revert to only include nightly change --- docker/README.md | 2 +- docker/build_matrix.yaml | 152 ++++++++++++++++---------------- docker/generate_build_matrix.py | 116 ++++++++++++------------ 3 files changed, 132 insertions(+), 138 deletions(-) diff --git a/docker/README.md b/docker/README.md index 7c6fc08856..a4bbb4b00d 100644 --- a/docker/README.md +++ b/docker/README.md @@ -31,7 +31,7 @@ To install composer, once inside the image, run `pip install mosaicml`. | Linux Distro | Flavor | PyTorch Version | CUDA Version | Python Version | Docker Tags | |----------------|----------|-------------------|---------------------|------------------|------------------------------------------------------------------------------------------| -| Ubuntu 20.04 | Base | 2.3.0 | 12.1.0 (Infiniband) | 3.11 | `mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.3.0 | 12.1.0 (Infiniband) | 3.11 | `mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.11-ubuntu20.04` | | Ubuntu 20.04 | Base | 2.3.0 | 12.1.0 (Infiniband) | 3.10 | `mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04` | | Ubuntu 20.04 | Base | 2.1.2 | 12.1.0 (Infiniband) | 3.10 | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04` | | Ubuntu 20.04 | Base | 2.1.2 | 12.1.0 (EFA) | 3.10 | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04-aws` | diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml index fad6635938..4d29fdae69 100644 --- a/docker/build_matrix.yaml +++ b/docker/build_matrix.yaml @@ -1,37 +1,4 @@ # This file is automatically generated by generate_build_matrix.py. DO NOT EDIT! -- AWS_OFI_NCCL_VERSION: '' - BASE_IMAGE: nvidia/cuda:11.7.1-cudnn8-devel-ubuntu20.04 - CUDA_VERSION: 11.7.1 - IMAGE_NAME: torch-1-13-1-cu117 - MOFED_VERSION: 5.5-1.0.3.2 - NVIDIA_REQUIRE_CUDA_OVERRIDE: '' - PYTHON_VERSION: '3.10' - PYTORCH_NIGHTLY_URL: '' - PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 1.13.1 - TAGS: - - mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04 - TARGET: pytorch_stage - TORCHVISION_VERSION: 0.14.1 -- AWS_OFI_NCCL_VERSION: '' - BASE_IMAGE: nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 - CUDA_VERSION: 11.8.0 - IMAGE_NAME: torch-2-0-1-cu118 - MOFED_VERSION: 5.5-1.0.3.2 - NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=11.8 brand=tesla,driver>=470,driver<471 brand=tesla,driver>=515,driver<516 - brand=unknown,driver>=470,driver<471 brand=unknown,driver>=515,driver<516 brand=nvidia,driver>=470,driver<471 - brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=470,driver<471 brand=nvidiartx,driver>=515,driver<516 - brand=geforce,driver>=470,driver<471 brand=geforce,driver>=515,driver<516 brand=quadro,driver>=470,driver<471 - brand=quadro,driver>=515,driver<516 brand=titan,driver>=470,driver<471 brand=titan,driver>=515,driver<516 - brand=titanrtx,driver>=470,driver<471 brand=titanrtx,driver>=515,driver<516 - PYTHON_VERSION: '3.10' - PYTORCH_NIGHTLY_URL: '' - PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.0.1 - TAGS: - - mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04 - TARGET: pytorch_stage - TORCHVISION_VERSION: 0.15.2 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 CUDA_VERSION: 12.1.0 @@ -60,39 +27,6 @@ - mosaicml/pytorch:latest TARGET: pytorch_stage TORCHVISION_VERSION: 0.16.2 -- AWS_OFI_NCCL_VERSION: v1.7.4-aws - BASE_IMAGE: nvidia/cuda:11.7.1-cudnn8-devel-ubuntu20.04 - CUDA_VERSION: 11.7.1 - IMAGE_NAME: torch-1-13-1-cu117-aws - MOFED_VERSION: '' - NVIDIA_REQUIRE_CUDA_OVERRIDE: '' - PYTHON_VERSION: '3.10' - PYTORCH_NIGHTLY_URL: '' - PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 1.13.1 - TAGS: - - mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04-aws - TARGET: pytorch_stage - TORCHVISION_VERSION: 0.14.1 -- AWS_OFI_NCCL_VERSION: v1.7.4-aws - BASE_IMAGE: nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 - CUDA_VERSION: 11.8.0 - IMAGE_NAME: torch-2-0-1-cu118-aws - MOFED_VERSION: '' - NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=11.8 brand=tesla,driver>=470,driver<471 brand=tesla,driver>=515,driver<516 - brand=unknown,driver>=470,driver<471 brand=unknown,driver>=515,driver<516 brand=nvidia,driver>=470,driver<471 - brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=470,driver<471 brand=nvidiartx,driver>=515,driver<516 - brand=geforce,driver>=470,driver<471 brand=geforce,driver>=515,driver<516 brand=quadro,driver>=470,driver<471 - brand=quadro,driver>=515,driver<516 brand=titan,driver>=470,driver<471 brand=titan,driver>=515,driver<516 - brand=titanrtx,driver>=470,driver<471 brand=titanrtx,driver>=515,driver<516 - PYTHON_VERSION: '3.10' - PYTORCH_NIGHTLY_URL: '' - PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.0.1 - TAGS: - - mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04-aws - TARGET: pytorch_stage - TORCHVISION_VERSION: 0.15.2 - AWS_OFI_NCCL_VERSION: v1.7.4-aws BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 CUDA_VERSION: 12.1.0 @@ -124,17 +58,56 @@ - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: ubuntu:20.04 CUDA_VERSION: '' - IMAGE_NAME: torch-1-13-1-cpu + IMAGE_NAME: torch-2-1-2-cpu MOFED_VERSION: '' NVIDIA_REQUIRE_CUDA_OVERRIDE: '' PYTHON_VERSION: '3.10' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 1.13.1 + PYTORCH_VERSION: 2.1.2 TAGS: - - mosaicml/pytorch:1.13.1_cpu-python3.10-ubuntu20.04 + - mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04 + - mosaicml/pytorch:latest_cpu TARGET: pytorch_stage - TORCHVISION_VERSION: 0.14.1 + TORCHVISION_VERSION: 0.16.2 +- AWS_OFI_NCCL_VERSION: '' + BASE_IMAGE: nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 + CUDA_VERSION: 11.8.0 + IMAGE_NAME: torch-2-0-1-cu118 + MOFED_VERSION: 5.5-1.0.3.2 + NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=11.8 brand=tesla,driver>=470,driver<471 brand=tesla,driver>=515,driver<516 + brand=unknown,driver>=470,driver<471 brand=unknown,driver>=515,driver<516 brand=nvidia,driver>=470,driver<471 + brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=470,driver<471 brand=nvidiartx,driver>=515,driver<516 + brand=geforce,driver>=470,driver<471 brand=geforce,driver>=515,driver<516 brand=quadro,driver>=470,driver<471 + brand=quadro,driver>=515,driver<516 brand=titan,driver>=470,driver<471 brand=titan,driver>=515,driver<516 + brand=titanrtx,driver>=470,driver<471 brand=titanrtx,driver>=515,driver<516 + PYTHON_VERSION: '3.10' + PYTORCH_NIGHTLY_URL: '' + PYTORCH_NIGHTLY_VERSION: '' + PYTORCH_VERSION: 2.0.1 + TAGS: + - mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04 + TARGET: pytorch_stage + TORCHVISION_VERSION: 0.15.2 +- AWS_OFI_NCCL_VERSION: v1.7.4-aws + BASE_IMAGE: nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 + CUDA_VERSION: 11.8.0 + IMAGE_NAME: torch-2-0-1-cu118-aws + MOFED_VERSION: '' + NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=11.8 brand=tesla,driver>=470,driver<471 brand=tesla,driver>=515,driver<516 + brand=unknown,driver>=470,driver<471 brand=unknown,driver>=515,driver<516 brand=nvidia,driver>=470,driver<471 + brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=470,driver<471 brand=nvidiartx,driver>=515,driver<516 + brand=geforce,driver>=470,driver<471 brand=geforce,driver>=515,driver<516 brand=quadro,driver>=470,driver<471 + brand=quadro,driver>=515,driver<516 brand=titan,driver>=470,driver<471 brand=titan,driver>=515,driver<516 + brand=titanrtx,driver>=470,driver<471 brand=titanrtx,driver>=515,driver<516 + PYTHON_VERSION: '3.10' + PYTORCH_NIGHTLY_URL: '' + PYTORCH_NIGHTLY_VERSION: '' + PYTORCH_VERSION: 2.0.1 + TAGS: + - mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04-aws + TARGET: pytorch_stage + TORCHVISION_VERSION: 0.15.2 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: ubuntu:20.04 CUDA_VERSION: '' @@ -149,21 +122,48 @@ - mosaicml/pytorch:2.0.1_cpu-python3.10-ubuntu20.04 TARGET: pytorch_stage TORCHVISION_VERSION: 0.15.2 +- AWS_OFI_NCCL_VERSION: '' + BASE_IMAGE: nvidia/cuda:11.7.1-cudnn8-devel-ubuntu20.04 + CUDA_VERSION: 11.7.1 + IMAGE_NAME: torch-1-13-1-cu117 + MOFED_VERSION: 5.5-1.0.3.2 + NVIDIA_REQUIRE_CUDA_OVERRIDE: '' + PYTHON_VERSION: '3.10' + PYTORCH_NIGHTLY_URL: '' + PYTORCH_NIGHTLY_VERSION: '' + PYTORCH_VERSION: 1.13.1 + TAGS: + - mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04 + TARGET: pytorch_stage + TORCHVISION_VERSION: 0.14.1 +- AWS_OFI_NCCL_VERSION: v1.7.4-aws + BASE_IMAGE: nvidia/cuda:11.7.1-cudnn8-devel-ubuntu20.04 + CUDA_VERSION: 11.7.1 + IMAGE_NAME: torch-1-13-1-cu117-aws + MOFED_VERSION: '' + NVIDIA_REQUIRE_CUDA_OVERRIDE: '' + PYTHON_VERSION: '3.10' + PYTORCH_NIGHTLY_URL: '' + PYTORCH_NIGHTLY_VERSION: '' + PYTORCH_VERSION: 1.13.1 + TAGS: + - mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04-aws + TARGET: pytorch_stage + TORCHVISION_VERSION: 0.14.1 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: ubuntu:20.04 CUDA_VERSION: '' - IMAGE_NAME: torch-2-1-2-cpu + IMAGE_NAME: torch-1-13-1-cpu MOFED_VERSION: '' NVIDIA_REQUIRE_CUDA_OVERRIDE: '' PYTHON_VERSION: '3.10' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.1.2 + PYTORCH_VERSION: 1.13.1 TAGS: - - mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04 - - mosaicml/pytorch:latest_cpu + - mosaicml/pytorch:1.13.1_cpu-python3.10-ubuntu20.04 TARGET: pytorch_stage - TORCHVISION_VERSION: 0.16.2 + TORCHVISION_VERSION: 0.14.1 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 CUDA_VERSION: 12.1.0 @@ -215,7 +215,7 @@ PYTORCH_NIGHTLY_VERSION: dev20240110+cu121 PYTORCH_VERSION: 2.3.0 TAGS: - - mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04 + - mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.11-ubuntu20.04 TARGET: pytorch_stage TORCHVISION_VERSION: 0.18.0 - AWS_OFI_NCCL_VERSION: '' diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py index 3fff12d9d0..ee10ef96ff 100644 --- a/docker/generate_build_matrix.py +++ b/docker/generate_build_matrix.py @@ -32,14 +32,6 @@ def _get_torchvision_version(pytorch_version: str): raise ValueError(f'Invalid pytorch_version: {pytorch_version}') -def _get_pytorch_versions(python_version: str): - if python_version == '3.10': - return ['1.13.1', '2.0.1', '2.1.2'] - if python_version == '3.11': - return [] - raise ValueError(f'Invalid python_version: {python_version}') - - def _get_base_image(cuda_version: str): if not cuda_version: return 'ubuntu:20.04' @@ -173,66 +165,68 @@ def _write_table(table_tag: str, table_contents: str): def _main(): - python_versions = ['3.10', '3.11'] + python_versions = ['3.10'] + pytorch_versions = ['2.1.2', '2.0.1', '1.13.1'] cuda_options = [True, False] stages = ['pytorch_stage'] interconnects = ['mellanox', 'EFA'] # mellanox is default, EFA needed for AWS pytorch_entries = [] - for product in itertools.product(python_versions, cuda_options, stages, interconnects): - python_version, use_cuda, stage, interconnect = product - for pytorch_version in _get_pytorch_versions(python_version): - cuda_version = _get_cuda_version(pytorch_version=pytorch_version, use_cuda=use_cuda) - entry = { - 'IMAGE_NAME': - _get_image_name(pytorch_version, cuda_version, stage, interconnect), - 'BASE_IMAGE': - _get_base_image(cuda_version), - 'CUDA_VERSION': - cuda_version, - 'PYTHON_VERSION': - python_version, - 'PYTORCH_VERSION': - pytorch_version, - 'TARGET': - stage, - 'TORCHVISION_VERSION': - _get_torchvision_version(pytorch_version), - 'TAGS': - _get_pytorch_tags( - python_version=python_version, - pytorch_version=pytorch_version, - cuda_version=cuda_version, - stage=stage, - interconnect=interconnect, - ), - 'PYTORCH_NIGHTLY_URL': - '', - 'PYTORCH_NIGHTLY_VERSION': - '', - 'NVIDIA_REQUIRE_CUDA_OVERRIDE': - _get_cuda_override(cuda_version), - } - - # Only build EFA image on latest python with cuda on pytorch_stage - if interconnect == 'EFA' and not (python_version == LATEST_PYTHON_VERSION and use_cuda and - stage == 'pytorch_stage'): - continue - - # Skip the mellanox drivers if not in the cuda images or using EFA - if not cuda_version or interconnect == 'EFA': - entry['MOFED_VERSION'] = '' - else: - entry['MOFED_VERSION'] = '5.5-1.0.3.2' + for product in itertools.product(python_versions, pytorch_versions, cuda_options, stages, interconnects): + python_version, pytorch_version, use_cuda, stage, interconnect = product - # Skip EFA drivers if not using EFA - if interconnect != 'EFA': - entry['AWS_OFI_NCCL_VERSION'] = '' - else: - entry['AWS_OFI_NCCL_VERSION'] = 'v1.7.4-aws' + cuda_version = _get_cuda_version(pytorch_version=pytorch_version, use_cuda=use_cuda) + + entry = { + 'IMAGE_NAME': + _get_image_name(pytorch_version, cuda_version, stage, interconnect), + 'BASE_IMAGE': + _get_base_image(cuda_version), + 'CUDA_VERSION': + cuda_version, + 'PYTHON_VERSION': + python_version, + 'PYTORCH_VERSION': + pytorch_version, + 'TARGET': + stage, + 'TORCHVISION_VERSION': + _get_torchvision_version(pytorch_version), + 'TAGS': + _get_pytorch_tags( + python_version=python_version, + pytorch_version=pytorch_version, + cuda_version=cuda_version, + stage=stage, + interconnect=interconnect, + ), + 'PYTORCH_NIGHTLY_URL': + '', + 'PYTORCH_NIGHTLY_VERSION': + '', + 'NVIDIA_REQUIRE_CUDA_OVERRIDE': + _get_cuda_override(cuda_version), + } - pytorch_entries.append(entry) + # Only build EFA image on latest python with cuda on pytorch_stage + if interconnect == 'EFA' and not (python_version == LATEST_PYTHON_VERSION and use_cuda and + stage == 'pytorch_stage'): + continue + + # Skip the mellanox drivers if not in the cuda images or using EFA + if not cuda_version or interconnect == 'EFA': + entry['MOFED_VERSION'] = '' + else: + entry['MOFED_VERSION'] = '5.5-1.0.3.2' + + # Skip EFA drivers if not using EFA + if interconnect != 'EFA': + entry['AWS_OFI_NCCL_VERSION'] = '' + else: + entry['AWS_OFI_NCCL_VERSION'] = 'v1.7.4-aws' + + pytorch_entries.append(entry) nightly_entry_310 = { 'AWS_OFI_NCCL_VERSION': '', 'BASE_IMAGE': 'nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04', @@ -261,7 +255,7 @@ def _main(): 'PYTORCH_VERSION': '2.3.0', 'PYTORCH_NIGHTLY_URL': 'https://download.pytorch.org/whl/nightly/cu121', 'PYTORCH_NIGHTLY_VERSION': 'dev20240110+cu121', - 'TAGS': ['mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.10-ubuntu20.04'], + 'TAGS': ['mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.11-ubuntu20.04'], 'TARGET': 'pytorch_stage', 'TORCHVISION_VERSION': '0.18.0' } From 5f71bfb0c610d1d09e69232eb3d4302adaa00caf Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Thu, 25 Jan 2024 13:46:15 -0800 Subject: [PATCH 61/70] reset to default build time --- .github/workflows/docker-configure-build-push.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/docker-configure-build-push.yaml b/.github/workflows/docker-configure-build-push.yaml index 1a84f6d0b4..8ae2705700 100644 --- a/.github/workflows/docker-configure-build-push.yaml +++ b/.github/workflows/docker-configure-build-push.yaml @@ -36,7 +36,6 @@ on: required: true jobs: configure-build-push: - timeout-minutes: 1440 runs-on: ubuntu-latest steps: - name: Maximize Build Space on Worker From 7fb711fdd818f2e6351513c16ad7e1053c418d64 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Thu, 25 Jan 2024 14:43:30 -0800 Subject: [PATCH 62/70] update docker yaml --- docker/build_matrix.yaml | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml index 4957589cc8..9f96b8481d 100644 --- a/docker/build_matrix.yaml +++ b/docker/build_matrix.yaml @@ -218,33 +218,6 @@ - mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.11-ubuntu20.04 TARGET: pytorch_stage TORCHVISION_VERSION: 0.18.0 -- AWS_OFI_NCCL_VERSION: '' - BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 - CUDA_VERSION: 12.1.0 - IMAGE_NAME: torch-nightly-2-3-0-20240110-cu121 - MOFED_VERSION: 5.5-1.0.3.2 - NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 - brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 - brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471 - brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471 - brand=tesla,driver>=510,driver<511 brand=unknown,driver>=510,driver<511 brand=nvidia,driver>=510,driver<511 - brand=nvidiartx,driver>=510,driver<511 brand=geforce,driver>=510,driver<511 brand=geforcertx,driver>=510,driver<511 - brand=quadro,driver>=510,driver<511 brand=quadrortx,driver>=510,driver<511 brand=titan,driver>=510,driver<511 - brand=titanrtx,driver>=510,driver<511 brand=tesla,driver>=515,driver<516 brand=unknown,driver>=515,driver<516 - brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=515,driver<516 brand=geforce,driver>=515,driver<516 - brand=geforcertx,driver>=515,driver<516 brand=quadro,driver>=515,driver<516 brand=quadrortx,driver>=515,driver<516 - brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=515,driver<516 brand=tesla,driver>=525,driver<526 - brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 - brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 - brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 - PYTHON_VERSION: '3.11' - PYTORCH_NIGHTLY_URL: https://download.pytorch.org/whl/nightly/cu121 - PYTORCH_NIGHTLY_VERSION: dev20240110+cu121 - PYTORCH_VERSION: 2.3.0 - TAGS: - - mosaicml/pytorch:2.3.0_cu121-nightly20240110-python3.11-ubuntu20.04 - TARGET: pytorch_stage - TORCHVISION_VERSION: 0.18.0 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.18.1 From 78c5ba868aa61a2cd5fe73ba485fdf807550eb14 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Thu, 25 Jan 2024 14:57:39 -0800 Subject: [PATCH 63/70] new names --- docker/generate_build_matrix.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py index 6084055bf2..1e5c550fd5 100644 --- a/docker/generate_build_matrix.py +++ b/docker/generate_build_matrix.py @@ -231,7 +231,7 @@ def _main(): 'AWS_OFI_NCCL_VERSION': '', 'BASE_IMAGE': 'nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04', 'CUDA_VERSION': '12.1.0', - 'IMAGE_NAME': 'torch-nightly-2-3-0-20240110-cu121', + 'IMAGE_NAME': 'torch-nightly-2-3-0-20240110-cu121-python3-10', 'MOFED_VERSION': '5.5-1.0.3.2', 'NVIDIA_REQUIRE_CUDA_OVERRIDE': _get_cuda_override('12.1.0'), 'PYTHON_VERSION': '3.10', @@ -248,7 +248,7 @@ def _main(): 'AWS_OFI_NCCL_VERSION': '', 'BASE_IMAGE': 'nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04', 'CUDA_VERSION': '12.1.0', - 'IMAGE_NAME': 'torch-nightly-2-3-0-20240110-cu121', + 'IMAGE_NAME': 'torch-nightly-2-3-0-20240110-cu121-python3-11', 'MOFED_VERSION': '5.5-1.0.3.2', 'NVIDIA_REQUIRE_CUDA_OVERRIDE': _get_cuda_override('12.1.0'), 'PYTHON_VERSION': '3.11', From d6062f018c429a485d2be182efb55606a20ba9fd Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Thu, 25 Jan 2024 18:28:38 -0800 Subject: [PATCH 64/70] merge --- docker/build_matrix.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml index 9f96b8481d..5d1e1d25a5 100644 --- a/docker/build_matrix.yaml +++ b/docker/build_matrix.yaml @@ -167,7 +167,7 @@ - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 CUDA_VERSION: 12.1.0 - IMAGE_NAME: torch-nightly-2-3-0-20240110-cu121 + IMAGE_NAME: torch-nightly-2-3-0-20240110-cu121-python3-10 MOFED_VERSION: 5.5-1.0.3.2 NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 @@ -194,7 +194,7 @@ - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 CUDA_VERSION: 12.1.0 - IMAGE_NAME: torch-nightly-2-3-0-20240110-cu121 + IMAGE_NAME: torch-nightly-2-3-0-20240110-cu121-python3-11 MOFED_VERSION: 5.5-1.0.3.2 NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 From 3027d270c3d62a44b909b28cdf6c723c2a78b3c3 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Fri, 26 Jan 2024 12:19:58 -0800 Subject: [PATCH 65/70] fix merge --- composer/datasets/utils.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/composer/datasets/utils.py b/composer/datasets/utils.py index 583d436198..5bcd9d9831 100644 --- a/composer/datasets/utils.py +++ b/composer/datasets/utils.py @@ -179,11 +179,7 @@ class MultiTokenEOSCriteria(transformers.StoppingCriteria): def __init__( self, stop_sequence: str, -<<<<<<< HEAD - tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast], -======= tokenizer: transformers.PreTrainedTokenizerBase, ->>>>>>> 39eb817355686e65345a2e0a302d121fa119f893 batch_size: int, ) -> None: self.done_tracker = [False] * batch_size @@ -217,11 +213,7 @@ def __call__(self, input_ids: torch.Tensor, scores: Optional[torch.FloatTensor] return False not in self.done_tracker def stop_sequences_criteria( -<<<<<<< HEAD - tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast], -======= tokenizer: transformers.PreTrainedTokenizerBase, ->>>>>>> 39eb817355686e65345a2e0a302d121fa119f893 stop_sequences: List[str], batch_size: int, ) -> transformers.StoppingCriteriaList: @@ -231,4 +223,4 @@ def stop_sequences_criteria( except ImportError as e: stop_sequences_criteria = None # pyright: ignore [reportGeneralTypeIssues] - MultiTokenEOSCriteria = None # pyright: ignore [reportGeneralTypeIssues] + MultiTokenEOSCriteria = None # pyright: ignore [reportGeneralTypeIssues] \ No newline at end of file From 8b0504ac58a0a97e43bb5a7fb0a5bea00e5b8161 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Fri, 26 Jan 2024 12:31:03 -0800 Subject: [PATCH 66/70] lint --- composer/datasets/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/composer/datasets/utils.py b/composer/datasets/utils.py index 5bcd9d9831..b627ef8596 100644 --- a/composer/datasets/utils.py +++ b/composer/datasets/utils.py @@ -223,4 +223,4 @@ def stop_sequences_criteria( except ImportError as e: stop_sequences_criteria = None # pyright: ignore [reportGeneralTypeIssues] - MultiTokenEOSCriteria = None # pyright: ignore [reportGeneralTypeIssues] \ No newline at end of file + MultiTokenEOSCriteria = None # pyright: ignore [reportGeneralTypeIssues] From 1d928a3a8a179185890a1b5e4106fd117c37e222 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Fri, 26 Jan 2024 13:25:14 -0800 Subject: [PATCH 67/70] cpu-3.11-nightly test --- .github/workflows/pr-cpu.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml index 55fbefcfe6..a5c246297a 100644 --- a/.github/workflows/pr-cpu.yaml +++ b/.github/workflows/pr-cpu.yaml @@ -27,6 +27,11 @@ jobs: markers: 'not daily and not remote and not gpu and not vision and not doctest' pytest_command: 'coverage run -m pytest' composer_package_name: 'mosaicml' + - name: 'cpu-3.11-nightly' + container: mosaicml/ci-staging:ec3d7563-2684-4ad1-921d-163a60e240b8 + markers: 'not daily and not remote and not gpu and not vision and not doctest' + pytest_command: 'coverage run -m pytest' + composer_package_name: 'mosaicml' # - name: 'cpu-3.10-2.2' # container: mosaicml/pytorch:2.2.0_cu121-nightly20231213-python3.10-ubuntu20.04 # markers: 'not daily and not remote and not gpu and not vision and not doctest' From 4d888fd817e62c4de9c93a2304e0aaad02fa79da Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Fri, 26 Jan 2024 13:35:12 -0800 Subject: [PATCH 68/70] temp rm test --- .github/workflows/pr-cpu.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml index a5c246297a..fd53ad6b4d 100644 --- a/.github/workflows/pr-cpu.yaml +++ b/.github/workflows/pr-cpu.yaml @@ -27,11 +27,11 @@ jobs: markers: 'not daily and not remote and not gpu and not vision and not doctest' pytest_command: 'coverage run -m pytest' composer_package_name: 'mosaicml' - - name: 'cpu-3.11-nightly' - container: mosaicml/ci-staging:ec3d7563-2684-4ad1-921d-163a60e240b8 - markers: 'not daily and not remote and not gpu and not vision and not doctest' - pytest_command: 'coverage run -m pytest' - composer_package_name: 'mosaicml' + # - name: 'cpu-3.11-nightly' + # container: mosaicml/ci-staging:ec3d7563-2684-4ad1-921d-163a60e240b8 + # markers: 'not daily and not remote and not gpu and not vision and not doctest' + # pytest_command: 'coverage run -m pytest' + # composer_package_name: 'mosaicml' # - name: 'cpu-3.10-2.2' # container: mosaicml/pytorch:2.2.0_cu121-nightly20231213-python3.10-ubuntu20.04 # markers: 'not daily and not remote and not gpu and not vision and not doctest' From 1b117d147c1e4a2a83e43d0fbd8f2069544af438 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Fri, 26 Jan 2024 13:41:10 -0800 Subject: [PATCH 69/70] cpu unit tst --- .github/workflows/pr-cpu.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml index fd53ad6b4d..a5c246297a 100644 --- a/.github/workflows/pr-cpu.yaml +++ b/.github/workflows/pr-cpu.yaml @@ -27,11 +27,11 @@ jobs: markers: 'not daily and not remote and not gpu and not vision and not doctest' pytest_command: 'coverage run -m pytest' composer_package_name: 'mosaicml' - # - name: 'cpu-3.11-nightly' - # container: mosaicml/ci-staging:ec3d7563-2684-4ad1-921d-163a60e240b8 - # markers: 'not daily and not remote and not gpu and not vision and not doctest' - # pytest_command: 'coverage run -m pytest' - # composer_package_name: 'mosaicml' + - name: 'cpu-3.11-nightly' + container: mosaicml/ci-staging:ec3d7563-2684-4ad1-921d-163a60e240b8 + markers: 'not daily and not remote and not gpu and not vision and not doctest' + pytest_command: 'coverage run -m pytest' + composer_package_name: 'mosaicml' # - name: 'cpu-3.10-2.2' # container: mosaicml/pytorch:2.2.0_cu121-nightly20231213-python3.10-ubuntu20.04 # markers: 'not daily and not remote and not gpu and not vision and not doctest' From 1b574be3a99637a3a0b97261f9f1bef87dc0bc56 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Fri, 26 Jan 2024 13:44:53 -0800 Subject: [PATCH 70/70] rm test --- .github/workflows/pr-cpu.yaml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml index a5c246297a..55fbefcfe6 100644 --- a/.github/workflows/pr-cpu.yaml +++ b/.github/workflows/pr-cpu.yaml @@ -27,11 +27,6 @@ jobs: markers: 'not daily and not remote and not gpu and not vision and not doctest' pytest_command: 'coverage run -m pytest' composer_package_name: 'mosaicml' - - name: 'cpu-3.11-nightly' - container: mosaicml/ci-staging:ec3d7563-2684-4ad1-921d-163a60e240b8 - markers: 'not daily and not remote and not gpu and not vision and not doctest' - pytest_command: 'coverage run -m pytest' - composer_package_name: 'mosaicml' # - name: 'cpu-3.10-2.2' # container: mosaicml/pytorch:2.2.0_cu121-nightly20231213-python3.10-ubuntu20.04 # markers: 'not daily and not remote and not gpu and not vision and not doctest'