From f81961e6889a8e0da2db3f988221662d2791c308 Mon Sep 17 00:00:00 2001 From: pbialecki Date: Fri, 18 Mar 2022 01:59:54 -0700 Subject: [PATCH 1/5] ADD 11.6 workflow for docker image build --- .github/workflows/build-conda-images.yml | 2 +- .github/workflows/build-libtorch-images.yml | 2 +- .github/workflows/build-manywheel-images.yml | 2 +- common/install_cuda.sh | 54 ++++++++++++++++++++ conda/Dockerfile | 5 ++ conda/build_all_docker.sh | 2 +- libtorch/Dockerfile | 4 ++ libtorch/build_all_docker.sh | 2 +- 8 files changed, 68 insertions(+), 5 deletions(-) diff --git a/.github/workflows/build-conda-images.yml b/.github/workflows/build-conda-images.yml index 4904cf619..6c37a4044 100644 --- a/.github/workflows/build-conda-images.yml +++ b/.github/workflows/build-conda-images.yml @@ -26,7 +26,7 @@ jobs: runs-on: linux.2xlarge strategy: matrix: - cuda_version: ["10.2", "11.3", "11.5", "cpu"] + cuda_version: ["10.2", "11.3", "11.5", "11.6", "cpu"] env: CUDA_VERSION: ${{ matrix.cuda_version }} steps: diff --git a/.github/workflows/build-libtorch-images.yml b/.github/workflows/build-libtorch-images.yml index 950d1fd1c..1045bd277 100644 --- a/.github/workflows/build-libtorch-images.yml +++ b/.github/workflows/build-libtorch-images.yml @@ -28,7 +28,7 @@ jobs: runs-on: ubuntu-18.04 strategy: matrix: - cuda_version: ["11.5", "11.3", "10.2"] + cuda_version: ["11.6", "11.5", "11.3", "10.2"] env: GPU_ARCH_TYPE: cuda GPU_ARCH_VERSION: ${{ matrix.cuda_version }} diff --git a/.github/workflows/build-manywheel-images.yml b/.github/workflows/build-manywheel-images.yml index 13882ea1f..ec755d0c2 100644 --- a/.github/workflows/build-manywheel-images.yml +++ b/.github/workflows/build-manywheel-images.yml @@ -28,7 +28,7 @@ jobs: runs-on: ubuntu-18.04 strategy: matrix: - cuda_version: ["11.5", "11.3", "10.2"] + cuda_version: ["11.6", "11.5", "11.3", "10.2"] env: GPU_ARCH_TYPE: cuda GPU_ARCH_VERSION: ${{ matrix.cuda_version }} diff --git a/common/install_cuda.sh b/common/install_cuda.sh index c18066d23..49de27f08 100644 --- a/common/install_cuda.sh +++ b/common/install_cuda.sh @@ -76,6 +76,27 @@ function install_115 { ldconfig } +function install_116 { + echo "Installing CUDA 11.6 and CuDNN 8.3" + rm -rf /usr/local/cuda-11.6 /usr/local/cuda + # install CUDA 11.6.1 in the same container + wget -q https://developer.download.nvidia.com/compute/cuda/11.6.1/local_installers/cuda_11.6.1_510.47.03_linux.run + chmod +x cuda_11.6.1_510.47.03_linux.run + ./cuda_11.6.1_510.47.03_linux.run --toolkit --silent + rm -f cuda_11.6.1_510.47.03_linux.run + rm -f /usr/local/cuda && ln -s /usr/local/cuda-11.6 /usr/local/cuda + + # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement + mkdir tmp_cudnn && cd tmp_cudnn + wget -q https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz -O cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz + tar xf cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz + cp -a cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive/include/* /usr/local/cuda/include/ + cp -a cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive/lib/* /usr/local/cuda/lib64/ + cd .. + rm -rf tmp_cudnn + ldconfig +} + function prune_102 { echo "Pruning CUDA 10.2 and CuDNN" ##################################################################################### @@ -172,6 +193,37 @@ function prune_115 { rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2021.3.0 $CUDA_BASE/nsight-systems-2021.3.3 } +function prune_116 { + echo "Pruning CUDA 11.6 and CuDNN" + ##################################################################################### + # CUDA 11.6 prune static libs + ##################################################################################### + export NVPRUNE="/usr/local/cuda-11.6/bin/nvprune" + export CUDA_LIB_DIR="/usr/local/cuda-11.6/lib64" + + export GENCODE="-gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86" + export GENCODE_CUDNN="-gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86" + + if [[ -n "$OVERRIDE_GENCODE" ]]; then + export GENCODE=$OVERRIDE_GENCODE + fi + + # all CUDA libs except CuDNN and CuBLAS (cudnn and cublas need arch 3.7 included) + ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis" \ + | xargs -I {} bash -c \ + "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}" + + # prune CuDNN and CuBLAS + $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a + $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a + + ##################################################################################### + # CUDA 11.6 prune visual tools + ##################################################################################### + export CUDA_BASE="/usr/local/cuda-11.6/" + rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2022.1.1 $CUDA_BASE/nsight-systems-2021.5.2 +} + # idiomatic parameter and option handling in sh while test $# -gt 0 do @@ -182,6 +234,8 @@ do ;; 11.5) install_115; prune_115 ;; + 11.6) install_116; prune_116 + ;; *) echo "bad argument $1"; exit 1 ;; esac diff --git a/conda/Dockerfile b/conda/Dockerfile index 349aa2120..84705c784 100644 --- a/conda/Dockerfile +++ b/conda/Dockerfile @@ -55,6 +55,10 @@ FROM cuda as cuda11.5 RUN bash ./install_cuda.sh 11.5 ENV DESIRED_CUDA=11.5 +FROM cuda as cuda11.6 +RUN bash ./install_cuda.sh 11.6 +ENV DESIRED_CUDA=11.6 + # Install MNIST test data FROM base as mnist ADD ./common/install_mnist.sh install_mnist.sh @@ -64,6 +68,7 @@ FROM base as all_cuda COPY --from=cuda10.2 /usr/local/cuda-10.2 /usr/local/cuda-10.2 COPY --from=cuda11.3 /usr/local/cuda-11.3 /usr/local/cuda-11.3 COPY --from=cuda11.5 /usr/local/cuda-11.5 /usr/local/cuda-11.5 +COPY --from=cuda11.6 /usr/local/cuda-11.6 /usr/local/cuda-11.6 FROM ${BASE_TARGET} as final # Install LLVM diff --git a/conda/build_all_docker.sh b/conda/build_all_docker.sh index 3b24cb5d5..d906bc370 100755 --- a/conda/build_all_docker.sh +++ b/conda/build_all_docker.sh @@ -4,6 +4,6 @@ set -eou pipefail TOPDIR=$(git rev-parse --show-toplevel) -for CUDA_VERSION in 11.5 11.3 10.2 cpu; do +for CUDA_VERSION in 11.6 11.5 11.3 10.2 cpu; do CUDA_VERSION="${CUDA_VERSION}" conda/build_docker.sh done diff --git a/libtorch/Dockerfile b/libtorch/Dockerfile index 86eef5602..546a3181b 100644 --- a/libtorch/Dockerfile +++ b/libtorch/Dockerfile @@ -56,6 +56,10 @@ FROM cuda as cuda11.5 RUN bash ./install_cuda.sh 11.5 RUN bash ./install_magma.sh 11.5 +FROM cuda as cuda11.6 +RUN bash ./install_cuda.sh 11.6 +RUN bash ./install_magma.sh 11.6 + FROM cpu as rocm ARG PYTORCH_ROCM_ARCH ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH} diff --git a/libtorch/build_all_docker.sh b/libtorch/build_all_docker.sh index 57f26bc8c..9d85fba84 100755 --- a/libtorch/build_all_docker.sh +++ b/libtorch/build_all_docker.sh @@ -4,7 +4,7 @@ set -eou pipefail TOPDIR=$(git rev-parse --show-toplevel) -for cuda_version in 11.5 11.3 10.2; do +for cuda_version in 11.6 11.5 11.3 10.2; do GPU_ARCH_TYPE=cuda GPU_ARCH_VERSION="${cuda_version}" "${TOPDIR}/libtorch/build_docker.sh" done From 232478fc38f3d4de717f81b520e2e34106391c7f Mon Sep 17 00:00:00 2001 From: pbialecki Date: Mon, 21 Mar 2022 11:49:34 -0700 Subject: [PATCH 2/5] use CUDA 11.6.0 --- common/install_cuda.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/common/install_cuda.sh b/common/install_cuda.sh index 49de27f08..2b85c1ed1 100644 --- a/common/install_cuda.sh +++ b/common/install_cuda.sh @@ -79,12 +79,12 @@ function install_115 { function install_116 { echo "Installing CUDA 11.6 and CuDNN 8.3" rm -rf /usr/local/cuda-11.6 /usr/local/cuda - # install CUDA 11.6.1 in the same container - wget -q https://developer.download.nvidia.com/compute/cuda/11.6.1/local_installers/cuda_11.6.1_510.47.03_linux.run - chmod +x cuda_11.6.1_510.47.03_linux.run - ./cuda_11.6.1_510.47.03_linux.run --toolkit --silent - rm -f cuda_11.6.1_510.47.03_linux.run - rm -f /usr/local/cuda && ln -s /usr/local/cuda-11.6 /usr/local/cuda + # install CUDA 11.6.0 in the same container + wget -q https://developer.download.nvidia.com/compute/cuda/11.6.0/local_installers/cuda_11.6.0_510.39.01_linux.run + chmod +x cuda_11.6.0_510.39.01_linux.run + ./cuda_11.6.0_510.39.01_linux.run --toolkit --silent + rm -f cuda_11.6.0_510.39.01_linux.run + rm -f /usr/local/cuda && ln -s /usr/local/cuda11.6 /usr/local/cuda # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement mkdir tmp_cudnn && cd tmp_cudnn From b117441a75ad45fbf2eeac78f11080ef8984bef3 Mon Sep 17 00:00:00 2001 From: pbialecki Date: Mon, 21 Mar 2022 11:53:26 -0700 Subject: [PATCH 3/5] remove libtorch changes --- .github/workflows/build-libtorch-images.yml | 2 +- libtorch/Dockerfile | 4 ---- libtorch/build_all_docker.sh | 2 +- 3 files changed, 2 insertions(+), 6 deletions(-) diff --git a/.github/workflows/build-libtorch-images.yml b/.github/workflows/build-libtorch-images.yml index 1045bd277..950d1fd1c 100644 --- a/.github/workflows/build-libtorch-images.yml +++ b/.github/workflows/build-libtorch-images.yml @@ -28,7 +28,7 @@ jobs: runs-on: ubuntu-18.04 strategy: matrix: - cuda_version: ["11.6", "11.5", "11.3", "10.2"] + cuda_version: ["11.5", "11.3", "10.2"] env: GPU_ARCH_TYPE: cuda GPU_ARCH_VERSION: ${{ matrix.cuda_version }} diff --git a/libtorch/Dockerfile b/libtorch/Dockerfile index 546a3181b..86eef5602 100644 --- a/libtorch/Dockerfile +++ b/libtorch/Dockerfile @@ -56,10 +56,6 @@ FROM cuda as cuda11.5 RUN bash ./install_cuda.sh 11.5 RUN bash ./install_magma.sh 11.5 -FROM cuda as cuda11.6 -RUN bash ./install_cuda.sh 11.6 -RUN bash ./install_magma.sh 11.6 - FROM cpu as rocm ARG PYTORCH_ROCM_ARCH ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH} diff --git a/libtorch/build_all_docker.sh b/libtorch/build_all_docker.sh index 9d85fba84..57f26bc8c 100755 --- a/libtorch/build_all_docker.sh +++ b/libtorch/build_all_docker.sh @@ -4,7 +4,7 @@ set -eou pipefail TOPDIR=$(git rev-parse --show-toplevel) -for cuda_version in 11.6 11.5 11.3 10.2; do +for cuda_version in 11.5 11.3 10.2; do GPU_ARCH_TYPE=cuda GPU_ARCH_VERSION="${cuda_version}" "${TOPDIR}/libtorch/build_docker.sh" done From d7218ede4d477a81f84da8a166c718206f2d3aaa Mon Sep 17 00:00:00 2001 From: ptrblck Date: Mon, 21 Mar 2022 12:26:53 -0700 Subject: [PATCH 4/5] remove 11.6 from .github/workflows/build-manywheel-images.yml Co-authored-by: Andrey Talman --- .github/workflows/build-manywheel-images.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-manywheel-images.yml b/.github/workflows/build-manywheel-images.yml index ec755d0c2..13882ea1f 100644 --- a/.github/workflows/build-manywheel-images.yml +++ b/.github/workflows/build-manywheel-images.yml @@ -28,7 +28,7 @@ jobs: runs-on: ubuntu-18.04 strategy: matrix: - cuda_version: ["11.6", "11.5", "11.3", "10.2"] + cuda_version: ["11.5", "11.3", "10.2"] env: GPU_ARCH_TYPE: cuda GPU_ARCH_VERSION: ${{ matrix.cuda_version }} From 631111d3e362b305e53e97e0a2a370a817a21935 Mon Sep 17 00:00:00 2001 From: ptrblck Date: Mon, 21 Mar 2022 12:28:34 -0700 Subject: [PATCH 5/5] fix ln for 11.6 in common/install_cuda.sh Co-authored-by: Andrey Talman --- common/install_cuda.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/install_cuda.sh b/common/install_cuda.sh index 2b85c1ed1..1fe00213d 100644 --- a/common/install_cuda.sh +++ b/common/install_cuda.sh @@ -84,7 +84,7 @@ function install_116 { chmod +x cuda_11.6.0_510.39.01_linux.run ./cuda_11.6.0_510.39.01_linux.run --toolkit --silent rm -f cuda_11.6.0_510.39.01_linux.run - rm -f /usr/local/cuda && ln -s /usr/local/cuda11.6 /usr/local/cuda + rm -f /usr/local/cuda && ln -s /usr/local/cuda-11.6 /usr/local/cuda # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement mkdir tmp_cudnn && cd tmp_cudnn