Python 313 work - rebase on main (#1868)

* Remove triton constraint for py312 (#1846) * Cache OpenBLAS to docker image for SBSA builds (#1842) * apply openblas cache for cpu-aarch64 * reapply for cuda-aarch64 * [MacOS] Don't build wheel while building libtorch Not sure why this was ever done twice * Allow validate doker images to be called from different workflow (#1850) * Allow validate doker images to be called from different workflow * Revert "[MacOS] Don't build wheel while building libtorch" This reverts commit d88495a. * [MacOS] Don't build libtorch twice (take 2) By not invoking `tools/build_libtorch.py` as as it's not done on Linux * [MacOs][LibTorch] Copy libomp.dylib into libtorch package * Update cudnn from v8 to v9 across CUDA versions and x86/arm (#1847) * Update cudnn to v9.1.0.70 for cuda11.8, cuda12.1, and cuda12.4 * Add CUDNN_VERSION variable * Remove 2 spaces for install_cu124 * trivial fix * Fix DEPS_LIST and DEPS_SONAME for x86 Update cudnn to v9 for arm cuda binary as well * libcudnn_adv_infer/libcudnn_adv_train becomes libcudnn_adv * Change DEPS due to cudnn v9 libraries name changes (and additions) * Fix lint * Add missing changes to cu121/cu124 * Change OpenSSL URL (#1854) * Change OpenSSL URL * Change to use openssl URL (but no longer ftp!) * Update build-manywheel-images.yml - Add a note about manylinux_2_28 state * Revert "Update cudnn from v8 to v9 across CUDA versions and x86/arm" (#1855) This reverts commit 5783bcc. * Don't run torch.compile on runtime images in docker validations (#1858) * Don't run torch.compile on runtime images * test * Don't run torch.compile on runtime images in docker validations * Update cudnn from v8 to v9 across CUDA versions and x86/arm (#1857) * Update cudnn to v9.1.0.70 for cuda11.8, cuda12.1, and cuda12.4 * Add CUDNN_VERSION variable * Remove 2 spaces for install_cu124 * trivial fix * Fix DEPS_LIST and DEPS_SONAME for x86 Update cudnn to v9 for arm cuda binary as well * libcudnn_adv_infer/libcudnn_adv_train becomes libcudnn_adv * Change DEPS due to cudnn v9 libraries name changes (and additions) * Fix lint * Add missing changes to cu121/cu124 * Fix aarch64 cuda typos * Update validate-docker-images.yml - disable runtime error check for now * Update validate-docker-images.yml - use validation_runner rather then hardcoded one * Update validate-docker-images.yml - fix MATRIX_GPU_ARCH_TYPE setting for cpu only workflows * [aarch64 cuda cudnn] Add RUNPATH to libcudnn_graph.so.9 (#1859) * Add executorch to pypi prep, promotion and validation scripts (#1860) * Add AOTriton install step for ROCm manylinux images (#1862) * Add AOTriton install step for ROCm * No common_utils.sh needed * temporary disable runtime error check * Add python 3.13 builder (#1845) --------- Co-authored-by: Ting Lu <92425201+tinglvv@users.noreply.github.com> Co-authored-by: Nikita Shulga <nikita.shulga@gmail.com> Co-authored-by: Wei Wang <143543872+nWEIdia@users.noreply.github.com> Co-authored-by: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com>
pytorch · Jun 17, 2024 · 01b8210 · 01b8210
1 parent 5d8c7af
commit 01b8210
Show file tree

Hide file tree

Showing 24 changed files with 224 additions and 131 deletions.
diff --git a/.github/scripts/validate_binaries.sh b/.github/scripts/validate_binaries.sh
@@ -62,7 +62,7 @@ else
     if [[ ${TARGET_OS} == 'windows' ]]; then
         python  ./test/smoke_test/smoke_test.py ${TEST_SUFFIX}
     else
-        python3  ./test/smoke_test/smoke_test.py ${TEST_SUFFIX}
+        python3  ./test/smoke_test/smoke_test.py ${TEST_SUFFIX} --runtime-error-check "disabled"
     fi
 
     if [[ ${TARGET_OS} == 'macos-arm64' ]]; then

diff --git a/.github/workflows/build-manywheel-images.yml b/.github/workflows/build-manywheel-images.yml
@@ -60,6 +60,7 @@ jobs:
       - name: Build Docker Image
         run: |
           manywheel/build_docker.sh
+  # NOTE: manylinux_2_28 are still experimental, see https://github.com/pytorch/pytorch/issues/123649
   build-docker-cuda-manylinux_2_28:
     runs-on: linux.12xlarge
     strategy:

diff --git a/.github/workflows/validate_docker_images.yml → .github/workflows/validate-docker-images.yml b/.github/workflows/validate_docker_images.yml → .github/workflows/validate-docker-images.yml
@@ -1,5 +1,22 @@
-name: Validate Docker Images (with Matrix Generation)
+name: Validate Nightly Docker Images
 on:
+  workflow_call:
+    inputs:
+      channel:
+        description: 'PyTorch channel to use (nightly, test, release, all)'
+        required: true
+        type: string
+        default: 'nightly'
+      generate_dockerhub_images:
+        description: 'Generate Docker Hub images (strip ghcr.io/ prefix for release)'
+        default: false
+        required: false
+        type: boolean
+      ref:
+        description: 'Reference to checkout, defaults to empty'
+        default: ""
+        required: false
+        type: string
   workflow_dispatch:
     inputs:
       channel:
@@ -15,8 +32,13 @@ on:
         description: 'Generate Docker Hub images (strip ghcr.io/ prefix for release)'
         default: false
         required: false
-        type: boolean     
-
+        type: boolean
+      ref:
+        description: 'Reference to checkout, defaults to empty'
+        default: ""
+        required: false
+        type: string
+
 jobs:
   generate-matrix:
     uses: pytorch/test-infra/.github/workflows/generate_docker_release_matrix.yml@main
@@ -31,7 +53,7 @@ jobs:
       fail-fast: false
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     with:
-      runner: linux.g5.4xlarge.nvidia.gpu
+      runner: ${{ matrix.validation_runner }}
       repository: "pytorch/builder"
       ref: ${{ inputs.ref || github.ref }}
       job-name: cuda${{ matrix.cuda }}-cudnn${{ matrix.cudnn_version }}-${{ matrix.image_type }}
@@ -40,7 +62,16 @@ jobs:
       timeout: 180
       script: |
         set -ex
-        export MATRIX_GPU_ARCH_TYPE="cuda"
+        
         export MATRIX_GPU_ARCH_VERSION="${{ matrix.cuda }}"
+        export MATRIX_IMAGE_TYPE="${{ matrix.image_type }}"
         export TARGET_OS="linux"
-        python test/smoke_test/smoke_test.py --package torchonly --runtime-error-check enabled
+        TORCH_COMPILE_CHECK="--torch-compile-check enabled"
+        if [[ ${MATRIX_IMAGE_TYPE} == "runtime" ]]; then
+          TORCH_COMPILE_CHECK="--torch-compile-check disabled"
+        fi
+        export MATRIX_GPU_ARCH_TYPE="cuda"
+        if [[ ${MATRIX_GPU_ARCH_VERSION} == "cpu" ]]; then 
+          export MATRIX_GPU_ARCH_TYPE="cpu"
+        fi
+        python test/smoke_test/smoke_test.py --package torchonly --runtime-error-check disabled ${TORCH_COMPILE_CHECK}
diff --git a/aarch64_linux/aarch64_wheel_ci_build.py b/aarch64_linux/aarch64_wheel_ci_build.py
@@ -14,44 +14,6 @@ def list_dir(path: str) -> List[str]:
     """
     return check_output(["ls", "-1", path]).decode().split("\n")
 
-
-def build_OpenBLAS() -> None:
-    '''
-    Building OpenBLAS, because the package in many linux is old
-    '''
-    print('Building OpenBLAS')
-    openblas_build_flags = [
-        "NUM_THREADS=128",
-        "USE_OPENMP=1",
-        "NO_SHARED=0",
-        "DYNAMIC_ARCH=1",
-        "TARGET=ARMV8",
-        "CFLAGS=-O3",
-    ]
-    openblas_checkout_dir = "OpenBLAS"
-
-    check_call(
-        [
-            "git",
-            "clone",
-            "https://github.com/OpenMathLib/OpenBLAS.git",
-            "-b",
-            "v0.3.25",
-            "--depth",
-            "1",
-            "--shallow-submodules",
-        ]
-    )
-
-    check_call(["make", "-j8"]
-                + openblas_build_flags,
-                cwd=openblas_checkout_dir)
-    check_call(["make", "-j8"]
-                + openblas_build_flags
-                + ["install"],
-                cwd=openblas_checkout_dir)
-
-
 def build_ArmComputeLibrary() -> None:
     """
     Using ArmComputeLibrary for aarch64 PyTorch
@@ -103,7 +65,7 @@ def update_wheel(wheel_path) -> None:
     os.system(f"unzip {wheel_path} -d {folder}/tmp")
     libs_to_copy = [
         "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",
-        "/usr/local/cuda/lib64/libcudnn.so.8",
+        "/usr/local/cuda/lib64/libcudnn.so.9",
         "/usr/local/cuda/lib64/libcublas.so.12",
         "/usr/local/cuda/lib64/libcublasLt.so.12",
         "/usr/local/cuda/lib64/libcudart.so.12",
@@ -116,12 +78,13 @@ def update_wheel(wheel_path) -> None:
         "/usr/local/cuda/lib64/libnvJitLink.so.12",
         "/usr/local/cuda/lib64/libnvrtc.so.12",
         "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.4",
-        "/usr/local/cuda/lib64/libcudnn_adv_infer.so.8",
-        "/usr/local/cuda/lib64/libcudnn_adv_train.so.8",
-        "/usr/local/cuda/lib64/libcudnn_cnn_infer.so.8",
-        "/usr/local/cuda/lib64/libcudnn_cnn_train.so.8",
-        "/usr/local/cuda/lib64/libcudnn_ops_infer.so.8",
-        "/usr/local/cuda/lib64/libcudnn_ops_train.so.8",
+        "/usr/local/cuda/lib64/libcudnn_adv.so.9",
+        "/usr/local/cuda/lib64/libcudnn_cnn.so.9",
+        "/usr/local/cuda/lib64/libcudnn_graph.so.9",
+        "/usr/local/cuda/lib64/libcudnn_ops.so.9",
+        "/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9",
+        "/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9",
+        "/usr/local/cuda/lib64/libcudnn_heuristic.so.9",
         "/opt/conda/envs/aarch64_env/lib/libgomp.so.1",
         "/opt/OpenBLAS/lib/libopenblas.so.0",
         "/acl/build/libarm_compute.so",
@@ -134,6 +97,9 @@ def update_wheel(wheel_path) -> None:
     os.system(
         f"cd {folder}/tmp/torch/lib/; patchelf --set-rpath '$ORIGIN' {folder}/tmp/torch/lib/libtorch_cuda.so"
     )
+    os.system(
+        f"cd {folder}/tmp/torch/lib/; patchelf --set-rpath '$ORIGIN' {folder}/tmp/torch/lib/libcudnn_graph.so.9"
+    )
     os.mkdir(f"{folder}/cuda_wheel")
     os.system(f"cd {folder}/tmp/; zip -r {folder}/cuda_wheel/{wheelname} *")
     shutil.move(
@@ -227,7 +193,6 @@ def parse_arguments():
     elif branch.startswith(("v1.", "v2.")):
         build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1:branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "
 
-    build_OpenBLAS()
     if enable_mkldnn:
         build_ArmComputeLibrary()
         print("build pytorch with mkldnn+acl backend")

diff --git a/analytics/validate_pypi_staging.py b/analytics/validate_pypi_staging.py
@@ -15,13 +15,20 @@
     "win_amd64",
     "macosx_11_0_arm64",
 ]
-PYTHON_VERSIONS = ["cp38", "cp39", "cp310", "cp311", "cp312"]
+PYTHON_VERSIONS = [
+    "cp38", 
+    "cp39", 
+    "cp310", 
+    "cp311", 
+    "cp312"
+    ]
 S3_PYPI_STAGING = "pytorch-backup"
 PACKAGE_RELEASES = {
-    "torch": "2.3.0",
-    "torchvision": "0.18.0",
-    "torchaudio": "2.3.0",
+    "torch": "2.3.1",
+    "torchvision": "0.18.1",
+    "torchaudio": "2.3.1",
     "torchtext": "0.18.0",
+    "executorch": "0.2.1"
 }
 
 PATTERN_V = "Version:"

diff --git a/common/aotriton_version.txt b/common/aotriton_version.txt
@@ -0,0 +1,5 @@
+0.6b
+manylinux_2_17
+rocm6
+04b5df8c8123f90cba3ede7e971e6fbc6040d506
+3db6ecbc915893ff967abd6e1b43bd5f54949868873be60dc802086c3863e648
diff --git a/common/install_aotriton.sh b/common/install_aotriton.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+set -ex
+
+TARBALL='aotriton.tar.bz2'
+# This read command alwasy returns with exit code 1
+read -d "\n" VER MANYLINUX ROCMBASE PINNED_COMMIT SHA256 < aotriton_version.txt || true
+ARCH=$(uname -m)
+AOTRITON_INSTALL_PREFIX="$1"
+AOTRITON_URL="https://github.com/ROCm/aotriton/releases/download/${VER}/aotriton-${VER}-${MANYLINUX}_${ARCH}-${ROCMBASE}.tar.bz2"
+
+cd "${AOTRITON_INSTALL_PREFIX}"
+# Must use -L to follow redirects
+curl -L --retry 3 -o "${TARBALL}" "${AOTRITON_URL}"
+ACTUAL_SHA256=$(sha256sum "${TARBALL}" | cut -d " " -f 1)
+if [ "${SHA256}" != "${ACTUAL_SHA256}" ]; then
+  echo -n "Error: The SHA256 of downloaded tarball is ${ACTUAL_SHA256},"
+  echo " which does not match the expected value ${SHA256}."
+  exit
+fi
+tar xf "${TARBALL}" && rm -rf "${TARBALL}"
diff --git a/common/install_cuda.sh b/common/install_cuda.sh
@@ -2,6 +2,8 @@
 
 set -ex
 
+CUDNN_VERSION=9.1.0.70
+
 function install_cusparselt_040 {
     # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
     mkdir tmp_cusparselt && pushd tmp_cusparselt
@@ -25,7 +27,7 @@ function install_cusparselt_052 {
 }
 
 function install_118 {
-    echo "Installing CUDA 11.8 and cuDNN 8.7 and NCCL 2.15 and cuSparseLt-0.4.0"
+    echo "Installing CUDA 11.8 and cuDNN ${CUDNN_VERSION} and NCCL 2.15 and cuSparseLt-0.4.0"
     rm -rf /usr/local/cuda-11.8 /usr/local/cuda
     # install CUDA 11.8.0 in the same container
     wget -q https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run
@@ -36,10 +38,10 @@ function install_118 {
 
     # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
     mkdir tmp_cudnn && cd tmp_cudnn
-    wget -q https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz -O cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz
-    tar xf cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz
-    cp -a cudnn-linux-x86_64-8.7.0.84_cuda11-archive/include/* /usr/local/cuda/include/
-    cp -a cudnn-linux-x86_64-8.7.0.84_cuda11-archive/lib/* /usr/local/cuda/lib64/
+    wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive.tar.xz
+    tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive.tar.xz
+    cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive/include/* /usr/local/cuda/include/
+    cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive/lib/* /usr/local/cuda/lib64/
     cd ..
     rm -rf tmp_cudnn
 
@@ -58,7 +60,7 @@ function install_118 {
 }
 
 function install_121 {
-    echo "Installing CUDA 12.1 and cuDNN 8.9 and NCCL 2.20.5 and cuSparseLt-0.5.2"
+    echo "Installing CUDA 12.1 and cuDNN ${CUDNN_VERSION} and NCCL 2.20.5 and cuSparseLt-0.5.2"
     rm -rf /usr/local/cuda-12.1 /usr/local/cuda
     # install CUDA 12.1.0 in the same container
     wget -q https://developer.download.nvidia.com/compute/cuda/12.1.1/local_installers/cuda_12.1.1_530.30.02_linux.run
@@ -69,10 +71,10 @@ function install_121 {
 
     # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
     mkdir tmp_cudnn && cd tmp_cudnn
-    wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-8.9.2.26_cuda12-archive.tar.xz -O cudnn-linux-x86_64-8.9.2.26_cuda12-archive.tar.xz
-    tar xf cudnn-linux-x86_64-8.9.2.26_cuda12-archive.tar.xz
-    cp -a cudnn-linux-x86_64-8.9.2.26_cuda12-archive/include/* /usr/local/cuda/include/
-    cp -a cudnn-linux-x86_64-8.9.2.26_cuda12-archive/lib/* /usr/local/cuda/lib64/
+    wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
+    tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
+    cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
+    cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
     cd ..
     rm -rf tmp_cudnn
 
@@ -91,7 +93,7 @@ function install_121 {
 }
 
 function install_124 {
-  echo "Installing CUDA 12.4 and cuDNN 8.9 and NCCL 2.20.5 and cuSparseLt-0.5.2"
+  echo "Installing CUDA 12.4 and cuDNN ${CUDNN_VERSION} and NCCL 2.20.5 and cuSparseLt-0.5.2"
   rm -rf /usr/local/cuda-12.4 /usr/local/cuda
   # install CUDA 12.4.0 in the same container
   wget -q https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_550.54.14_linux.run
@@ -102,10 +104,10 @@ function install_124 {
 
   # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
   mkdir tmp_cudnn && cd tmp_cudnn
-  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-8.9.2.26_cuda12-archive.tar.xz -O cudnn-linux-x86_64-8.9.2.26_cuda12-archive.tar.xz
-  tar xf cudnn-linux-x86_64-8.9.2.26_cuda12-archive.tar.xz
-  cp -a cudnn-linux-x86_64-8.9.2.26_cuda12-archive/include/* /usr/local/cuda/include/
-  cp -a cudnn-linux-x86_64-8.9.2.26_cuda12-archive/lib/* /usr/local/cuda/lib64/
+  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
+  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
   cd ..
   rm -rf tmp_cudnn
 

diff --git a/common/install_cuda_aarch64.sh b/common/install_cuda_aarch64.sh
@@ -14,7 +14,7 @@ function install_cusparselt_052 {
 }
 
 function install_124 {
-  echo "Installing CUDA 12.4 and cuDNN 8.9 and NCCL 2.20.5 and cuSparseLt-0.5.2"
+  echo "Installing CUDA 12.4 and cuDNN 9.1 and NCCL 2.20.5 and cuSparseLt-0.5.2"
   rm -rf /usr/local/cuda-12.4 /usr/local/cuda
   # install CUDA 12.4.0 in the same container
   wget -q https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_550.54.14_linux_sbsa.run
@@ -25,10 +25,10 @@ function install_124 {
 
   # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
   mkdir tmp_cudnn && cd tmp_cudnn
-  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-sbsa/cudnn-linux-sbsa-8.9.2.26_cuda12-archive.tar.xz -O cudnn-linux-sbsa-8.9.2.26_cuda12-archive.tar.xz
-  tar xf cudnn-linux-sbsa-8.9.2.26_cuda12-archive.tar.xz
-  cp -a cudnn-linux-sbsa-8.9.2.26_cuda12-archive/include/* /usr/local/cuda/include/
-  cp -a cudnn-linux-sbsa-8.9.2.26_cuda12-archive/lib/* /usr/local/cuda/lib64/
+  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-sbsa/cudnn-linux-sbsa-9.1.0.70_cuda12-archive.tar.xz -O cudnn-linux-sbsa-9.1.0.70_cuda12-archive.tar.xz
+  tar xf cudnn-linux-sbsa-9.1.0.70_cuda12-archive.tar.xz
+  cp -a cudnn-linux-sbsa-9.1.0.70_cuda12-archive/include/* /usr/local/cuda/include/
+  cp -a cudnn-linux-sbsa-9.1.0.70_cuda12-archive/lib/* /usr/local/cuda/lib64/
   cd ..
   rm -rf tmp_cudnn
 

diff --git a/common/install_openblas.sh b/common/install_openblas.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+set -ex
+
+cd /
+git clone https://github.com/OpenMathLib/OpenBLAS.git -b v0.3.25 --depth 1 --shallow-submodules
+
+
+OPENBLAS_BUILD_FLAGS="
+NUM_THREADS=128
+USE_OPENMP=1
+NO_SHARED=0
+DYNAMIC_ARCH=1
+TARGET=ARMV8
+CFLAGS=-O3
+"
+
+OPENBLAS_CHECKOUT_DIR="OpenBLAS"
+
+make -j8 ${OPENBLAS_BUILD_FLAGS} -C ${OPENBLAS_CHECKOUT_DIR}
+make -j8 ${OPENBLAS_BUILD_FLAGS} install -C ${OPENBLAS_CHECKOUT_DIR}
diff --git a/conda/build_pytorch.sh b/conda/build_pytorch.sh
@@ -287,9 +287,9 @@ else
         TRITON_VERSION=$(cat $pytorch_rootdir/.ci/docker/triton_version.txt)
         if [[ -n "$OVERRIDE_PACKAGE_VERSION" && "$OVERRIDE_PACKAGE_VERSION" =~ .*dev.* ]]; then
             TRITON_SHORTHASH=$(cut -c1-10 $pytorch_rootdir/.github/ci_commit_pins/triton.txt)
-            export CONDA_TRITON_CONSTRAINT="    - torchtriton==${TRITON_VERSION}+${TRITON_SHORTHASH} # [py < 312]"
+            export CONDA_TRITON_CONSTRAINT="    - torchtriton==${TRITON_VERSION}+${TRITON_SHORTHASH} # [py < 313]"
         else
-            export CONDA_TRITON_CONSTRAINT="    - torchtriton==${TRITON_VERSION} # [py < 312]"
+            export CONDA_TRITON_CONSTRAINT="    - torchtriton==${TRITON_VERSION} # [py < 313]"
         fi
     fi
 

diff --git a/conda/pytorch-nightly/build.sh b/conda/pytorch-nightly/build.sh
@@ -59,12 +59,12 @@ if [[ -n "$build_with_cuda" ]]; then
     if [[ $CUDA_VERSION == 11.8* ]]; then
         TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST;3.7+PTX;9.0"
         #for cuda 11.8 include all dynamic loading libraries
-        DEPS_LIST=(/usr/local/cuda/lib64/libcudnn*.so.8 /usr/local/cuda-11.8/extras/CUPTI/lib64/libcupti.so.11.8 /usr/local/cuda/lib64/libcusparseLt.so.0)
+        DEPS_LIST=(/usr/local/cuda/lib64/libcudnn*.so.9 /usr/local/cuda-11.8/extras/CUPTI/lib64/libcupti.so.11.8 /usr/local/cuda/lib64/libcusparseLt.so.0)
     elif [[ $CUDA_VERSION == 12.1* || $CUDA_VERSION == 12.4* ]]; then
         # cuda 12 does not support sm_3x
         TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST;9.0"
-        # for cuda 12.1 (12.4) we use cudnn 8.8 (8.9) and include all dynamic loading libraries
-        DEPS_LIST=(/usr/local/cuda/lib64/libcudnn*.so.8 /usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12 /usr/local/cuda/lib64/libcusparseLt.so.0)
+        # for cuda 12.1 (12.4) we use cudnn 9.1 and include all dynamic loading libraries
+        DEPS_LIST=(/usr/local/cuda/lib64/libcudnn*.so.9 /usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12 /usr/local/cuda/lib64/libcusparseLt.so.0)
     fi
     if [[ -n "$OVERRIDE_TORCH_CUDA_ARCH_LIST" ]]; then
         TORCH_CUDA_ARCH_LIST="$OVERRIDE_TORCH_CUDA_ARCH_LIST"

diff --git a/libtorch/Dockerfile b/libtorch/Dockerfile
@@ -81,6 +81,12 @@ RUN apt-get update -y && \
 RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
 RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh
 
+# Install AOTriton
+COPY ./common/aotriton_version.txt aotriton_version.txt
+COPY ./common/install_aotriton.sh install_aotriton.sh
+RUN bash ./install_aotriton.sh /opt/rocm && rm install_aotriton.sh aotriton_version.txt
+ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton
+
 FROM ${BASE_TARGET} as final
 COPY --from=openssl            /opt/openssl           /opt/openssl
 # Install patchelf