mbzomowski-test-org · mbzomowski · Nov 16, 2023 · Sep 26, 2023 · Sep 26, 2023 · Sep 26, 2023
diff --git a/.circleci/build.sh b/.circleci/build.sh
@@ -41,14 +41,17 @@ apply_patches
 
 python -c "import fcntl; fcntl.fcntl(1, fcntl.F_SETFL, 0)"
 
+# We always build PyTorch without CUDA support.
+export USE_CUDA=0
 python setup.py install
 
 sccache --show-stats
 
 source $XLA_DIR/xla_env
 export GCLOUD_SERVICE_KEY_FILE="$XLA_DIR/default_credentials.json"
-export SILO_NAME='cache-silo-ci-gcc-11'  # cache bucket for CI
+export SILO_NAME='cache-silo-ci-dev-3.8_cuda_12.1'  # cache bucket for CI
 export BUILD_CPP_TESTS='1'
+export TF_CUDA_COMPUTE_CAPABILITIES="sm_50,sm_70,sm_75,compute_80,$TF_CUDA_COMPUTE_CAPABILITIES"
 build_torch_xla $XLA_DIR
 
 popd
diff --git a/.circleci/common.sh b/.circleci/common.sh
@@ -92,27 +92,6 @@ function install_deps_pytorch_xla() {
 
   sudo ln -s "$(command -v bazelisk)" /usr/bin/bazel
 
-  # Install gcc-11
-  sudo apt-get update
-  # Update ppa for GCC
-  sudo apt-get install -y software-properties-common
-  sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
-  sudo apt update -y
-  sudo apt install -y gcc-11
-  sudo apt install -y g++-11
-  sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 100
-  sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 100
-
-  export NVCC_PREPEND_FLAGS='-ccbin /usr/bin/g++-11'
-
-  # Hack similar to https://github.com/pytorch/pytorch/pull/105227/files#diff-9e59213240d3b55d2ddc53c8c096db9eece0665d64f46473454f9dc0c10fd804
-  sudo rm /opt/conda/lib/libstdc++.so*
-
-  # Update gcov for test coverage
-  sudo update-alternatives --install /usr/bin/gcov gcov /usr/bin/gcov-11 100
-  sudo update-alternatives --install /usr/bin/gcov-dump gcov-dump /usr/bin/gcov-dump-11 100
-  sudo update-alternatives --install /usr/bin/gcov-tool gcov-tool /usr/bin/gcov-tool-11 100
-
   # Symnlink the missing cuda headers if exists
   CUBLAS_PATTERN="/usr/include/cublas*"
   if ls $CUBLAS_PATTERN 1> /dev/null 2>&1; then
@@ -148,16 +127,18 @@ function run_torch_xla_python_tests() {
     else
       ./test/run_tests.sh
 
-      # GPU tests
+      # CUDA tests
       if [ -x "$(command -v nvidia-smi)" ]; then
-        # These tests fail on GPU with 03/30 TF-pin update (https://github.com/pytorch/xla/pull/4840)
+        # These tests fail on CUDA with 03/30 TF-pin update (https://github.com/pytorch/xla/pull/4840)
+        PJRT_DEVICE=CUDA python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1
+        # TODO(xiowei replace gpu with cuda): remove the test below with PJRT_DEVICE=GPU because PJRT_DEVICE=GPU is being deprecated.
         PJRT_DEVICE=GPU python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1
-        PJRT_DEVICE=GPU python test/test_train_mp_imagenet_fsdp.py --fake_data --auto_wrap_policy type_based --use_small_fake_sample --num_epochs=1
-        XLA_DISABLE_FUNCTIONALIZATION=1 PJRT_DEVICE=GPU python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1
+        PJRT_DEVICE=CUDA python test/test_train_mp_imagenet_fsdp.py --fake_data --auto_wrap_policy type_based --use_small_fake_sample --num_epochs=1
+        XLA_DISABLE_FUNCTIONALIZATION=1 PJRT_DEVICE=CUDA python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1
         # Syncfree SGD optimizer tests
         if [ -d ./torch_xla/amp/syncfree ]; then
           echo "Running Syncfree Optimizer Test"
-          PJRT_DEVICE=GPU python test/test_syncfree_optimizers.py
+          PJRT_DEVICE=CUDA python test/test_syncfree_optimizers.py
 
           # Following test scripts are mainly useful for
           # performance evaluation & comparison among different
@@ -192,9 +173,9 @@ function run_torch_xla_cpp_tests() {
     if [ "$USE_COVERAGE" != "0" ]; then
       # TODO(yeounoh) shard the coverage testing
       if [ -x "$(command -v nvidia-smi)" ]; then
-        PJRT_DEVICE=GPU test/cpp/run_tests.sh $EXTRA_ARGS -L""
+        PJRT_DEVICE=CUDA test/cpp/run_tests.sh $EXTRA_ARGS -L""
         cp $XLA_DIR/bazel-out/_coverage/_coverage_report.dat /tmp/cov1.dat
-        PJRT_DEVICE=GPU test/cpp/run_tests.sh -X early_sync -F AtenXlaTensorTest.TestEarlySyncLiveTensors -L"" $EXTRA_ARGS
+        PJRT_DEVICE=CUDA test/cpp/run_tests.sh -X early_sync -F AtenXlaTensorTest.TestEarlySyncLiveTensors -L"" $EXTRA_ARGS
         cp $XLA_DIR/bazel-out/_coverage/_coverage_report.dat /tmp/cov2.dat
         lcov --add-tracefile /tmp/cov1.dat -a /tmp/cov2.dat -o /tmp/merged.dat
       else
@@ -206,8 +187,8 @@ function run_torch_xla_cpp_tests() {
     else
       # Shard GPU testing
       if [ -x "$(command -v nvidia-smi)" ]; then
-        PJRT_DEVICE=GPU test/cpp/run_tests.sh $EXTRA_ARGS -L""
-        PJRT_DEVICE=GPU test/cpp/run_tests.sh -X early_sync -F AtenXlaTensorTest.TestEarlySyncLiveTensors -L"" $EXTRA_ARGS
+        PJRT_DEVICE=CUDA test/cpp/run_tests.sh $EXTRA_ARGS -L""
+        PJRT_DEVICE=CUDA test/cpp/run_tests.sh -X early_sync -F AtenXlaTensorTest.TestEarlySyncLiveTensors -L"" $EXTRA_ARGS
       else
         PJRT_DEVICE=CPU test/cpp/run_tests.sh $EXTRA_ARGS -L""
       fi

diff --git a/.circleci/docker/Dockerfile b/.circleci/docker/Dockerfile
@@ -1,13 +1,13 @@
 # This requires cuda & cudnn packages pre-installed in the base image.
 # Other available cuda images are listed at https://hub.docker.com/r/nvidia/cuda
-ARG base_image="nvidia/cuda:11.7.0-cudnn8-devel-ubuntu18.04"
+ARG base_image="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.8_cuda_12.1"
 FROM "${base_image}"
 
 ARG python_version="3.8"
 ARG cuda="1"
 ARG cuda_compute="5.2,7.5"
-ARG cc="clang-8"
-ARG cxx="clang++-8"
+ARG cc="clang"
+ARG cxx="clang++"
 ARG cxx_abi="1"
 ARG tpuvm=""
 
@@ -37,38 +37,15 @@ ENV CXX "${cxx}"
 # Whether to build for TPUVM mode
 ENV TPUVM_MODE "${tpuvm}"
 
-# Rotate nvidia repo public key (last updated: 04/27/2022)
-# Unfortunately, nvidia/cuda image is shipped with invalid public key
-RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
-
-# Install base system packages
-RUN apt-get clean && apt-get update
-RUN apt-get upgrade -y
-RUN apt-get install --fix-missing -y python-pip python3-pip git curl libopenblas-dev vim jq \
-    apt-transport-https ca-certificates procps openssl sudo wget libssl-dev libc6-dbg
-
-# Install clang & llvm
-ADD ./install_llvm_clang.sh install_llvm_clang.sh
-RUN bash ./install_llvm_clang.sh
-
+# Install clang as upstream CI forces clang
+RUN apt-get install -y clang
 # Install valgrind
-ADD ./install_valgrind.sh install_valgrind.sh
+COPY ./install_valgrind.sh install_valgrind.sh
 RUN bash ./install_valgrind.sh
 
-# Sets up jenkins user.
-RUN useradd jenkins && \
-    mkdir /home/jenkins && \
-    chown jenkins /home/jenkins
-RUN echo 'jenkins ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
-
-RUN mkdir -p /opt/conda /opt/cargo /opt/rustup /workspace /var/lib/jenkins && \
-    chown jenkins /opt/conda /opt/cargo /opt/rustup /workspace /var/lib/jenkins
-USER jenkins
-WORKDIR /workspace
-
 # Install openmpi for CUDA
-run sudo apt-get install -y ssh
-run sudo apt-get install -y --allow-downgrades --allow-change-held-packages openmpi-bin libopenmpi-dev
+run apt-get install -y ssh
+run apt-get install -y --allow-downgrades --allow-change-held-packages openmpi-bin libopenmpi-dev
 
 # Builds and configure sccache
 ENV OPENSSL_INCLUDE_DIR /usr/include/openssl
@@ -87,6 +64,25 @@ RUN . $CARGO_HOME/env && \
 
 ENV PATH $CARGO_HOME/bin:$PATH
 
+# Upstream CI requires jq
+RUN apt-get install -y jq
+
+# TODO: Add exec permisson for all users in base image.
+RUN chmod a+x /usr/local/bin/bazel
+# TODO: move sudo installation in base image.
+RUN apt-get install -y sudo
+
+RUN useradd jenkins && \
+    mkdir /home/jenkins && \
+    chown jenkins /home/jenkins
+RUN echo 'jenkins ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
+
+RUN mkdir -p /opt/conda /opt/cargo /opt/rustup /workspace /var/lib/jenkins && \
+    chown jenkins /opt/conda /opt/cargo /opt/rustup /workspace /var/lib/jenkins
+ENV PATH /home/jenkins/.local/bin:$PATH
+USER jenkins
+WORKDIR /workspace
+
 # Installs and configures Conda.
 ADD ./install_conda.sh install_conda.sh
 RUN sudo chown jenkins ./install_conda.sh
@@ -95,6 +91,7 @@ RUN bash ./install_conda.sh "${python_version}" /opt/conda
 RUN echo "conda activate base" >> ~/.bashrc
 RUN echo "export TF_CPP_LOG_THREAD_ID=1" >> ~/.bashrc
 ENV PATH /opt/conda/bin:$PATH
+ENV LD_LIBRARY_PATH /lib/x86_64-linux-gnu/:/usr/lib/x86_64-linux-gnu/:/opt/conda/lib/:$LD_LIBRARY_PATH
 
 RUN bash -c "source ~/.bashrc"
 CMD ["bash"]
diff --git a/.circleci/docker/install_conda.sh b/.circleci/docker/install_conda.sh
@@ -4,7 +4,7 @@ set -ex
 
 PYTHON_VERSION=$1
 CONDA_PREFIX=$2
-DEFAULT_PYTHON_VERSION=3.7
+DEFAULT_PYTHON_VERSION=3.8
 
 
 function install_and_setup_conda() {
@@ -30,7 +30,7 @@ function install_and_setup_conda() {
   conda update -y -n base conda
   conda install -y python=$PYTHON_VERSION
 
-  conda install -y nomkl numpy=1.18.5 pyyaml setuptools cmake \
+  conda install -y nomkl numpy=1.18.5 pyyaml setuptools \
     cffi typing tqdm coverage hypothesis dataclasses cython
 
   /usr/bin/yes | pip install mkl==2022.2.1
@@ -41,9 +41,6 @@ function install_and_setup_conda() {
   /usr/bin/yes | pip install --upgrade numba
   /usr/bin/yes | pip install cloud-tpu-client
   /usr/bin/yes | pip install expecttest==0.1.3
-  /usr/bin/yes | pip install ninja  # Install ninja to speedup the build
-  # Using Ninja requires CMake>=3.13, PyTorch requires CMake>=3.18
-  /usr/bin/yes | pip install "cmake>=3.18" --upgrade
   /usr/bin/yes | pip install absl-py
   # Additional PyTorch requirements
   /usr/bin/yes | pip install scikit-image scipy==1.6.3

diff --git a/.circleci/docker/install_valgrind.sh b/.circleci/docker/install_valgrind.sh
@@ -9,7 +9,7 @@ tar -xjf valgrind-${VALGRIND_VERSION}.tar.bz2
 cd valgrind-${VALGRIND_VERSION}
 ./configure --prefix=/usr/local
 make -j6
-sudo make install
+make install
 cd ../../
 rm -rf valgrind_build
 alias valgrind="/usr/local/bin/valgrind"
diff --git a/.circleci/test.sh b/.circleci/test.sh
@@ -26,5 +26,5 @@ function install_torchvision() {
 install_torchvision
 
 export GCLOUD_SERVICE_KEY_FILE="$XLA_DIR/default_credentials.json"
-export SILO_NAME='cache-silo-ci-gcc-11'  # cache bucket for CI
+export SILO_NAME='cache-silo-ci-dev-3.8_cuda_12.1'  # cache bucket for CI
 run_torch_xla_tests $PYTORCH_DIR $XLA_DIR $USE_COVERAGE
diff --git a/.github/workflows/_build.yml b/.github/workflows/_build.yml
@@ -73,12 +73,12 @@ jobs:
             # if image layers are not present in the repo.
             # Note: disable the following 2 lines while testing a new image, so we do not
             # push to the upstream.
-            docker tag "${GCR_DOCKER_IMAGE}" "${ECR_DOCKER_IMAGE_BASE}:v1.0" >/dev/null
-            docker push "${ECR_DOCKER_IMAGE_BASE}:v1.0" >/dev/null
+            docker tag "${GCR_DOCKER_IMAGE}" "${ECR_DOCKER_IMAGE_BASE}:v1.1-lite" >/dev/null
+            docker push "${ECR_DOCKER_IMAGE_BASE}:v1.1-lite" >/dev/null
       - name: Start the container
         shell: bash
         run: |
-          pid=$(docker run -t -d -w "$WORKDIR" "${GCR_DOCKER_IMAGE}")
+          pid=$(docker run --privileged -t -d -w "$WORKDIR" "${GCR_DOCKER_IMAGE}")
           docker exec -u jenkins "${pid}" sudo chown -R jenkins "${WORKDIR}"
           docker cp "${GITHUB_WORKSPACE}/." "$pid:$WORKDIR"
           echo "pid=${pid}" >> "${GITHUB_ENV}"
@@ -87,7 +87,6 @@ jobs:
         shell: bash
         run: |
           echo "declare -x SCCACHE_BUCKET=${SCCACHE_BUCKET}" | docker exec -i "${pid}" sh -c "cat >> env"
-          echo "declare -x CC=clang-8 CXX=clang++-8" | docker exec -i "${pid}" sh -c "cat >> xla_env"
           echo "declare -x DISABLE_XRT=${DISABLE_XRT}" | docker exec -i "${pid}" sh -c "cat >> xla_env"
           echo "declare -x XLA_CUDA=${XLA_CUDA}" | docker exec -i "${pid}" sh -c "cat >> xla_env"
           echo "declare -x BAZEL_REMOTE_CACHE=1" | docker exec -i "${pid}" sh -c "cat >> xla_env"
@@ -96,8 +95,7 @@ jobs:
       - name: Build
         shell: bash
         run: |
-          docker exec -u jenkins "${pid}" bash -c ". ~/.bashrc && .circleci/build.sh"
-
+          docker exec --privileged -u jenkins "${pid}" bash -c ".circleci/build.sh"
       - name: Cleanup build env
         shell: bash
         run: |

diff --git a/.github/workflows/_coverage.yml b/.github/workflows/_coverage.yml
@@ -94,7 +94,7 @@ jobs:
       - name: Test
         shell: bash
         run: |
-          docker exec -u jenkins "${pid}" bash -c '. ~/.bashrc && .circleci/${{ inputs.test-script }}'
+          docker exec -u jenkins "${pid}" bash -c '.circleci/${{ inputs.test-script }}'
       - name: Upload coverage results
         if: ${{ inputs.collect-coverage }}
         shell: bash

diff --git a/.github/workflows/_docs.yml b/.github/workflows/_docs.yml
@@ -43,7 +43,7 @@ jobs:
           echo "pid=${pid}" >> "${GITHUB_ENV}"
       - name: Build & publish docs
         shell: bash
-        run: docker exec -u jenkins "${pid}" bash -c '. ~/.bashrc && .circleci/doc_push.sh'
+        run: docker exec -u jenkins "${pid}" bash -c '.circleci/doc_push.sh'
       - name: Teardown Linux
         uses: pytorch/test-infra/.github/actions/teardown-linux@main
         if: always()
diff --git a/.github/workflows/_test.yml b/.github/workflows/_test.yml
@@ -116,7 +116,7 @@ jobs:
       - name: Test
         shell: bash
         run: |
-          docker exec -u jenkins "${pid}" bash -c '. ~/.bashrc && .circleci/${{ inputs.test-script }}'
+          docker exec --privileged -u jenkins "${pid}" bash -c '.circleci/${{ inputs.test-script }}'
       - name: Upload coverage results
         if: ${{ inputs.collect-coverage }}
         shell: bash

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
@@ -19,8 +19,7 @@ jobs:
     uses: ./.github/workflows/_build.yml
     with:
       ecr-docker-image-base: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base
-      gcr-docker-image: gcr.io/tpu-pytorch/xla_base:latest
-      disable_xrt: 1
+      gcr-docker-image: gcr.io/tpu-pytorch/xla_base:dev-3.8_cuda_12.1
       cuda: 1
     secrets:
       gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}
@@ -43,7 +42,7 @@ jobs:
     with:
       docker-image: ${{ needs.build.outputs.docker-image }}
       runner: linux.8xlarge.nvidia.gpu
-      timeout-minutes: 300
+      timeout-minutes: 180
       disable-xrt: 1
     secrets:
       gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}

diff --git a/.github/workflows/build_and_test_xrt.yml b/.github/workflows/build_and_test_xrt.yml
@@ -18,7 +18,7 @@ jobs:
     uses: ./.github/workflows/_build.yml
     with:
       ecr-docker-image-base: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base
-      gcr-docker-image: gcr.io/tpu-pytorch/xla_base:latest
+      gcr-docker-image: gcr.io/tpu-pytorch/xla_base:dev-3.8_cuda_12.1
       disable_xrt: 0
       cuda: 1
     secrets:
@@ -42,7 +42,7 @@ jobs:
     with:
       docker-image: ${{ needs.build.outputs.docker-image }}
       runner: linux.8xlarge.nvidia.gpu
-      timeout-minutes: 300
+      timeout-minutes: 180
       disable-xrt: 0
     secrets:
       gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}
diff --git a/.github/workflows/tpu-ci.yml b/.github/workflows/tpu-ci.yml
@@ -0,0 +1,27 @@
+name: TPU Test
+run-name: CI Testing
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: '0 16,20,0 * * 1-5'
+jobs:
+  tpu-test:
+    runs-on: v4-runner-set
+    steps:
+      - run: |
+          git clone --recursive https://github.com/pytorch/pytorch
+          cd pytorch/
+          python3 setup.py install --user
+          git clone --recursive https://github.com/mbzomowski/xla.git
+      - env:
+          BAZEL_VERBOSE: 1
+          BUNDLE_LIBTPU: 1
+          TPUVM_MODE: 1
+        run: |
+          cd pytorch/xla
+          python3 setup.py install --user
+      - env:
+          PJRT_DEVICE: TPU
+        run: |
+          cd pytorch/xla
+          python3 -u test/test_operations.py -v
diff --git a/.kokoro/Dockerfile b/.kokoro/Dockerfile
@@ -47,7 +47,7 @@ ARG SCCACHE="$(which sccache)"
 
 WORKDIR /pytorch/xla
 ARG GCLOUD_SERVICE_KEY_FILE="/pytorch/xla/default_credentials.json"
-ARG SILO_NAME='cache-silo-ci-gcc-11'  # cache bucket for CI
+ARG SILO_NAME='cache-silo-ci-dev-3.8_cuda_12.1'  # cache bucket for CI
 RUN time pip install -e .
 
 # Run tests

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -94,7 +94,7 @@ To run the tests, follow __one__ of the options below:
 * Run on GPU:
 
   ```Shell
-  export PJRT_DEVICE=GPU GPU_NUM_DEVICES=${NUM_GPU}
+  export PJRT_DEVICE=CUDA GPU_NUM_DEVICES=${NUM_GPU}
   ```
 
 For more detail on configuring the runtime, please refer to [this doc](https://github.com/pytorch/xla/blob/master/docs/pjrt.md#quickstart)

diff --git a/README.md b/README.md
@@ -111,7 +111,7 @@ If you're using `DistributedDataParallel`, make the following changes:
 Additional information on PyTorch/XLA, including a description of its semantics
 and functions, is available at [PyTorch.org](http://pytorch.org/xla/). See the
 [API Guide](API_GUIDE.md) for best practices when writing networks that run on
-XLA devices (TPU, GPU, CPU and...).
+XLA devices (TPU, CUDA, CPU and...).
 
 Our comprehensive user guides are available at: