diff --git a/.github/workflows/_build_plugin.yml b/.github/workflows/_build_plugin.yml new file mode 100644 index 00000000000..d9937f91657 --- /dev/null +++ b/.github/workflows/_build_plugin.yml @@ -0,0 +1,57 @@ +name: build-cuda-plugin +on: + workflow_call: + inputs: + dev-image: + required: true + type: string + description: Base image for builds + runner: + required: false + type: string + description: Runner type for the test + default: linux.12xlarge + cuda: + required: false + type: string + description: Whether to build XLA with CUDA + default: 1 + + secrets: + gcloud-service-key: + required: true + description: Secret to access Bazel build cache + + outputs: + docker-image: + value: ${{ jobs.build.outputs.docker-image }} + description: The docker image containing the built PyTorch. +jobs: + build: + runs-on: ${{ inputs.runner }} + container: + image: ${{ inputs.dev-image }} + env: + GCLOUD_SERVICE_KEY: ${{ secrets.gcloud-service-key }} + GOOGLE_APPLICATION_CREDENTIALS: /tmp/default_credentials.json + BAZEL_JOBS: 16 + BAZEL_REMOTE_CACHE: 1 + steps: + - name: Setup gcloud + shell: bash + run: | + echo "${GCLOUD_SERVICE_KEY}" > $GOOGLE_APPLICATION_CREDENTIALS + - name: Checkout repo + uses: actions/checkout@v4 + with: + path: pytorch/xla + - name: Build + shell: bash + run: | + cd pytorch/xla/infra/ansible + ansible-playbook playbook.yaml -vvv -e "stage=build_plugin arch=amd64 accelerator=cuda src_root=${GITHUB_WORKSPACE}" --skip-tags=fetch_srcs,install_deps + - name: Upload wheel + uses: actions/upload-artifact@v4 + with: + name: cuda-plugin + path: /dist/*.whl diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 41bca83b5cb..45e9fc86b51 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -28,6 +28,14 @@ jobs: secrets: gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }} + build-cuda-plugin: + name: "Build XLA CUDA plugin" + uses: ./.github/workflows/_build_plugin.yml + with: + dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.8_cuda_12.1 + secrets: + gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }} + test-cpu: name: "CPU tests" uses: ./.github/workflows/_test.yml diff --git a/infra/ansible/playbook.yaml b/infra/ansible/playbook.yaml index 3484fdc72ce..524b2a8c70c 100644 --- a/infra/ansible/playbook.yaml +++ b/infra/ansible/playbook.yaml @@ -16,7 +16,7 @@ "Pass the required variable with: --e \"{{ item.name }}=\"" loop: - name: stage - pattern: ^(build|release)$ + pattern: ^(build|build_plugin|release)$ - name: arch pattern: ^(aarch64|amd64)$ - name: accelerator @@ -73,6 +73,7 @@ src_root: "/src" tags: fetch_srcs + # TODO: better name now that there are two builds - role: build_srcs vars: src_root: "/src" @@ -81,8 +82,20 @@ combine(build_env[arch] | default({}, true)) | combine(build_env[accelerator] | default({}, true)) }}" + when: stage == "build" tags: build_srcs + - role: build_plugin + vars: + src_root: "/src" + env_vars: "{{ + build_env.common | default({}, true) | + combine(build_env[arch] | default({}, true)) | + combine(build_env[accelerator] | default({}, true)) + }}" + when: stage == "build_plugin" + tags: build_plugin + - role: configure_env vars: env_vars: "{{ diff --git a/infra/ansible/roles/build_plugin/tasks/main.yaml b/infra/ansible/roles/build_plugin/tasks/main.yaml new file mode 100644 index 00000000000..8bc1f561d98 --- /dev/null +++ b/infra/ansible/roles/build_plugin/tasks/main.yaml @@ -0,0 +1,23 @@ +- name: Create /dist directory for exported wheels + ansible.builtin.file: + path: /dist + state: directory + mode: '0755' + +- name: Build PyTorch/XLA CUDA Plugin + ansible.builtin.command: + cmd: pip wheel -w /dist plugins/cuda -v + chdir: "{{ (src_root, 'pytorch/xla') | path_join }}" + environment: "{{ env_vars }}" + when: accelerator == "cuda" + +- name: Find plugin *.whl files in pytorch/xla/dist + ansible.builtin.find: + path: "/dist" + pattern: "torch_xla_cuda_plugin*.whl" + register: plugin_wheels + +- name: Install plugin wheels + ansible.builtin.pip: + name: "{{ plugin_wheels.files | map(attribute='path') }}" + state: "forcereinstall" diff --git a/infra/ansible/roles/build_srcs/tasks/main.yaml b/infra/ansible/roles/build_srcs/tasks/main.yaml index bc708e2c680..d945f150d38 100644 --- a/infra/ansible/roles/build_srcs/tasks/main.yaml +++ b/infra/ansible/roles/build_srcs/tasks/main.yaml @@ -23,13 +23,6 @@ chdir: "{{ (src_root, 'pytorch/xla') | path_join }}" environment: "{{ env_vars }}" -- name: Build PyTorch/XLA CUDA Plugin - ansible.builtin.command: - cmd: pip wheel -w dist plugins/cuda -v - chdir: "{{ (src_root, 'pytorch/xla') | path_join }}" - environment: "{{ env_vars }}" - when: accelerator == "cuda" - - name: Find XLA *.whl files in pytorch/xla/dist ansible.builtin.find: path: "{{ (src_root, 'pytorch/xla/dist') | path_join }}" diff --git a/plugins/cuda/pyproject.toml b/plugins/cuda/pyproject.toml index fd8bbf59f6c..d44a2ea3bd5 100644 --- a/plugins/cuda/pyproject.toml +++ b/plugins/cuda/pyproject.toml @@ -1,15 +1,15 @@ [build-system] -requires = ["setuptools"] +requires = ["setuptools", "numpy"] build-backend = "setuptools.build_meta" [project] name = "torch_xla_cuda_plugin" -version = "0.0.1" authors = [ {name = "PyTorch/XLA Dev Team", email = "pytorch-xla@googlegroups.com"}, ] description = "PyTorch/XLA CUDA Plugin" requires-python = ">=3.8" +dynamic = ["version"] [tool.setuptools.package-data] torch_xla_cuda_plugin = ["lib/*.so"] diff --git a/plugins/cuda/setup.py b/plugins/cuda/setup.py index 4207d598ed2..6f155d99d00 100644 --- a/plugins/cuda/setup.py +++ b/plugins/cuda/setup.py @@ -1,3 +1,4 @@ +import datetime import os import sys @@ -10,4 +11,7 @@ build_util.bazel_build('@xla//xla/pjrt/c:pjrt_c_api_gpu_plugin.so', 'torch_xla_cuda_plugin/lib', ['--config=cuda']) -setuptools.setup() +setuptools.setup( + # TODO: Use a common version file + version=f'2.4.0.dev{datetime.date.today().strftime("%Y%m%d")}' +)