Modifying data_parallel_tutorial.py to enable multiple GPU support #1154
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Build tutorials | |
on: | |
pull_request: | |
push: | |
branches: | |
- main | |
concurrency: | |
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} | |
cancel-in-progress: true | |
jobs: | |
worker: | |
name: pytorch_tutorial_build_worker | |
strategy: | |
matrix: | |
include: | |
- { shard: 1, num_shards: 15, runner: "linux.16xlarge.nvidia.gpu" } | |
- { shard: 2, num_shards: 15, runner: "linux.g5.4xlarge.nvidia.gpu" } | |
- { shard: 3, num_shards: 15, runner: "linux.g5.4xlarge.nvidia.gpu" } | |
- { shard: 4, num_shards: 15, runner: "linux.g5.4xlarge.nvidia.gpu" } | |
- { shard: 5, num_shards: 15, runner: "linux.g5.4xlarge.nvidia.gpu" } | |
- { shard: 6, num_shards: 15, runner: "linux.g5.4xlarge.nvidia.gpu" } | |
- { shard: 7, num_shards: 15, runner: "linux.4xlarge.nvidia.gpu" } | |
- { shard: 8, num_shards: 15, runner: "linux.4xlarge.nvidia.gpu" } | |
- { shard: 9, num_shards: 15, runner: "linux.4xlarge.nvidia.gpu" } | |
- { shard: 10, num_shards: 15, runner: "linux.4xlarge.nvidia.gpu" } | |
- { shard: 11, num_shards: 15, runner: "linux.4xlarge.nvidia.gpu" } | |
- { shard: 12, num_shards: 15, runner: "linux.4xlarge.nvidia.gpu" } | |
- { shard: 13, num_shards: 15, runner: "linux.4xlarge.nvidia.gpu" } | |
- { shard: 14, num_shards: 15, runner: "linux.4xlarge.nvidia.gpu" } | |
- { shard: 15, num_shards: 15, runner: "linux.4xlarge.nvidia.gpu" } | |
fail-fast: false | |
runs-on: ${{ matrix.runner }} | |
env: | |
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9" | |
CUDA_VERSION: "9" | |
steps: | |
- name: Setup SSH (Click me for login details) | |
uses: pytorch/test-infra/.github/actions/setup-ssh@main | |
with: | |
github-secret: ${{ secrets.GITHUB_TOKEN }} | |
instructions: | | |
All testing is done inside the container, to start an interactive session run: | |
docker exec -it $(docker container ps --format '{{.ID}}') bash | |
- name: Checkout Tutorials | |
uses: actions/checkout@v3 | |
- name: Setup Linux | |
uses: pytorch/pytorch/.github/actions/setup-linux@main | |
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG | |
uses: pytorch/test-infra/.github/actions/setup-nvidia@main | |
- name: Calculate docker image | |
shell: bash | |
id: docker-image | |
run: | | |
set -ex | |
# for some reason, pip installs it in a different place than what is looked at in the py file | |
pip3 install requests==2.26 | |
pyTorchDockerImageTag=$(python3 .jenkins/get_docker_tag.py) | |
echo "docker-image=${DOCKER_IMAGE}:${pyTorchDockerImageTag}" >> "${GITHUB_OUTPUT}" | |
- name: Pull docker image | |
uses: pytorch/test-infra/.github/actions/pull-docker-image@main | |
with: | |
docker-image: ${{ steps.docker-image.outputs.docker-image }} | |
- name: Build | |
shell: bash | |
env: | |
DOCKER_IMAGE: ${{ steps.docker-image.outputs.docker-image }} | |
NUM_WORKERS: ${{ matrix.num_shards }} | |
WORKER_ID: ${{ matrix.shard }} | |
COMMIT_ID: ${{ github.sha }} | |
JOB_TYPE: worker | |
COMMIT_SOURCE: ${{ github.ref }} | |
run: | | |
set -ex | |
chmod +x ".jenkins/build.sh" | |
container_name=$(docker run \ | |
${GPU_FLAG:-} \ | |
-e WORKER_ID \ | |
-e NUM_WORKERS \ | |
-e COMMIT_ID \ | |
-e JOB_TYPE \ | |
-e COMMIT_SOURCE \ | |
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ | |
--tty \ | |
--detach \ | |
--user jenkins \ | |
--shm-size=2gb \ | |
--name="${container_name}" \ | |
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ | |
-w /var/lib/jenkins/workspace \ | |
"${DOCKER_IMAGE}" | |
) | |
echo "rm /opt/cache/bin/*" | docker exec -u root -i "${container_name}" bash | |
docker exec -t "${container_name}" sh -c ".jenkins/build.sh" | |
- name: Teardown Linux | |
uses: pytorch/test-infra/.github/actions/teardown-linux@main | |
if: always() | |
manager: | |
name: pytorch_tutorial_build_manager | |
needs: worker | |
runs-on: [self-hosted, linux.2xlarge] | |
environment: ${{ github.ref == 'refs/heads/main' && 'pytorchbot-env' || '' }} | |
env: | |
DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9" | |
CUDA_VERSION: "9" | |
steps: | |
- name: Setup SSH (Click me for login details) | |
uses: pytorch/test-infra/.github/actions/setup-ssh@main | |
with: | |
github-secret: ${{ secrets.GITHUB_TOKEN }} | |
instructions: | | |
All testing is done inside the container, to start an interactive session run: | |
docker exec -it $(docker container ps --format '{{.ID}}') bash | |
- name: Checkout Tutorials | |
uses: actions/checkout@v3 | |
- name: Setup Linux | |
uses: pytorch/pytorch/.github/actions/setup-linux@main | |
- name: Calculate docker image | |
shell: bash | |
id: docker-image | |
run: | | |
set -ex | |
# for some reason, pip installs it in a different place than what is looked at in the py file | |
pip3 install requests==2.26 | |
pyTorchDockerImageTag=$(python3 .jenkins/get_docker_tag.py) | |
echo "docker-image=${DOCKER_IMAGE}:${pyTorchDockerImageTag}" >> "${GITHUB_OUTPUT}" | |
- name: Pull docker image | |
uses: pytorch/test-infra/.github/actions/pull-docker-image@main | |
with: | |
docker-image: ${{ steps.docker-image.outputs.docker-image }} | |
- name: Build | |
shell: bash | |
env: | |
DOCKER_IMAGE: ${{ steps.docker-image.outputs.docker-image }} | |
NUM_WORKERS: 15 | |
WORKER_ID: ${{ matrix.shard }} | |
COMMIT_ID: ${{ github.sha }} | |
JOB_TYPE: manager | |
COMMIT_SOURCE: ${{ github.ref }} | |
GITHUB_PYTORCHBOT_TOKEN: ${{ secrets.PYTORCHBOT_TOKEN }} | |
run: | | |
set -ex | |
chmod +x ".jenkins/build.sh" | |
container_name=$(docker run \ | |
${GPU_FLAG:-} \ | |
-e WORKER_ID \ | |
-e NUM_WORKERS \ | |
-e COMMIT_ID \ | |
-e JOB_TYPE \ | |
-e COMMIT_SOURCE \ | |
-e GITHUB_PYTORCHBOT_TOKEN \ | |
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ | |
--tty \ | |
--detach \ | |
--user jenkins \ | |
--name="${container_name}" \ | |
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ | |
-w /var/lib/jenkins/workspace \ | |
"${DOCKER_IMAGE}" | |
) | |
echo "rm /opt/cache/bin/*" | docker exec -u root -i "${container_name}" bash | |
docker exec -t "${container_name}" sh -c ".jenkins/build.sh" | |
- name: Upload docs preview | |
uses: seemethere/upload-artifact-s3@v5 | |
if: ${{ github.event_name == 'pull_request' }} | |
with: | |
retention-days: 14 | |
s3-bucket: doc-previews | |
if-no-files-found: error | |
path: docs | |
s3-prefix: pytorch/tutorials/${{ github.event.pull_request.number }} | |
- name: Teardown Linux | |
uses: pytorch/test-infra/.github/actions/teardown-linux@main | |
if: always() |