diff --git a/.github/workflows/h100_benchmark.yml b/.github/workflows/h100_benchmark.yml new file mode 100644 index 0000000000..864ca724f7 --- /dev/null +++ b/.github/workflows/h100_benchmark.yml @@ -0,0 +1,45 @@ +name: Run H100 Benchmarks + +on: + schedule: + - cron: '0 0 * * *' # Runs at midnight UTC every day + # or on label h100 + workflow_dispatch: + + +concurrency: + group: h100-benchmark-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} + cancel-in-progress: true + +env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + +jobs: + test: + strategy: + fail-fast: false + matrix: + include: + - name: H100 + runs-on: linux.aws.h100 + torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu121' + gpu-arch-type: "cuda" + gpu-arch-version: "12.1" + + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + with: + timeout: 60 + runner: ${{ matrix.runs-on }} + gpu-arch-type: ${{ matrix.gpu-arch-type }} + gpu-arch-version: ${{ matrix.gpu-arch-version }} + script: | + conda create -n venv python=3.9 -y + conda activate venv + echo "::group::Install newer objcopy that supports --set-section-alignment" + yum install -y devtoolset-10-binutils + export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH + python -m pip install --upgrade pip + pip install ${{ matrix.torch-spec }} + pip install -r dev-requirements.txt + pip install . + pytest test/ --verbose -s