From 45f06618d3c26aca2efdea421d117221503d8340 Mon Sep 17 00:00:00 2001 From: crusaderky Date: Thu, 15 Sep 2022 22:05:07 +0100 Subject: [PATCH 1/8] Repeat A/B tests --- .github/workflows/ab_tests.yml | 196 +++------ .github/workflows/tests.yml | 211 +++------ ....yaml.rename_me => AB_baseline.conda.yaml} | 6 +- ...k.yaml.rename_me => AB_baseline.dask.yaml} | 2 +- ...da.yaml.rename_me => AB_sample.conda.yaml} | 5 +- ...ask.yaml.rename_me => AB_sample.dask.yaml} | 0 AB_environments/README.md | 58 ++- AB_environments/config.yaml | 16 + ci/scripts/discover_ab_environments.py | 35 +- dashboard.py | 412 ++++++++++++------ 10 files changed, 499 insertions(+), 442 deletions(-) rename AB_environments/{AB_baseline.conda.yaml.rename_me => AB_baseline.conda.yaml} (87%) rename AB_environments/{AB_baseline.dask.yaml.rename_me => AB_baseline.dask.yaml} (68%) rename AB_environments/{AB_sample.conda.yaml.rename_me => AB_sample.conda.yaml} (68%) rename AB_environments/{AB_sample.dask.yaml.rename_me => AB_sample.dask.yaml} (100%) create mode 100644 AB_environments/config.yaml diff --git a/.github/workflows/ab_tests.yml b/.github/workflows/ab_tests.yml index 9865e0e08e..85261aa450 100644 --- a/.github/workflows/ab_tests.yml +++ b/.github/workflows/ab_tests.yml @@ -20,34 +20,44 @@ jobs: name: Discover A/B environments runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - name: Checkout + uses: actions/checkout@v2 with: fetch-depth: 0 - - uses: actions/setup-python@v4 + + - name: Install Python + uses: actions/setup-python@v4 with: python-version: '3.10' - - id: set-matrix + + - name: Install dependencies + run: pip install PyYaml + + - name: Generate dynamic matrix + id: set-matrix run: echo "::set-output name=matrix::$(python ci/scripts/discover_ab_environments.py)" + outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} + matrix: ${{ steps.set-matrix.outputs.matrix }} # Everything below this point runs iff there are files matching - # AB_environments/AB_*.conda.yaml - # AB_environments/AB_*.dask.yaml + # AB_environments/AB_*.{conda,dask}.yaml + # and AB_environments/config.yaml set repeat > 0 software: - name: Setup + name: Setup - ${{ matrix.runtime-version }} py${{ matrix.python-version }} runs-on: ubuntu-latest needs: discover_ab_envs - if: ${{ fromJson(needs.discover_ab_envs.outputs.matrix) }} + if: ${{ fromJson(needs.discover_ab_envs.outputs.matrix).runtime }} strategy: fail-fast: false matrix: python-version: ["3.9"] - runtime-version: ${{ fromJson(needs.discover_ab_envs.outputs.matrix) }} + runtime-version: ${{ fromJson(needs.discover_ab_envs.outputs.matrix).runtime }} steps: - - uses: actions/checkout@v2 + - name: Checkout + uses: actions/checkout@v2 with: fetch-depth: 0 @@ -60,6 +70,13 @@ jobs: python-version: ${{ matrix.python-version }} environment-file: ci/environment.yml + - name: Create null hypothesis as a copy of baseline + if: matrix.runtime-version == 'AB_null_hypothesis' + run: | + cd AB_environments + cp AB_baseline.conda.yaml AB_null_hypothesis.conda.yaml + cp AB_baseline.dask.yaml AB_null_hypothesis.dask.yaml + - name: Build Coiled Software Environment env: DASK_COILED__TOKEN: ${{ secrets.COILED_BENCHMARK_BOT_TOKEN }} @@ -101,8 +118,8 @@ jobs: software_name.txt test_upstream.txt - runtime: - name: Runtime - ${{ matrix.os }}, Python ${{ matrix.python-version }}, Runtime ${{ matrix.runtime-version }} + tests: + name: A/B Tests - ${{ matrix.category }} ${{ matrix.runtime-version }} ${{ matrix.os }} py${{ matrix.python-version }} needs: [discover_ab_envs, software] runs-on: ${{ matrix.os }} timeout-minutes: 120 @@ -111,10 +128,13 @@ jobs: matrix: os: [ubuntu-latest] python-version: ["3.9"] - runtime-version: ${{ fromJson(needs.discover_ab_envs.outputs.matrix) }} + category: ${{ fromJson(needs.discover_ab_envs.outputs.matrix).category }} + runtime-version: ${{ fromJson(needs.discover_ab_envs.outputs.matrix).runtime }} + repeat: ${{ fromJson(needs.discover_ab_envs.outputs.matrix).repeat }} steps: - - uses: actions/checkout@v2 + - name: Checkout + uses: actions/checkout@v2 with: fetch-depth: 0 @@ -128,7 +148,6 @@ jobs: environment-file: ci/environment.yml - name: Download software environment assets - if: matrix.runtime-version == 'latest' || startsWith(matrix.runtime-version, 'AB_') uses: actions/download-artifact@v3 with: name: software-environment-${{ matrix.runtime-version }}-py${{ matrix.python-version }} @@ -145,140 +164,32 @@ jobs: AWS_ACCESS_KEY_ID: ${{ secrets.RUNTIME_CI_BOT_AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.RUNTIME_CI_BOT_AWS_SECRET_ACCESS_KEY }} COILED_RUNTIME_VERSION: ${{ matrix.runtime-version }} - DB_NAME: runtime-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}.db - BENCHMARK: true - run: bash ci/scripts/run_tests.sh tests/runtime - - - name: Upload benchmark results - uses: actions/upload-artifact@v3 - if: always() - with: - name: runtime-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }} - path: runtime-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}.db - - benchmarks: - name: Benchmarks - ${{ matrix.os }}, Python ${{ matrix.python-version }}, Runtime ${{ matrix.runtime-version }} - needs: [discover_ab_envs, software] - runs-on: ${{ matrix.os }} - timeout-minutes: 120 - strategy: - fail-fast: false - matrix: - os: [ubuntu-latest] - python-version: ["3.9"] - runtime-version: ${{ fromJson(needs.discover_ab_envs.outputs.matrix) }} - - steps: - - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - - name: Set up environment - uses: conda-incubator/setup-miniconda@v2 - with: - miniforge-variant: Mambaforge - use-mamba: true - condarc-file: ci/condarc - python-version: ${{ matrix.python-version }} - environment-file: ci/environment.yml - - - name: Download software environment assets - uses: actions/download-artifact@v3 - with: - name: software-environment-${{ matrix.runtime-version }}-py${{ matrix.python-version }} - - - name: Install coiled-runtime - env: - COILED_RUNTIME_VERSION: ${{ matrix.runtime-version }} - run: source ci/scripts/install_coiled_runtime.sh - - - name: Run benchmarking tests - id: benchmarking_tests - env: - DASK_COILED__TOKEN: ${{ secrets.COILED_BENCHMARK_BOT_TOKEN }} - AWS_ACCESS_KEY_ID: ${{ secrets.RUNTIME_CI_BOT_AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.RUNTIME_CI_BOT_AWS_SECRET_ACCESS_KEY }} - COILED_RUNTIME_VERSION: ${{ matrix.runtime-version }} - DB_NAME: benchmark-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}.db - BENCHMARK: true - run: bash ci/scripts/run_tests.sh tests/benchmarks - - - name: Upload benchmark results - uses: actions/upload-artifact@v3 - if: always() - with: - name: benchmark-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }} - path: benchmark-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}.db - - stability: - name: Stability - ${{ matrix.os }}, Python ${{ matrix.python-version }}, Runtime ${{ matrix.runtime-version }} - needs: [discover_ab_envs, software] - runs-on: ${{ matrix.os }} - timeout-minutes: 120 - strategy: - fail-fast: false - matrix: - os: [ubuntu-latest] - python-version: ["3.9"] - runtime-version: ${{ fromJson(needs.discover_ab_envs.outputs.matrix) }} - - steps: - - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - - name: Set up environment - uses: conda-incubator/setup-miniconda@v2 - with: - miniforge-variant: Mambaforge - use-mamba: true - condarc-file: ci/condarc - python-version: ${{ matrix.python-version }} - environment-file: ci/environment.yml - - - name: Download software environment assets - if: matrix.runtime-version == 'latest' || startsWith(matrix.runtime-version, 'AB_') - uses: actions/download-artifact@v3 - with: - name: software-environment-${{ matrix.runtime-version }}-py${{ matrix.python-version }} - - - name: Install coiled-runtime - env: - COILED_RUNTIME_VERSION: ${{ matrix.runtime-version }} - run: source ci/scripts/install_coiled_runtime.sh - - - name: Run stability tests - id: stability_tests - env: - DASK_COILED__TOKEN: ${{ secrets.COILED_BENCHMARK_BOT_TOKEN }} - AWS_ACCESS_KEY_ID: ${{ secrets.RUNTIME_CI_BOT_AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.RUNTIME_CI_BOT_AWS_SECRET_ACCESS_KEY }} - COILED_RUNTIME_VERSION: ${{ matrix.runtime-version }} - DB_NAME: stability-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}.db + DB_NAME: ${{ matrix.category }}-${{ matrix.os }}-${{ matrix.runtime-version }}-${{ matrix.repeat }}-py${{ matrix.python-version }}.db BENCHMARK: true CLUSTER_DUMP: true - run: bash ci/scripts/run_tests.sh tests/stability + run: bash ci/scripts/run_tests.sh tests/${{ matrix.category }} - name: Upload benchmark results uses: actions/upload-artifact@v3 if: always() with: - name: stability-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }} - path: stability-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}.db + name: ${{ matrix.category }}-${{ matrix.os }}-${{ matrix.runtime-version }}-${{ matrix.repeat }}-py${{ matrix.python-version }} + path: ${{ matrix.category }}-${{ matrix.os }}-${{ matrix.runtime-version }}-${{ matrix.repeat }}-py${{ matrix.python-version }}.db cleanup: - needs: [discover_ab_envs, software, runtime, benchmarks, stability] - if: always() && ${{ fromJson(needs.discover_ab_envs.outputs.matrix) }} + needs: [discover_ab_envs, software, tests] + if: always() && ${{ fromJson(needs.discover_ab_envs.outputs.matrix).runtime }} name: Cleanup runs-on: ubuntu-latest strategy: fail-fast: false matrix: python-version: ["3.9"] - runtime-version: ${{ fromJson(needs.discover_ab_envs.outputs.matrix) }} + runtime-version: ${{ fromJson(needs.discover_ab_envs.outputs.matrix).runtime }} steps: - - uses: actions/checkout@v2 + - name: Checkout + uses: actions/checkout@v2 - name: Set up Python uses: actions/setup-python@v4 @@ -302,9 +213,9 @@ jobs: coiled env delete $SOFTWARE_NAME process-results: - needs: [discover_ab_envs, runtime, benchmarks, stability] + needs: [discover_ab_envs, tests] name: Combine separate benchmark results - if: always() && ${{ fromJson(needs.discover_ab_envs.outputs.matrix) }} + if: always() && ${{ fromJson(needs.discover_ab_envs.outputs.matrix).runtime }} runs-on: ubuntu-latest concurrency: # Fairly strict concurrency rule to avoid stepping on benchmark db. @@ -312,14 +223,17 @@ jobs: group: process-benchmarks cancel-in-progress: false steps: - - uses: actions/checkout@v2 + - name: Checkout + uses: actions/checkout@v2 - - uses: actions/setup-python@v4 + - name: Install Python + uses: actions/setup-python@v4 - name: Install dependencies run: pip install alembic - - uses: actions/download-artifact@v3 + - name: Download artifacts + uses: actions/download-artifact@v3 with: path: benchmarks @@ -337,15 +251,17 @@ jobs: static-site: needs: [discover_ab_envs, process-results] # Always generate the site, as this can be skipped even if an indirect dependency fails (like a test run) - if: always() && ${{ fromJson(needs.discover_ab_envs.outputs.matrix) }} + if: always() && ${{ fromJson(needs.discover_ab_envs.outputs.matrix).runtime }} name: Build static dashboards runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - name: Checkout + uses: actions/checkout@v2 with: fetch-depth: 0 - - uses: actions/download-artifact@v3 + - name: Download artifacts + uses: actions/download-artifact@v3 with: name: benchmark.db diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 307a615f1f..209b3f9e60 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -23,7 +23,7 @@ defaults: jobs: software: - name: Setup + name: Setup - py${{ matrix.python-version }} runs-on: ubuntu-latest strategy: fail-fast: false @@ -31,7 +31,8 @@ jobs: python-version: ["3.8", "3.9", "3.10"] steps: - - uses: actions/checkout@v2 + - name: Checkout + uses: actions/checkout@v2 with: fetch-depth: 0 @@ -109,20 +110,58 @@ jobs: test_upstream.txt ab_baseline.txt - runtime: - name: Runtime - ${{ matrix.os }}, Python ${{ matrix.python-version }}, Runtime ${{ matrix.runtime-version }} + tests: + name: Tests - ${{ matrix.category }} ${{ matrix.runtime-version }} ${{ matrix.os }} py${{ matrix.python-version }} needs: software runs-on: ${{ matrix.os }} timeout-minutes: 120 strategy: fail-fast: false matrix: - os: ["ubuntu-latest"] + os: [ubuntu-latest] python-version: ["3.9"] - runtime-version: ["latest", "0.0.4", "0.1.0"] + category: [runtime, benchmarks, stability] + runtime-version: [latest, "0.0.4", "0.1.0"] + include: + # Run stability tests on Python 3.8 + - category: stability + python-version: "3.8" + runtime-version: latest + os: ubuntu-latest + - category: stability + python-version: "3.8" + runtime-version: "0.0.4" + os: ubuntu-latest + - category: stability + python-version: "3.8" + runtime-version: "0.1.0" + os: ubuntu-latest + # Run stability tests on Python 3.10 + - category: stability + python-version: "3.10" + runtime-version: latest + os: ubuntu-latest + - category: stability + python-version: "3.10" + runtime-version: "0.0.4" + os: ubuntu-latest + - category: stability + python-version: "3.10" + runtime-version: "0.1.0" + os: ubuntu-latest + # Run stability tests on Python Windows and MacOS (latest py39 only) + - category: stability + python-version: "3.9" + runtime-version: latest + os: windows-latest + - category: stability + python-version: "3.9" + runtime-version: latest + os: macos-latest steps: - - uses: actions/checkout@v2 + - name: Checkout + uses: actions/checkout@v2 with: fetch-depth: 0 @@ -153,137 +192,20 @@ jobs: AWS_ACCESS_KEY_ID: ${{ secrets.RUNTIME_CI_BOT_AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.RUNTIME_CI_BOT_AWS_SECRET_ACCESS_KEY }} COILED_RUNTIME_VERSION: ${{ matrix.runtime-version }} - DB_NAME: runtime-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}.db - BENCHMARK: true - run: bash ci/scripts/run_tests.sh tests/runtime - - - name: Upload benchmark results - uses: actions/upload-artifact@v3 - if: always() - with: - name: runtime-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }} - path: runtime-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}.db - - benchmarks: - name: Benchmarks - ${{ matrix.os }}, Python ${{ matrix.python-version }}, Runtime ${{ matrix.runtime-version }} - needs: software - runs-on: ${{ matrix.os }} - timeout-minutes: 120 - strategy: - fail-fast: false - matrix: - os: ["ubuntu-latest"] - python-version: ["3.9"] - runtime-version: ["latest", "0.0.4", "0.1.0"] - - steps: - - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - - name: Set up environment - uses: conda-incubator/setup-miniconda@v2 - with: - miniforge-variant: Mambaforge - use-mamba: true - condarc-file: ci/condarc - python-version: ${{ matrix.python-version }} - environment-file: ci/environment.yml - - - name: Download software environment assets - if: matrix.runtime-version == 'latest' - uses: actions/download-artifact@v3 - with: - name: software-environment-py${{ matrix.python-version }} - - - name: Install coiled-runtime - env: - COILED_RUNTIME_VERSION: ${{ matrix.runtime-version }} - run: source ci/scripts/install_coiled_runtime.sh - - - name: Run benchmarking tests - id: benchmarking_tests - env: - DASK_COILED__TOKEN: ${{ secrets.COILED_BENCHMARK_BOT_TOKEN }} - AWS_ACCESS_KEY_ID: ${{ secrets.RUNTIME_CI_BOT_AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.RUNTIME_CI_BOT_AWS_SECRET_ACCESS_KEY }} - COILED_RUNTIME_VERSION: ${{ matrix.runtime-version }} - DB_NAME: benchmark-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}.db - BENCHMARK: true - run: bash ci/scripts/run_tests.sh tests/benchmarks - - - name: Upload benchmark results - uses: actions/upload-artifact@v3 - if: always() - with: - name: benchmark-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }} - path: benchmark-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}.db - - stability: - name: Stability - ${{ matrix.os }}, Python ${{ matrix.python-version }}, Runtime ${{ matrix.runtime-version }} - needs: software - runs-on: ${{ matrix.os }} - timeout-minutes: 120 - strategy: - fail-fast: false - matrix: - os: ["ubuntu-latest"] - python-version: ["3.8", "3.9", "3.10"] - runtime-version: ["latest", "0.0.4", "0.1.0"] - include: - - python-version: "3.9" - runtime-version: "latest" - os: "windows-latest" - - python-version: "3.9" - runtime-version: "latest" - os: "macos-latest" - - steps: - - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - - name: Set up environment - uses: conda-incubator/setup-miniconda@v2 - with: - miniforge-variant: Mambaforge - use-mamba: true - condarc-file: ci/condarc - python-version: ${{ matrix.python-version }} - environment-file: ci/environment.yml - - - name: Download software environment assets - if: matrix.runtime-version == 'latest' - uses: actions/download-artifact@v3 - with: - name: software-environment-py${{ matrix.python-version }} - - - name: Install coiled-runtime - env: - COILED_RUNTIME_VERSION: ${{ matrix.runtime-version }} - run: source ci/scripts/install_coiled_runtime.sh - - - name: Run stability tests - id: stability_tests - env: - DASK_COILED__TOKEN: ${{ secrets.COILED_BENCHMARK_BOT_TOKEN }} - AWS_ACCESS_KEY_ID: ${{ secrets.RUNTIME_CI_BOT_AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.RUNTIME_CI_BOT_AWS_SECRET_ACCESS_KEY }} - COILED_RUNTIME_VERSION: ${{ matrix.runtime-version }} - DB_NAME: stability-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}.db + DB_NAME: ${{ matrix.category }}-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}.db BENCHMARK: true CLUSTER_DUMP: true - run: bash ci/scripts/run_tests.sh tests/stability + run: bash ci/scripts/run_tests.sh tests/${{ matrix.category }} - name: Upload benchmark results uses: actions/upload-artifact@v3 if: always() with: - name: stability-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }} - path: stability-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}.db + name: ${{ matrix.category }}-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }} + path: ${{ matrix.category }}-${{ matrix.os }}-${{ matrix.runtime-version }}-py${{ matrix.python-version }}.db cleanup: - needs: [software, runtime, benchmarks, stability] + needs: [software, tests] if: always() name: Cleanup runs-on: ubuntu-latest @@ -292,7 +214,8 @@ jobs: matrix: python-version: ["3.8", "3.9", "3.10"] steps: - - uses: actions/checkout@v2 + - name: Checkout + uses: actions/checkout@v2 - name: Set up Python uses: actions/setup-python@v4 @@ -316,7 +239,7 @@ jobs: coiled env delete $SOFTWARE_NAME process-results: - needs: [runtime, benchmarks, stability] + needs: tests name: Combine separate benchmark results if: always() && github.repository == 'coiled/coiled-runtime' runs-on: ubuntu-latest @@ -326,14 +249,17 @@ jobs: group: process-benchmarks cancel-in-progress: false steps: - - uses: actions/checkout@v2 + - name: Checkout + uses: actions/checkout@v2 - - uses: actions/setup-python@v4 + - name: Install Python + uses: actions/setup-python@v4 - name: Install dependencies run: pip install alembic - - uses: actions/download-artifact@v3 + - name: Download artifacts + uses: actions/download-artifact@v3 with: path: benchmarks @@ -374,7 +300,8 @@ jobs: name: Detect regressions runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - name: Checkout + uses: actions/checkout@v2 with: fetch-depth: 0 @@ -406,15 +333,12 @@ jobs: report: name: report - needs: [runtime, benchmarks, stability, regressions] + needs: [tests, regressions] if: | always() && github.event_name != 'pull_request' && github.repository == 'coiled/coiled-runtime' - && (needs.runtime.result == 'failure' || - needs.benchmarks.result == 'failure' || - needs.stability.result == 'failure' || - needs.regressions.result == 'failure') + && (needs.tests.result == 'failure' || needs.regressions.result == 'failure') runs-on: ubuntu-latest defaults: @@ -437,7 +361,6 @@ jobs: labels: ["ci-failure"], }) - static-site: needs: process-results # Always generate the site, as this can be skipped even if an indirect dependency fails (like a test run) @@ -445,11 +368,13 @@ jobs: name: Build static dashboards runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - name: Checkout + uses: actions/checkout@v2 with: fetch-depth: 0 - - uses: actions/download-artifact@v3 + - name: Download tests database + uses: actions/download-artifact@v3 with: name: benchmark.db diff --git a/AB_environments/AB_baseline.conda.yaml.rename_me b/AB_environments/AB_baseline.conda.yaml similarity index 87% rename from AB_environments/AB_baseline.conda.yaml.rename_me rename to AB_environments/AB_baseline.conda.yaml index 8485352382..2beac715c6 100644 --- a/AB_environments/AB_baseline.conda.yaml.rename_me +++ b/AB_environments/AB_baseline.conda.yaml @@ -1,5 +1,5 @@ # Special environment file for A/B testing, used as the baseline environment. -# Change contents as needed and remove the .rename_me suffix. +# Change contents, but do not rename. channels: - conda-forge dependencies: @@ -14,6 +14,6 @@ dependencies: # - You can point to your own git fork instead # For example, if you want to test a PR before it's merged into main, you should # change this to the dask/dask and/or dask/distributed git tip - - dask==2022.8.1 - - distributed=2022.8.1 + - dask==2022.9.0 + - distributed==2022.9.0 # - git+https://github.com/dask/distributed@dd81b424971e81616e1a52fa09ce4698a5002d41 diff --git a/AB_environments/AB_baseline.dask.yaml.rename_me b/AB_environments/AB_baseline.dask.yaml similarity index 68% rename from AB_environments/AB_baseline.dask.yaml.rename_me rename to AB_environments/AB_baseline.dask.yaml index 8c296301be..cd1d2e38d3 100644 --- a/AB_environments/AB_baseline.dask.yaml.rename_me +++ b/AB_environments/AB_baseline.dask.yaml @@ -1,3 +1,3 @@ # Special environment file for A/B testing, used as the baseline environment. -# Change contents as needed and remove the .rename_me suffix. +# Change contents, but do not rename. # Leave empty if you don't want to override anything. diff --git a/AB_environments/AB_sample.conda.yaml.rename_me b/AB_environments/AB_sample.conda.yaml similarity index 68% rename from AB_environments/AB_sample.conda.yaml.rename_me rename to AB_environments/AB_sample.conda.yaml index 87b6409f3f..46dbb07913 100644 --- a/AB_environments/AB_sample.conda.yaml.rename_me +++ b/AB_environments/AB_sample.conda.yaml @@ -10,5 +10,6 @@ dependencies: - python=3.9 - coiled-runtime=0.1.0 - pip: - - dask==2022.8.1 - - git+https://github.com/dask/distributed@dd81b424971e81616e1a52fa09ce4698a5002d41 + - dask==2022.9.0 + # - distributed==2022.9.0 + - git+https://github.com/dask/distributed@1fd07f03cacee6fde81d13282568a727bce789b9 diff --git a/AB_environments/AB_sample.dask.yaml.rename_me b/AB_environments/AB_sample.dask.yaml similarity index 100% rename from AB_environments/AB_sample.dask.yaml.rename_me rename to AB_environments/AB_sample.dask.yaml diff --git a/AB_environments/README.md b/AB_environments/README.md index ddc056c2a2..a647c7be85 100644 --- a/AB_environments/README.md +++ b/AB_environments/README.md @@ -34,8 +34,8 @@ dependencies: - python=3.9 - coiled-runtime=0.1.0 - pip: - - dask==2022.8.1 - - distributed=2022.8.1 + - dask==2022.9.0 + - distributed==2022.9.0 ``` In this example it's using `coiled-runtime` as a base, but it doesn't have to. If you do use `coiled-runtime` though, you must install any conflicting packages with pip; in the @@ -47,8 +47,8 @@ arbitrary forks, e.g. ```yaml - pip: - - dask==2022.8.1 - - git+https://github.com/yourname/distributed@dd81b424971e81616e1a52fa09ce4698a5002d41 + - dask==2022.9.0 + - git+https://github.com/yourname/distributed@1fd07f03cacee6fde81d13282568a727bce789b9 ``` The second file in each pair is a dask config file. If you don't want to change the config, you must create an empty file. @@ -66,8 +66,32 @@ If you create *any* files in `AB_environments/`, you *must* create the baseline - `AB_baseline.conda.yaml` - `AB_baseline.dask.yaml` -#### Complete example -We want to test the impact of disabling work stealing. We create 4 files: +### 4. Tweak configuration file +Open `AB_environments/config.yaml` and set the `repeat` setting to a number higher than 0. +This enables the A/B tests. +Setting a low number of repeated runs is faster and cheaper, but will result in higher +variance. + +`repeat` must remain set to 0 in the main branch, thus completely disabling +A/B tests, in order to avoid unnecessary runs. + +In the same file, you can also set the `test_null_hypothesis` flag to true to +automatically create a verbatim copy of AB_baseline and then compare the two in the A/B +tests. Set it to false to save some money if you are already confident that the 'repeat' +setting is high enough. + +Finally, the files offers a `categories` list. These are the subdirectories of `tests/` +which you wish to run. + +### 5. (optional) Tweak tests +Nothing prevents you from changing the tests themselves. + +For example, you may be interested in a single test, but you don't want to run its +whole category; all you need to do is open the test files and delete what you don't care +about. + +### Complete example +You want to test the impact of disabling work stealing. You'll create at least 4 files: - `AB_environments/AB_baseline.conda.yaml`: ```yaml @@ -77,8 +101,8 @@ dependencies: - python=3.9 - coiled-runtime=0.1.0 - pip: - - dask==2022.8.1 - - distributed=2022.8.1 + - dask==2022.9.0 + - distributed==2022.9.0 ``` - `AB_environments/AB_baseline.dask.yaml`: (empty file) - `AB_environments/AB_no_steal.conda.yaml`: (same as baseline) @@ -89,8 +113,18 @@ distributed: work-stealing: False ``` -### 4. Run CI -- `git push`. Note: we are *not* creating a PR. +- `AB_environments/config.yaml`: +```yaml +repeat: 5 +test_null_hypothesis: true +categories: + - runtime + - benchmarks + - stability +``` + +### 6. Run CI +- `git push`. Note: you should *not* open a Pull Request. - Open https://github.com/coiled/coiled-runtime/actions/workflows/ab_tests.yml and wait for the run to complete. - Open the run from the link above. In the Summary tab, scroll down and download the @@ -98,9 +132,11 @@ distributed: Note: artifacts will appear only after the run is complete. - Decompress `static-dashboard.zip` and open `index.html` in your browser. -### 5. Clean up + +### 7. Clean up Remember to delete the branch once you're done. + ### Troubleshooting #### Problem: diff --git a/AB_environments/config.yaml b/AB_environments/config.yaml new file mode 100644 index 0000000000..9c1e3f011c --- /dev/null +++ b/AB_environments/config.yaml @@ -0,0 +1,16 @@ +# Number of times to run each test suite. +# Lower values are faster and cheaper but will result in higher variance. +# This must remain set to 0 in the main branch, thus completely disabling +# A/B tests, in order to avoid unnecessary runs. +repeat: 0 + +# Set to true to automatically create a verbatim copy of AB_baseline and then compare +# the two in the A/B tests. Set to false to save some money if you are already confident +# that the 'repeat' setting is high enough. +test_null_hypothesis: true + +# Tests categories to run. These are subdirectories of tests/. +categories: + - runtime + - benchmarks + - stability diff --git a/ci/scripts/discover_ab_environments.py b/ci/scripts/discover_ab_environments.py index 60db39bf9a..f13610338d 100644 --- a/ci/scripts/discover_ab_environments.py +++ b/ci/scripts/discover_ab_environments.py @@ -1,22 +1,47 @@ +from __future__ import annotations + import glob import json import os.path +import yaml + -def main(): - envs = [] +def build_json() -> dict[str, list[int]]: + with open("AB_environments/config.yaml") as fh: + cfg = yaml.safe_load(fh) + if not isinstance(cfg.get("repeat"), int) or cfg["repeat"] < 0: + raise ValueError("AB_environments/config.yaml: missing key {repeat: N}") + if not cfg["repeat"]: + return {"repeat": [], "runtime": [], "category": []} + + runtimes = [] for conda_fname in sorted(glob.glob("AB_environments/AB_*.conda.yaml")): env_name = os.path.basename(conda_fname)[: -len(".conda.yaml")] dask_fname = f"AB_environments/{env_name}.dask.yaml" # Raise FileNotFoundError if missing open(dask_fname).close() - envs.append(env_name) + runtimes.append(env_name) + + if not runtimes: + return {"repeat": [], "runtime": [], "category": []} - if envs and "AB_baseline" not in envs: + if "AB_baseline" not in runtimes: # If any A/B environments are defined, AB_baseline is required raise FileNotFoundError("AB_environments/AB_baseline.conda.yaml") - print(json.dumps(envs)) + if cfg["test_null_hypothesis"]: + runtimes += ["AB_null_hypothesis"] + + return { + "repeat": list(range(1, cfg["repeat"] + 1)), + "runtime": runtimes, + "category": cfg["categories"], + } + + +def main() -> None: + print(json.dumps(build_json())) if __name__ == "__main__": diff --git a/dashboard.py b/dashboard.py index ff819dee91..5fcd3a7424 100644 --- a/dashboard.py +++ b/dashboard.py @@ -4,15 +4,19 @@ import glob import importlib import inspect +import operator import pathlib -from typing import Literal, NamedTuple +from collections.abc import Callable +from typing import Any, Literal, NamedTuple import altair +import numpy import pandas import panel import sqlalchemy from bokeh.resources import INLINE +altair.data_transformers.enable("default", max_rows=None) panel.extension("vega") @@ -53,121 +57,222 @@ def load_test_source() -> None: print(f"Discovered {len(source)} tests") -def align_to_baseline(df: pandas.DataFrame, baseline: str) -> pandas.DataFrame | None: - """Add columns +def calc_ab_confidence_intervals( + df: pandas.DataFrame, field_name: str, A: str, B: str +) -> pandas.DataFrame: + """Calculate p(B / A - 1) > x and p(B / A - 1) < -x for discrete x, where A and B + are runtimes, for all tests in df. + + Algorithm + --------- + https://towardsdatascience.com/a-practical-guide-to-a-b-tests-in-python-66666f5c3b02 + + Returns + ------- + DataFrame: + + fullname + Test name with category, e.g. bencharks/test_foo.py::test_123[1] + fullname_no_category + Test name without category, e.g. test_foo.py::test_123[1] + x + Confidence interval [-0.5, 0.5]. Note that element 0 will be repeated. + xlabel + "<-{p*100}% | x < 0 + ">{p*100}% | x > 0 + p + p(B/A-1) < x | x < 0 + p(B/A-1) > x | x > 0 + color + 0 if p=1 and x < 0 + 0.5 if p=0 + 1 if p=1 and x > 0 + plus all shades in between + """ - - duration_baseline - - average_memory_baseline - - peak_memory_baseline - - duration_delta (A/B - 1) - - average_memory_delta (A/B - 1) - - peak_memory_delta (A/B - 1) + def bootstrap_mean(df_i: pandas.DataFrame) -> pandas.DataFrame: + boot = df_i[field_name].sample(frac=10_000, replace=True).to_frame() + boot["i"] = pandas.RangeIndex(boot.shape[0]) // df_i.shape[0] + out = boot.groupby("i").mean().reset_index()[[field_name]] + assert out.shape == (10_000, 1) + out.index.name = "bootstrap_run" + return out + + # DataFrame with 20,000 rows per test exactly, with columns + # [fullname, fullname_no_category, runtime, bootstrap_run, {field_name}] + bootstrapped = ( + df.groupby(["fullname", "fullname_no_category", "runtime"]) + .apply(bootstrap_mean) + .reset_index() + ) - Baseline values are from the matching rows given the same test name and the baseline - runtime. Note that this means that df is expected to have exactly 1 test in the - baseline runtime for each test in every other runtime. - """ - df_baseline = df[df["runtime"] == baseline] - - if df_baseline.empty: - # Typically a misspelling. However, this can legitimately happen in CI if all - # three jobs of the baseline runtime failed early. - print( - f"Baseline runtime {baseline!r} not found; valid choices are:", - ", ".join(df["runtime"].unique()), + # DataFrame with 10,000 rows per test exactly, with columns + # [fullname, fullname_no_category, bootstrap_run, {A}, {B}, diff] + pivot = bootstrapped.pivot( + ["fullname", "fullname_no_category", "bootstrap_run"], + "runtime", + field_name, + ).reset_index() + pivot["diff"] = pivot[B] / pivot[A] - 1 + + def confidence( + df_i: pandas.DataFrame, + x: numpy.ndarray, + op: Literal["<", ">"], + cmp: Callable[[Any, Any], bool], + color_factor: float, + ) -> pandas.DataFrame: + xlabel = [f"{op}{xi * 100:.0f}%" for xi in x] + p = (cmp(df_i["diff"].values.reshape([-1, 1]), x)).sum(axis=0) / df_i.shape[0] + color = color_factor * p / 2 + 0.5 + return pandas.DataFrame({"x": x, "xlabel": xlabel, "p": p, "color": color}) + + pivot_groups = pivot.groupby(["fullname", "fullname_no_category"])[["diff"]] + x_neg = numpy.linspace(-0.8, 0, 17) + x_pos = numpy.linspace(0, 0.8, 17) + conf_neg, conf_pos = [ + # DataFrame with 1 row per element of x_neg/x_pos and columns + # [fullname, fullname_no_category, x, xlabel, p, color] + ( + pivot_groups.apply(confidence, p, op, cmp, color_factor) + .reset_index() + .drop("level_2", axis=1) ) - return None - - baseline_names = df_baseline["fullname"].unique() - all_names = df["fullname"].unique() - - assert len(baseline_names) == df_baseline.shape[0] - if len(baseline_names) < len(all_names): - # This will happen in CI if one or two out of three jobs of the baseline failed. - # Note that df contains the latest run only. It means that tests on all runtimes - # (including historical ones) should be from the coiled-runtime git tip, so - # adding or removing tests should not cause a mismatch. - print( - f"Baseline runtime {baseline!r} is missing some tests:", - ", ".join(set(all_names) - set(baseline_names)), + for (p, op, cmp, color_factor) in ( + (x_neg, "<", operator.lt, -1), + (x_pos, ">", operator.gt, 1), ) - return None - - columns = [spec.field_name for spec in SPECS] - df_baseline = ( - df_baseline.set_index("fullname") - .loc[df["fullname"], columns] - .rename(columns={k: k + "_baseline" for k in columns}) - ) - df_baseline.index = df.index - df = pandas.concat([df, df_baseline], axis=1) - for column in columns: - df[column + "_delta"] = (df[column] / df[column + "_baseline"] - 1) * 100 - return df + ] + return pandas.concat([conf_neg, conf_pos], axis=0) def make_barchart( df: pandas.DataFrame, spec: ChartSpec, title: str, - baseline: str | None, -) -> altair.Chart | None: +) -> tuple[altair.Chart | None, int]: """Make a single Altair barchart for a given test or runtime""" df = df.dropna(subset=[spec.field_name, "start"]) if not len(df): # Some tests do not have average_memory or peak_memory measures, only runtime - return None + return None, 0 - fields = [ - spec.field_name, - "fullname", - "fullname_no_category", - "dask_version", - "distributed_version", - "runtime", + df = df[ + [ + spec.field_name, + "fullname", + "fullname_no_category", + "dask_version", + "distributed_version", + "runtime", + ] ] - height = max(df.shape[0] * 20 + 50, 90) tooltip = [ altair.Tooltip("fullname:N", title="Test"), + altair.Tooltip("runtime:N", title="Runtime"), altair.Tooltip("dask_version:N", title="Dask"), altair.Tooltip("distributed_version:N", title="Distributed"), - altair.Tooltip(f"{spec.field_name}:Q", title=f"{spec.field_desc} {spec.unit}"), + altair.Tooltip(f"count({spec.field_name}):N", title="Number of runs"), + altair.Tooltip(f"stdev({spec.field_name}):Q", title=f"std dev {spec.unit}"), + altair.Tooltip(f"min({spec.field_name}):Q", title=f"min {spec.unit}"), + altair.Tooltip(f"median({spec.field_name}):Q", title=f"median {spec.unit}"), + altair.Tooltip(f"mean({spec.field_name}):Q", title=f"mean {spec.unit}"), + altair.Tooltip(f"max({spec.field_name}):Q", title=f"max {spec.unit}"), ] by_test = len(df["fullname"].unique()) == 1 if by_test: df = df.sort_values("runtime", key=runtime_sort_key_pd) y = altair.Y("runtime", title="Runtime", sort=None) + n_bars = df["runtime"].unique().size else: y = altair.Y("fullname_no_category", title="Test name") + n_bars = df["fullname_no_category"].unique().size - if baseline: - fields += [ - f"{spec.field_name}_delta", - f"{spec.field_name}_baseline", - ] - x = altair.X( - f"{spec.field_name}_delta", - title=f"{spec.field_desc} (delta % from {baseline})", - ) - tooltip += [ - altair.Tooltip( - f"{spec.field_name}_baseline:Q", title=f"{baseline} {spec.unit}" + height = max(n_bars * 20 + 50, 90) + + bars = ( + altair.Chart(width=800, height=height) + .mark_bar() + .encode( + x=altair.X( + f"median({spec.field_name}):Q", title=f"{spec.field_desc} {spec.unit}" ), - altair.Tooltip(f"{spec.field_name}_delta:Q", title="Delta %"), + y=y, + tooltip=tooltip, + ) + ) + ticks = ( + altair.Chart() + .mark_tick(color="black") + .encode(x=f"mean({spec.field_name})", y=y) + ) + error_bars = ( + altair.Chart().mark_errorbar(extent="stdev").encode(x=spec.field_name, y=y) + ) + chart = ( + altair.layer(bars, ticks, error_bars, data=df) + .properties(title=title) + .configure(autosize="fit") + ) + + return chart, height + + +def make_ab_confidence_map( + df: pandas.DataFrame, + spec: ChartSpec, + title: str, + baseline: str, +) -> tuple[altair.Chart | None, int]: + """Make a single Altair heatmap of p(B/A - 1) confidence intervals, where B is the + examined runtime and A is the baseline, for all tests for a given measure. + """ + df = df.dropna(subset=[spec.field_name, "start"]) + if not len(df): + # Some tests do not have average_memory or peak_memory measures, only runtime + return None, 0 + + df = df[ + [ + spec.field_name, + "fullname", + "fullname_no_category", + "runtime", ] - else: - x = altair.X(spec.field_name, title=f"{spec.field_desc} {spec.unit}") + ] + runtimes = df["runtime"].unique() + A = baseline + B = next(r for r in runtimes if r != baseline) + conf = calc_ab_confidence_intervals(df, spec.field_name, A, B) - return ( - altair.Chart(df[fields], width=800, height=height) - .mark_bar() - .encode(x=x, y=y, tooltip=tooltip) + n_bars = df["fullname_no_category"].unique().size + height = max(n_bars * 20 + 50, 90) + + chart = ( + altair.Chart(conf, width=800, height=height) + .mark_rect() + .encode( + x=altair.X("xlabel:O", title="confidence threshold (B/A - 1)", sort=None), + y=altair.Y("fullname_no_category:O", title="Test"), + color=altair.Color( + "color:Q", + scale=altair.Scale(scheme="redblue", domain=[0, 1], reverse=True), + legend=None, + ), + tooltip=[ + altair.Tooltip("fullname:O", title="Test Name"), + altair.Tooltip("xlabel:O", title="Confidence threshold"), + altair.Tooltip("p:Q", format=".2p", title="p(B/A-1) exceeds threshold"), + ], + ) .properties(title=title) .configure(autosize="fit") ) + return chart, height + def make_timeseries( df: pandas.DataFrame, spec: ChartSpec, title: str @@ -229,7 +334,7 @@ def make_timeseries( def make_test_report( df: pandas.DataFrame, - kind: Literal["barchart" | "timeseries"], + kind: Literal["barchart" | "timeseries" | "A/B"], title: str, sourcename: str | None = None, baseline: str | None = None, @@ -240,17 +345,19 @@ def make_test_report( if kind == "timeseries": assert not baseline chart = make_timeseries(df, spec, title) + height = 384 + elif kind == "barchart": + assert not baseline + chart, height = make_barchart(df, spec, title) + elif kind == "A/B": + assert baseline + chart, height = make_ab_confidence_map(df, spec, title, baseline=baseline) else: - chart = make_barchart(df, spec, title, baseline) + raise ValueError(kind) # pragma: nocover if not chart: continue tabs.append((spec.field_desc, chart)) - if kind == "timeseries": - height = 384 - else: - height = max(df.shape[0] * 20 + 50, 90) - if sourcename in source: code = panel.pane.Markdown( f"```python\n{source[sourcename]}\n```", @@ -281,10 +388,8 @@ def make_timeseries_html_report( categories = sorted(df[df.runtime == runtime].category.unique()) tabs = [] for category in categories: - df_by_test = ( - df[(df.runtime == runtime) & (df.category == category)] - .sort_values("sourcename") - .groupby("sourcename") + df_by_test = df[(df.runtime == runtime) & (df.category == category)].groupby( + "sourcename" ) panes = [ make_test_report( @@ -302,29 +407,22 @@ def make_timeseries_html_report( doc.save(out_fname, title=runtime, resources=INLINE) -def make_ab_html_report( +def make_barchart_html_report( df: pandas.DataFrame, output_dir: pathlib.Path, by_test: bool, - baseline: str | None, ) -> None: - """Generate HTML report for the latest CI run, comparing all runtimes (e.g. - coiled-upstream-py3.9) against a baseline runtime + """Generate HTML report containing bar charts showing statistical information + (mean, median, etc). Create one tab for each test category (e.g. benchmarks, runtime, stability), one graph for each runtime and one bar for each test OR one graph for each test and one bar for each runtime, and one graph tab for each measure (wall clock, average memory, peak memory). - - If a baseline runtime is defined, all measures are expressed relative to the - baseline; otherwise they're expressed in absolute terms. """ out_fname = str( output_dir.joinpath( - "AB_by_" - + ("test" if by_test else "runtime") - + (f"_vs_{baseline}" if baseline else "") - + ".html" + "barcharts_by_" + ("test" if by_test else "runtime") + ".html" ) ) print(f"Generating {out_fname}") @@ -333,36 +431,25 @@ def make_ab_html_report( tabs = [] for category in categories: if by_test: - df_by_test = ( - df[df.category == category] - .sort_values(["sourcename", "fullname"]) - .groupby(["sourcename", "fullname"]) - ) + df_by_test = df[df.category == category].groupby(["sourcename", "fullname"]) panes = [ make_test_report( df_by_test.get_group((sourcename, fullname)), kind="barchart", title=fullname, sourcename=sourcename, - baseline=baseline, ) for sourcename, fullname in df_by_test.groups ] else: - df_by_runtime = ( - df[df.category == category] - .sort_values("runtime", key=runtime_sort_key_pd) - .groupby("runtime") - ) + df_by_runtime = df[df.category == category].groupby("runtime") panes = [ make_test_report( df_by_runtime.get_group(runtime), kind="barchart", title=runtime, - baseline=baseline, ) for runtime in sorted(df_by_runtime.groups, key=runtime_sort_key) - if runtime != baseline ] flex = panel.FlexBox(*panes, align_items="start", justify_content="start") tabs.append((category.title(), flex)) @@ -370,11 +457,69 @@ def make_ab_html_report( doc.save( out_fname, - title="A/B by " - + ("test" if by_test else "runtime") - + (f" vs. {baseline}" if baseline else ""), + title="Bar charts by " + ("test" if by_test else "runtime"), + resources=INLINE, + ) + + +def make_ab_html_report( + df: pandas.DataFrame, + output_dir: pathlib.Path, + baseline: str, +) -> bool: + """Generate HTML report containing heat maps for confidence intervals relative to + a baseline runtime, e.g. p(B/A-1) > 10% + + Create one tab for each test category (e.g. benchmarks, runtime, stability), one + graph for each runtime, and one graph tab for each measure (wall clock, average + memory, peak memory). + + Returns + ------- + True if the report was generated; False otherwise + """ + out_fname = str(output_dir.joinpath(f"AB_vs_{baseline}.html")) + print(f"Generating {out_fname}") + + categories = sorted(df.category.unique()) + tabs = [] + for category in categories: + df_by_runtime = df[df.category == category].groupby("runtime") + if baseline not in df_by_runtime.groups: + # Typically a misspelling. However, this can legitimately happen in CI if + # all three jobs of the baseline runtime failed early. + print( + f"Baseline runtime {baseline!r} not found; valid choices are:", + ", ".join(df["runtime"].unique()), + ) + return False + + panes = [ + make_test_report( + pandas.concat( + [ + df_by_runtime.get_group(runtime), + df_by_runtime.get_group(baseline), + ], + axis=0, + ), + kind="A/B", + title=runtime, + baseline=baseline, + ) + for runtime in sorted(df_by_runtime.groups, key=runtime_sort_key) + if runtime != baseline + ] + flex = panel.FlexBox(*panes, align_items="start", justify_content="start") + tabs.append((category.title(), flex)) + doc = panel.Tabs(*tabs, margin=12) + + doc.save( + out_fname, + title="A/B confidence intervals vs. " + baseline, resources=INLINE, ) + return True def make_index_html_report( @@ -385,12 +530,12 @@ def make_index_html_report( index_txt += "### Historical timeseries\n" for runtime in runtimes: index_txt += f"- [{runtime}](./{runtime}.html)\n" - index_txt += "\n\n### A/B tests\n" - index_txt += "- [by test](./AB_by_test.html)\n" - index_txt += "- [by runtime](./AB_by_runtime.html)\n" + index_txt += "\n\n### Statistical analysis\n" + index_txt += "- [Bar charts, by test](./barcharts_by_test.html)\n" + index_txt += "- [Bar charts, by runtime](./barcharts_by_runtime.html)\n" for baseline in baselines: index_txt += ( - f"- [by runtime vs. {baseline}](./AB_by_runtime_vs_{baseline}.html)\n" + f"- [A/B confidence intervals vs. {baseline}](./AB_vs_{baseline}.html)\n" ) index = panel.pane.Markdown(index_txt, width=800) @@ -503,24 +648,17 @@ def main() -> None: for runtime in runtimes: make_timeseries_html_report(df, output_dir, runtime) - # Select only the latest run for each runtime. This may pick up historical runs (up - # to 6h old) if they have not been rerun in the current pull/PR. - # TODO This is fragile. Keep the latest and historical databases separate, or record - # the coiled-runtime git hash and use it to filter? - max_end = df.sort_values("end").groupby(["runtime", "category"]).tail(1) - max_end = max_end[max_end["end"] > max_end["end"].max() - pandas.Timedelta("6h")] - session_ids = max_end["session_id"].unique() - latest_run = df[df["session_id"].isin(session_ids)] - - make_ab_html_report(latest_run, output_dir, by_test=True, baseline=None) - make_ab_html_report(latest_run, output_dir, by_test=False, baseline=None) + # Do not use data that is more than a week old in statistical analysis + df_recent = df[df["end"] > df["end"].max() - pandas.Timedelta("7d")] + + make_barchart_html_report(df_recent, output_dir, by_test=True) + make_barchart_html_report(df_recent, output_dir, by_test=False) + baselines = [] for baseline in args.baseline: - df_baseline = align_to_baseline(latest_run, baseline) - if df_baseline is None: - continue - baselines.append(baseline) - make_ab_html_report(df_baseline, output_dir, by_test=False, baseline=baseline) + has_baseline = make_ab_html_report(df_recent, output_dir, baseline) + if has_baseline: + baselines.append(baseline) make_index_html_report(output_dir, runtimes, baselines) From c58806c8f7c84a25ea8a90f22998c0f56e6cf4bd Mon Sep 17 00:00:00 2001 From: crusaderky Date: Tue, 20 Sep 2022 11:44:09 +0100 Subject: [PATCH 2/8] Throttle concurrent runs --- .github/workflows/ab_tests.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/ab_tests.yml b/.github/workflows/ab_tests.yml index a9a4b2281f..f1ec82c7c3 100644 --- a/.github/workflows/ab_tests.yml +++ b/.github/workflows/ab_tests.yml @@ -51,6 +51,11 @@ jobs: timeout-minutes: 120 strategy: fail-fast: false + # AWS implements limiters to how many EC2 instances you can spawn in parallel *on + # the same AWS account*. If such limit is reached, jobs will randomly fail when + # trying to create the Coiled clusters, and restarting failed jobs won't fix the + # problem. + max-parallel: 20 matrix: os: [ubuntu-latest] python-version: ["3.9"] From 154451860c612efb18db0e67dcf4fdeb8edfdd77 Mon Sep 17 00:00:00 2001 From: crusaderky Date: Tue, 20 Sep 2022 12:49:55 +0100 Subject: [PATCH 3/8] tweaks --- .github/workflows/ab_tests.yml | 3 +++ AB_environments/AB_sample.dask.yaml | 6 +++--- AB_environments/config.yaml | 4 ++-- ci/scripts/dask_config_to_env.py | 5 ++--- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/.github/workflows/ab_tests.yml b/.github/workflows/ab_tests.yml index f1ec82c7c3..93b9d3e083 100644 --- a/.github/workflows/ab_tests.yml +++ b/.github/workflows/ab_tests.yml @@ -84,6 +84,9 @@ jobs: run: | source ci/scripts/install_coiled_runtime.sh AB_environments/${{ matrix.runtime-version }}.conda.yaml + - name: Convert dask config into environment variables + run: python ci/scripts/dask_config_to_env.py AB_environments/${{ matrix.runtime-version }}.dask.yaml >> $GITHUB_ENV + - name: Run Coiled Runtime Tests id: test env: diff --git a/AB_environments/AB_sample.dask.yaml b/AB_environments/AB_sample.dask.yaml index fd743b1f76..1442587efe 100644 --- a/AB_environments/AB_sample.dask.yaml +++ b/AB_environments/AB_sample.dask.yaml @@ -3,6 +3,6 @@ # All files *must* be called AB_.conda.yaml # and *must* be accompanied by AB_.dask.yaml. # Leave empty if you don't want to override anything. -# distributed: -# scheduler: -# worker-saturation: 1.2 +distributed: + scheduler: + worker-saturation: 1.2 diff --git a/AB_environments/config.yaml b/AB_environments/config.yaml index 9c1e3f011c..e9e4f5f3b1 100644 --- a/AB_environments/config.yaml +++ b/AB_environments/config.yaml @@ -11,6 +11,6 @@ test_null_hypothesis: true # Tests categories to run. These are subdirectories of tests/. categories: - - runtime - benchmarks - - stability + # - runtime + # - stability diff --git a/ci/scripts/dask_config_to_env.py b/ci/scripts/dask_config_to_env.py index 74897fc45d..b151ee5920 100755 --- a/ci/scripts/dask_config_to_env.py +++ b/ci/scripts/dask_config_to_env.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -"""Read a dask config file and print it out in the format `-e ENV=VALUE ENV=VALUE ...` +"""Read a dask config file and print it out in the format `ENV=VALUE\nENV=VALUE ...` This script is a work-around to not being able to upload dask config files to `conda env create`. """ @@ -14,9 +14,8 @@ def main(fname: str) -> None: with open(fname) as fh: cfg = yaml.safe_load(fh) - # Print nothing in case of empty file, comments only, or empty dict if cfg: - print("-e " + " ".join(traverse(cfg, []))) + print("\n".join(traverse(cfg, []))) def traverse(node: dict | list | str | float | None, path: list[str]) -> Iterator[str]: From 79f4fa17b2dab238163feed9dd3ef594e4a5e1ee Mon Sep 17 00:00:00 2001 From: crusaderky Date: Tue, 20 Sep 2022 12:56:04 +0100 Subject: [PATCH 4/8] Add upstream --- .github/workflows/tests.yml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 80217f7c7c..b4db13c326 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -32,9 +32,13 @@ jobs: os: [ubuntu-latest] python-version: ["3.9"] category: [runtime, benchmarks, stability] - runtime-version: [latest, "0.0.4", "0.1.0"] + runtime-version: [upstream, latest, "0.0.4", "0.1.0"] include: # Run stability tests on Python 3.8 + - category: stability + python-version: "3.8" + runtime-version: upstream + os: ubuntu-latest - category: stability python-version: "3.8" runtime-version: latest @@ -48,6 +52,10 @@ jobs: runtime-version: "0.1.0" os: ubuntu-latest # Run stability tests on Python 3.10 + - category: stability + python-version: "3.10" + runtime-version: upstream + os: ubuntu-latest - category: stability python-version: "3.10" runtime-version: latest From a0a4ac3187f3d94bfba0112edb56609f7d2e33fd Mon Sep 17 00:00:00 2001 From: crusaderky Date: Tue, 20 Sep 2022 17:20:26 +0100 Subject: [PATCH 5/8] Env variables --- conftest.py | 8 +++++++- tests/benchmarks/test_parquet.py | 3 ++- tests/benchmarks/test_work_stealing.py | 6 ++++-- tests/runtime/test_coiled.py | 6 ++++-- tests/stability/test_deadlock.py | 3 ++- tests/stability/test_spill.py | 28 +++++++++++++++----------- 6 files changed, 35 insertions(+), 19 deletions(-) diff --git a/conftest.py b/conftest.py index ac6e4efa1b..7b8fe33871 100644 --- a/conftest.py +++ b/conftest.py @@ -397,8 +397,13 @@ def test_name_uuid(request): return f"{request.node.originalname}-{uuid.uuid4().hex}" +@pytest.fixture(scope="session") +def dask_env_variables(): + return {k: v for k, v in os.environ.items() if k.startswith("DASK_")} + + @pytest.fixture(scope="module") -def small_cluster(request): +def small_cluster(request, dask_env_variables): # Extract `backend_options` for cluster from `backend_options` markers backend_options = merge( m.kwargs for m in request.node.iter_markers(name="backend_options") @@ -411,6 +416,7 @@ def small_cluster(request): scheduler_vm_types=["t3.xlarge"], backend_options=backend_options, package_sync=True, + environ=dask_env_variables, ) as cluster: yield cluster diff --git a/tests/benchmarks/test_parquet.py b/tests/benchmarks/test_parquet.py index feeaa7666b..69b8c03858 100644 --- a/tests/benchmarks/test_parquet.py +++ b/tests/benchmarks/test_parquet.py @@ -15,13 +15,14 @@ @pytest.fixture(scope="module") -def parquet_cluster(): +def parquet_cluster(dask_env_variables): with Cluster( f"parquet-{uuid.uuid4().hex[:8]}", n_workers=N_WORKERS, worker_vm_types=["m5.xlarge"], scheduler_vm_types=["m5.xlarge"], package_sync=True, + environ=dask_env_variables, ) as cluster: yield cluster diff --git a/tests/benchmarks/test_work_stealing.py b/tests/benchmarks/test_work_stealing.py index ae9fc47b66..9f45868a1c 100644 --- a/tests/benchmarks/test_work_stealing.py +++ b/tests/benchmarks/test_work_stealing.py @@ -23,7 +23,7 @@ def test_trivial_workload_should_not_cause_work_stealing(small_client): reason="https://github.com/dask/distributed/issues/6624", ) def test_work_stealing_on_scaling_up( - test_name_uuid, upload_cluster_dump, benchmark_all + test_name_uuid, upload_cluster_dump, benchmark_all, dask_env_variables ): with Cluster( name=test_name_uuid, @@ -32,6 +32,7 @@ def test_work_stealing_on_scaling_up( scheduler_vm_types=["t3.xlarge"], wait_for_workers=True, package_sync=True, + environ=dask_env_variables, ) as cluster: with Client(cluster) as client: with upload_cluster_dump(client, cluster), benchmark_all(client): @@ -77,7 +78,7 @@ def clog(n): def test_work_stealing_on_straggling_worker( - test_name_uuid, upload_cluster_dump, benchmark_all + test_name_uuid, upload_cluster_dump, benchmark_all, dask_env_variables ): with Cluster( name=test_name_uuid, @@ -85,6 +86,7 @@ def test_work_stealing_on_straggling_worker( worker_vm_types=["t3.medium"], scheduler_vm_types=["t3.xlarge"], wait_for_workers=True, + environ=dask_env_variables, ) as cluster: with Client(cluster) as client: with upload_cluster_dump(client, cluster), benchmark_all(client): diff --git a/tests/runtime/test_coiled.py b/tests/runtime/test_coiled.py index e75fbc736b..7ed5693645 100644 --- a/tests/runtime/test_coiled.py +++ b/tests/runtime/test_coiled.py @@ -35,9 +35,11 @@ def test_quickstart_parquet(small_client): assert not result.empty -def test_default_cluster_spinup_time(request, auto_benchmark_time): +def test_default_cluster_spinup_time(request, auto_benchmark_time, dask_env_variables): with Cluster( - name=f"{request.node.originalname}-{uuid.uuid4().hex[:8]}", package_sync=True + name=f"{request.node.originalname}-{uuid.uuid4().hex[:8]}", + package_sync=True, + environ=dask_env_variables, ): pass diff --git a/tests/stability/test_deadlock.py b/tests/stability/test_deadlock.py index 8c29d658e2..6ee4825cbb 100644 --- a/tests/stability/test_deadlock.py +++ b/tests/stability/test_deadlock.py @@ -13,7 +13,7 @@ @pytest.mark.skip( reason="Skip until https://github.com/dask/distributed/pull/6637 is merged" ) -def test_repeated_merge_spill(upload_cluster_dump, benchmark_all): +def test_repeated_merge_spill(upload_cluster_dump, benchmark_all, dask_env_variables): with Cluster( name=f"test_deadlock-{uuid.uuid4().hex}", n_workers=20, @@ -21,6 +21,7 @@ def test_repeated_merge_spill(upload_cluster_dump, benchmark_all): scheduler_vm_types=["t3.xlarge"], wait_for_workers=True, package_sync=True, + environ=dask_env_variables, ) as cluster: with Client(cluster) as client: with upload_cluster_dump(client, cluster), benchmark_all(client): diff --git a/tests/stability/test_spill.py b/tests/stability/test_spill.py index 04bedc4b33..2e9f191d03 100644 --- a/tests/stability/test_spill.py +++ b/tests/stability/test_spill.py @@ -4,10 +4,11 @@ import pytest from coiled import Cluster from dask.distributed import Client, wait +from toolz import merge @pytest.fixture(scope="module") -def spill_cluster(): +def spill_cluster(dask_env_variables): with Cluster( f"spill-{uuid.uuid4().hex[:8]}", n_workers=5, @@ -16,17 +17,20 @@ def spill_cluster(): worker_vm_types=["t3.large"], scheduler_vm_types=["t3.xlarge"], wait_for_workers=True, - environ={ - # Note: We set allowed-failures to ensure that no tasks are not retried - # upon ungraceful shutdown behavior during adaptive scaling - # but we receive a KilledWorker() instead. - "DASK_DISTRIBUTED__SCHEDULER__ALLOWED_FAILURES": "0", - # We need to limit the number of connections to avoid getting `oom-killed`. - # See https://github.com/coiled/coiled-runtime/pull/229#discussion_r946807049 - # for a longer discussion - "DASK_DISTRIBUTED__WORKER__CONNECTIONS__INCOMING": "1", - "DASK_DISTRIBUTED__WORKER__CONNECTIONS__OUTGOING": "1", - }, + environ=merge( + dask_env_variables, + { + # Note: We set allowed-failures to ensure that no tasks are not retried + # upon ungraceful shutdown behavior during adaptive scaling but we + # receive a KilledWorker() instead. + "DASK_DISTRIBUTED__SCHEDULER__ALLOWED_FAILURES": "0", + # We need to limit the number of connections to avoid getting + # `oom-killed`. For a longer discussion, see + # https://github.com/coiled/coiled-runtime/pull/229#discussion_r946807049 + "DASK_DISTRIBUTED__WORKER__CONNECTIONS__INCOMING": "1", + "DASK_DISTRIBUTED__WORKER__CONNECTIONS__OUTGOING": "1", + }, + ), ) as cluster: yield cluster From cce60d044a698cb6ba5057c33acda73abad54605 Mon Sep 17 00:00:00 2001 From: crusaderky Date: Tue, 20 Sep 2022 17:20:51 +0100 Subject: [PATCH 6/8] temp enable A/B --- AB_environments/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/AB_environments/config.yaml b/AB_environments/config.yaml index e9e4f5f3b1..17200c7ca1 100644 --- a/AB_environments/config.yaml +++ b/AB_environments/config.yaml @@ -2,7 +2,7 @@ # Lower values are faster and cheaper but will result in higher variance. # This must remain set to 0 in the main branch, thus completely disabling # A/B tests, in order to avoid unnecessary runs. -repeat: 0 +repeat: 3 # Set to true to automatically create a verbatim copy of AB_baseline and then compare # the two in the A/B tests. Set to false to save some money if you are already confident From 603a55fa3b069eab3164b071b605fd8c1a66703f Mon Sep 17 00:00:00 2001 From: crusaderky Date: Tue, 20 Sep 2022 17:41:12 +0100 Subject: [PATCH 7/8] fix null hypothesis --- .github/workflows/ab_tests.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/ab_tests.yml b/.github/workflows/ab_tests.yml index 93b9d3e083..e4c51ab296 100644 --- a/.github/workflows/ab_tests.yml +++ b/.github/workflows/ab_tests.yml @@ -78,6 +78,13 @@ jobs: python-version: ${{ matrix.python-version }} environment-file: ci/environment.yml + - name: Create null hypothesis as a copy of baseline + if: matrix.runtime-version == 'AB_null_hypothesis' + run: | + cd AB_environments + cp AB_baseline.conda.yaml AB_null_hypothesis.conda.yaml + cp AB_baseline.dask.yaml AB_null_hypothesis.dask.yaml + - name: Install coiled-runtime env: COILED_RUNTIME_VERSION: ${{ matrix.runtime-version }} From 911f0c60eb4d43139a8a983d7eac34c04c64e049 Mon Sep 17 00:00:00 2001 From: crusaderky Date: Wed, 21 Sep 2022 00:40:04 +0100 Subject: [PATCH 8/8] revert temp changes --- AB_environments/AB_sample.dask.yaml | 6 +++--- AB_environments/config.yaml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/AB_environments/AB_sample.dask.yaml b/AB_environments/AB_sample.dask.yaml index 1442587efe..fd743b1f76 100644 --- a/AB_environments/AB_sample.dask.yaml +++ b/AB_environments/AB_sample.dask.yaml @@ -3,6 +3,6 @@ # All files *must* be called AB_.conda.yaml # and *must* be accompanied by AB_.dask.yaml. # Leave empty if you don't want to override anything. -distributed: - scheduler: - worker-saturation: 1.2 +# distributed: +# scheduler: +# worker-saturation: 1.2 diff --git a/AB_environments/config.yaml b/AB_environments/config.yaml index 17200c7ca1..e9e4f5f3b1 100644 --- a/AB_environments/config.yaml +++ b/AB_environments/config.yaml @@ -2,7 +2,7 @@ # Lower values are faster and cheaper but will result in higher variance. # This must remain set to 0 in the main branch, thus completely disabling # A/B tests, in order to avoid unnecessary runs. -repeat: 3 +repeat: 0 # Set to true to automatically create a verbatim copy of AB_baseline and then compare # the two in the A/B tests. Set to false to save some money if you are already confident