diff --git a/.github/workflows/performance.yml b/.github/workflows/performance.yml new file mode 100644 index 00000000000..7b72d85797e --- /dev/null +++ b/.github/workflows/performance.yml @@ -0,0 +1,181 @@ + +name: Performance Regression Testing +# Schedule triggers +on: + # TODO this is just while developing + pull_request: + branches: + - 'develop' + - 'performance-regression-testing' + schedule: + # runs twice a day at 10:05am and 10:05pm + - cron: '5 10,22 * * *' + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +jobs: + + # checks fmt of runner code + # purposefully not a dependency of any other job + # will block merging, but not prevent developing + fmt: + name: Cargo fmt + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + - run: rustup component add rustfmt + - uses: actions-rs/cargo@v1 + with: + command: fmt + args: --manifest-path performance/runner/Cargo.toml --all -- --check + + # runs any tests associated with the runner + # these tests make sure the runner logic is correct + test-runner: + name: Test Runner + runs-on: ubuntu-latest + env: + # turns errors into warnings + RUSTFLAGS: "-D warnings" + steps: + - uses: actions/checkout@v2 + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + - uses: actions-rs/cargo@v1 + with: + command: test + args: --manifest-path performance/runner/Cargo.toml + + # build an optimized binary to be used as the runner in later steps + build-runner: + needs: [test-runner] + name: Build Runner + runs-on: ubuntu-latest + env: + RUSTFLAGS: "-D warnings" + steps: + - uses: actions/checkout@v2 + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + - uses: actions-rs/cargo@v1 + with: + command: build + args: --release --manifest-path performance/runner/Cargo.toml + - uses: actions/upload-artifact@v2 + with: + name: runner + path: performance/runner/target/release/runner + + # run the performance measurements on the current or default branch + measure-dev: + needs: [build-runner] + name: Measure Dev Branch + runs-on: ubuntu-latest + steps: + - name: checkout dev + uses: actions/checkout@v2 + - name: Setup Python + uses: actions/setup-python@v2.2.2 + with: + python-version: '3.8' + - name: install dbt + run: pip install -r dev-requirements.txt -r editable-requirements.txt + - name: install hyperfine + run: wget https://github.com/sharkdp/hyperfine/releases/download/v1.11.0/hyperfine_1.11.0_amd64.deb && sudo dpkg -i hyperfine_1.11.0_amd64.deb + - uses: actions/download-artifact@v2 + with: + name: runner + - name: change permissions + run: chmod +x ./runner + - name: run + run: ./runner measure -b dev -p ${{ github.workspace }}/performance/projects/ + - uses: actions/upload-artifact@v2 + with: + name: dev-results + path: performance/results/ + + # run the performance measurements on the release branch which we use + # as a performance baseline. This part takes by far the longest, so + # we do everything we can first so the job fails fast. + # ----- + # we need to checkout dbt twice in this job: once for the baseline dbt + # version, and once to get the latest regression testing projects, + # metrics, and runner code from the develop or current branch so that + # the calculations match for both versions of dbt we are comparing. + measure-baseline: + needs: [build-runner] + name: Measure Baseline Branch + runs-on: ubuntu-latest + steps: + - name: checkout latest + uses: actions/checkout@v2 + with: + ref: '0.20.latest' + - name: Setup Python + uses: actions/setup-python@v2.2.2 + with: + python-version: '3.8' + - name: move repo up a level + run: mkdir ${{ github.workspace }}/../baseline/ && cp -r ${{ github.workspace }} ${{ github.workspace }}/../baseline + - name: "[debug] ls new dbt location" + run: ls ${{ github.workspace }}/../baseline/dbt/ + # installation creates egg-links so we have to preserve source + - name: install dbt from new location + run: cd ${{ github.workspace }}/../baseline/dbt/ && pip install -r dev-requirements.txt -r editable-requirements.txt + # checkout the current branch to get all the target projects + # this deletes the old checked out code which is why we had to copy before + - name: checkout dev + uses: actions/checkout@v2 + - name: install hyperfine + run: wget https://github.com/sharkdp/hyperfine/releases/download/v1.11.0/hyperfine_1.11.0_amd64.deb && sudo dpkg -i hyperfine_1.11.0_amd64.deb + - uses: actions/download-artifact@v2 + with: + name: runner + - name: change permissions + run: chmod +x ./runner + - name: run runner + run: ./runner measure -b baseline -p ${{ github.workspace }}/performance/projects/ + - uses: actions/upload-artifact@v2 + with: + name: baseline-results + path: performance/results/ + + # detect regressions on the output generated from measuring + # the two branches. Exits with non-zero code if a regression is detected. + calculate-regressions: + needs: [measure-dev, measure-baseline] + name: Compare Results + runs-on: ubuntu-latest + steps: + - uses: actions/download-artifact@v2 + with: + name: dev-results + - uses: actions/download-artifact@v2 + with: + name: baseline-results + - name: "[debug] ls result files" + run: ls + - uses: actions/download-artifact@v2 + with: + name: runner + - name: change permissions + run: chmod +x ./runner + - name: run calculation + run: ./runner calculate -r ./ + # always attempt to upload the results even if there were regressions found + - uses: actions/upload-artifact@v2 + if: ${{ always() }} + with: + name: final-calculations + path: ./final_calculations.json diff --git a/CHANGELOG.md b/CHANGELOG.md index b3c9e14641e..3febb8547b1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,7 @@ - Fix for RPC requests that raise a RecursionError when serializing Undefined values as JSON ([#3464](https://github.com/dbt-labs/dbt/issues/3464), [#3687](https://github.com/dbt-labs/dbt/pull/3687)) ### Under the hood +- Add performance regression testing [#3602](https://github.com/dbt-labs/dbt/pull/3602) - Improve default view and table materialization performance by checking relational cache before attempting to drop temp relations ([#3112](https://github.com/fishtown-analytics/dbt/issues/3112), [#3468](https://github.com/fishtown-analytics/dbt/pull/3468)) - Add optional `sslcert`, `sslkey`, and `sslrootcert` profile arguments to the Postgres connector. ([#3472](https://github.com/fishtown-analytics/dbt/pull/3472), [#3473](https://github.com/fishtown-analytics/dbt/pull/3473)) - Move the example project used by `dbt init` into `dbt` repository, to avoid cloning an external repo ([#3005](https://github.com/fishtown-analytics/dbt/pull/3005), [#3474](https://github.com/fishtown-analytics/dbt/pull/3474), [#3536](https://github.com/fishtown-analytics/dbt/pull/3536)) diff --git a/performance/README.md b/performance/README.md new file mode 100644 index 00000000000..71aba30b989 --- /dev/null +++ b/performance/README.md @@ -0,0 +1,18 @@ +# Performance Regression Testing +This directory includes dbt project setups to test on and a test runner written in Rust which runs specific dbt commands on each of the projects. Orchestration is done via the GitHub Action workflow in `/.github/workflows/performance.yml`. The workflow is scheduled to run every night, but it can also be triggered manually. + +The github workflow hardcodes our baseline branch for performance metrics as `0.20.latest`. As future versions become faster, this branch will be updated to hold us to those new standards. + +## Adding a new dbt project +Just make a new directory under `performance/projects/`. It will automatically be picked up by the tests. + +## Adding a new dbt command +In `runner/src/measure.rs::measure` add a metric to the `metrics` Vec. The Github Action will handle recompilation if you don't have the rust toolchain installed. + +## Future work +- add more projects to test different configurations that have been known bottlenecks +- add more dbt commands to measure +- possibly using the uploaded json artifacts to store these results so they can be graphed over time +- reading new metrics from a file so no one has to edit rust source to add them to the suite +- instead of building the rust every time, we could publish and pull down the latest version. +- instead of manually setting the baseline version of dbt to test, pull down the latest stable version as the baseline. diff --git a/performance/project_config/.user.yml b/performance/project_config/.user.yml new file mode 100644 index 00000000000..1870bd1ea47 --- /dev/null +++ b/performance/project_config/.user.yml @@ -0,0 +1 @@ +id: 5d0c160e-f817-4b77-bce3-ffb2e37f0c9b diff --git a/performance/project_config/profiles.yml b/performance/project_config/profiles.yml new file mode 100644 index 00000000000..7b1c39a1f9e --- /dev/null +++ b/performance/project_config/profiles.yml @@ -0,0 +1,12 @@ +default: + target: dev + outputs: + dev: + type: postgres + host: localhost + user: dummy + password: dummy_password + port: 5432 + dbname: dummy + schema: dummy + threads: 4 \ No newline at end of file diff --git a/performance/projects/01_dummy_project/dbt_project.yml b/performance/projects/01_dummy_project/dbt_project.yml new file mode 100644 index 00000000000..95e1020def8 --- /dev/null +++ b/performance/projects/01_dummy_project/dbt_project.yml @@ -0,0 +1,38 @@ + +# Name your package! Package names should contain only lowercase characters +# and underscores. A good package name should reflect your organization's +# name or the intended use of these models +name: 'my_new_package' +version: 1.0.0 +config-version: 2 + +# This setting configures which "profile" dbt uses for this project. Profiles contain +# database connection information, and should be configured in the ~/.dbt/profiles.yml file +profile: 'default' + +# These configurations specify where dbt should look for different types of files. +# The `source-paths` config, for example, states that source models can be found +# in the "models/" directory. You probably won't need to change these! +source-paths: ["models"] +analysis-paths: ["analysis"] +test-paths: ["tests"] +data-paths: ["data"] +macro-paths: ["macros"] + +target-path: "target" # directory which will store compiled SQL files +clean-targets: # directories to be removed by `dbt clean` + - "target" + - "dbt_modules" + +# You can define configurations for models in the `source-paths` directory here. +# Using these configurations, you can enable or disable models, change how they +# are materialized, and more! + +# In this example config, we tell dbt to build all models in the example/ directory +# as views (the default). These settings can be overridden in the individual model files +# using the `{{ config(...) }}` macro. +models: + my_new_package: + # Applies to all files under models/example/ + example: + materialized: view diff --git a/performance/projects/01_dummy_project/models/path_0/node_0.sql b/performance/projects/01_dummy_project/models/path_0/node_0.sql new file mode 100644 index 00000000000..26d9cae7b5b --- /dev/null +++ b/performance/projects/01_dummy_project/models/path_0/node_0.sql @@ -0,0 +1 @@ +select 1 as id \ No newline at end of file diff --git a/performance/projects/01_dummy_project/models/path_0/node_0.yml b/performance/projects/01_dummy_project/models/path_0/node_0.yml new file mode 100644 index 00000000000..282e56882f0 --- /dev/null +++ b/performance/projects/01_dummy_project/models/path_0/node_0.yml @@ -0,0 +1,11 @@ +models: +- columns: + - name: id + tests: + - unique + - not_null + - relationships: + field: id + to: node_0 + name: node_0 +version: 2 diff --git a/performance/projects/01_dummy_project/models/path_0/node_1.sql b/performance/projects/01_dummy_project/models/path_0/node_1.sql new file mode 100644 index 00000000000..cb5b97f45c1 --- /dev/null +++ b/performance/projects/01_dummy_project/models/path_0/node_1.sql @@ -0,0 +1,3 @@ +select 1 as id +union all +select * from {{ ref('node_0') }} \ No newline at end of file diff --git a/performance/projects/01_dummy_project/models/path_0/node_1.yml b/performance/projects/01_dummy_project/models/path_0/node_1.yml new file mode 100644 index 00000000000..2899ddf532f --- /dev/null +++ b/performance/projects/01_dummy_project/models/path_0/node_1.yml @@ -0,0 +1,11 @@ +models: +- columns: + - name: id + tests: + - unique + - not_null + - relationships: + field: id + to: node_0 + name: node_1 +version: 2 diff --git a/performance/projects/01_dummy_project/models/path_0/node_2.sql b/performance/projects/01_dummy_project/models/path_0/node_2.sql new file mode 100644 index 00000000000..cb5b97f45c1 --- /dev/null +++ b/performance/projects/01_dummy_project/models/path_0/node_2.sql @@ -0,0 +1,3 @@ +select 1 as id +union all +select * from {{ ref('node_0') }} \ No newline at end of file diff --git a/performance/projects/01_dummy_project/models/path_0/node_2.yml b/performance/projects/01_dummy_project/models/path_0/node_2.yml new file mode 100644 index 00000000000..d2582eabbd8 --- /dev/null +++ b/performance/projects/01_dummy_project/models/path_0/node_2.yml @@ -0,0 +1,11 @@ +models: +- columns: + - name: id + tests: + - unique + - not_null + - relationships: + field: id + to: node_0 + name: node_2 +version: 2 diff --git a/performance/projects/02_dummy_project/dbt_project.yml b/performance/projects/02_dummy_project/dbt_project.yml new file mode 100644 index 00000000000..95e1020def8 --- /dev/null +++ b/performance/projects/02_dummy_project/dbt_project.yml @@ -0,0 +1,38 @@ + +# Name your package! Package names should contain only lowercase characters +# and underscores. A good package name should reflect your organization's +# name or the intended use of these models +name: 'my_new_package' +version: 1.0.0 +config-version: 2 + +# This setting configures which "profile" dbt uses for this project. Profiles contain +# database connection information, and should be configured in the ~/.dbt/profiles.yml file +profile: 'default' + +# These configurations specify where dbt should look for different types of files. +# The `source-paths` config, for example, states that source models can be found +# in the "models/" directory. You probably won't need to change these! +source-paths: ["models"] +analysis-paths: ["analysis"] +test-paths: ["tests"] +data-paths: ["data"] +macro-paths: ["macros"] + +target-path: "target" # directory which will store compiled SQL files +clean-targets: # directories to be removed by `dbt clean` + - "target" + - "dbt_modules" + +# You can define configurations for models in the `source-paths` directory here. +# Using these configurations, you can enable or disable models, change how they +# are materialized, and more! + +# In this example config, we tell dbt to build all models in the example/ directory +# as views (the default). These settings can be overridden in the individual model files +# using the `{{ config(...) }}` macro. +models: + my_new_package: + # Applies to all files under models/example/ + example: + materialized: view diff --git a/performance/projects/02_dummy_project/models/path_0/node_0.sql b/performance/projects/02_dummy_project/models/path_0/node_0.sql new file mode 100644 index 00000000000..26d9cae7b5b --- /dev/null +++ b/performance/projects/02_dummy_project/models/path_0/node_0.sql @@ -0,0 +1 @@ +select 1 as id \ No newline at end of file diff --git a/performance/projects/02_dummy_project/models/path_0/node_0.yml b/performance/projects/02_dummy_project/models/path_0/node_0.yml new file mode 100644 index 00000000000..282e56882f0 --- /dev/null +++ b/performance/projects/02_dummy_project/models/path_0/node_0.yml @@ -0,0 +1,11 @@ +models: +- columns: + - name: id + tests: + - unique + - not_null + - relationships: + field: id + to: node_0 + name: node_0 +version: 2 diff --git a/performance/projects/02_dummy_project/models/path_0/node_1.sql b/performance/projects/02_dummy_project/models/path_0/node_1.sql new file mode 100644 index 00000000000..cb5b97f45c1 --- /dev/null +++ b/performance/projects/02_dummy_project/models/path_0/node_1.sql @@ -0,0 +1,3 @@ +select 1 as id +union all +select * from {{ ref('node_0') }} \ No newline at end of file diff --git a/performance/projects/02_dummy_project/models/path_0/node_1.yml b/performance/projects/02_dummy_project/models/path_0/node_1.yml new file mode 100644 index 00000000000..2899ddf532f --- /dev/null +++ b/performance/projects/02_dummy_project/models/path_0/node_1.yml @@ -0,0 +1,11 @@ +models: +- columns: + - name: id + tests: + - unique + - not_null + - relationships: + field: id + to: node_0 + name: node_1 +version: 2 diff --git a/performance/projects/02_dummy_project/models/path_0/node_2.sql b/performance/projects/02_dummy_project/models/path_0/node_2.sql new file mode 100644 index 00000000000..cb5b97f45c1 --- /dev/null +++ b/performance/projects/02_dummy_project/models/path_0/node_2.sql @@ -0,0 +1,3 @@ +select 1 as id +union all +select * from {{ ref('node_0') }} \ No newline at end of file diff --git a/performance/projects/02_dummy_project/models/path_0/node_2.yml b/performance/projects/02_dummy_project/models/path_0/node_2.yml new file mode 100644 index 00000000000..d2582eabbd8 --- /dev/null +++ b/performance/projects/02_dummy_project/models/path_0/node_2.yml @@ -0,0 +1,11 @@ +models: +- columns: + - name: id + tests: + - unique + - not_null + - relationships: + field: id + to: node_0 + name: node_2 +version: 2 diff --git a/performance/results/.gitignore b/performance/results/.gitignore new file mode 100644 index 00000000000..b8e28556b4b --- /dev/null +++ b/performance/results/.gitignore @@ -0,0 +1,5 @@ +# all files here are generated results +* + +# except this one +!.gitignore diff --git a/performance/runner/.gitignore b/performance/runner/.gitignore new file mode 100644 index 00000000000..054a2acc8a4 --- /dev/null +++ b/performance/runner/.gitignore @@ -0,0 +1,2 @@ +target/ +projects/*/logs diff --git a/performance/runner/Cargo.lock b/performance/runner/Cargo.lock new file mode 100644 index 00000000000..8b90cedfef8 --- /dev/null +++ b/performance/runner/Cargo.lock @@ -0,0 +1,307 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "ansi_term" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b" +dependencies = [ + "winapi", +] + +[[package]] +name = "atty" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" +dependencies = [ + "hermit-abi", + "libc", + "winapi", +] + +[[package]] +name = "bitflags" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" + +[[package]] +name = "clap" +version = "2.33.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37e58ac78573c40708d45522f0d80fa2f01cc4f9b4e2bf749807255454312002" +dependencies = [ + "ansi_term", + "atty", + "bitflags", + "strsim", + "textwrap", + "unicode-width", + "vec_map", +] + +[[package]] +name = "either" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" + +[[package]] +name = "heck" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c" +dependencies = [ + "unicode-segmentation", +] + +[[package]] +name = "hermit-abi" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" +dependencies = [ + "libc", +] + +[[package]] +name = "itertools" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69ddb889f9d0d08a67338271fa9b62996bc788c7796a5c18cf057420aaed5eaf" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736" + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "libc" +version = "0.2.98" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "320cfe77175da3a483efed4bc0adc1968ca050b098ce4f2f1c13a56626128790" + +[[package]] +name = "proc-macro-error" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" +dependencies = [ + "proc-macro-error-attr", + "proc-macro2", + "quote", + "syn", + "version_check", +] + +[[package]] +name = "proc-macro-error-attr" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" +dependencies = [ + "proc-macro2", + "quote", + "version_check", +] + +[[package]] +name = "proc-macro2" +version = "1.0.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c7ed8b8c7b886ea3ed7dde405212185f423ab44682667c8c6dd14aa1d9f6612" +dependencies = [ + "unicode-xid", +] + +[[package]] +name = "quote" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3d0b9745dc2debf507c8422de05d7226cc1f0644216dfdfead988f9b1ab32a7" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "runner" +version = "0.1.0" +dependencies = [ + "itertools", + "serde", + "serde_json", + "structopt", + "thiserror", +] + +[[package]] +name = "ryu" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e" + +[[package]] +name = "serde" +version = "1.0.127" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f03b9878abf6d14e6779d3f24f07b2cfa90352cfec4acc5aab8f1ac7f146fae8" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.127" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a024926d3432516606328597e0f224a51355a493b49fdd67e9209187cbe55ecc" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.66" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "336b10da19a12ad094b59d870ebde26a45402e5b470add4b5fd03c5048a32127" +dependencies = [ + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "strsim" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" + +[[package]] +name = "structopt" +version = "0.3.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69b041cdcb67226aca307e6e7be44c8806423d83e018bd662360a93dabce4d71" +dependencies = [ + "clap", + "lazy_static", + "structopt-derive", +] + +[[package]] +name = "structopt-derive" +version = "0.4.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7813934aecf5f51a54775e00068c237de98489463968231a51746bbbc03f9c10" +dependencies = [ + "heck", + "proc-macro-error", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "syn" +version = "1.0.74" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1873d832550d4588c3dbc20f01361ab00bfe741048f71e3fecf145a7cc18b29c" +dependencies = [ + "proc-macro2", + "quote", + "unicode-xid", +] + +[[package]] +name = "textwrap" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" +dependencies = [ + "unicode-width", +] + +[[package]] +name = "thiserror" +version = "1.0.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93119e4feac1cbe6c798c34d3a53ea0026b0b1de6a120deef895137c0529bfe2" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "060d69a0afe7796bf42e9e2ff91f5ee691fb15c53d38b4b62a9a53eb23164745" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "unicode-segmentation" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8895849a949e7845e06bd6dc1aa51731a103c42707010a5b591c0038fb73385b" + +[[package]] +name = "unicode-width" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9337591893a19b88d8d87f2cec1e73fad5cdfd10e5a6f349f498ad6ea2ffb1e3" + +[[package]] +name = "unicode-xid" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" + +[[package]] +name = "vec_map" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" + +[[package]] +name = "version_check" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe" + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" diff --git a/performance/runner/Cargo.toml b/performance/runner/Cargo.toml new file mode 100644 index 00000000000..a1581255689 --- /dev/null +++ b/performance/runner/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "runner" +version = "0.1.0" +edition = "2018" + +[dependencies] +itertools = "0.10.1" +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" +structopt = "0.3" +thiserror = "1.0.26" diff --git a/performance/runner/src/calculate.rs b/performance/runner/src/calculate.rs new file mode 100644 index 00000000000..aff405d109b --- /dev/null +++ b/performance/runner/src/calculate.rs @@ -0,0 +1,269 @@ +use crate::exceptions::{CalculateError, IOError}; +use itertools::Itertools; +use serde::{Deserialize, Serialize}; +use std::fs; +use std::fs::DirEntry; +use std::path::{Path, PathBuf}; + +// This type exactly matches the type of array elements +// from hyperfine's output. Deriving `Serialize` and `Deserialize` +// gives us read and write capabilities via json_serde. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct Measurement { + pub command: String, + pub mean: f64, + pub stddev: f64, + pub median: f64, + pub user: f64, + pub system: f64, + pub min: f64, + pub max: f64, + pub times: Vec, +} + +// This type exactly matches the type of hyperfine's output. +// Deriving `Serialize` and `Deserialize` gives us read and +// write capabilities via json_serde. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct Measurements { + pub results: Vec, +} + +// Output data from a comparison between runs on the baseline +// and dev branches. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct Data { + pub threshold: f64, + pub difference: f64, + pub baseline: f64, + pub dev: f64, +} + +// The full output from a comparison between runs on the baseline +// and dev branches. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct Calculation { + pub metric: String, + pub regression: bool, + pub data: Data, +} + +// A type to describe which measurement we are working with. This +// information is parsed from the filename of hyperfine's output. +#[derive(Debug, Clone, PartialEq)] +pub struct MeasurementGroup { + pub version: String, + pub run: String, + pub measurement: Measurement, +} + +// Given two measurements, return all the calculations. Calculations are +// flagged as regressions or not regressions. +fn calculate(metric: &str, dev: &Measurement, baseline: &Measurement) -> Vec { + let median_threshold = 1.05; // 5% regression threshold + let median_difference = dev.median / baseline.median; + + let stddev_threshold = 1.20; // 20% regression threshold + let stddev_difference = dev.stddev / baseline.stddev; + + vec![ + Calculation { + metric: ["median", metric].join("_"), + regression: median_difference > median_threshold, + data: Data { + threshold: median_threshold, + difference: median_difference, + baseline: baseline.median, + dev: dev.median, + }, + }, + Calculation { + metric: ["stddev", metric].join("_"), + regression: stddev_difference > stddev_threshold, + data: Data { + threshold: stddev_threshold, + difference: stddev_difference, + baseline: baseline.stddev, + dev: dev.stddev, + }, + }, + ] +} + +// Given a directory, read all files in the directory and return each +// filename with the deserialized json contents of that file. +fn measurements_from_files( + results_directory: &Path, +) -> Result, CalculateError> { + fs::read_dir(results_directory) + .or_else(|e| Err(IOError::ReadErr(results_directory.to_path_buf(), Some(e)))) + .or_else(|e| Err(CalculateError::CalculateIOError(e)))? + .into_iter() + .map(|entry| { + let ent: DirEntry = entry + .or_else(|e| Err(IOError::ReadErr(results_directory.to_path_buf(), Some(e)))) + .or_else(|e| Err(CalculateError::CalculateIOError(e)))?; + + Ok(ent.path()) + }) + .collect::, CalculateError>>()? + .iter() + .filter(|path| { + path.extension() + .and_then(|ext| ext.to_str()) + .map_or(false, |ext| ext.ends_with("json")) + }) + .map(|path| { + fs::read_to_string(path) + .or_else(|e| Err(IOError::BadFileContentsErr(path.clone(), Some(e)))) + .or_else(|e| Err(CalculateError::CalculateIOError(e))) + .and_then(|contents| { + serde_json::from_str::(&contents) + .or_else(|e| Err(CalculateError::BadJSONErr(path.clone(), Some(e)))) + }) + .map(|m| (path.clone(), m)) + }) + .collect() +} + +// Given a list of filename-measurement pairs, detect any regressions by grouping +// measurements together by filename. +fn calculate_regressions( + measurements: &[(&PathBuf, &Measurement)], +) -> Result, CalculateError> { + /* + Strategy of this function body: + 1. [Measurement] -> [MeasurementGroup] + 2. Sort the MeasurementGroups + 3. Group the MeasurementGroups by "run" + 4. Call `calculate` with the two resulting Measurements as input + */ + + let mut measurement_groups: Vec = measurements + .iter() + .map(|(p, m)| { + p.file_name() + .ok_or_else(|| IOError::MissingFilenameErr(p.to_path_buf())) + .and_then(|name| { + name.to_str() + .ok_or_else(|| IOError::FilenameNotUnicodeErr(p.to_path_buf())) + }) + .map(|name| { + let parts: Vec<&str> = name.split("_").collect(); + MeasurementGroup { + version: parts[0].to_owned(), + run: parts[1..].join("_"), + measurement: (*m).clone(), + } + }) + }) + .collect::, IOError>>() + .or_else(|e| Err(CalculateError::CalculateIOError(e)))?; + + measurement_groups.sort_by(|x, y| (&x.run, &x.version).cmp(&(&y.run, &y.version))); + + // locking up mutation + let sorted_measurement_groups = measurement_groups; + + let calculations: Vec = sorted_measurement_groups + .iter() + .group_by(|x| &x.run) + .into_iter() + .map(|(_, g)| { + let mut groups: Vec<&MeasurementGroup> = g.collect(); + groups.sort_by(|x, y| x.version.cmp(&y.version)); + + match groups.len() { + 2 => { + let dev = &groups[1]; + let baseline = &groups[0]; + + if dev.version == "dev" && baseline.version == "baseline" { + Ok(calculate(&dev.run, &dev.measurement, &baseline.measurement)) + } else { + Err(CalculateError::BadBranchNameErr( + baseline.version.clone(), + dev.version.clone(), + )) + } + } + i => { + let gs: Vec = groups.into_iter().map(|x| x.clone()).collect(); + Err(CalculateError::BadGroupSizeErr(i, gs)) + } + } + }) + .collect::>, CalculateError>>()? + .concat(); + + Ok(calculations) +} + +// Top-level function. Given a path for the result directory, call the above +// functions to compare and collect calculations. Calculations include both +// metrics that fall within the threshold and regressions. +pub fn regressions(results_directory: &PathBuf) -> Result, CalculateError> { + measurements_from_files(Path::new(&results_directory)).and_then(|v| { + // exit early with an Err if there are no results to process + if v.len() <= 0 { + Err(CalculateError::NoResultsErr(results_directory.clone())) + // we expect two runs for each project-metric pairing: one for each branch, baseline + // and dev. An odd result count is unexpected. + } else if v.len() % 2 == 1 { + Err(CalculateError::OddResultsCountErr( + v.len(), + results_directory.clone(), + )) + } else { + // otherwise, we can do our comparisons + let measurements = v + .iter() + // the way we're running these, the files will each contain exactly one measurement, hence `results[0]` + .map(|(p, ms)| (p, &ms.results[0])) + .collect::>(); + + calculate_regressions(&measurements[..]) + } + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn detects_5_percent_regression() { + let dev = Measurement { + command: "some command".to_owned(), + mean: 1.06, + stddev: 1.06, + median: 1.06, + user: 1.06, + system: 1.06, + min: 1.06, + max: 1.06, + times: vec![], + }; + + let baseline = Measurement { + command: "some command".to_owned(), + mean: 1.00, + stddev: 1.00, + median: 1.00, + user: 1.00, + system: 1.00, + min: 1.00, + max: 1.00, + times: vec![], + }; + + let calculations = calculate("test_metric", &dev, &baseline); + let regressions: Vec<&Calculation> = + calculations.iter().filter(|calc| calc.regression).collect(); + + // expect one regression for median + println!("{:#?}", regressions); + assert_eq!(regressions.len(), 1); + assert_eq!(regressions[0].metric, "median_test_metric"); + } +} diff --git a/performance/runner/src/exceptions.rs b/performance/runner/src/exceptions.rs new file mode 100644 index 00000000000..f278da6c365 --- /dev/null +++ b/performance/runner/src/exceptions.rs @@ -0,0 +1,155 @@ +use crate::calculate::*; +use std::io; +#[cfg(test)] +use std::path::Path; +use std::path::PathBuf; +use thiserror::Error; + +// Custom IO Error messages for the IO errors we encounter. +// New constructors should be added to wrap any new IO errors. +// The desired output of these errors is tested below. +#[derive(Debug, Error)] +pub enum IOError { + #[error("ReadErr: The file cannot be read.\nFilepath: {}\nOriginating Exception: {}", .0.to_string_lossy().into_owned(), .1.as_ref().map_or("None".to_owned(), |e| format!("{}", e)))] + ReadErr(PathBuf, Option), + #[error("MissingFilenameErr: The path provided does not specify a file.\nFilepath: {}", .0.to_string_lossy().into_owned())] + MissingFilenameErr(PathBuf), + #[error("FilenameNotUnicodeErr: The filename is not expressible in unicode. Consider renaming the file.\nFilepath: {}", .0.to_string_lossy().into_owned())] + FilenameNotUnicodeErr(PathBuf), + #[error("BadFileContentsErr: Check that the file exists and is readable.\nFilepath: {}\nOriginating Exception: {}", .0.to_string_lossy().into_owned(), .1.as_ref().map_or("None".to_owned(), |e| format!("{}", e)))] + BadFileContentsErr(PathBuf, Option), + #[error("CommandErr: System command failed to run.\nOriginating Exception: {}", .0.as_ref().map_or("None".to_owned(), |e| format!("{}", e)))] + CommandErr(Option), +} + +// Custom Error messages for the error states we could encounter +// during calculation, and are not prevented at compile time. New +// constructors should be added for any new error situations that +// come up. The desired output of these errors is tested below. +#[derive(Debug, Error)] +pub enum CalculateError { + #[error("BadJSONErr: JSON in file cannot be deserialized as expected.\nFilepath: {}\nOriginating Exception: {}", .0.to_string_lossy().into_owned(), .1.as_ref().map_or("None".to_owned(), |e| format!("{}", e)))] + BadJSONErr(PathBuf, Option), + #[error("{}", .0)] + CalculateIOError(IOError), + #[error("NoResultsErr: The results directory has no json files in it.\nFilepath: {}", .0.to_string_lossy().into_owned())] + NoResultsErr(PathBuf), + #[error("OddResultsCountErr: The results directory has an odd number of results in it. Expected an even number.\nFile Count: {}\nFilepath: {}", .0, .1.to_string_lossy().into_owned())] + OddResultsCountErr(usize, PathBuf), + #[error("BadGroupSizeErr: Expected two results per group, one for each branch-project pair.\nCount: {}\nGroup: {:?}", .0, .1.into_iter().map(|group| (&group.version[..], &group.run[..])).collect::>())] + BadGroupSizeErr(usize, Vec), + #[error("BadBranchNameErr: Branch names must be 'baseline' and 'dev'.\nFound: {}, {}", .0, .1)] + BadBranchNameErr(String, String), +} + +// Tests for exceptions +#[cfg(test)] +mod tests { + use super::*; + + // Tests the output fo io error messages. There should be at least one per enum constructor. + #[test] + fn test_io_error_messages() { + let pairs = vec![ + ( + IOError::ReadErr(Path::new("dummy/path/file.json").to_path_buf(), None), + r#"ReadErr: The file cannot be read. +Filepath: dummy/path/file.json +Originating Exception: None"#, + ), + ( + IOError::MissingFilenameErr(Path::new("dummy/path/no_file/").to_path_buf()), + r#"MissingFilenameErr: The path provided does not specify a file. +Filepath: dummy/path/no_file/"#, + ), + ( + IOError::FilenameNotUnicodeErr(Path::new("dummy/path/no_file/").to_path_buf()), + r#"FilenameNotUnicodeErr: The filename is not expressible in unicode. Consider renaming the file. +Filepath: dummy/path/no_file/"#, + ), + ( + IOError::BadFileContentsErr( + Path::new("dummy/path/filenotexist.json").to_path_buf(), + None, + ), + r#"BadFileContentsErr: Check that the file exists and is readable. +Filepath: dummy/path/filenotexist.json +Originating Exception: None"#, + ), + ( + IOError::CommandErr(None), + r#"CommandErr: System command failed to run. +Originating Exception: None"#, + ), + ]; + + for (err, msg) in pairs { + assert_eq!(format!("{}", err), msg) + } + } + + // Tests the output fo calculate error messages. There should be at least one per enum constructor. + #[test] + fn test_calculate_error_messages() { + let pairs = vec![ + ( + CalculateError::BadJSONErr(Path::new("dummy/path/file.json").to_path_buf(), None), + r#"BadJSONErr: JSON in file cannot be deserialized as expected. +Filepath: dummy/path/file.json +Originating Exception: None"#, + ), + ( + CalculateError::BadJSONErr(Path::new("dummy/path/file.json").to_path_buf(), None), + r#"BadJSONErr: JSON in file cannot be deserialized as expected. +Filepath: dummy/path/file.json +Originating Exception: None"#, + ), + ( + CalculateError::NoResultsErr(Path::new("dummy/path/no_file/").to_path_buf()), + r#"NoResultsErr: The results directory has no json files in it. +Filepath: dummy/path/no_file/"#, + ), + ( + CalculateError::OddResultsCountErr( + 3, + Path::new("dummy/path/no_file/").to_path_buf(), + ), + r#"OddResultsCountErr: The results directory has an odd number of results in it. Expected an even number. +File Count: 3 +Filepath: dummy/path/no_file/"#, + ), + ( + CalculateError::BadGroupSizeErr( + 1, + vec![MeasurementGroup { + version: "dev".to_owned(), + run: "some command".to_owned(), + measurement: Measurement { + command: "some command".to_owned(), + mean: 1.0, + stddev: 1.0, + median: 1.0, + user: 1.0, + system: 1.0, + min: 1.0, + max: 1.0, + times: vec![1.0, 1.1, 0.9, 1.0, 1.1, 0.9, 1.1], + }, + }], + ), + r#"BadGroupSizeErr: Expected two results per group, one for each branch-project pair. +Count: 1 +Group: [("dev", "some command")]"#, + ), + ( + CalculateError::BadBranchNameErr("boop".to_owned(), "noop".to_owned()), + r#"BadBranchNameErr: Branch names must be 'baseline' and 'dev'. +Found: boop, noop"#, + ), + ]; + + for (err, msg) in pairs { + assert_eq!(format!("{}", err), msg) + } + } +} diff --git a/performance/runner/src/main.rs b/performance/runner/src/main.rs new file mode 100644 index 00000000000..28392d11c8d --- /dev/null +++ b/performance/runner/src/main.rs @@ -0,0 +1,119 @@ +extern crate structopt; + +mod calculate; +mod exceptions; +mod measure; + +use crate::calculate::Calculation; +use crate::exceptions::CalculateError; +use std::fs::File; +use std::io::Write; +use std::path::PathBuf; +use structopt::StructOpt; + +// This type defines the commandline interface and is generated +// by `derive(StructOpt)` +#[derive(Clone, Debug, StructOpt)] +#[structopt(name = "performance", about = "performance regression testing runner")] +enum Opt { + #[structopt(name = "measure")] + Measure { + #[structopt(parse(from_os_str))] + #[structopt(short)] + projects_dir: PathBuf, + #[structopt(short)] + branch_name: String, + }, + #[structopt(name = "calculate")] + Calculate { + #[structopt(parse(from_os_str))] + #[structopt(short)] + results_dir: PathBuf, + }, +} + +// enables proper useage of exit() in main. +// https://doc.rust-lang.org/std/process/fn.exit.html#examples +// +// This is where all the printing should happen. Exiting happens +// in main, and module functions should only return values. +fn run_app() -> Result { + // match what the user inputs from the cli + match Opt::from_args() { + // measure subcommand + Opt::Measure { + projects_dir, + branch_name, + } => { + // if there are any nonzero exit codes from the hyperfine runs, + // return the first one. otherwise return zero. + measure::measure(&projects_dir, &branch_name) + .or_else(|e| Err(CalculateError::CalculateIOError(e)))? + .iter() + .map(|status| status.code()) + .flatten() + .filter(|code| *code != 0) + .collect::>() + .get(0) + .map_or(Ok(0), |x| { + println!("Main: a child process exited with a nonzero status code."); + Ok(*x) + }) + } + + // calculate subcommand + Opt::Calculate { results_dir } => { + // get all the calculations or gracefully show the user an exception + let calculations = calculate::regressions(&results_dir)?; + + // print all calculations to stdout so they can be easily debugged + // via CI. + println!(":: All Calculations ::\n"); + for c in &calculations { + println!("{:#?}\n", c); + } + + // indented json string representation of the calculations array + let json_calcs = serde_json::to_string_pretty(&calculations) + .expect("Main: Failed to serialize calculations to json"); + + // create the empty destination file, and write the json string + let outfile = &mut results_dir.into_os_string(); + outfile.push("/final_calculations.json"); + let mut f = File::create(outfile).expect("Main: Unable to create file"); + f.write_all(json_calcs.as_bytes()) + .expect("Main: Unable to write data"); + + // filter for regressions + let regressions: Vec<&Calculation> = + calculations.iter().filter(|c| c.regression).collect(); + + // return a non-zero exit code if there are regressions + match regressions[..] { + [] => { + println!("congrats! no regressions :)"); + Ok(0) + } + _ => { + // print all calculations to stdout so they can be easily + // debugged via CI. + println!(":: Regressions Found ::\n"); + for r in regressions { + println!("{:#?}\n", r); + } + Ok(1) + } + } + } + } +} + +fn main() { + std::process::exit(match run_app() { + Ok(code) => code, + Err(err) => { + eprintln!("{}", err); + 1 + } + }); +} diff --git a/performance/runner/src/measure.rs b/performance/runner/src/measure.rs new file mode 100644 index 00000000000..35df2b997c7 --- /dev/null +++ b/performance/runner/src/measure.rs @@ -0,0 +1,89 @@ +use crate::exceptions::IOError; +use std::fs; +use std::path::PathBuf; +use std::process::{Command, ExitStatus}; + +// `Metric` defines a dbt command that we want to measure on both the +// baseline and dev branches. +#[derive(Debug, Clone)] +struct Metric<'a> { + name: &'a str, + prepare: &'a str, + cmd: &'a str, +} + +impl Metric<'_> { + // Returns the proper filename for the hyperfine output for this metric. + fn outfile(&self, project: &str, branch: &str) -> String { + [branch, "_", self.name, "_", project, ".json"].join("") + } +} + +// Calls hyperfine via system command, and returns all the exit codes for each hyperfine run. +pub fn measure<'a>( + projects_directory: &PathBuf, + dbt_branch: &str, +) -> Result, IOError> { + /* + Strategy of this function body: + 1. Read all directory names in `projects_directory` + 2. Pair `n` projects with `m` metrics for a total of n*m pairs + 3. Run hyperfine on each project-metric pair + */ + + // To add a new metric to the test suite, simply define it in this list: + // TODO: This could be read from a config file in a future version. + let metrics: Vec = vec![Metric { + name: "parse", + prepare: "rm -rf target/", + cmd: "dbt parse --no-version-check", + }]; + + fs::read_dir(projects_directory) + .or_else(|e| Err(IOError::ReadErr(projects_directory.to_path_buf(), Some(e))))? + .map(|entry| { + let path = entry + .or_else(|e| Err(IOError::ReadErr(projects_directory.to_path_buf(), Some(e))))? + .path(); + + let project_name: String = path + .file_name() + .ok_or_else(|| IOError::MissingFilenameErr(path.clone().to_path_buf())) + .and_then(|x| { + x.to_str() + .ok_or_else(|| IOError::FilenameNotUnicodeErr(path.clone().to_path_buf())) + })? + .to_owned(); + + // each project-metric pair we will run + let pairs = metrics + .iter() + .map(|metric| (path.clone(), project_name.clone(), metric)) + .collect::)>>(); + + Ok(pairs) + }) + .collect::)>>, IOError>>()? + .concat() + .iter() + // run hyperfine on each pairing + .map(|(path, project_name, metric)| { + Command::new("hyperfine") + .current_dir(path) + // warms filesystem caches by running the command first without counting it. + // alternatively we could clear them before each run + .arg("--warmup") + .arg("1") + .arg("--prepare") + .arg(metric.prepare) + .arg([metric.cmd, " --profiles-dir ", "../../project_config/"].join("")) + .arg("--export-json") + .arg(["../../results/", &metric.outfile(project_name, dbt_branch)].join("")) + // this prevents hyperfine from capturing dbt's output. + // Noisy, but good for debugging when tests fail. + .arg("--show-output") + .status() // use spawn() here instead for more information + .or_else(|e| Err(IOError::CommandErr(Some(e)))) + }) + .collect() +}