diff --git a/.circleci/config.yml b/.circleci/config.yml index 81b17ad4..65c00440 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -5,7 +5,13 @@ jobs: build: docker: - image: cimg/python:3.9.9 - - image: circleci/postgres:9.6.5-alpine-ram + - image: cimg/postgres:14.0 + auth: + username: dbt-labs + password: '' + environment: + POSTGRES_USER: root + POSTGRES_DB: circle_test steps: - checkout diff --git a/.gitignore b/.gitignore index 49f147cb..a33e3f41 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ - target/ dbt_packages/ logs/ +logfile \ No newline at end of file diff --git a/README.md b/README.md index df0b9eef..9095cb7f 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,8 @@ Useful macros when performing data audits * [compare_queries](#compare_queries-source) * [compare_column_values](#compare_column_values-source) * [compare_relation_columns](#compare_relation_columns-source) +* [compare_all_columns](#compare_all_columns-source) +* [compare_column_values_verbose](#compare_column_values_verbose-source) # Installation instructions New to dbt packages? Read more about them [here](https://docs.getdbt.com/docs/building-a-dbt-project/package-management/). @@ -160,67 +162,6 @@ number of your records don't match. work as expected. -### Advanced usage: -Got a wide table, and want to iterate through all the columns? Try something -like this: -``` -{%- set columns_to_compare=adapter.get_columns_in_relation(ref('dim_product')) -%} - -{% set old_etl_relation_query %} - select * from public.dim_product - where is_latest -{% endset %} - -{% set new_etl_relation_query %} - select * from {{ ref('dim_product') }} -{% endset %} - -{% if execute %} - {% for column in columns_to_compare %} - {{ log('Comparing column "' ~ column.name ~'"', info=True) }} - - {% set audit_query = audit_helper.compare_column_values( - a_query=old_etl_relation_query, - b_query=new_etl_relation_query, - primary_key="product_id", - column_to_compare=column.name - ) %} - - {% set audit_results = run_query(audit_query) %} - {% do audit_results.print_table() %} - {{ log("", info=True) }} - - {% endfor %} -{% endif %} -``` - -This will give you an output like: -``` -Comparing column "name" -| match_status | count_records | percent_of_total | -| -------------------- | ------------- | ---------------- | -| ✅: perfect match | 41,573 | 99.43 | -| 🤷: missing from b | 26 | 0.06 | -| 🙅: ‍values do not... | 212 | 0.51 | - -Comparing column "msrp" -| match_status | count_records | percent_of_total | -| -------------------- | ------------- | ---------------- | -| ✅: perfect match | 31,145 | 74.49 | -| ✅: both are null | 10,557 | 25.25 | -| 🤷: missing from b | 22 | 0.05 | -| 🤷: value is null ... | 31 | 0.07 | -| 🤷: value is null ... | 4 | 0.01 | -| 🙅: ‍values do not... | 52 | 0.12 | - -Comparing column "status" -| match_status | count_records | percent_of_total | -| -------------------- | ------------- | ---------------- | -| ✅: perfect match | 37,715 | 90.20 | -| 🤷: missing from b | 26 | 0.06 | -| 🙅: ‍values do not... | 4,070 | 9.73 | -``` - ### Advanced usage - dbt Cloud: The ``.print_table()`` function is not compatible with dbt Cloud so an adjustment needs to be made in order to print the results. Replace the following section of code: ``` @@ -280,5 +221,125 @@ it is a date in our "b" relation. ``` +## compare_all_columns ([source](macros/compare_all_columns.sql)) +This macro is designed to be added to a dbt test suite as a custom test. A +`compare_all_columns` test monitors changes data values when code is changed +as part of a PR or during development. It sets up a test that will fail +if any column values do not match. + +Users can configure what exactly constitutes a value match or failure. If +there is a test failure, results can be inspected in the warehouse. The primary key +and the column name can be included in the test output that gets written to the warehouse. +This enables the user to join test results to relevant tables in your dev or prod schema to investigate the error. + +### Usage: + +_Note: this test should only be used on (and will only work on) models that have a primary key that is reliably `unique` and `not_null`. [Generic dbt tests](https://docs.getdbt.com/docs/building-a-dbt-project/tests#generic-tests) should be used to ensure the model being tested meets the requirements of `unique` and `not_null`._ + +To create a test for the `stg_customers` model, create a custom test +in the `tests` subdirectory of your dbt project that looks like this: + +``` +{{ + audit_helper.compare_all_columns( + a_relation=ref('stg_customers'), -- in a test, this ref will compile as your dev or PR schema. + b_relation=api.Relation.create(database='dbt_db', schema='analytics_prod', identifier='stg_customers'), -- you can explicitly write a relation to select your production schema, or any other db/schema/table you'd like to use for comparison testing. + exclude_columns=['updated_at'], + primary_key='id' + ) +}} +where not perfect_match +``` +The `where not perfect_match` statement is an example of a filter you can apply to define what +constitutes a test failure. The test will fail if any rows don't meet the +requirement of a perfect match. Failures would include: + +* If the primary key exists in both relations, but one model has a null value in a column. +* If a primary key is missing from one relation. +* If the primary key exists in both relations, but the value conflicts. + +If you'd like the test to only fail when there are conflicting values, you could configure it like this: + +``` +{{ + audit_helper.compare_all_columns( + a_relation=ref('stg_customers'), + b_relation=api.Relation.create(database='dbt_db', schema='analytics_prod', identifier='stg_customers'), + primary_key='id' + ) +}} +where conflicting_values +``` + +#### Arguemnts: + +* `a_relation` and `b_relation`: The [relations](https://docs.getdbt.com/reference#relation) + you want to compare. Any two relations that have the same columns can be used. In the + example above, two different approaches to writing relations, using `ref` and + using `api.Relation.create`, are demonstrated. (When writing one-off code, it might make sense to + hard-code a relation, like this: `analytics_prod.stg_customers`. A hard-coded relation + is not recommended when building this macro into a CI cycle.) +* `exclude_columns` (optional): Any columns you wish to exclude from the + validation. +* `primary_key`: The primary key of the model. Used to sort unmatched + results for row-by-row validation. + +If you want to create test results that include columns from the model itself +for easier inspection, that can be written into the test: + +``` +{{ + audit_helper.compare_all_columns( + a_relation=ref('stg_customers'), + b_relation=api.Relation.create(database='dbt_db', schema='analytics_prod', identifier='stg_customers'), + exclude_columns=['updated_at'], + primary_key='id' + ) +}} +left join {{ ref('stg_customers') }} using(id) +``` + +This structure also allows for the test to group or filter by any attribute in the model or in +the macro's output as part of the test, for example: + +``` +with base_test_cte as ( + {{ + audit_helper.compare_all_columns( + a_relation=ref('stg_customers'), + b_relation=api.Relation.create(database='dbt_db', schema='analytics_prod', identifier='stg_customers'), + exclude_columns=['updated_at'], + primary_key='id' + ) + }} + left join {{ ref('stg_customers') }} using(id) + where conflicting_values +) +select + status, -- assume there's a "status" column in stg_customers + count(distinct case when conflicting_values then id end) as conflicting_values +from base_test_cte +group by 1 +``` + +You can write a `compare_all_columns` test on individual table; and the test will be run +as part of a full test suite run. + +``` +dbt test --select stg_customers +``` + +If you want to [store results in the warehouse for further analysis](https://docs.getdbt.com/docs/building-a-dbt-project/tests#storing-test-failures), add the `--store-failures` +flag. + +``` +dbt test --select stg_customers --store-failures +``` + +## compare_column_values_verbose ([source](macros/compare_column_values_verbose.sql)) +This macro will return a query that, when executed, returns the same information as +`compare_column_values`, but not summarized. `compare_column_values_verbose` enables `compare_all_columns` to give the user more flexibility around what will result in a test failure. + + # To-do: * Macro to check if two schemas contain the same relations diff --git a/integration_tests/models/compare_all_columns_where_clause.sql b/integration_tests/models/compare_all_columns_where_clause.sql new file mode 100644 index 00000000..2afd1a17 --- /dev/null +++ b/integration_tests/models/compare_all_columns_where_clause.sql @@ -0,0 +1,11 @@ +{% set a_relation=ref('data_compare_all_columns__market_of_choice_produce')%} + +{% set b_relation=ref('data_compare_all_columns__albertsons_produce') %} + +{{ audit_helper.compare_all_columns( + a_relation=a_relation, + b_relation=b_relation, + primary_key="id", + summarize=false +) }} +where not perfect_match \ No newline at end of file diff --git a/integration_tests/models/compare_all_columns_with_summary.sql b/integration_tests/models/compare_all_columns_with_summary.sql new file mode 100644 index 00000000..49b20088 --- /dev/null +++ b/integration_tests/models/compare_all_columns_with_summary.sql @@ -0,0 +1,9 @@ +{% set a_relation=ref('data_compare_all_columns__market_of_choice_produce')%} + +{% set b_relation=ref('data_compare_all_columns__albertsons_produce') %} + +{{ audit_helper.compare_all_columns( + a_relation=a_relation, + b_relation=b_relation, + primary_key="id" +) }} diff --git a/integration_tests/models/compare_all_columns_with_summary_and_exclude.sql b/integration_tests/models/compare_all_columns_with_summary_and_exclude.sql new file mode 100644 index 00000000..83ecf7c9 --- /dev/null +++ b/integration_tests/models/compare_all_columns_with_summary_and_exclude.sql @@ -0,0 +1,10 @@ +{% set a_relation=ref('data_compare_all_columns__market_of_choice_produce')%} + +{% set b_relation=ref('data_compare_all_columns__albertsons_produce') %} + +{{ audit_helper.compare_all_columns( + a_relation=a_relation, + b_relation=b_relation, + primary_key="id", + exclude_columns=['ripeness'] +) }} diff --git a/integration_tests/models/compare_all_columns_without_summary.sql b/integration_tests/models/compare_all_columns_without_summary.sql new file mode 100644 index 00000000..510ce182 --- /dev/null +++ b/integration_tests/models/compare_all_columns_without_summary.sql @@ -0,0 +1,10 @@ +{% set a_relation=ref('data_compare_all_columns__market_of_choice_produce')%} + +{% set b_relation=ref('data_compare_all_columns__albertsons_produce') %} + +{{ audit_helper.compare_all_columns( + a_relation=a_relation, + b_relation=b_relation, + primary_key="id", + summarize=false +) }} diff --git a/integration_tests/models/schema.yml b/integration_tests/models/schema.yml index 49c6bc56..7e1957bd 100644 --- a/integration_tests/models/schema.yml +++ b/integration_tests/models/schema.yml @@ -35,3 +35,24 @@ models: tests: - dbt_utils.equality: compare_model: ref('expected_results__compare_relations_without_exclude') + + - name: compare_all_columns_with_summary + tests: + - dbt_utils.equality: + compare_model: ref('expected_results__compare_all_columns_with_summary') + + - name: compare_all_columns_without_summary + tests: + - dbt_utils.equality: + compare_model: ref('expected_results__compare_all_columns_without_summary') + + + - name: compare_all_columns_with_summary_and_exclude + tests: + - dbt_utils.equality: + compare_model: ref('expected_results__compare_all_columns_with_summary_and_exclude') + + - name: compare_all_columns_where_clause + tests: + - dbt_utils.equality: + compare_model: ref('expected_results__compare_all_columns_where_clause') \ No newline at end of file diff --git a/integration_tests/seeds/data_compare_all_columns__albertsons_produce.csv b/integration_tests/seeds/data_compare_all_columns__albertsons_produce.csv new file mode 100644 index 00000000..80da035f --- /dev/null +++ b/integration_tests/seeds/data_compare_all_columns__albertsons_produce.csv @@ -0,0 +1,9 @@ +id,fruit,ripeness +1,banana,yellow +2,banana,brown +3,banana,brown +4,orange,green +5,orange,orange +6,,brown +7,orange,orange +9,apple,mushy \ No newline at end of file diff --git a/integration_tests/seeds/data_compare_all_columns__market_of_choice_produce.csv b/integration_tests/seeds/data_compare_all_columns__market_of_choice_produce.csv new file mode 100644 index 00000000..4e092668 --- /dev/null +++ b/integration_tests/seeds/data_compare_all_columns__market_of_choice_produce.csv @@ -0,0 +1,9 @@ +id,fruit,ripeness +1,banana,yellow +2,banana,green +3,banana,brown +4,orange,green +5,orange,orange +6,orange,brown +7,orange, +8,apple,mushy \ No newline at end of file diff --git a/integration_tests/seeds/expected_results__compare_all_columns_where_clause.csv b/integration_tests/seeds/expected_results__compare_all_columns_where_clause.csv new file mode 100644 index 00000000..1092c0ff --- /dev/null +++ b/integration_tests/seeds/expected_results__compare_all_columns_where_clause.csv @@ -0,0 +1,10 @@ +primary_key,column_name,perfect_match,null_in_a,null_in_b,missing_from_a,missing_from_b,conflicting_values +8,ID,false,false,false,false,true,false +9,ID,false,false,false,true,false,false +6,FRUIT,false,false,true,false,false,false +8,FRUIT,false,false,false,false,true,false +9,FRUIT,false,false,false,true,false,false +2,RIPENESS,false,false,false,false,false,true +7,RIPENESS,false,true,false,false,false,false +8,RIPENESS,false,false,false,false,true,false +9,RIPENESS,false,false,false,true,false,false \ No newline at end of file diff --git a/integration_tests/seeds/expected_results__compare_all_columns_with_summary.csv b/integration_tests/seeds/expected_results__compare_all_columns_with_summary.csv new file mode 100644 index 00000000..5d879dce --- /dev/null +++ b/integration_tests/seeds/expected_results__compare_all_columns_with_summary.csv @@ -0,0 +1,4 @@ +column_name,perfect_match,null_in_a,null_in_b,missing_from_a,missing_from_b,conflicting_values +ID,7,0,0,1,1,0 +FRUIT,6,0,1,1,1,0 +RIPENESS,5,1,0,1,1,1 \ No newline at end of file diff --git a/integration_tests/seeds/expected_results__compare_all_columns_with_summary_and_exclude.csv b/integration_tests/seeds/expected_results__compare_all_columns_with_summary_and_exclude.csv new file mode 100644 index 00000000..f3ab63cd --- /dev/null +++ b/integration_tests/seeds/expected_results__compare_all_columns_with_summary_and_exclude.csv @@ -0,0 +1,3 @@ +column_name,perfect_match,null_in_a,null_in_b,missing_from_a,missing_from_b,conflicting_values +ID,7,0,0,1,1,0 +FRUIT,6,0,1,1,1,0 \ No newline at end of file diff --git a/integration_tests/seeds/expected_results__compare_all_columns_without_summary.csv b/integration_tests/seeds/expected_results__compare_all_columns_without_summary.csv new file mode 100644 index 00000000..fa3e319b --- /dev/null +++ b/integration_tests/seeds/expected_results__compare_all_columns_without_summary.csv @@ -0,0 +1,28 @@ +primary_key,column_name,perfect_match,null_in_a,null_in_b,missing_from_a,missing_from_b,conflicting_values +1,ID,true,false,false,false,false,false +2,ID,true,false,false,false,false,false +3,ID,true,false,false,false,false,false +4,ID,true,false,false,false,false,false +5,ID,true,false,false,false,false,false +6,ID,true,false,false,false,false,false +7,ID,true,false,false,false,false,false +8,ID,false,false,false,false,true,false +9,ID,false,false,false,true,false,false +1,FRUIT,true,false,false,false,false,false +2,FRUIT,true,false,false,false,false,false +3,FRUIT,true,false,false,false,false,false +4,FRUIT,true,false,false,false,false,false +5,FRUIT,true,false,false,false,false,false +6,FRUIT,false,false,true,false,false,false +7,FRUIT,true,false,false,false,false,false +8,FRUIT,false,false,false,false,true,false +9,FRUIT,false,false,false,true,false,false +1,RIPENESS,true,false,false,false,false,false +2,RIPENESS,false,false,false,false,false,true +3,RIPENESS,true,false,false,false,false,false +4,RIPENESS,true,false,false,false,false,false +5,RIPENESS,true,false,false,false,false,false +6,RIPENESS,true,false,false,false,false,false +7,RIPENESS,false,true,false,false,false,false +8,RIPENESS,false,false,false,false,true,false +9,RIPENESS,false,false,false,true,false,false \ No newline at end of file diff --git a/macros/compare_all_columns.sql b/macros/compare_all_columns.sql new file mode 100644 index 00000000..95d48b9d --- /dev/null +++ b/macros/compare_all_columns.sql @@ -0,0 +1,95 @@ +{% macro compare_all_columns( a_relation, b_relation, primary_key, exclude_columns=[],summarize=true ) -%} + {{ return(adapter.dispatch('compare_all_columns', 'audit_helper')( a_relation, b_relation, primary_key, exclude_columns, summarize )) }} +{%- endmacro %} + +{% macro default__compare_all_columns( a_relation, b_relation, primary_key, exclude_columns=[], summarize=true ) -%} + + {% set column_names = dbt_utils.get_filtered_columns_in_relation(from=a_relation, except=exclude_columns) %} + + {% set a_query %} + select + * + from {{ a_relation }} + {% endset %} + + {% set b_query %} + select + * + from {{ b_relation }} + {% endset %} + + {% for column_name in column_names %} + + {% set audit_query = audit_helper.compare_column_values_verbose( + a_query=a_query, + b_query=b_query, + primary_key=primary_key, + column_to_compare=column_name + ) %} + + /* Create a query combining results from all columns so that the user, or the + test suite, can examine all at once. + */ + + {% if loop.first %} + + /* Create a CTE that wraps all the unioned subqueries that are created + in this for loop + */ + with main as ( + + {% endif %} + + /* There will be one audit_query subquery for each column + */ + ( {{ audit_query }} ) + + {% if not loop.last %} + + union all + + {% else %} + + ), + + {%- if summarize %} + + final as ( + select + upper(column_name) as column_name, + sum(case when perfect_match then 1 else 0 end) as perfect_match, + sum(case when null_in_a then 1 else 0 end) as null_in_a, + sum(case when null_in_b then 1 else 0 end) as null_in_b, + sum(case when missing_from_a then 1 else 0 end) as missing_from_a, + sum(case when missing_from_b then 1 else 0 end) as missing_from_b, + sum(case when conflicting_values then 1 else 0 end) as conflicting_values + from main + group by 1 + order by column_name + ) + + {%- else %} + + final as ( + select + primary_key, + upper(column_name) as column_name, + perfect_match, + null_in_a, + null_in_b, + missing_from_a, + missing_from_b, + conflicting_values + from main + order by primary_key + ) + + {%- endif %} + + select * from final + + {% endif %} + + {% endfor %} + +{% endmacro %} \ No newline at end of file diff --git a/macros/compare_column_values_verbose.sql b/macros/compare_column_values_verbose.sql new file mode 100644 index 00000000..035b8c4b --- /dev/null +++ b/macros/compare_column_values_verbose.sql @@ -0,0 +1,41 @@ +{% macro compare_column_values_verbose(a_query, b_query, primary_key, column_to_compare) -%} + {{ return(adapter.dispatch('compare_column_values_verbose', 'audit_helper')(a_query, b_query, primary_key, column_to_compare)) }} +{%- endmacro %} + + +{% macro default__compare_column_values_verbose(a_query, b_query, primary_key, column_to_compare) -%} +with a_query as ( + {{ a_query }} +), + +b_query as ( + {{ b_query }} +) + select + coalesce(a_query.{{ primary_key }}, b_query.{{ primary_key }}) as primary_key, + + {% if target.name == 'postgres' or target.name == 'redshift' %} + '{{ column_to_compare }}'::text as column_name, + {% else %} + '{{ column_to_compare }}' as column_name, + {% endif %} + + coalesce(a_query.{{ column_to_compare }} = b_query.{{ column_to_compare }}, + (a_query.{{ column_to_compare }} is null and b_query.{{ column_to_compare }} is null), + false) as perfect_match, + a_query.{{ column_to_compare }} is null and a_query.{{ primary_key }} is not null as null_in_a, + b_query.{{ column_to_compare }} is null and b_query.{{ primary_key }} is not null as null_in_b, + a_query.{{ primary_key }} is null as missing_from_a, + b_query.{{ primary_key }} is null as missing_from_b, + coalesce(a_query.{{ column_to_compare }} != b_query.{{ column_to_compare }} and + (a_query.{{ column_to_compare }} is not null or b_query.{{ column_to_compare }} is not null), false) + as conflicting_values + -- considered a conflict if the values do not match AND at least one of the values is not null. + + from a_query + + full outer join b_query on (a_query.{{ primary_key }} = b_query.{{ primary_key }}) + + + +{% endmacro %} diff --git a/macros/compare_relations.sql b/macros/compare_relations.sql index 7f836ba2..b9b421fb 100644 --- a/macros/compare_relations.sql +++ b/macros/compare_relations.sql @@ -1,36 +1,30 @@ -{% macro pop_columns(columns, columns_to_pop) %} -{% set popped_columns=[] %} - -{% for column in columns %} - {% if column.name | lower not in columns_to_pop | lower %} - {% do popped_columns.append(column) %} - {% endif %} -{% endfor %} - -{{ return(popped_columns) }} -{% endmacro %} - - ----- - {% macro compare_relations(a_relation, b_relation, exclude_columns=[], primary_key=None, summarize=true) %} -{%- set a_columns = adapter.get_columns_in_relation(a_relation) -%} +{% set column_names = dbt_utils.get_filtered_columns_in_relation(from=a_relation, except=exclude_columns) %} + +{% set column_selection %} -{% set check_columns=audit_helper.pop_columns(a_columns, exclude_columns) %} + {% for column_name in column_names %} + {{ adapter.quote(column_name) }} + {% if not loop.last %} + , + {% endif %} + {% endfor %} -{% set check_cols_csv = check_columns | map(attribute='quoted') | join(', ') %} +{% endset %} {% set a_query %} select - {{ check_cols_csv }} + + {{ column_selection }} from {{ a_relation }} {% endset %} {% set b_query %} select - {{ check_cols_csv }} + + {{ column_selection }} from {{ b_relation }} {% endset %}