diff --git a/CHANGELOG.md b/CHANGELOG.md index f72afbd4..28a74cea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,32 @@ # Unreleased + +## 🚨 deduplicate ([#542](https://github.com/dbt-labs/dbt-utils/pull/542), [#548](https://github.com/dbt-labs/dbt-utils/pull/548)) + +The call signature of `deduplicate` has changed. The previous call signature is marked as deprecated and will be removed in the next minor version. + +- The `group_by` argument is now deprecated and replaced by `partition_by`. +- The `order_by` argument is now required. +- The `relation_alias` argument has been removed as the macro now supports `relation` as a string directly. If you were using `relation_alias` to point to a CTE previously then you can now pass the alias directly to `relation`. + +Before: +```jinja +{% macro deduplicate(relation, group_by, order_by=none, relation_alias=none) -%} +... +{% endmacro %} +``` + +After: +```jinja +{% macro deduplicate(relation, partition_by, order_by) -%} +... +{% endmacro %} +``` + ## New features -- Add an optional `where` clause parameter to `get_column_values()` to filter values returned ([#511](https://github.com/dbt-labs/dbt-utils/issues/511), [#583](https://github.com/dbt-labs/dbt-utils/pull/583)) +- Add an optional `where` clause parameter to `get_column_values()` to filter values returned ([#511](https://github.com/dbt-labs/dbt-utils/issues/511), [#583](https://github.com/dbt-labs/dbt-utils/pull/583)) - Add `where` parameter to `union_relations` macro ([#554](https://github.com/dbt-labs/dbt-utils/pull/554)) +- Add Postgres specific implementation of `deduplicate()` ([#548](https://github.com/dbt-labs/dbt-utils/pull/548)) +- Add Snowflake specific implementation of `deduplicate()` ([#543](https://github.com/dbt-labs/dbt-utils/issues/543), [#548](https://github.com/dbt-labs/dbt-utils/pull/548)) ## Fixes - Enable a negative part_number for `split_part()` ([#557](https://github.com/dbt-labs/dbt-utils/issues/557), [#559](https://github.com/dbt-labs/dbt-utils/pull/559)) @@ -10,6 +35,7 @@ - Documentation about listagg macro ([#544](https://github.com/dbt-labs/dbt-utils/issues/544), [#560](https://github.com/dbt-labs/dbt-utils/pull/560)) - Fix links to macro section in table of contents ([#555](https://github.com/dbt-labs/dbt-utils/pull/555)) - Contributing guide ([#574](https://github.com/dbt-labs/dbt-utils/pull/574)) +- Add better documentation for `deduplicate()` ([#542](https://github.com/dbt-labs/dbt-utils/pull/542), [#548](https://github.com/dbt-labs/dbt-utils/pull/548)) ## Under the hood - Fail integration tests appropriately ([#540](https://github.com/dbt-labs/dbt-utils/issues/540), [#545](https://github.com/dbt-labs/dbt-utils/pull/545)) @@ -34,7 +60,7 @@ ## Fixes - `get_column_values()` once more raises an error when the model doesn't exist and there is no default provided ([#531](https://github.com/dbt-labs/dbt-utils/issues/531), [#533](https://github.com/dbt-labs/dbt-utils/pull/533)) -- `get_column_values()` raises an error when used with an ephemeral model, instead of getting stuck in a compilation loop ([#358](https://github.com/dbt-labs/dbt-utils/issues/358), [#518](https://github.com/dbt-labs/dbt-utils/pull/518)) +- `get_column_values()` raises an error when used with an ephemeral model, instead of getting stuck in a compilation loop ([#358](https://github.com/dbt-labs/dbt-utils/issues/358), [#518](https://github.com/dbt-labs/dbt-utils/pull/518)) - BigQuery materialized views work correctly with `get_relations_by_pattern()` ([#525](https://github.com/dbt-labs/dbt-utils/pull/525)) ## Quality of life diff --git a/README.md b/README.md index 4c149a82..97660630 100644 --- a/README.md +++ b/README.md @@ -736,16 +736,44 @@ This macro returns the sql required to build a date spine. The spine will includ ``` #### deduplicate ([source](macros/sql/deduplicate.sql)) -This macro returns the sql required to remove duplicate rows from a model or source. +This macro returns the sql required to remove duplicate rows from a model, source, or CTE. + +**Args:** + - `relation` (required): a [Relation](https://docs.getdbt.com/reference/dbt-classes#relation) (a `ref` or `source`) or string which identifies the model to deduplicate. + - `partition_by` (required): column names (or expressions) to use to identify a set/window of rows out of which to select one as the deduplicated row. + - `order_by` (required): column names (or expressions) that determine the priority order of which row should be chosen if there are duplicates (comma-separated string). *NB.* if this order by clause results in ties then which row is returned may be nondeterministic across runs. **Usage:** ``` {{ dbt_utils.deduplicate( relation=source('my_source', 'my_table'), - group_by="user_id, cast(timestamp as day)", + partition_by='user_id, cast(timestamp as day)', order_by="timestamp desc", - relation_alias="my_cte" + ) +}} +``` + +``` +{{ dbt_utils.deduplicate( + relation=ref('my_model'), + partition_by='user_id', + order_by='effective_date desc, effective_sequence desc', + ) +}} +``` + +``` +with my_cte as ( + select * + from {{ source('my_source', 'my_table') }} + where user_id = 1 +) + +{{ dbt_utils.deduplicate( + relation='my_cte', + partition_by='user_id, cast(timestamp as day)', + order_by='timestamp desc', ) }} ``` diff --git a/integration_tests/models/sql/schema.yml b/integration_tests/models/sql/schema.yml index e5927fbf..a74b5d63 100644 --- a/integration_tests/models/sql/schema.yml +++ b/integration_tests/models/sql/schema.yml @@ -172,3 +172,8 @@ models: tests: - dbt_utils.equality: compare_model: ref('data_deduplicate_expected') + + - name: test_deduplicate_deprecated + tests: + - dbt_utils.equality: + compare_model: ref('data_deduplicate_expected') diff --git a/integration_tests/models/sql/test_deduplicate.sql b/integration_tests/models/sql/test_deduplicate.sql index 81fe81e7..226555b1 100644 --- a/integration_tests/models/sql/test_deduplicate.sql +++ b/integration_tests/models/sql/test_deduplicate.sql @@ -10,10 +10,9 @@ deduped as ( {{ dbt_utils.deduplicate( - ref('data_deduplicate'), - group_by='user_id', + 'source', + partition_by='user_id', order_by='version desc', - relation_alias="source" ) | indent }} diff --git a/integration_tests/models/sql/test_deduplicate_deprecated.sql b/integration_tests/models/sql/test_deduplicate_deprecated.sql new file mode 100644 index 00000000..b80db2cb --- /dev/null +++ b/integration_tests/models/sql/test_deduplicate_deprecated.sql @@ -0,0 +1,22 @@ +with + +source as ( + select * + from {{ ref('data_deduplicate') }} + where user_id = 1 +), + +deduped as ( + + {{ + dbt_utils.deduplicate( + ref('data_deduplicate'), + group_by='user_id', + order_by='version desc', + relation_alias='source', + ) | indent + }} + +) + +select * from deduped diff --git a/macros/sql/deduplicate.sql b/macros/sql/deduplicate.sql index 9a3571a2..64501740 100644 --- a/macros/sql/deduplicate.sql +++ b/macros/sql/deduplicate.sql @@ -1,23 +1,102 @@ -{%- macro deduplicate(relation, group_by, order_by=none, relation_alias=none) -%} - {{ return(adapter.dispatch('deduplicate', 'dbt_utils')(relation, group_by, order_by=order_by, relation_alias=relation_alias)) }} +{%- macro deduplicate(relation, partition_by, order_by=none, relation_alias=none) -%} + + {%- set error_message_group_by -%} +Warning: the `group_by` parameter of the `deduplicate` macro is no longer supported and will be deprecated in a future release of dbt-utils. +Use `partition_by` instead. +The {{ model.package_name }}.{{ model.name }} model triggered this warning. + {%- endset -%} + + {% if kwargs.get('group_by') %} + {%- do exceptions.warn(error_message_group_by) -%} + {%- endif -%} + + {%- set error_message_order_by -%} +Warning: `order_by` as an optional parameter of the `deduplicate` macro is no longer supported and will be deprecated in a future release of dbt-utils. +Supply a non-null value for `order_by` instead. +The {{ model.package_name }}.{{ model.name }} model triggered this warning. + {%- endset -%} + + {% if not order_by %} + {%- do exceptions.warn(error_message_order_by) -%} + {%- endif -%} + + {%- set error_message_alias -%} +Warning: the `relation_alias` parameter of the `deduplicate` macro is no longer supported and will be deprecated in a future release of dbt-utils. +If you were using `relation_alias` to point to a CTE previously then you can now pass the alias directly to `relation` instead. +The {{ model.package_name }}.{{ model.name }} model triggered this warning. + {%- endset -%} + + {% if relation_alias %} + {%- do exceptions.warn(error_message_alias) -%} + {%- endif -%} + + {% set partition_by = partition_by or kwargs.get('group_by') %} + {% set relation = relation_alias or relation %} + {% set order_by = order_by or "'1'" %} + + {{ return(adapter.dispatch('deduplicate', 'dbt_utils')(relation, partition_by, order_by)) }} {% endmacro %} -{%- macro default__deduplicate(relation, group_by, order_by=none, relation_alias=none) -%} +{%- macro default__deduplicate(relation, partition_by, order_by) -%} - select - {{ dbt_utils.star(relation, relation_alias='deduped') | indent }} - from ( + with row_numbered as ( select _inner.*, row_number() over ( - partition by {{ group_by }} - {% if order_by is not none -%} + partition by {{ partition_by }} order by {{ order_by }} - {%- endif %} ) as rn - from {{ relation if relation_alias is none else relation_alias }} as _inner - ) as deduped - where deduped.rn = 1 + from {{ relation }} as _inner + ) + + select + distinct data.* + from {{ relation }} as data + {# + -- Not all DBs will support natural joins but the ones that do include: + -- Oracle, MySQL, SQLite, Redshift, Teradata, Materialize, Databricks + -- Apache Spark, SingleStore, Vertica + -- Those that do not appear to support natural joins include: + -- SQLServer, Trino, Presto, Rockset, Athena + #} + natural join row_numbered + where row_numbered.rn = 1 + +{%- endmacro -%} + +{# Redshift should use default instead of Postgres #} +{% macro redshift__deduplicate(relation, partition_by, order_by) -%} + + {{ return(dbt_utils.default__deduplicate(relation, partition_by, order_by=order_by)) }} + +{% endmacro %} + +{# +-- Postgres has the `DISTINCT ON` syntax: +-- https://www.postgresql.org/docs/current/sql-select.html#SQL-DISTINCT +#} +{%- macro postgres__deduplicate(relation, partition_by, order_by) -%} + + select + distinct on ({{ partition_by }}) * + from {{ relation }} + order by {{ partition_by }}{{ ',' ~ order_by }} + +{%- endmacro -%} + +{# +-- Snowflake has the `QUALIFY` syntax: +-- https://docs.snowflake.com/en/sql-reference/constructs/qualify.html +#} +{%- macro snowflake__deduplicate(relation, partition_by, order_by) -%} + + select * + from {{ relation }} + qualify + row_number() over ( + partition by {{ partition_by }} + order by {{ order_by }} + ) = 1 {%- endmacro -%} @@ -26,21 +105,18 @@ -- clause in BigQuery: -- https://github.com/dbt-labs/dbt-utils/issues/335#issuecomment-788157572 #} -{%- macro bigquery__deduplicate(relation, group_by, order_by=none, relation_alias=none) -%} +{%- macro bigquery__deduplicate(relation, partition_by, order_by) -%} - select - {{ dbt_utils.star(relation, relation_alias='deduped') | indent }} + select unique.* from ( select array_agg ( original - {% if order_by is not none -%} order by {{ order_by }} - {%- endif %} limit 1 - )[offset(0)] as deduped - from {{ relation if relation_alias is none else relation_alias }} as original - group by {{ group_by }} + )[offset(0)] unique + from {{ relation }} original + group by {{ partition_by }} ) {%- endmacro -%}