diff --git a/.github/scripts/transform_dbt_test_results.py b/.github/scripts/transform_dbt_test_results.py index 5503341aa..d2e679264 100644 --- a/.github/scripts/transform_dbt_test_results.py +++ b/.github/scripts/transform_dbt_test_results.py @@ -86,9 +86,6 @@ import simplejson as json import yaml -# Tests without a config.meta.category property will be grouped in -# this default category -DEFAULT_TEST_CATEGORY = "miscellaneous" # Prefix for the URL location of a test in the dbt docs DOCS_URL_PREFIX = "https://ccao-data.github.io/data-architecture/#!/test" # The S3 bucket where Athena query results are stored @@ -108,8 +105,9 @@ CLASS_FIELD = "class" WHO_FIELD = "who" WEN_FIELD = "wen" -# Overrides for default display names for dbt tests -CUSTOM_TEST_NAMES = { +# Mapping that defines category names that should be reported for tests +# based on their generics +TEST_CATEGORIES = { "macro.athena.test_accepted_range": "incorrect_values", "macro.dbt_utils.test_accepted_range": "incorrect_values", "macro.athena.test_accepted_values": "incorrect_values", @@ -121,6 +119,9 @@ "macro.athena.test_is_null": "missing_values", "macro.athena.test_res_class_matches_pardat": "class_mismatch_or_issue", } +# Fallback for tests whose category we can't determine from either the +# test name, the `meta.category` attribute, or the TEST_CATEGORIES mapping +DEFAULT_TEST_CATEGORY = "miscellaneous" # Directory to store failed test caches TEST_CACHE_DIR = "test_cache" @@ -1183,7 +1184,7 @@ def get_category_from_node(node: typing.Dict) -> str: return meta_category for dependency_macro in node["depends_on"]["macros"]: - if custom_test_name := CUSTOM_TEST_NAMES.get(dependency_macro): + if custom_test_name := TEST_CATEGORIES.get(dependency_macro): return custom_test_name # Custom generic tests are always formatted like # macro.dbt.test_ diff --git a/dbt/README.md b/dbt/README.md index ac52f1b2d..bbc5c64a4 100644 --- a/dbt/README.md +++ b/dbt/README.md @@ -11,6 +11,7 @@ This directory stores the configuration for building our data catalog using * [๐Ÿ”จ How to rebuild models using GitHub Actions](#-how-to-rebuild-models-using-github-actions) * [๐Ÿ’ป How to develop the catalog](#-how-to-develop-the-catalog) * [โž• How to add a new model](#-how-to-add-a-new-model) +* [๐Ÿ“ How to add and run tests](#-how-to-add-and-run-tests) * [๐Ÿ› Debugging tips](#-debugging-tips) ### Outside this document @@ -18,6 +19,7 @@ This directory stores the configuration for building our data catalog using * [๐Ÿ“– Data documentation](https://ccao-data.github.io/data-architecture) * [๐Ÿ“ Design doc for our decision to develop our catalog with dbt](../documentation/design-docs/data-catalog.md) +* [๐Ÿงช Generic tests we use for testing](./tests/generic/README.md) ## ๐Ÿ–ผ๏ธ Background: What does the data catalog do? @@ -409,29 +411,144 @@ We use the following pattern to determine where to define each column descriptio ### Model tests -Any assumptions underlying the new model should be documented in the form of -[dbt tests](https://docs.getdbt.com/docs/build/tests). We prefer adding tests -inline in `schema.yml` model properties files, as opposed to defining one-off -tests in the `tests/` directory. +New models should generally be added with accompanying tests to ensure the +underlying data and transformations are correct. For more information on +testing, see [๐Ÿ“ How to add and run tests](#-how-to-add-and-run-tests). -Conceptually, there are two types of tests that we might consider for a new -model: +## ๐Ÿ“ How to add and run tests -1. **Data tests** check that the assumptions that a model makes about the raw - data it is transforming are correct. - * For example: Test that the table is unique by `pin10` and `year`. -2. **Unit tests** check that the transformation logic itself produces - the correct output on a hardcoded set of input data. +We test our data and our transformations using [dbt +tests](https://docs.getdbt.com/docs/build/tests). We prefer adding tests +inline in `schema.yml` config files using [generic +tests](https://docs.getdbt.com/best-practices/writing-custom-generic-tests), +rather than [singular +tests](https://docs.getdbt.com/docs/build/data-tests#singular-data-tests). + +### Data tests vs. unit tests + +There are two types of tests that we might consider for a model: + +1. **Data tests** check that our assumptions about our raw data are correct + * For example: Test that a table is unique by `parid` and `taxyr` +2. **Unit tests** check that transformation logic inside a model definition + produces the correct output on a specific set of input data * For example: Test that an enum column computed by a `CASE... WHEN` - expression produces the correct enum output for a given input string. - -The dbt syntax does not distinguish between data and unit tests, but it has -emerged as a valuable distinction that we make on our side. Data tests are -generally much easier to define and to implement, since they operate directly on -source data and do not require hardcoded input and output values to execute. -Due to this complexity, we currently do not have a way of supporting unit -tests, although we plan to revisit this in the future; as such, when proposing -new tests, check to ensure that they are in fact data tests and not unit tests. + expression produces the correct output for a given input string + +dbt tests are data tests by default, although a dedicated unit testing syntax +[is coming soon](https://docs.getdbt.com/docs/build/unit-tests). Until unit +tests are natively supported, however, we do not have a way of implementing them. +We plan to change this once unit testing is released, but for now, make sure that +any new tests you write are data tests and not unit tests. + +### Adding data tests + +There are two types of data tests that we support: + +1. **QC tests** confirm our assumptions about iasWorld data and are run at + scheduled intervals to confirm that iasWorld data meets spec +2. **Non-QC tests** confirm all other assumptions about data sources outside + of iasWorld, and are run in an ad hoc fashion depending on the needs of + the transformations that sit on top of the raw data + +#### Adding QC tests + +QC tests are run on a schedule by the [`test-dbt-models` +workflow](https://github.com/ccao-data/data-architecture/actions/workflows/test_dbt_models.yaml) +and their output is interpreted by the [`transform_dbt_test_results` +script](https://github.com/ccao-data/data-architecture/blob/master/.github/scripts/transform_dbt_test_results.py). +This script reads the metadata for a test run and outputs an Excel +workbook with detailed information on each failure to aid in resolving +any data problems that the tests reveal. + +There are a few specific modifications a test author needs to make to +ensure that QC tests can be run by the workflow and interpreted by the script: + +* One of either the test or the model that the test is defined on must be +[tagged](https://docs.getdbt.com/reference/resource-configs/tags) with +the tag `test_qc_iasworld` + * Prefer tagging the model, and fall back to tagging the test if for + some reason the model cannot be tagged (e.g. if it has some non-QC + tests defined on it) +* The test definition must supply a few specific parameters: + * `name` must be set and follow the pattern + `iasworld__` + * `additional_select_columns` must be set to an array of strings + representing any extra columns that need to be output by the test + for display in the workbook + * Generics typically select any columns mentioned by other parameters, + but if you are unsure which columns will be selected by default + (meaning they do not need to be included in `additional_select_columns`), + consult our [documentation](./tests/generic/README.md) for the generic + test you're using + * `config.where` should typically set to provide a filter expression + that restricts tests to unique rows and to rows matching a date range + set by the `test_qc_year_start` and `test_qc_year_end` + [project variables](https://docs.getdbt.com/docs/build/project-variables) + * `meta` should be set with a few specific string attributes: + * `description` (required): A short human-readable description of the test + * `category` (optional): A workbook category for the test, required if + a category is not defined for the test's generic in the `TEST_CATEGORIES` + constant in the [`transform_dbt_test_results` + script](https://github.com/ccao-data/data-architecture/blob/master/.github/scripts/transform_dbt_test_results.py) + * `table_name` (optional): The name of the table to report in the output + workbook, if the workbook should report a different table name than the + name of the model that the test is defined on + +See the [`iasworld_pardat_class_in_ccao_class_dict` +test](https://github.com/ccao-data/data-architecture/blob/bd4bc1769fe33fdba1dbe827791b5c41389cf6ec/dbt/models/iasworld/schema/iasworld.pardat.yml#L78-L96) +for an example of a test that sets these attributes. + +Due to the similarity of parameters defined on QC tests, we make extensive use +of YAML anchors and aliases to define symbols for commonly-used values. +See [here](https://support.atlassian.com/bitbucket-cloud/docs/yaml-anchors/) +for a brief explanation of the YAML anchor and alias syntax. + +#### Adding non-QC tests + +QC tests are much more common than non-QC tests in our test suite. If you +are being asked to add a test that appears to be a non-QC test, double +check with the person who assigned the test to you and ask them when +and how the test should be run so that its attributes can be set +accordingly. + +### Choosing a generic test + +Writing a test in a `schema.yml` file requires a [generic +test](https://docs.getdbt.com/best-practices/writing-custom-generic-tests) +to define the underlying test logic. Our generic tests are defined +in the `tests/generic/` directory. Before writing a test, look at +[the documentation for our generics](./tests/generic/README.md) to see if +any of them meet your needs. + +If a generic test does not meet your needs but seems like it could be +easily extended to meet your needs (say, if it inner joins two tables +but you would like to be able to configure it to left join those tables +instead) you can modify the macro that defines the generic test as part +of your PR to make the change that you need. + +If no generic tests meet your needs and none can be easily modified to +do so, you have two options: + +1. **Define a new model in the `models/qc/` directory that _can_ use a pre-existing generic**. + This is a good option if, say, you need to join two or more tables in a + complex way that is specific to your test and not easily generalizable. + With this approach, you can perform that join in the model, and then + the generic test doesn't need to know anything about it. +2. **Write a new generic test**. If you decide to take this approach, + make sure to read the docs on [writing custom generic + tests](https://docs.getdbt.com/best-practices/writing-custom-generic-tests). + This is a good option if you think that the logic you need + for your test will be easily generalizable to other models + and other tests. You'll also need to follow a few extra steps that are specific + to our environment: + 1. Add a default category for your generic test in + the `TEST_CATEGORIES` constant in the [`transform_dbt_test_results` + script](https://github.com/ccao-data/data-architecture/blob/master/.github/scripts/transform_dbt_test_results.py) + 2. Make sure that your generic test supports the `additional_select_columns` + parameter that most of our generic tests support, making use + of the `format_additional_select_columns` macro to format the + parameter when applying it to your `SELECT` condition ## ๐Ÿ› Debugging tips diff --git a/dbt/models/default/schema/default.vw_pin_universe.yml b/dbt/models/default/schema/default.vw_pin_universe.yml index a6fa3168d..a4e6b36bc 100644 --- a/dbt/models/default/schema/default.vw_pin_universe.yml +++ b/dbt/models/default/schema/default.vw_pin_universe.yml @@ -81,6 +81,15 @@ models: description: '{{ doc("column_chicago_police_district_num") }}' - name: class description: '{{ doc("shared_column_class") }}' + tests: + - count_is_consistent: + name: default_vw_pin_universe_class_count_is_consistent_by_year + group_column: year + config: + error_if: ">25" + - not_accepted_values: + name: default_vw_pin_universe_class_no_hyphens + values: "2-99" - name: cook_board_of_review_district_data_year description: '{{ doc("shared_column_data_year") }}' - name: cook_board_of_review_district_num @@ -209,6 +218,10 @@ models: description: '{{ doc("column_tax_district_num") }}' - name: township_code description: '{{ doc("shared_column_township_code") }}' + tests: + - count_is_consistent: + name: default_vw_pin_universe_town_count_is_consistent_by_year + group_column: year - name: township_name description: '{{ doc("shared_column_township_name") }}' - name: triad_code @@ -233,18 +246,6 @@ models: description: ZIP code of the property tests: - # Number of classes is consistent over time - - count_is_consistent: - name: default_vw_pin_universe_class_count_is_consistent_by_year - group_column: year - count_column: class - config: - error_if: ">25" - # Number of towns is consistent over time - - count_is_consistent: - name: default_vw_pin_universe_town_count_is_consistent_by_year - group_column: year - count_column: township_code - unique_combination_of_columns: name: default_vw_pin_universe_unique_by_14_digit_pin_and_year combination_of_columns: @@ -259,9 +260,5 @@ models: expression: REGEXP_COUNT(pin, '[0-9]') = 14 AND LENGTH(pin) = 14 additional_select_columns: - pin - - not_accepted_values: - name: default_vw_pin_universe_class_no_hyphens - column_name: class - values: "2-99" # TODO: Data completeness correlates with availability of spatial data # by year diff --git a/dbt/models/iasworld/schema/iasworld.legdat.yml b/dbt/models/iasworld/schema/iasworld.legdat.yml index 4bde7cff4..56921f833 100644 --- a/dbt/models/iasworld/schema/iasworld.legdat.yml +++ b/dbt/models/iasworld/schema/iasworld.legdat.yml @@ -343,7 +343,7 @@ sources: description: adrno should be <= 5 characters long - column_length: name: iasworld_legdat_adrsuf_zip1_taxdist_length_lte_5 - columns: + column_names: - adrsuf - zip1 - taxdist diff --git a/dbt/models/location/schema.yml b/dbt/models/location/schema.yml index 5da4c6da7..75d3d9f34 100644 --- a/dbt/models/location/schema.yml +++ b/dbt/models/location/schema.yml @@ -250,7 +250,7 @@ models: - column_length: name: location_vw_pin10_location_7_digit_ids_are_correct_length length: 7 - columns: + column_names: - census_place_geoid - census_puma_geoid - census_school_district_elementary_geoid diff --git a/dbt/tests/generic/README.md b/dbt/tests/generic/README.md new file mode 100644 index 000000000..c3bf4bf15 --- /dev/null +++ b/dbt/tests/generic/README.md @@ -0,0 +1,328 @@ +# Generic tests + +This directory stores the [custom generic +tests](https://docs.getdbt.com/best-practices/writing-custom-generic-tests) that we use +to define our test suite. + +## Available generic tests + +- [`test_accepted_range`](#test_accepted_range) +- [`test_accepted_values`](#test_accepted_values) +- [`test_column_is_subset_of_external_column`](#test_column_is_subset_of_external_column) +- [`test_column_length`](#test_column_length) +- [`test_columns_match`](#test_columns_match) +- [`test_count_is_consistent`](#test_count_is_consistent) +- [`test_expression_is_true`](#test_expression_is_true) +- [`test_is_null`](#test_is_null) +- [`test_no_extra_whitespace`](#test_no_extra_whitespace) +- [`test_not_accepted_values`](#test_not_accepted_values) +- [`test_not_null`](#test_not_null) +- [`test_relationships`](#test_relationships) +- [`test_res_class_matches_pardat`](#test_res_class_matches_pardat) +- [`test_row_count`](#test_row_count) +- [`test_row_values_match_after_join`](#test_row_values_match_after_join) +- [`test_sequential_values`](#test_sequential_values) +- [`test_unique_combination_of_columns`](#test_unique_combination_of_columns) +- [`test_value_is_present`](#test_value_is_present) + +### `test_accepted_range` + +Asserts that a column's values fall inside an expected range. Any combination of `min_value` +and `max_value` is allowed, and the range can be inclusive or exclusive. + +**Parameters**: + +* `min_value` (optional number): Lower bound for the range. Defaults to no lower bound. +* `max_value` (optional number): Upper bound for the range. Defaults to no upper bound. +* `inclusive` (optional boolean): Whether the range is inclusive. Defaults to true. +* `additional_select_columns` (optional list of strings): Additional columns to select for + failure output. The column the test is defined on will always be selected regardless + of this value. + +### `test_accepted_values` + +Asserts that a column's values are all present in a canonical list of `values`. The +opposite of [`test_not_accepted_values`](#test_not_accepted_values). + +**Parameters**: + +* `values` (required list of any value): Canonical list of allowed values. +* `quote` (optional boolean): Whether to single-quote all elements of `values`, i.e. + whether to convert them to strings. Defaults to true. +* `additional_select_columns` (optional list of strings): Additional columns to select for + failure output. The column the test is defined on will always be selected regardless + of this value. + +### `test_column_is_subset_of_external_column` + +Asserts that a column is a subset of a column in an external relation. Rows that have no +match in the external relation's column will be flagged as failures. + +**Parameters**: + +* `external_model` (required string): The external relation to use for comparison. Use a + [`ref()`](https://docs.getdbt.com/reference/dbt-jinja-functions/ref) or + [`source()`](https://docs.getdbt.com/reference/dbt-jinja-functions/source) call to + specify this relation so that the DAG can understand the relationship. +* `external_column` (required string): The name of the column on `external_model` to + use for comparison. +* `additional_select_columns` (optional list of strings): Additional columns to select for + failure output. The column the test is defined on will always be selected regardless + of this value. + +### `test_column_length` + +Asserts that all columns in a list of `column_names` have the correct length. Returns columns with +the naming pattern `len_` representing the length of each column for all rows where +one of the columns has an incorrect length, e.g. if `column_names = ["foo", "bar"]` the test will +return two additional columns named `len_foo` and `len_bar`. + +Since this test operates on a list of `column_names` instead of a scalar `column_name`, it must +be defined on the table level rather than on the column level. + +**Parameters**: + +* `column_names` (required list of strings): The list of columns to check for proper length. +* `length` (required integer): The length that the column values should be. +* `additional_select_columns` (optional list of strings): Additional columns to select for + failure output. The column the test is defined on and the autogenerated `len_` + columns will always be selected regardless of this value. + +### `test_columns_match` + +Asserts that two or more columns in the same relation have the same value for each row. + +**Parameters**: + +* `matching_column_names` (required list of strings): The list of columns to check for + identical values. +* `additional_select_columns` (optional list of strings): Additional columns to select for + failure output. The column the test is defined on and the columns defined in + `matching_column_names` will always be selected regardless of this value. + +### `test_count_is_consistent` + +Asserts that the count of a given column is the same when grouped by another column, for +example that the number of distinct township codes is the same across years. Returns +the grouping column and a column called `count` with the count of rows for that group. + +**Parameters**: + +* `group_column` (required string): The column to use for grouping. + +### `test_expression_is_true` + +Asserts that a valid SQL expression is true for all rows. In other words, filters for +rows where a given `expression` is false. Often useful for idiosyncratic comparisons +across columns that are not easily generalized into generic tests. + +**Parameters**: + +* `expression` (required string): A valid SQL expression to apply to the column or table. +* `additional_select_columns` (optional list of strings): Additional columns to select for + failure output. The column the test is defined on will always be selected regardless + of this value. + +### `test_is_null` + +Asserts that a column contains only null values. The opposite of [`test_not_null`](#test_not_null). + +**Parameters**: + +* `additional_select_columns` (optional list of strings): Additional columns to select for + failure output. The column the test is defined on will always be selected regardless + of this value. + +### `test_no_extra_whitespace` + +Asserts that one or more string columns do not contain extraneous whitespace. Returns all +of the columns that are configured in `column_names`. + +Since this test operates on a list of `column_names` instead of a scalar `column_name`, it must +be defined on the table level rather than on the column level. + +**Parameters**: + +* `column_names` (required list of strings): The list of columns to check for extra whitespace. +* `allow_interior_space` (optional boolean): If true, will only check for leading and trailing + whitespace, and otherwise will also check for multiple consecutive spaces in the interior + of the string. Defaults to false. + +### `test_not_accepted_values` + +Asserts that there are no rows that match the given values. The opposite of +[`test_accepted_values`](#test_accepted_values). + +**Parameters**: + +* `values` (required list of any value): Canonical list of disallowed values. +* `quote` (optional boolean): Whether to single-quote all elements of `values`, i.e. + whether to convert them to strings. Defaults to true. +* `additional_select_columns` (optional list of strings): Additional columns to select for + failure output. The column the test is defined on will always be selected regardless + of this value. + +### `test_not_null` + +Asserts that there are no null values present in a column. The opposite of +[`test_is_null`](#test_is_null). + +**Parameters**: + +* `additional_select_columns` (optional list of strings): Additional columns to select for + failure output. The column the test is defined on will always be selected regardless + of this value. + +### `test_relationships` + +Asserts that all of the records in a child table have a corresponding record in a parent table. +This property is referred to as "referential integrity". + +* `to` (required string): The external relation to use for comparison. Use a + [`ref()`](https://docs.getdbt.com/reference/dbt-jinja-functions/ref) or + [`source()`](https://docs.getdbt.com/reference/dbt-jinja-functions/source) call to + specify this relation so that the DAG can understand the relationship. +* `field` (required string): The name of the column on `to` to use for comparison. +* `additional_select_columns` (optional list of strings): Additional columns to select for + failure output. The column the test is defined on will always be selected regardless + of this value. + +### `test_res_class_matches_pardat` + +For all residential parcels in a given model, test that there is at least one +class code that matches a class code for that parcel in `iasworld.pardat`. + +The test filters for residential parcels by anti-joining the model against `comdat` +using `parid` and `taxyr`; as a result, it filters out mixed-use parcels as well. + +**Parameters**: + +* `major_class_only` (optional boolean): Compare only the first digit of classes. When + set to false, compare the first three digits instead. Defaults to false. +* `parid_column_name` (optional string): The name of the column on the base model that + corresponds to `pardat.parid`, in case the model uses a different name scheme. Defaults + to `parid`. +* `taxyr_column_name` (optional string): The name of the column on the base model that + corresponds to `pardat.taxyr`, in case the model uses a different name scheme. Defaults + to `taxyr`. +* `join_type` (optional string): The type of join to use when joining to `pardat`, e.g. + `"inner"` or `"left"`. Defaults to `"left"`. +* `additional_pardat_filter` (optional string): A SQL string representing additional conditions + to apply in the `WHERE` clause of the subquery that selects from `pardat` to join to the model, + e.g. `"class != 'EX' AND class != 'RR'"`. Note that `cur = 'Y'` and `deactivat IS NULL` are + already set prior to this parameter being applied, hence the "additional" in the param name. +* `additional_select_columns` (optional list of dictionaries): Additional columns to + select for failure output. The column the test is defined on and `pardat.class` will always be + selected regardless of this value. Columns must be represented as dictionaries with the following + attributes: + * `column` (required string): The name of the column to select. + * `agg_func` (required string): The aggregation function to use for aggregating column values, + like `max` or `array_agg`. Necessary because results are automatically grouped by `parid` and + `taxyr`. + * `alias` (optional string): The name of the column to use for output. Necessary because + aggregation functions as represented by `agg_func` require aliases in SQL. Defaults to + `_`. + +### `test_row_count` + +Asserts that row counts for a model or column are above a certain value. + +**Parameters**: + +* `above` (required integer): The minimum row count (inclusive) for the model or column. + +### `test_row_values_match_after_join` + +Asserts that row values match after joining two tables. + +Row values can be a subset of the values in the joined table, e.g. if a PIN in +table A has one row with `class = "212"` and the same PIN in table B has two +rows, one with `class = "212"` and one with `class = "211"`, then table A passes +the test. + +**Parameters**: + +* `external_model` (required string): The name of the model to join to. +* `external_column_name` (required string): The name of the column in + `external_model` to join to. +* `join_condition` (required string): The `ON` (or `USING`) portion of a + `JOIN` clause, represented as a string. Note that in the case where `ON` is + used, columns in the base model should be formatted like `model.` + while columns in the external model should be formatted like + `external_model.`, e.g. `ON model.pin = external_model.parid`. + This is not necessary in the case of a `USING` expression, since `USING` does + not need to refer to table names for the purposes of namespacing columns. +* `group_by` (optional list of strings): The columns from the base model to + pass to the `GROUP BY` function used in the test query. Unlike + `join_condition`, these column names do not have to be prefixed with + `model.*`, since they are assumed to come from the base model for the + test and not the external model. +* `join_type` (optional string): The type of join to use, e.g. `"inner"` or + `"left"`. Defaults to `"inner"`. +* `column_alias` (optional string): An alias to use when selecting the column + from the base model for output. An alias is required in this case because + the column must be aggregated. Defaults to `"model_col"`. +* `external_column_alias` (optional string): An alias to use when selecting the + column from the external model for output. Defaults to `"external_model_col"`. +* `additional_select_columns` (optional list of dictionaries): Additional columns to + select for failure output. `model.`, `external_model.`, + and the columns specified in the `group_by` parameter will always be selected regardless + of this value. Columns must be represented as dictionaries with the following + attributes: + * `column` (required string): The name of the column to select. + * `agg_func` (required string): The aggregation function to use for aggregating column values, + like `max` or `array_agg`. Necessary because results are automatically grouped by the + columns specified in the `group_by` parameter. + * `alias` (optional string): The name of the column to use for output. Necessary because + aggregation functions as represented by `agg_func` require aliases in SQL. Defaults to + `_`. + +### `test_sequential_values` + +Asserts that a column contains sequential values. Can be used for both numeric values and +datetime values. + +**Parameters**: + +* `interval` (optional integer): The expected gap in units between two sequential values. + Defaults to `1`. +* `datepart` (optional string): When present, indicates that values are datetimes and + describes the unit of dates that should be used by `interval` to establish expected gaps, + e.g. `"hour"` or `"day"`. +* `group_by_columns` (optional list of strings): The group of columns to use for partitioning + in the window function that is used to lag the base column. Defaults to an empty list. +* `additional_select_columns` (optional list of strings): Additional columns to select for + failure output. The column the test is defined on, the columns in `group_by_columns`, + and value for the column in the preceding value in the sequence (aliased to + `previous_`) will always be selected regardless of this value. + +### `test_unique_combination_of_columns` + +Asserts that the combination of columns always produces unique rows in a relation. For example, +the combination of `parid` and `taxyr` might produce unique rows even though neither column is +unique in isolation. + +Since this test operates on a `combination_of_columns` list instead of a scalar `column_name`, +it must be defined on the table level rather than on the column level. + +**Parameters**: + +* `combination_of_columns` (required list of strings): One or more columns that are unique + as a group. +* `allowed_duplicates` (optional integer): The maximum number of duplicates that is considered + acceptable for the purposes of uniqueness. Defaults to 1. +* `additional_select_columns` (optional list of strings): Additional columns to select for + failure output. Regardless of this value, the columns defined by `combination_of_columns` + along with an automatically generated column `num_duplicates` will always be selected. + +### `test_value_is_present` + +Asserts that a given expression returns a non-zero number of rows. + +Since this test operates on an `expression` instead of a `column_name`, +it must be defined on the table level rather than on the column level. + +**Parameters**: + +* `expression` (required string): A valid SQL string representing the expression that + should return a non-zero number of rows. diff --git a/dbt/tests/generic/test_column_length.sql b/dbt/tests/generic/test_column_length.sql index c955f3670..598936fb6 100644 --- a/dbt/tests/generic/test_column_length.sql +++ b/dbt/tests/generic/test_column_length.sql @@ -6,14 +6,14 @@ -- If additional columns should be returned along with the length columns, -- e.g. to add an identifiable key for the row, use the -- additional_select_columns argument. -{% test column_length(model, columns, length, additional_select_columns=[]) %} +{% test column_length(model, column_names, length, additional_select_columns=[]) %} {%- set additional_select_columns_csv = format_additional_select_columns( additional_select_columns ) %} {%- set length_columns = [] %} - {%- for column in columns %} + {%- for column in column_names %} {%- set length_columns = length_columns.append([column, "len_" + column]) %} {%- endfor %} diff --git a/dbt/tests/generic/test_columns_match.sql b/dbt/tests/generic/test_columns_match.sql index a07aea8d4..0fdd854f9 100644 --- a/dbt/tests/generic/test_columns_match.sql +++ b/dbt/tests/generic/test_columns_match.sql @@ -1,6 +1,8 @@ -- Confirm that columns in the same table have the same value -{% test columns_match(model, column_name, columns, additional_select_columns=[]) %} - {%- set columns_csv = columns | join(", ") -%} +{% test columns_match( + model, column_name, matching_column_names, additional_select_columns=[] +) %} + {%- set columns_csv = matching_column_names | join(", ") -%} {%- set columns_csv = column_name ~ ", " ~ columns_csv -%} {%- if additional_select_columns -%} {%- set additional_select_columns_csv = format_additional_select_columns( @@ -12,7 +14,7 @@ select {{ columns_csv }} from {{ model }} where - {%- for column in columns %} + {%- for column in matching_column_names %} {{ column_name }} != {{ column }} {%- if not loop.last -%} and{% endif %} {%- endfor %} {% endtest %} diff --git a/dbt/tests/generic/test_count_is_consistent.sql b/dbt/tests/generic/test_count_is_consistent.sql index 7c37bad0f..5f96ad110 100644 --- a/dbt/tests/generic/test_count_is_consistent.sql +++ b/dbt/tests/generic/test_count_is_consistent.sql @@ -1,11 +1,11 @@ -- Test that the count of a given column is the same when grouped by another -- col, for example that the number of distinct township codes is the same -- across years -{% test count_is_consistent(model, group_column, count_column) %} +{% test count_is_consistent(model, column_name, group_column) %} with counts as ( - select {{ group_column }}, count(distinct({{ count_column }})) as cnt + select {{ group_column }}, count(distinct({{ column_name }})) as cnt from {{ model }} group by {{ group_column }} ),