ccao-data · jeancochrane · May 7, 2024 · May 2, 2024 · May 3, 2024 · May 6, 2024
@@ -86,9 +86,6 @@
 import simplejson as json
 import yaml
 
-# Tests without a config.meta.category property will be grouped in
-# this default category
-DEFAULT_TEST_CATEGORY = "miscellaneous"
 # Prefix for the URL location of a test in the dbt docs
 DOCS_URL_PREFIX = "https://ccao-data.github.io/data-architecture/#!/test"
 # The S3 bucket where Athena query results are stored
@@ -108,8 +105,9 @@
 CLASS_FIELD = "class"
 WHO_FIELD = "who"
 WEN_FIELD = "wen"
-# Overrides for default display names for dbt tests
-CUSTOM_TEST_NAMES = {
+# Mapping that defines category names that should be reported for tests
+# based on their generics
+TEST_CATEGORIES = {
     "macro.athena.test_accepted_range": "incorrect_values",
     "macro.dbt_utils.test_accepted_range": "incorrect_values",
     "macro.athena.test_accepted_values": "incorrect_values",
@@ -121,6 +119,9 @@
     "macro.athena.test_is_null": "missing_values",
     "macro.athena.test_res_class_matches_pardat": "class_mismatch_or_issue",
 }
+# Fallback for tests whose category we can't determine from either the
+# test name, the `meta.category` attribute, or the TEST_CATEGORIES mapping
+DEFAULT_TEST_CATEGORY = "miscellaneous"
 # Directory to store failed test caches
 TEST_CACHE_DIR = "test_cache"
 
@@ -1183,7 +1184,7 @@ def get_category_from_node(node: typing.Dict) -> str:
         return meta_category
 
     for dependency_macro in node["depends_on"]["macros"]:
-        if custom_test_name := CUSTOM_TEST_NAMES.get(dependency_macro):
+        if custom_test_name := TEST_CATEGORIES.get(dependency_macro):
             return custom_test_name
         # Custom generic tests are always formatted like
         # macro.dbt.test_<generic_name>

@@ -11,13 +11,15 @@ This directory stores the configuration for building our data catalog using
 * [🔨 How to rebuild models using GitHub Actions](#-how-to-rebuild-models-using-github-actions)
 * [💻 How to develop the catalog](#-how-to-develop-the-catalog)
 * [➕ How to add a new model](#-how-to-add-a-new-model)
+* [📝 How to add and run tests](#-how-to-add-and-run-tests)
 * [🐛 Debugging tips](#-debugging-tips)
 
 ### Outside this document
 
 * [📖 Data documentation](https://ccao-data.github.io/data-architecture)
 * [📝 Design doc for our decision to develop our catalog with
   dbt](../documentation/design-docs/data-catalog.md)
+* [🧪 Generic tests we use for testing](./tests/generic/README.md)
 
 ## 🖼️ Background: What does the data catalog do?
 
@@ -409,29 +411,144 @@ We use the following pattern to determine where to define each column descriptio
 
 ### Model tests
 
-Any assumptions underlying the new model should be documented in the form of
-[dbt tests](https://docs.getdbt.com/docs/build/tests). We prefer adding tests
-inline in `schema.yml` model properties files, as opposed to defining one-off
-tests in the `tests/` directory.
+New models should generally be added with accompanying tests to ensure the
+underlying data and transformations are correct. For more information on
+testing, see [📝 How to add and run tests](#-how-to-add-and-run-tests).
 
-Conceptually, there are two types of tests that we might consider for a new
-model:
+## 📝 How to add and run tests
 
-1. **Data tests** check that the assumptions that a model makes about the raw
-   data it is transforming are correct.
-    * For example: Test that the table is unique by `pin10` and `year`.
-2. **Unit tests** check that the transformation logic itself produces
-   the correct output on a hardcoded set of input data.
+We test our data and our transformations using [dbt
+tests](https://docs.getdbt.com/docs/build/tests). We prefer adding tests
+inline in `schema.yml` config files using [generic
+tests](https://docs.getdbt.com/best-practices/writing-custom-generic-tests),
+rather than [singular
+tests](https://docs.getdbt.com/docs/build/data-tests#singular-data-tests).
+
+### Data tests vs. unit tests
+
+There are two types of tests that we might consider for a model:
+
+1. **Data tests** check that our assumptions about our raw data are correct
+    * For example: Test that a table is unique by `parid` and `taxyr`
+2. **Unit tests** check that transformation logic inside a model definition
+   produces the correct output on a specific set of input data
     * For example: Test that an enum column computed by a `CASE... WHEN`
-      expression produces the correct enum output for a given input string.
-
-The dbt syntax does not distinguish between data and unit tests, but it has
-emerged as a valuable distinction that we make on our side. Data tests are
-generally much easier to define and to implement, since they operate directly on
-source data and do not require hardcoded input and output values to execute.
-Due to this complexity, we currently do not have a way of supporting unit
-tests, although we plan to revisit this in the future; as such, when proposing
-new tests, check to ensure that they are in fact data tests and not unit tests.
+      expression produces the correct output for a given input string
+
+dbt tests are data tests by default, although a dedicated unit testing syntax
+[is coming soon](https://docs.getdbt.com/docs/build/unit-tests). Until unit
+tests are natively supported, however, we do not have a way of implementing them.
     tables: 
       - name: ratio_stats 
         description: '{{ doc("table_ratio_stats") }}' 
         tests: 
           - expression_is_true: 
               name: reporting_ratio_stats_no_nulls 
               expression: | 
                 year IS NOT NULL 
                 AND triad IS NOT NULL 
                 AND geography_type IS NOT NULL 
                 AND property_group IS NOT NULL 
     tables: 
       - name: ratio_stats 
         description: '{{ doc("table_ratio_stats") }}' 
         tests: 
           - expression_is_true: 
               name: reporting_ratio_stats_no_nulls 
               expression: | 
                 year IS NOT NULL 
                 AND triad IS NOT NULL 
                 AND geography_type IS NOT NULL 
                 AND property_group IS NOT NULL 
+We plan to change this once unit testing is released, but for now, make sure that
+any new tests you write are data tests and not unit tests.
+
+### Adding data tests
+
+There are two types of data tests that we support:
+
+1. **QC tests** confirm our assumptions about iasWorld data and are run at
+   scheduled intervals to confirm that iasWorld data meets spec
+2. **Non-QC tests** confirm all other assumptions about data sources outside
+   of iasWorld, and are run in an ad hoc fashion depending on the needs of
+   the transformations that sit on top of the raw data
+
+#### Adding QC tests
+
+QC tests are run on a schedule by the [`test-dbt-models`
+workflow](https://github.com/ccao-data/data-architecture/actions/workflows/test_dbt_models.yaml)
+and their output is interpreted by the [`transform_dbt_test_results`
+script](https://github.com/ccao-data/data-architecture/blob/master/.github/scripts/transform_dbt_test_results.py).
+This script reads the metadata for a test run and outputs an Excel
+workbook with detailed information on each failure to aid in resolving
+any data problems that the tests reveal.
+
+There are a few specific modifications a test author needs to make to
+ensure that QC tests can be run by the workflow and interpreted by the script:
+
+* One of either the test or the model that the test is defined on must be
+[tagged](https://docs.getdbt.com/reference/resource-configs/tags) with
+the tag `test_qc_iasworld`
+  * Prefer tagging the model, and fall back to tagging the test if for
+    some reason the model cannot be tagged (e.g. if it has some non-QC
+    tests defined on it)
+* The test definition must supply a few specific parameters:
+  * `name` must be set and follow the pattern
+    `iasworld_<table_name>_<test_description>`
+  * `additional_select_columns` must be set to an array of strings
+    representing any extra columns that need to be output by the test
+    for display in the workbook
+    * Generics typically select any columns mentioned by other parameters,
+      but if you are unsure which columns will be selected by default
+      (meaning they do not need to be included in `additional_select_columns`),
+      consult our [documentation](./tests/generic/README.md) for the generic
+      test you're using
+  * `config.where` should typically set to provide a filter expression
+    that restricts tests to unique rows and to rows matching a date range
+    set by the `test_qc_year_start` and `test_qc_year_end`
+    [project variables](https://docs.getdbt.com/docs/build/project-variables)
+  * `meta` should be set with a few specific string attributes:
+    * `description` (required): A short human-readable description of the test
+    * `category` (optional): A workbook category for the test, required if
+      a category is not defined for the test's generic in the `TEST_CATEGORIES`
+      constant in the [`transform_dbt_test_results`
+      script](https://github.com/ccao-data/data-architecture/blob/master/.github/scripts/transform_dbt_test_results.py)
+    * `table_name` (optional): The name of the table to report in the output
+      workbook, if the workbook should report a different table name than the
+      name of the model that the test is defined on
+
+See the [`iasworld_pardat_class_in_ccao_class_dict`
+test](https://github.com/ccao-data/data-architecture/blob/bd4bc1769fe33fdba1dbe827791b5c41389cf6ec/dbt/models/iasworld/schema/iasworld.pardat.yml#L78-L96)
+for an example of a test that sets these attributes.
+
+Due to the similarity of parameters defined on QC tests, we make extensive use
+of YAML anchors and aliases to define symbols for commonly-used values.
+See [here](https://support.atlassian.com/bitbucket-cloud/docs/yaml-anchors/)
+for a brief explanation of the YAML anchor and alias syntax.
+
+#### Adding non-QC tests 
+
+QC tests are much more common than non-QC tests in our test suite. If you
+are being asked to add a test that appears to be a non-QC test, double
+check with the person who assigned the test to you and ask them when
+and how the test should be run so that its attributes can be set
+accordingly.
+
+### Choosing a generic test
+
+Writing a test in a `schema.yml` file requires a [generic
+test](https://docs.getdbt.com/best-practices/writing-custom-generic-tests)
+to define the underlying test logic. Our generic tests are defined
+in the `tests/generic/` directory. Before writing a test, look at
+[the documentation for our generics](./tests/generic/README.md) to see if
+any of them meet your needs.
+
+If a generic test does not meet your needs but seems like it could be
+easily extended to meet your needs (say, if it inner joins two tables
+but you would like to be able to configure it to left join those tables
+instead) you can modify the macro that defines the generic test as part
+of your PR to make the change that you need.
+
+If no generic tests meet your needs and none can be easily modified to
+do so, you have two options:
+
+1. **Define a new model in the `models/qc/` directory that _can_ use a pre-existing generic**.
+   This is a good option if, say, you need to join two or more tables in a
+   complex way that is specific to your test and not easily generalizable.
+   With this approach, you can perform that join in the model, and then
+   the generic test doesn't need to know anything about it.
+2. **Write a new generic test**. If you decide to take this approach,
+   make sure to read the docs on [writing custom generic
+   tests](https://docs.getdbt.com/best-practices/writing-custom-generic-tests).
+   This is a good option if you think that the logic you need
+   for your test will be easily generalizable to other models
+   and other tests. You'll also need to follow a few extra steps that are specific
+   to our environment:
+     1. Add a default category for your generic test in
+        the `TEST_CATEGORIES` constant in the [`transform_dbt_test_results`
+        script](https://github.com/ccao-data/data-architecture/blob/master/.github/scripts/transform_dbt_test_results.py)
+     2. Make sure that your generic test supports the `additional_select_columns`
+        parameter that most of our generic tests support, making use
+        of the `format_additional_select_columns` macro to format the
+        parameter when applying it to your `SELECT` condition
 
 ## 🐛 Debugging tips
 

@@ -81,6 +81,15 @@ models:
         description: '{{ doc("column_chicago_police_district_num") }}'
       - name: class
         description: '{{ doc("shared_column_class") }}'
+        tests:
+          - count_is_consistent:
+              name: default_vw_pin_universe_class_count_is_consistent_by_year
+              group_column: year
+              config:
+                error_if: ">25"
+          - not_accepted_values:
+              name: default_vw_pin_universe_class_no_hyphens
+              values: "2-99"
       - name: cook_board_of_review_district_data_year
         description: '{{ doc("shared_column_data_year") }}'
       - name: cook_board_of_review_district_num
@@ -209,6 +218,10 @@ models:
         description: '{{ doc("column_tax_district_num") }}'
       - name: township_code
         description: '{{ doc("shared_column_township_code") }}'
+        tests:
+          - count_is_consistent:
+              name: default_vw_pin_universe_town_count_is_consistent_by_year
+              group_column: year
       - name: township_name
         description: '{{ doc("shared_column_township_name") }}'
       - name: triad_code
@@ -233,18 +246,6 @@ models:
         description: ZIP code of the property
 
     tests:
-      # Number of classes is consistent over time
-      - count_is_consistent:
-          name: default_vw_pin_universe_class_count_is_consistent_by_year
-          group_column: year
-          count_column: class
-          config:
-            error_if: ">25"
-      # Number of towns is consistent over time
-      - count_is_consistent:
-          name: default_vw_pin_universe_town_count_is_consistent_by_year
-          group_column: year
-          count_column: township_code
       - unique_combination_of_columns:
           name: default_vw_pin_universe_unique_by_14_digit_pin_and_year
           combination_of_columns:
@@ -259,9 +260,5 @@ models:
           expression: REGEXP_COUNT(pin, '[0-9]') = 14 AND LENGTH(pin) = 14
           additional_select_columns:
             - pin
-      - not_accepted_values:
-          name: default_vw_pin_universe_class_no_hyphens
-          column_name: class
-          values: "2-99"
       # TODO: Data completeness correlates with availability of spatial data
       # by year
@@ -343,7 +343,7 @@ sources:
                 description: adrno should be <= 5 characters long
           - column_length:
               name: iasworld_legdat_adrsuf_zip1_taxdist_length_lte_5
-              columns:
+              column_names:
                 - adrsuf
                 - zip1
                 - taxdist

@@ -250,7 +250,7 @@ models:
       - column_length:
           name: location_vw_pin10_location_7_digit_ids_are_correct_length
           length: 7
-          columns:
+          column_names:
             - census_place_geoid
             - census_puma_geoid
             - census_school_district_elementary_geoid