From 47c7fd058680f38e2bb5858e28ec7c5f32b51c31 Mon Sep 17 00:00:00 2001 From: Lawrence Adams Date: Fri, 11 Oct 2024 16:08:02 +0100 Subject: [PATCH 1/5] feat: build location model properly using patient and organization as a source - uses locations from patients and organizations and runs a distinct ontop of them - uses a mixture of address and city as a natural key to join on in downstream models to have a valid `organization_id` --- models/intermediate/int__person.sql | 6 ++- models/omop/care_site.sql | 9 ++++- models/omop/location.sql | 61 ++++++++++++++++++++++------- 3 files changed, 59 insertions(+), 17 deletions(-) diff --git a/models/intermediate/int__person.sql b/models/intermediate/int__person.sql index 8d78421..7f0fc8b 100644 --- a/models/intermediate/int__person.sql +++ b/models/intermediate/int__person.sql @@ -19,7 +19,7 @@ SELECT WHEN upper(p.ethnicity) = 'NONHISPANIC' THEN 38003564 ELSE 0 END AS ethnicity_concept_id - , {{ dbt.cast("NULL", api.Column.translate_type("integer")) }} AS location_id + , loc.location_id , {{ dbt.cast("NULL", api.Column.translate_type("integer")) }} AS provider_id , {{ dbt.cast("NULL", api.Column.translate_type("integer")) }} AS care_site_id , p.patient_id AS person_source_value @@ -30,4 +30,8 @@ SELECT , p.ethnicity AS ethnicity_source_value , 0 AS ethnicity_source_concept_id FROM {{ ref('stg_synthea__patients') }} AS p +LEFT JOIN {{ ref('location') }} loc + -- Address and city provides enough entropy to join on safely + ON (p.patient_address = loc.address_1 AND p.patient_city = loc.city) + WHERE p.patient_gender IS NOT NULL diff --git a/models/omop/care_site.sql b/models/omop/care_site.sql index b1cadab..ae480ce 100644 --- a/models/omop/care_site.sql +++ b/models/omop/care_site.sql @@ -2,7 +2,12 @@ SELECT ROW_NUMBER() OVER (ORDER BY organization_id) AS care_site_id , organization_name AS care_site_name , 0 AS place_of_service_concept_id - , {{ dbt.cast("null", api.Column.translate_type("integer")) }} AS location_id + , loc.location_id , organization_id AS care_site_source_value , {{ dbt.cast("null", api.Column.translate_type("varchar")) }} AS place_of_service_source_value -FROM {{ ref('stg_synthea__organizations') }} +FROM {{ ref('stg_synthea__organizations') }} org +LEFT JOIN {{ ref('location') }} loc + ON + org.organization_name = loc.address_1 + AND org.organization_address = loc.address_2 + AND org.organization_city = loc.city diff --git a/models/omop/location.sql b/models/omop/location.sql index 9a0b959..5013584 100644 --- a/models/omop/location.sql +++ b/models/omop/location.sql @@ -1,15 +1,48 @@ +WITH unioned_location_sources AS ( + SELECT DISTINCT + p.patient_address AS address_1 + , {{ dbt.cast("null", api.Column.translate_type("varchar")) }} AS address_2 + , p.patient_city AS city + , s.state_abbreviation AS state + , p.patient_zip AS zip + , p.patient_county AS county + , {{ dbt.cast("null", api.Column.translate_type("varchar")) }} AS location_source_value + , {{ dbt.cast("null", api.Column.translate_type("integer")) }} AS country_concept_id + , {{ dbt.cast("null", api.Column.translate_type("varchar")) }} AS country_source_value + , p.patient_latitude AS latitude + , p.patient_longitude AS longitude + FROM {{ ref("stg_synthea__patients") }} AS p + LEFT JOIN {{ ref('stg_map__states') }} AS s ON p.patient_state = s.state_name + + UNION + + SELECT DISTINCT + organization_name AS address_1 + , organization_address AS address_2 + , organization_city AS city + , organization_state AS state + , organization_zip AS zip + , {{ dbt.cast("null", api.Column.translate_type("varchar")) }} AS county + , {{ dbt.cast("null", api.Column.translate_type("varchar")) }} AS location_source_value + , {{ dbt.cast("null", api.Column.translate_type("integer")) }} AS country_concept_id + , {{ dbt.cast("null", api.Column.translate_type("varchar")) }} AS country_source_value + , organization_latitude AS latitude + , organization_longitude AS longitude + FROM + {{ ref("stg_synthea__organizations") }} +) + SELECT - ROW_NUMBER() OVER () AS location_id - , {{ dbt.cast("null", api.Column.translate_type("varchar")) }} AS address_1 - , {{ dbt.cast("null", api.Column.translate_type("varchar")) }} AS address_2 - , p.patient_city AS city - , s.state_abbreviation AS state - , {{ dbt.cast("null", api.Column.translate_type("varchar")) }} AS county - , p.patient_zip AS zip - , p.patient_zip AS location_source_value - , {{ dbt.cast("null", api.Column.translate_type("integer")) }} AS country_concept_id - , {{ dbt.cast("null", api.Column.translate_type("varchar")) }} AS country_source_value - , {{ dbt.cast("null", api.Column.translate_type("decimal")) }} AS latitude - , {{ dbt.cast("null", api.Column.translate_type("decimal")) }} AS longitude -FROM {{ ref('stg_synthea__patients') }} AS p -LEFT JOIN {{ ref('stg_map__states') }} AS s ON p.patient_state = s.state_name + row_number() OVER (ORDER BY state, city, address_1) AS location_id + , address_1 + , address_2 + , city + , state + , zip + , county + , location_source_value + , country_concept_id + , country_source_value + , latitude + , longitude +FROM unioned_location_sources From 889aa1c68b99312d2b02d8cd7d130b5f966ce8d5 Mon Sep 17 00:00:00 2001 From: Lawrence Adams Date: Sun, 13 Oct 2024 04:46:25 +0100 Subject: [PATCH 2/5] refactor: use address hash to join onto int__locations --- models/intermediate/int__locations.sql | 59 ++++++++++++++++++++++++++ models/intermediate/int__person.sql | 11 ++++- models/omop/care_site.sql | 13 +++--- models/omop/location.sql | 36 +--------------- 4 files changed, 77 insertions(+), 42 deletions(-) create mode 100644 models/intermediate/int__locations.sql diff --git a/models/intermediate/int__locations.sql b/models/intermediate/int__locations.sql new file mode 100644 index 0000000..d09f988 --- /dev/null +++ b/models/intermediate/int__locations.sql @@ -0,0 +1,59 @@ +{% set address_columns = [ + "address_1", + "address_2", + "city", + "state", + "zip", + "county" + ] +%} + +WITH unioned_location_sources AS ( + SELECT DISTINCT + p.patient_address AS address_1 + , {{ dbt.cast("null", api.Column.translate_type("varchar")) }} AS address_2 + , p.patient_city AS city + , s.state_abbreviation AS state + , p.patient_zip AS zip + , p.patient_county AS county + , {{ dbt.cast("null", api.Column.translate_type("integer")) }} AS country_concept_id + , {{ dbt.cast("null", api.Column.translate_type("varchar")) }} AS country_source_value + , p.patient_latitude AS latitude + , p.patient_longitude AS longitude + FROM {{ ref("stg_synthea__patients") }} AS p + LEFT JOIN {{ ref('stg_map__states') }} AS s ON p.patient_state = s.state_name + + UNION + + SELECT DISTINCT + organization_address AS address_1 + , {{ dbt.cast("null", api.Column.translate_type("varchar")) }} AS address_2 + , organization_city AS city + , organization_state AS state + , organization_zip AS zip + , {{ dbt.cast("null", api.Column.translate_type("varchar")) }} AS county + , {{ dbt.cast("null", api.Column.translate_type("integer")) }} AS country_concept_id + , {{ dbt.cast("null", api.Column.translate_type("varchar")) }} AS country_source_value + , organization_latitude AS latitude + , organization_longitude AS longitude + FROM + {{ ref("stg_synthea__organizations") }} +) + +SELECT + address_1 + , address_2 + , city + , state + , zip + , county + , md5( + {%- for col in address_columns -%} + coalesce({{col}}, '') {{ "|| " if not loop.last }} + {%- endfor -%} + ) AS location_source_value + , country_concept_id + , country_source_value + , latitude + , longitude +FROM unioned_location_sources diff --git a/models/intermediate/int__person.sql b/models/intermediate/int__person.sql index 7f0fc8b..6c87b4e 100644 --- a/models/intermediate/int__person.sql +++ b/models/intermediate/int__person.sql @@ -30,8 +30,15 @@ SELECT , p.ethnicity AS ethnicity_source_value , 0 AS ethnicity_source_concept_id FROM {{ ref('stg_synthea__patients') }} AS p -LEFT JOIN {{ ref('location') }} loc +LEFT JOIN {{ ref('stg_map__states') }} AS s ON p.patient_state = s.state_name +LEFT JOIN {{ ref('location') }} AS loc -- Address and city provides enough entropy to join on safely - ON (p.patient_address = loc.address_1 AND p.patient_city = loc.city) + ON loc.location_source_value = md5( + coalesce(p.patient_address, '') + || coalesce(p.patient_city, '') + || coalesce(s.state_abbreviation, '') + || coalesce(p.patient_zip, '') + || coalesce(p.patient_county, '') + ) WHERE p.patient_gender IS NOT NULL diff --git a/models/omop/care_site.sql b/models/omop/care_site.sql index ae480ce..ef517ba 100644 --- a/models/omop/care_site.sql +++ b/models/omop/care_site.sql @@ -5,9 +5,12 @@ SELECT , loc.location_id , organization_id AS care_site_source_value , {{ dbt.cast("null", api.Column.translate_type("varchar")) }} AS place_of_service_source_value -FROM {{ ref('stg_synthea__organizations') }} org -LEFT JOIN {{ ref('location') }} loc +FROM {{ ref('stg_synthea__organizations') }} +LEFT JOIN {{ ref('location') }} AS loc ON - org.organization_name = loc.address_1 - AND org.organization_address = loc.address_2 - AND org.organization_city = loc.city + loc.location_source_value = MD5( + COALESCE(organization_address, '') + || COALESCE(organization_city, '') + || COALESCE(organization_state, '') + || COALESCE(organization_zip, '') + ) diff --git a/models/omop/location.sql b/models/omop/location.sql index 5013584..3dca603 100644 --- a/models/omop/location.sql +++ b/models/omop/location.sql @@ -1,37 +1,3 @@ -WITH unioned_location_sources AS ( - SELECT DISTINCT - p.patient_address AS address_1 - , {{ dbt.cast("null", api.Column.translate_type("varchar")) }} AS address_2 - , p.patient_city AS city - , s.state_abbreviation AS state - , p.patient_zip AS zip - , p.patient_county AS county - , {{ dbt.cast("null", api.Column.translate_type("varchar")) }} AS location_source_value - , {{ dbt.cast("null", api.Column.translate_type("integer")) }} AS country_concept_id - , {{ dbt.cast("null", api.Column.translate_type("varchar")) }} AS country_source_value - , p.patient_latitude AS latitude - , p.patient_longitude AS longitude - FROM {{ ref("stg_synthea__patients") }} AS p - LEFT JOIN {{ ref('stg_map__states') }} AS s ON p.patient_state = s.state_name - - UNION - - SELECT DISTINCT - organization_name AS address_1 - , organization_address AS address_2 - , organization_city AS city - , organization_state AS state - , organization_zip AS zip - , {{ dbt.cast("null", api.Column.translate_type("varchar")) }} AS county - , {{ dbt.cast("null", api.Column.translate_type("varchar")) }} AS location_source_value - , {{ dbt.cast("null", api.Column.translate_type("integer")) }} AS country_concept_id - , {{ dbt.cast("null", api.Column.translate_type("varchar")) }} AS country_source_value - , organization_latitude AS latitude - , organization_longitude AS longitude - FROM - {{ ref("stg_synthea__organizations") }} -) - SELECT row_number() OVER (ORDER BY state, city, address_1) AS location_id , address_1 @@ -45,4 +11,4 @@ SELECT , country_source_value , latitude , longitude -FROM unioned_location_sources +FROM {{ ref('int__locations') }} From 48e261a9cddead23e129369ab9195307d6e899c4 Mon Sep 17 00:00:00 2001 From: Lawrence Adams Date: Sun, 13 Oct 2024 20:46:11 +0100 Subject: [PATCH 3/5] refactor: create and use a macro for safe hashing of columns - used for address joining - requires `colaesce` as joins with NULLs will not work --- macros/macros.yml | 7 +++++++ macros/safe_hash.sql | 9 +++++++++ models/intermediate/int__locations.sql | 9 ++------- models/intermediate/int__person.sql | 16 +++++++++------- models/omop/care_site.sql | 14 ++++++++------ 5 files changed, 35 insertions(+), 20 deletions(-) create mode 100644 macros/macros.yml create mode 100644 macros/safe_hash.sql diff --git a/macros/macros.yml b/macros/macros.yml new file mode 100644 index 0000000..e576eaf --- /dev/null +++ b/macros/macros.yml @@ -0,0 +1,7 @@ +macros: + - name: safe_hash + description: This macro allows concatenation and hashing of fields that may contain `NULL` elements. For example, fields in an address. + arguments: + - name: columns + type: list[str] + description: A list of column names \ No newline at end of file diff --git a/macros/safe_hash.sql b/macros/safe_hash.sql new file mode 100644 index 0000000..4149e9a --- /dev/null +++ b/macros/safe_hash.sql @@ -0,0 +1,9 @@ +{%- macro safe_hash(columns) -%} +{% set coalesced_columns = [] %} +{%- for column in columns -%} + {% do coalesced_columns.append("COALESCE(" + column + ", '')") %} +{%- endfor -%} + MD5( + {{ dbt.concat(coalesced_columns) }} + ) +{%- endmacro -%} diff --git a/models/intermediate/int__locations.sql b/models/intermediate/int__locations.sql index d09f988..ae52150 100644 --- a/models/intermediate/int__locations.sql +++ b/models/intermediate/int__locations.sql @@ -5,8 +5,7 @@ "state", "zip", "county" - ] -%} +] %} WITH unioned_location_sources AS ( SELECT DISTINCT @@ -47,11 +46,7 @@ SELECT , state , zip , county - , md5( - {%- for col in address_columns -%} - coalesce({{col}}, '') {{ "|| " if not loop.last }} - {%- endfor -%} - ) AS location_source_value + , {{ safe_hash(address_columns) }} AS location_source_value , country_concept_id , country_source_value , latitude diff --git a/models/intermediate/int__person.sql b/models/intermediate/int__person.sql index 6c87b4e..71aaae7 100644 --- a/models/intermediate/int__person.sql +++ b/models/intermediate/int__person.sql @@ -1,3 +1,11 @@ +{% set address_columns = [ + "p.patient_address", + "p.patient_city", + "s.state_abbreviation", + "p.patient_zip", + "p.patient_county" +] %} + SELECT row_number() OVER (ORDER BY p.patient_id) AS person_id , CASE upper(p.patient_gender) @@ -33,12 +41,6 @@ FROM {{ ref('stg_synthea__patients') }} AS p LEFT JOIN {{ ref('stg_map__states') }} AS s ON p.patient_state = s.state_name LEFT JOIN {{ ref('location') }} AS loc -- Address and city provides enough entropy to join on safely - ON loc.location_source_value = md5( - coalesce(p.patient_address, '') - || coalesce(p.patient_city, '') - || coalesce(s.state_abbreviation, '') - || coalesce(p.patient_zip, '') - || coalesce(p.patient_county, '') - ) + ON loc.location_source_value = {{ safe_hash(address_columns) }} WHERE p.patient_gender IS NOT NULL diff --git a/models/omop/care_site.sql b/models/omop/care_site.sql index ef517ba..2393ead 100644 --- a/models/omop/care_site.sql +++ b/models/omop/care_site.sql @@ -1,3 +1,10 @@ +{% set address_columns = [ + "organization_address", + "organization_city", + "organization_state", + "organization_zip", +] %} + SELECT ROW_NUMBER() OVER (ORDER BY organization_id) AS care_site_id , organization_name AS care_site_name @@ -8,9 +15,4 @@ SELECT FROM {{ ref('stg_synthea__organizations') }} LEFT JOIN {{ ref('location') }} AS loc ON - loc.location_source_value = MD5( - COALESCE(organization_address, '') - || COALESCE(organization_city, '') - || COALESCE(organization_state, '') - || COALESCE(organization_zip, '') - ) + loc.location_source_value = {{ safe_hash(address_columns) }} From b34e4d47a3b1e122ac71714eb9d21f0b9e84666e Mon Sep 17 00:00:00 2001 From: Lawrence Adams Date: Sun, 13 Oct 2024 20:55:19 +0100 Subject: [PATCH 4/5] chore: use jinja style string concatenation --- macros/safe_hash.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/macros/safe_hash.sql b/macros/safe_hash.sql index 4149e9a..c012c74 100644 --- a/macros/safe_hash.sql +++ b/macros/safe_hash.sql @@ -1,7 +1,7 @@ {%- macro safe_hash(columns) -%} {% set coalesced_columns = [] %} {%- for column in columns -%} - {% do coalesced_columns.append("COALESCE(" + column + ", '')") %} + {% do coalesced_columns.append("COALESCE(" ~ column ~ ", '')") %} {%- endfor -%} MD5( {{ dbt.concat(coalesced_columns) }} From acb3522f1e7bc3a4354b41df06bf3550ac034945 Mon Sep 17 00:00:00 2001 From: Lawrence Adams Date: Wed, 23 Oct 2024 22:28:30 +0100 Subject: [PATCH 5/5] fix: resolve review issues --- macros/safe_hash.sql | 2 +- models/intermediate/int__person.sql | 2 -- models/omop/location.sql | 2 +- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/macros/safe_hash.sql b/macros/safe_hash.sql index c012c74..0c59184 100644 --- a/macros/safe_hash.sql +++ b/macros/safe_hash.sql @@ -1,7 +1,7 @@ {%- macro safe_hash(columns) -%} {% set coalesced_columns = [] %} {%- for column in columns -%} - {% do coalesced_columns.append("COALESCE(" ~ column ~ ", '')") %} + {% do coalesced_columns.append("COALESCE(" ~ column.lower() ~ ", '')") %} {%- endfor -%} MD5( {{ dbt.concat(coalesced_columns) }} diff --git a/models/intermediate/int__person.sql b/models/intermediate/int__person.sql index 71aaae7..ec97731 100644 --- a/models/intermediate/int__person.sql +++ b/models/intermediate/int__person.sql @@ -40,7 +40,5 @@ SELECT FROM {{ ref('stg_synthea__patients') }} AS p LEFT JOIN {{ ref('stg_map__states') }} AS s ON p.patient_state = s.state_name LEFT JOIN {{ ref('location') }} AS loc - -- Address and city provides enough entropy to join on safely ON loc.location_source_value = {{ safe_hash(address_columns) }} - WHERE p.patient_gender IS NOT NULL diff --git a/models/omop/location.sql b/models/omop/location.sql index 3dca603..4fba0f9 100644 --- a/models/omop/location.sql +++ b/models/omop/location.sql @@ -1,5 +1,5 @@ SELECT - row_number() OVER (ORDER BY state, city, address_1) AS location_id + row_number() OVER (ORDER BY state, city, address_1, location_source_value) AS location_id , address_1 , address_2 , city