From 97fc99f7cf29dd3ac774f6b303c93499110ec92c Mon Sep 17 00:00:00 2001 From: Valentin Matton Date: Tue, 14 Jan 2025 15:32:18 +0100 Subject: [PATCH] fix(pipeline): clean up soliguide timestamps --- .../soliguide/stg_soliguide__lieux.sql | 59 +++++++++---------- .../soliguide/stg_soliguide__services.sql | 59 ++++++++++--------- 2 files changed, 61 insertions(+), 57 deletions(-) diff --git a/pipeline/dbt/models/staging/sources/soliguide/stg_soliguide__lieux.sql b/pipeline/dbt/models/staging/sources/soliguide/stg_soliguide__lieux.sql index 823ab50d..e88f24aa 100644 --- a/pipeline/dbt/models/staging/sources/soliguide/stg_soliguide__lieux.sql +++ b/pipeline/dbt/models/staging/sources/soliguide/stg_soliguide__lieux.sql @@ -4,36 +4,35 @@ WITH source AS ( lieux AS ( SELECT - _di_source_id AS "_di_source_id", - CAST(data ->> 'updatedAt' AS DATE) AS "updated_at", - CAST(data #>> '{position,location,coordinates,0}' AS FLOAT) AS "position__coordinates__x", - CAST(data #>> '{position,location,coordinates,1}' AS FLOAT) AS "position__coordinates__y", - CAST(data #>> '{modalities,inconditionnel}' AS BOOLEAN) AS "modalities__inconditionnel", - CAST(data #>> '{modalities,appointment,checked}' AS BOOLEAN) AS "modalities__appointment__checked", - CAST(data #>> '{modalities,inscription,checked}' AS BOOLEAN) AS "modalities__inscription__checked", - CAST(data #>> '{modalities,orientation,checked}' AS BOOLEAN) AS "modalities__orientation__checked", - data ->> 'lieu_id' AS "id", - data ->> 'lieu_id' AS "lieu_id", - -- TODO: entity.phones - data ->> 'name' AS "name", - data #>> '{position,city}' AS "position__city", - data #>> '{position,cityCode}' AS "position__city_code", - data #>> '{position,country}' AS "position__country", - data ->> 'description' AS "description", - data ->> 'seo_url' AS "seo_url", - data #>> '{position,postalCode}' AS "position__postal_code", - data #>> '{position,address}' AS "position__address", - NULLIF(TRIM(data #>> '{position,additionalInformation}'), '') AS "position__additional_information", - data #>> '{position,department}' AS "position__department", - data #>> '{publics,age}' AS "publics__age", - CAST(data #>> '{publics,accueil}' AS INT) AS "publics__accueil", - NULLIF(data #>> '{entity,mail}', '') AS "entity_mail", - NULLIF(data #>> '{entity,website}', '') AS "entity_website", - data -> 'newhours' AS "newhours", - data #>> '{modalities,appointment,precisions}' AS "modalities__appointment__precisions", - data #>> '{modalities,inscription,precisions}' AS "modalities__inscription__precisions", - data #>> '{modalities,orientation,precisions}' AS "modalities__orientation__precisions", - data -> 'sources' AS "sources" + _di_source_id, + CAST(SUBSTRING(data ->> 'updatedAt' FROM '\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z') AS DATE) AS "updated_at", + CAST(data #>> '{position,location,coordinates,0}' AS FLOAT) AS "position__coordinates__x", + CAST(data #>> '{position,location,coordinates,1}' AS FLOAT) AS "position__coordinates__y", + CAST(data #>> '{modalities,inconditionnel}' AS BOOLEAN) AS "modalities__inconditionnel", + CAST(data #>> '{modalities,appointment,checked}' AS BOOLEAN) AS "modalities__appointment__checked", + CAST(data #>> '{modalities,inscription,checked}' AS BOOLEAN) AS "modalities__inscription__checked", + CAST(data #>> '{modalities,orientation,checked}' AS BOOLEAN) AS "modalities__orientation__checked", + data ->> 'lieu_id' AS "id", + data ->> 'lieu_id' AS "lieu_id", + data ->> 'name' AS "name", + data #>> '{position,city}' AS "position__city", + data #>> '{position,cityCode}' AS "position__city_code", + data #>> '{position,country}' AS "position__country", + data ->> 'description' AS "description", + data ->> 'seo_url' AS "seo_url", + data #>> '{position,postalCode}' AS "position__postal_code", + data #>> '{position,address}' AS "position__address", + NULLIF(TRIM(data #>> '{position,additionalInformation}'), '') AS "position__additional_information", + data #>> '{position,department}' AS "position__department", + data #>> '{publics,age}' AS "publics__age", + CAST(data #>> '{publics,accueil}' AS INT) AS "publics__accueil", + NULLIF(data #>> '{entity,mail}', '') AS "entity_mail", + NULLIF(data #>> '{entity,website}', '') AS "entity_website", + data -> 'newhours' AS "newhours", + data #>> '{modalities,appointment,precisions}' AS "modalities__appointment__precisions", + data #>> '{modalities,inscription,precisions}' AS "modalities__inscription__precisions", + data #>> '{modalities,orientation,precisions}' AS "modalities__orientation__precisions", + data -> 'sources' AS "sources" FROM source ), diff --git a/pipeline/dbt/models/staging/sources/soliguide/stg_soliguide__services.sql b/pipeline/dbt/models/staging/sources/soliguide/stg_soliguide__services.sql index ce3f3926..265b7f05 100644 --- a/pipeline/dbt/models/staging/sources/soliguide/stg_soliguide__services.sql +++ b/pipeline/dbt/models/staging/sources/soliguide/stg_soliguide__services.sql @@ -2,37 +2,42 @@ WITH source AS ( {{ stg_source_header('soliguide', 'lieux') }} ), +-- about timestamps : soliguide can have corrupted timestamps, +-- therefore timestamps are extracted from datetime fields and then casted to date + services AS ( SELECT - source._di_source_id AS "_di_source_id", - CAST(source.data ->> 'updatedAt' AS DATE) AS "updated_at", - source.data ->> 'lieu_id' AS "lieu_id", - inline_service.data ->> 'serviceObjectId' AS "id", - NULLIF(inline_service.data ->> 'name', '') AS "name", - inline_service.data ->> 'category' AS "category", - NULLIF(inline_service.data ->> 'description', '') AS "description", - inline_service.data -> 'hours' AS "hours", - CAST(inline_service.data ->> 'differentHours' AS BOOLEAN) AS "different_hours", - CAST(inline_service.data #>> '{close,actif}' AS BOOLEAN) AS "close__actif", - CAST(inline_service.data #>> '{close,dateDebut}' AS DATE) AS "close__date_debut", - CAST(inline_service.data #>> '{close,dateFin}' AS DATE) AS "close__date_fin", - CAST(inline_service.data #>> '{modalities,inconditionnel}' AS BOOLEAN) AS "modalities__inconditionnel", - CAST(inline_service.data #>> '{modalities,appointment,checked}' AS BOOLEAN) AS "modalities__appointment__checked", - inline_service.data #>> '{modalities,appointment,precisions}' AS "modalities__appointment__precisions", - CAST(inline_service.data #>> '{modalities,price,checked}' AS BOOLEAN) AS "modalities__price__checked", - CASE - WHEN inline_service.data #>> '{modalities,price,precisions}' IS NULL THEN source.data #>> '{modalities,price,precisions}' - WHEN source.data #>> '{modalities,price,precisions}' IS NULL THEN NULL - ELSE CONCAT(inline_service.data #>> '{modalities,price,precisions}', ' ', source.data #>> '{modalities,price,precisions}') - END AS "modalities__price__precisions", - CAST(inline_service.data #>> '{modalities,inscription,checked}' AS BOOLEAN) AS "modalities__inscription__checked", - inline_service.data #>> '{modalities,inscription,precisions}' AS "modalities__inscription__precisions", - CAST(inline_service.data #>> '{modalities,orientation,checked}' AS BOOLEAN) AS "modalities__orientation__checked", - inline_service.data #>> '{modalities,orientation,precisions}' AS "modalities__orientation__precisions", - source.data -> 'sources' AS "sources" + source._di_source_id AS "_di_source_id", + CAST(SUBSTRING(source.data ->> 'updatedAt' FROM '\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z') AS DATE) AS "updated_at", + source.data ->> 'lieu_id' AS "lieu_id", + services.data ->> 'serviceObjectId' AS "id", + NULLIF(services.data ->> 'name', '') AS "name", + services.data ->> 'category' AS "category", + NULLIF(services.data ->> 'description', '') AS "description", + services.data -> 'hours' AS "hours", + CAST(services.data ->> 'differentHours' AS BOOLEAN) AS "different_hours", + CAST(services.data #>> '{close,actif}' AS BOOLEAN) AS "close__actif", + CAST(SUBSTRING(services.data #>> '{close,dateDebut}' FROM '\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z') AS DATE) AS "close__date_debut", + CAST(SUBSTRING(services.data #>> '{close,dateFin}' FROM '\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z') AS DATE) AS "close__date_fin", + CAST(services.data #>> '{modalities,inconditionnel}' AS BOOLEAN) AS "modalities__inconditionnel", + CAST(services.data #>> '{modalities,appointment,checked}' AS BOOLEAN) AS "modalities__appointment__checked", + services.data #>> '{modalities,appointment,precisions}' AS "modalities__appointment__precisions", + CAST(services.data #>> '{modalities,price,checked}' AS BOOLEAN) AS "modalities__price__checked", + ARRAY_TO_STRING( + ARRAY_REMOVE( + ARRAY[services.data #>> '{modalities,price,precisions}', source.data #>> '{modalities,price,precisions}'], + NULL + ), + E'\n\n' + ) AS "modalities__price__precisions", + CAST(services.data #>> '{modalities,inscription,checked}' AS BOOLEAN) AS "modalities__inscription__checked", + services.data #>> '{modalities,inscription,precisions}' AS "modalities__inscription__precisions", + CAST(services.data #>> '{modalities,orientation,checked}' AS BOOLEAN) AS "modalities__orientation__checked", + services.data #>> '{modalities,orientation,precisions}' AS "modalities__orientation__precisions", + source.data -> 'sources' AS "sources" FROM source, - LATERAL (SELECT * FROM JSONB_PATH_QUERY(source.data, '$.services_all[*]')) AS inline_service (data) + LATERAL (SELECT services.* FROM JSONB_PATH_QUERY(source.data, '$.services_all[*]')) AS services (data) ), final AS (