From e4b3d62c9d5cefc7afc510fdca0b34025303d1d0 Mon Sep 17 00:00:00 2001 From: sophiely Date: Fri, 22 Mar 2024 17:53:37 +0100 Subject: [PATCH] adapt column lineage query for symlink dataset --- api/src/main/java/marquez/db/ColumnLineageDao.java | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/api/src/main/java/marquez/db/ColumnLineageDao.java b/api/src/main/java/marquez/db/ColumnLineageDao.java index d5fe0df74c..14faf16414 100644 --- a/api/src/main/java/marquez/db/ColumnLineageDao.java +++ b/api/src/main/java/marquez/db/ColumnLineageDao.java @@ -106,7 +106,7 @@ SELECT DISTINCT ON (output_dataset_field_uuid, input_dataset_field_uuid) * ORDER BY output_dataset_field_uuid, input_dataset_field_uuid, updated_at DESC, updated_at ), dataset_fields_view AS ( - SELECT d.namespace_name as namespace_name, d.name as dataset_name, df.name as field_name, df.type, df.uuid + SELECT d.namespace_name as namespace_name, d.name as dataset_name, df.name as field_name, df.type, df.uuid, d.namespace_uuid FROM dataset_fields df INNER JOIN datasets_view d ON d.uuid = df.dataset_uuid ), @@ -157,8 +157,10 @@ WHERE output_dataset_field_uuid IN () clr.output_dataset_version_uuid as dataset_version_uuid FROM column_lineage_recursive clr INNER JOIN dataset_fields_view output_fields ON clr.output_dataset_field_uuid = output_fields.uuid -- hidden datasets will be filtered + INNER JOIN dataset_symlinks ds_output ON ds_output.namespace_uuid = output_fields.namespace_uuid AND ds_output.name = output_fields.dataset_name LEFT JOIN dataset_fields_view input_fields ON clr.input_dataset_field_uuid = input_fields.uuid - WHERE NOT clr.is_cycle + INNER JOIN dataset_symlinks ds_input ON ds_input.namespace_uuid = input_fields.namespace_uuid AND ds_input.name = input_fields.dataset_name + WHERE NOT clr.is_cycle AND ds_output.is_primary is true AND ds_input.is_primary GROUP BY output_fields.namespace_name, output_fields.dataset_name, @@ -175,7 +177,7 @@ Set getLineage( @SqlQuery( """ WITH selected_column_lineage AS ( - SELECT DISTINCT ON (cl.output_dataset_field_uuid, cl.input_dataset_field_uuid) cl.* + SELECT DISTINCT ON (cl.output_dataset_field_uuid, cl.input_dataset_field_uuid) cl.*, dv.namespace_uuid FROM column_lineage cl JOIN dataset_fields df ON df.uuid = cl.output_dataset_field_uuid JOIN datasets_view dv ON dv.uuid = df.dataset_uuid @@ -203,7 +205,9 @@ dataset_fields_view AS ( null as dataset_version_uuid FROM selected_column_lineage c INNER JOIN dataset_fields_view output_fields ON c.output_dataset_field_uuid = output_fields.uuid + INNER JOIN dataset_symlinks ds ON ds.namespace_uuid = c.namespace_uuid and ds.name=output_fields.dataset_name LEFT JOIN dataset_fields_view input_fields ON c.input_dataset_field_uuid = input_fields.uuid + WHERE ds.is_primary is true GROUP BY output_fields.namespace_name, output_fields.dataset_name,