Skip to content

Commit

Permalink
adapt column lineage query for symlink dataset (#2775)
Browse files Browse the repository at this point in the history
Signed-off-by: sophiely <ly.sophie200@gmail.com>
Co-authored-by: Peter Hicks <phixMe@users.noreply.github.com>
  • Loading branch information
sophiely and phixMe authored Apr 16, 2024
1 parent d5daf50 commit 17909a2
Showing 1 changed file with 7 additions and 3 deletions.
10 changes: 7 additions & 3 deletions api/src/main/java/marquez/db/ColumnLineageDao.java
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ SELECT DISTINCT ON (output_dataset_field_uuid, input_dataset_field_uuid) *
ORDER BY output_dataset_field_uuid, input_dataset_field_uuid, updated_at DESC, updated_at
),
dataset_fields_view AS (
SELECT d.namespace_name as namespace_name, d.name as dataset_name, df.name as field_name, df.type, df.uuid
SELECT d.namespace_name as namespace_name, d.name as dataset_name, df.name as field_name, df.type, df.uuid, d.namespace_uuid
FROM dataset_fields df
INNER JOIN datasets_view d ON d.uuid = df.dataset_uuid
),
Expand Down Expand Up @@ -157,8 +157,10 @@ WHERE output_dataset_field_uuid IN (<datasetFieldUuids>)
clr.output_dataset_version_uuid as dataset_version_uuid
FROM column_lineage_recursive clr
INNER JOIN dataset_fields_view output_fields ON clr.output_dataset_field_uuid = output_fields.uuid -- hidden datasets will be filtered
INNER JOIN dataset_symlinks ds_output ON ds_output.namespace_uuid = output_fields.namespace_uuid AND ds_output.name = output_fields.dataset_name
LEFT JOIN dataset_fields_view input_fields ON clr.input_dataset_field_uuid = input_fields.uuid
WHERE NOT clr.is_cycle
INNER JOIN dataset_symlinks ds_input ON ds_input.namespace_uuid = input_fields.namespace_uuid AND ds_input.name = input_fields.dataset_name
WHERE NOT clr.is_cycle AND ds_output.is_primary is true AND ds_input.is_primary
GROUP BY
output_fields.namespace_name,
output_fields.dataset_name,
Expand All @@ -175,7 +177,7 @@ Set<ColumnLineageNodeData> getLineage(
@SqlQuery(
"""
WITH selected_column_lineage AS (
SELECT DISTINCT ON (cl.output_dataset_field_uuid, cl.input_dataset_field_uuid) cl.*
SELECT DISTINCT ON (cl.output_dataset_field_uuid, cl.input_dataset_field_uuid) cl.*, dv.namespace_uuid
FROM column_lineage cl
JOIN dataset_fields df ON df.uuid = cl.output_dataset_field_uuid
JOIN datasets_view dv ON dv.uuid = df.dataset_uuid
Expand Down Expand Up @@ -203,7 +205,9 @@ dataset_fields_view AS (
null as dataset_version_uuid
FROM selected_column_lineage c
INNER JOIN dataset_fields_view output_fields ON c.output_dataset_field_uuid = output_fields.uuid
INNER JOIN dataset_symlinks ds ON ds.namespace_uuid = c.namespace_uuid and ds.name=output_fields.dataset_name
LEFT JOIN dataset_fields_view input_fields ON c.input_dataset_field_uuid = input_fields.uuid
WHERE ds.is_primary is true
GROUP BY
output_fields.namespace_name,
output_fields.dataset_name,
Expand Down

0 comments on commit 17909a2

Please sign in to comment.