From d17a4696f06d3b0fdcbaba0c7f157141d76f99c0 Mon Sep 17 00:00:00 2001 From: Vinh Nguyen Date: Fri, 24 May 2024 10:19:07 +0700 Subject: [PATCH 1/2] Optimize Column Lineage Query Performance Signed-off-by: Vinh Nguyen --- api/src/main/java/marquez/db/ColumnLineageDao.java | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/api/src/main/java/marquez/db/ColumnLineageDao.java b/api/src/main/java/marquez/db/ColumnLineageDao.java index 14faf16414..8ad33c2134 100644 --- a/api/src/main/java/marquez/db/ColumnLineageDao.java +++ b/api/src/main/java/marquez/db/ColumnLineageDao.java @@ -184,11 +184,7 @@ SELECT DISTINCT ON (cl.output_dataset_field_uuid, cl.input_dataset_field_uuid) c WHERE ARRAY[]::DATASET_NAME[] && dv.dataset_symlinks -- array of string pairs is cast onto array of DATASET_NAME types to be checked if it has non-empty intersection with dataset symlinks ORDER BY output_dataset_field_uuid, input_dataset_field_uuid, updated_at DESC, updated_at ), - dataset_fields_view AS ( - SELECT d.namespace_name as namespace_name, d.name as dataset_name, df.name as field_name, df.type, df.uuid - FROM dataset_fields df - INNER JOIN datasets_view d ON d.uuid = df.dataset_uuid - ) + dataset_fields_view AS ( SELECT d.namespace_name as namespace_name, d.name as dataset_name, df.name as field_name, df.type, df.uuid FROM dataset_fields df INNER JOIN ( select * from datasets_view where current_version_uuid IN ( SELECT DISTINCT output_dataset_version_uuid FROM selected_column_lineage UNION SELECT DISTINCT input_dataset_version_uuid FROM selected_column_lineage ) ) d ON d.uuid = df.dataset_uuid ) SELECT output_fields.namespace_name, output_fields.dataset_name, From e14f060c5821bb9b9b2fdd01c30957bae754dc01 Mon Sep 17 00:00:00 2001 From: Vinh Nguyen Date: Mon, 3 Jun 2024 16:25:44 +0700 Subject: [PATCH 2/2] Optimize Column Lineage Query Performance - Format query - replace select * with uuid, namespace_name, name Signed-off-by: Vinh Nguyen --- .../java/marquez/db/ColumnLineageDao.java | 21 ++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/api/src/main/java/marquez/db/ColumnLineageDao.java b/api/src/main/java/marquez/db/ColumnLineageDao.java index 8ad33c2134..8225588e0d 100644 --- a/api/src/main/java/marquez/db/ColumnLineageDao.java +++ b/api/src/main/java/marquez/db/ColumnLineageDao.java @@ -184,7 +184,26 @@ SELECT DISTINCT ON (cl.output_dataset_field_uuid, cl.input_dataset_field_uuid) c WHERE ARRAY[]::DATASET_NAME[] && dv.dataset_symlinks -- array of string pairs is cast onto array of DATASET_NAME types to be checked if it has non-empty intersection with dataset symlinks ORDER BY output_dataset_field_uuid, input_dataset_field_uuid, updated_at DESC, updated_at ), - dataset_fields_view AS ( SELECT d.namespace_name as namespace_name, d.name as dataset_name, df.name as field_name, df.type, df.uuid FROM dataset_fields df INNER JOIN ( select * from datasets_view where current_version_uuid IN ( SELECT DISTINCT output_dataset_version_uuid FROM selected_column_lineage UNION SELECT DISTINCT input_dataset_version_uuid FROM selected_column_lineage ) ) d ON d.uuid = df.dataset_uuid ) + dataset_fields_view AS ( + SELECT + d.namespace_name AS namespace_name, + d.name AS dataset_name, + df.name AS field_name, + df.type, + df.uuid + FROM dataset_fields df + INNER JOIN ( + SELECT uuid, namespace_name, name + FROM datasets_view + WHERE current_version_uuid IN ( + SELECT DISTINCT output_dataset_version_uuid + FROM selected_column_lineage + UNION + SELECT DISTINCT input_dataset_version_uuid + FROM selected_column_lineage + ) + ) d ON d.uuid = df.dataset_uuid + ) SELECT output_fields.namespace_name, output_fields.dataset_name,