feat(pdf): Test Result PDF report

DataKitchen · Sep 30, 2024 · dea50a0 · dea50a0
1 parent 4ead21b
commit dea50a0
Show file tree

Hide file tree

Showing 8 changed files with 418 additions and 214 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -60,6 +60,7 @@ dependencies = [
     "concurrent_log_handler==0.9.25",
     "cryptography==42.0.8",
     "validators==0.33.0",
+    "reportlab==4.2.2",
 ]
 
 [project.optional-dependencies]

diff --git a/testgen/ui/components/widgets/download_dialog.py b/testgen/ui/components/widgets/download_dialog.py
@@ -0,0 +1,34 @@
+from collections.abc import Callable
+from typing import Any
+
+import streamlit as st
+
+
+def download_dialog(
+    dialog_title: str,
+    file_name: str,
+    mime_type: str,
+    file_content_func: Callable[[], Any],
+):
+    """Wrapping a dialog and a download button together to allow generating the file contents only when needed."""
+
+    def _dialog_content():
+        # Encapsulating the dialog content in a container just to force its height and avoid the dialog to
+        # have its height changed when the button is rendered.
+        with st.container(height=55, border=False):
+            spinner_col, button_col, _ = st.columns([.3, .4, .3])
+
+        with spinner_col:
+            with st.spinner(text="Generating file..."):
+                data = file_content_func()
+
+        with button_col:
+            st.download_button(
+                label=":material/download: Download",
+                data=data,
+                file_name=file_name,
+                mime=mime_type,
+                use_container_width=True
+            )
+
+    return st.dialog(title=dialog_title, width="small")(_dialog_content)()
diff --git a/testgen/ui/pdf/__init__.py b/testgen/ui/pdf/__init__.py
diff --git a/testgen/ui/pdf/test_result_report.py b/testgen/ui/pdf/test_result_report.py
@@ -0,0 +1,130 @@
+from reportlab.lib import enums
+from reportlab.lib.styles import ParagraphStyle
+from reportlab.platypus import Paragraph, SimpleDocTemplate, Table, TableStyle
+
+from testgen.ui.services.database_service import get_schema
+from testgen.ui.services.test_results_service import (
+    do_source_data_lookup,
+    do_source_data_lookup_custom,
+    get_test_result_history,
+)
+
+PARA_STYLE_DEFAULT = ParagraphStyle(
+    "default",
+    fontSize=8,
+)
+
+PARA_STYLE_INFO = PARA_STYLE_DEFAULT
+
+
+PARA_STYLE_ERROR = PARA_STYLE_DEFAULT
+
+
+PARA_STYLE_MONO = ParagraphStyle(
+    "heading_1",
+    PARA_STYLE_DEFAULT,
+
+)
+
+
+PARA_STYLE_H1 = ParagraphStyle(
+    "heading_1",
+    PARA_STYLE_DEFAULT,
+    fontSize=12,
+    leading=16,
+)
+
+PARA_STYLE_TITLE = ParagraphStyle(
+    "title",
+    PARA_STYLE_DEFAULT,
+    fontSize=18,
+    leading=30,
+    alignment=enums.TA_CENTER,
+)
+
+TABLE_STYLE_SUMMARY = TableStyle(
+    (
+        # All cells
+        ("ALIGN", (0, 0), (-1, -1), "LEFT"),
+        ("VALIGN", (0, 0), (-1, -1), "TOP"),
+        ("FONT", (0, 0), (-1, -1), "Helvetica", 7),
+
+        # Header
+        ("FONT", (0, 0), (0, -1), "Helvetica-Bold"),
+        ("ALIGN", (0, 0), (0, -1), "RIGHT"),
+    )
+)
+
+def get_report_content(tr_data):
+
+    yield Paragraph(f"TestGen Issue Report: {tr_data['result_status']}", PARA_STYLE_TITLE)
+
+    yield Paragraph("Summary", PARA_STYLE_H1)
+
+    summary_table_data = [
+        ("Date", tr_data["test_date"]),
+        ("Database/Schema", tr_data["schema_name"]),
+        ("Table", tr_data["table_name"]),
+        ("Column", tr_data["column_names"]),
+        ("Table Group", tr_data["table_groups_name"]),
+        ("Test Suite", tr_data["test_suite"]),
+        ("Issue Type", "Test Result"),
+        ("Risk Level", tr_data["severity"]),
+        ("Data Quality Dimension", tr_data["dq_dimension"]),
+        ("Test", f"""{tr_data["test_name_short"]}: {tr_data["test_name_long"]}\n{tr_data["test_description"]}"""),
+        ("Result Measure", tr_data["result_measure"]),
+        ("Threshold Value", f"""{tr_data["threshold_value"]} {tr_data["threshold_description"]}"""),
+    ]
+    if tr_data["measure_uom_description"]:
+        summary_table_data.append(("Units", tr_data["measure_uom_description"]))
+
+    yield Table(summary_table_data, style=TABLE_STYLE_SUMMARY, hAlign="LEFT")
+
+    yield Paragraph("Usage Notes", PARA_STYLE_H1)
+    yield Paragraph(tr_data["usage_notes"], PARA_STYLE_DEFAULT)
+
+    yield Paragraph("Result History", PARA_STYLE_H1)
+
+    history_data = get_test_result_history(get_schema(), tr_data)
+
+    history_table_data = [
+        (r["test_date"], r["threshold_value"], r["result_measure"], r["result_status"])
+        for _, r in history_data.iterrows()
+    ]
+
+    yield Table(history_table_data)
+
+    yield Paragraph("Sample Data", PARA_STYLE_H1)
+
+    if tr_data["test_type"] == "CUSTOM":
+        bad_data_status, bad_data_msg, lookup_query, sample_data = do_source_data_lookup_custom(get_schema(), tr_data)
+    else:
+        bad_data_status, bad_data_msg, lookup_query, sample_data = do_source_data_lookup(get_schema(), tr_data)
+    if bad_data_status in {"ND", "NA"}:
+        yield Paragraph(bad_data_msg, style=PARA_STYLE_INFO)
+    elif bad_data_status == "ERR":
+        yield Paragraph(bad_data_msg, style=PARA_STYLE_ERROR)
+    elif sample_data is None:
+        yield Paragraph("An unknown error was encountered.", style=PARA_STYLE_ERROR)
+    else:
+        if bad_data_msg:
+            yield Paragraph(bad_data_msg, style=PARA_STYLE_DEFAULT)
+
+        sample_data.fillna("[NULL]", inplace=True)
+
+        yield Table(
+            (
+                [col.replace("_", " ").title() for col in sample_data.columns],
+                *(data for _, data in sample_data.iterrows()),
+            )
+        )
+
+
+    yield Paragraph("SQL Query", PARA_STYLE_H1)
+
+    yield Paragraph(lookup_query, PARA_STYLE_MONO)
+
+
+def create_report(filename, test_result_id):
+    doc = SimpleDocTemplate(filename)
+    doc.build(flowables=list(get_report_content(test_result_id)))
diff --git a/testgen/ui/services/form_service.py b/testgen/ui/services/form_service.py
@@ -11,7 +11,7 @@
 
 import pandas as pd
 import streamlit as st
-import validators
+from attrs import validators
 from pandas.api.types import is_datetime64_any_dtype
 from st_aggrid import AgGrid, ColumnsAutoSizeMode, DataReturnMode, GridOptionsBuilder, GridUpdateMode, JsCode
 from streamlit_extras.no_default_selectbox import selectbox

diff --git a/testgen/ui/services/test_definition_service.py b/testgen/ui/services/test_definition_service.py
@@ -22,6 +22,27 @@ def get_test_definitions(
     )
 
 
+def get_test_definition(db_schema, test_def_id):
+    str_sql = f"""
+           SELECT d.id::VARCHAR, tt.test_name_short as test_name, tt.test_name_long as full_name,
+                  tt.test_description as description, tt.usage_notes,
+                  d.column_name,
+                  d.baseline_value, d.baseline_ct, d.baseline_avg, d.baseline_sd, d.threshold_value,
+                  d.subset_condition, d.groupby_names, d.having_condition, d.match_schema_name,
+                  d.match_table_name, d.match_column_names, d.match_subset_condition,
+                  d.match_groupby_names, d.match_having_condition,
+                  d.window_date_column, d.window_days::VARCHAR as window_days,
+                  d.custom_query,
+                  d.severity, tt.default_severity,
+                  d.test_active, d.lock_refresh, d.last_manual_update
+             FROM {db_schema}.test_definitions d
+           INNER JOIN {db_schema}.test_types tt
+              ON (d.test_type = tt.test_type)
+            WHERE d.id = '{test_def_id}';
+    """
+    return database_service.retrieve_data(str_sql)
+
+
 def delete(test_definition_ids, dry_run=False):
     schema = st.session_state["dbschema"]
     usage_result = test_definition_queries.get_test_definition_usage(schema, test_definition_ids)

diff --git a/testgen/ui/services/test_results_service.py b/testgen/ui/services/test_results_service.py
@@ -0,0 +1,184 @@
+import pandas as pd
+
+from testgen.common import ConcatColumnList
+from testgen.ui.services import database_service as db
+from testgen.ui.services.string_service import empty_if_null
+from testgen.ui.services.test_definition_service import get_test_definition
+
+
+def get_test_result_history(db_schema, tr_data):
+    if tr_data["auto_gen"]:
+        str_where = f"""
+            WHERE test_suite_id = '{tr_data["test_suite_id"]}'
+              AND table_name = '{tr_data["table_name"]}'
+              AND column_names = '{tr_data["column_names"]}'
+              AND test_type = '{tr_data["test_type"]}'
+              AND auto_gen = TRUE
+        """
+    else:
+        str_where = f"""
+            WHERE test_definition_id_runtime = '{tr_data["test_definition_id_runtime"]}'
+        """
+
+    str_sql = f"""
+           SELECT test_date, test_type,
+                  test_name_short, test_name_long, measure_uom, test_operator,
+                  threshold_value::NUMERIC, result_measure, result_status
+             FROM {db_schema}.v_test_results {str_where}
+           ORDER BY test_date DESC;
+    """
+
+    df = db.retrieve_data(str_sql)
+    # Clean Up
+    df["test_date"] = pd.to_datetime(df["test_date"])
+
+    return df
+
+
+def do_source_data_lookup_custom(db_schema, tr_data):
+    # Define the query
+    str_sql = f"""
+            SELECT d.custom_query as lookup_query, tg.table_group_schema, c.project_qc_schema,
+                   c.sql_flavor, c.project_host, c.project_port, c.project_db, c.project_user, c.project_pw_encrypted,
+                   c.url, c.connect_by_url, c.connect_by_key, c.private_key, c.private_key_passphrase
+              FROM {db_schema}.test_definitions d
+            INNER JOIN {db_schema}.table_groups tg
+               ON ('{tr_data["table_groups_id"]}'::UUID = tg.id)
+            INNER JOIN {db_schema}.connections c
+               ON (tg.connection_id = c.connection_id)
+             WHERE d.id = '{tr_data["test_definition_id_current"]}';
+    """
+
+    try:
+        # Retrieve SQL for customer lookup
+        lst_query = db.retrieve_data_list(str_sql)
+
+        # Retrieve and return data as df
+        if lst_query:
+            str_sql = lst_query[0]["lookup_query"]
+            str_sql = str_sql.replace("{DATA_SCHEMA}", empty_if_null(lst_query[0]["table_group_schema"]))
+            df = db.retrieve_target_db_df(
+                lst_query[0]["sql_flavor"],
+                lst_query[0]["project_host"],
+                lst_query[0]["project_port"],
+                lst_query[0]["project_db"],
+                lst_query[0]["project_user"],
+                lst_query[0]["project_pw_encrypted"],
+                str_sql,
+                lst_query[0]["url"],
+                lst_query[0]["connect_by_url"],
+                lst_query[0]["connect_by_key"],
+                lst_query[0]["private_key"],
+                lst_query[0]["private_key_passphrase"],
+            )
+            if df.empty:
+                return "ND", "Data that violates Test criteria is not present in the current dataset.", str_sql, None
+            else:
+                return "OK", None, str_sql, df
+        else:
+            return "NA", "A source data lookup for this Test is not available.", None, None
+
+    except Exception as e:
+        return "ERR", f"Source data lookup query caused an error:\n\n{e.args[0]}", str_sql, None
+
+
+def do_source_data_lookup(db_schema, tr_data, sql_only=False):
+    # Define the query
+    str_sql = f"""
+            SELECT t.lookup_query, tg.table_group_schema, c.project_qc_schema,
+                   c.sql_flavor, c.project_host, c.project_port, c.project_db, c.project_user, c.project_pw_encrypted,
+                   c.url, c.connect_by_url,
+                   c.connect_by_key, c.private_key, c.private_key_passphrase
+              FROM {db_schema}.target_data_lookups t
+            INNER JOIN {db_schema}.table_groups tg
+               ON ('{tr_data["table_groups_id"]}'::UUID = tg.id)
+            INNER JOIN {db_schema}.connections c
+               ON (tg.connection_id = c.connection_id)
+               AND (t.sql_flavor = c.sql_flavor)
+             WHERE t.error_type = 'Test Results'
+               AND t.test_id = '{tr_data["test_type_id"]}'
+               AND t.lookup_query > '';
+    """
+
+    def replace_parms(df_test, str_query):
+        if df_test.empty:
+            raise ValueError("This test definition is no longer present.")
+
+        str_query = str_query.replace("{TARGET_SCHEMA}", empty_if_null(lst_query[0]["table_group_schema"]))
+        str_query = str_query.replace("{TABLE_NAME}", empty_if_null(tr_data["table_name"]))
+        str_query = str_query.replace("{COLUMN_NAME}", empty_if_null(tr_data["column_names"]))
+        str_query = str_query.replace("{DATA_QC_SCHEMA}", empty_if_null(lst_query[0]["project_qc_schema"]))
+        str_query = str_query.replace("{TEST_DATE}", str(empty_if_null(tr_data["test_date"])))
+
+        str_query = str_query.replace("{CUSTOM_QUERY}", empty_if_null(df_test.at[0, "custom_query"]))
+        str_query = str_query.replace("{BASELINE_VALUE}", empty_if_null(df_test.at[0, "baseline_value"]))
+        str_query = str_query.replace("{BASELINE_CT}", empty_if_null(df_test.at[0, "baseline_ct"]))
+        str_query = str_query.replace("{BASELINE_AVG}", empty_if_null(df_test.at[0, "baseline_avg"]))
+        str_query = str_query.replace("{BASELINE_SD}", empty_if_null(df_test.at[0, "baseline_sd"]))
+        str_query = str_query.replace("{THRESHOLD_VALUE}", empty_if_null(df_test.at[0, "threshold_value"]))
+
+        str_substitute = empty_if_null(df_test.at[0, "subset_condition"])
+        str_substitute = "1=1" if str_substitute == "" else str_substitute
+        str_query = str_query.replace("{SUBSET_CONDITION}", str_substitute)
+
+        str_query = str_query.replace("{GROUPBY_NAMES}", empty_if_null(df_test.at[0, "groupby_names"]))
+        str_query = str_query.replace("{HAVING_CONDITION}", empty_if_null(df_test.at[0, "having_condition"]))
+        str_query = str_query.replace("{MATCH_SCHEMA_NAME}", empty_if_null(df_test.at[0, "match_schema_name"]))
+        str_query = str_query.replace("{MATCH_TABLE_NAME}", empty_if_null(df_test.at[0, "match_table_name"]))
+        str_query = str_query.replace("{MATCH_COLUMN_NAMES}", empty_if_null(df_test.at[0, "match_column_names"]))
+
+        str_substitute = empty_if_null(df_test.at[0, "match_subset_condition"])
+        str_substitute = "1=1" if str_substitute == "" else str_substitute
+        str_query = str_query.replace("{MATCH_SUBSET_CONDITION}", str_substitute)
+
+        str_query = str_query.replace("{MATCH_GROUPBY_NAMES}", empty_if_null(df_test.at[0, "match_groupby_names"]))
+        str_query = str_query.replace("{MATCH_HAVING_CONDITION}", empty_if_null(df_test.at[0, "match_having_condition"]))
+        str_query = str_query.replace("{COLUMN_NAME_NO_QUOTES}", empty_if_null(tr_data["column_names"]))
+
+        str_query = str_query.replace("{WINDOW_DATE_COLUMN}", empty_if_null(df_test.at[0, "window_date_column"]))
+        str_query = str_query.replace("{WINDOW_DAYS}", empty_if_null(df_test.at[0, "window_days"]))
+
+        str_substitute = ConcatColumnList(tr_data["column_names"], "<NULL>")
+        str_query = str_query.replace("{CONCAT_COLUMNS}", str_substitute)
+        str_substitute = ConcatColumnList(df_test.at[0, "match_groupby_names"], "<NULL>")
+        str_query = str_query.replace("{CONCAT_MATCH_GROUPBY}", str_substitute)
+
+        if str_query is None or str_query == "":
+            raise ValueError("Lookup query is not defined for this Test Type.")
+        return str_query
+
+    try:
+        # Retrieve SQL for customer lookup
+        lst_query = db.retrieve_data_list(str_sql)
+
+        if sql_only:
+            return lst_query, replace_parms, None
+
+        # Retrieve and return data as df
+        if lst_query:
+            df_test = get_test_definition(db_schema, tr_data["test_definition_id_current"])
+
+            str_sql = replace_parms(df_test, lst_query[0]["lookup_query"])
+            df = db.retrieve_target_db_df(
+                lst_query[0]["sql_flavor"],
+                lst_query[0]["project_host"],
+                lst_query[0]["project_port"],
+                lst_query[0]["project_db"],
+                lst_query[0]["project_user"],
+                lst_query[0]["project_pw_encrypted"],
+                str_sql,
+                lst_query[0]["url"],
+                lst_query[0]["connect_by_url"],
+                lst_query[0]["connect_by_key"],
+                lst_query[0]["private_key"],
+                lst_query[0]["private_key_passphrase"],
+            )
+            if df.empty:
+                return "ND", "Data that violates Test criteria is not present in the current dataset.", str_sql, None
+            else:
+                return "OK", None, str_sql, df
+        else:
+            return "NA", "A source data lookup for this Test is not available.", None, None
+
+    except Exception as e:
+        return "ERR", f"Source data lookup query caused:\n\n{e.args[0]}", str_sql, None