+ """,
+ unsafe_allow_html=True,
+ )
+
+ def reset_cache(self) -> None:
+ st.session_state.pop(self.get_field_key("private_key_uploader"), None)
+ st.session_state.pop(self.get_field_key("previous_private_key_file"), None)
+ return super().reset_cache()
From 372c12b6ae110e90a71155cdc453d400dfc06ab0 Mon Sep 17 00:00:00 2001
From: Luis Trinidad
Date: Fri, 1 Nov 2024 11:49:58 -0400
Subject: [PATCH 40/91] fix(ui): use index instead of value in flavor selector
---
.../frontend/js/components/flavor_selector.js | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/testgen/ui/components/frontend/js/components/flavor_selector.js b/testgen/ui/components/frontend/js/components/flavor_selector.js
index a4a1875..e5ff790 100644
--- a/testgen/ui/components/frontend/js/components/flavor_selector.js
+++ b/testgen/ui/components/frontend/js/components/flavor_selector.js
@@ -9,7 +9,7 @@
* @typedef Properties
* @type {object}
* @property {Array.} flavors
- * @property {string} selected
+ * @property {((number|null))} selected
* @property {(number|null)} columns
*/
@@ -29,7 +29,7 @@ const DatabaseFlavorSelector = (/** @type Properties */props) => {
const flavors = props.flavors?.val ?? props.flavors;
const numberOfColumns = props.columns?.val ?? props.columns ?? 3;
const numberOfRows = Math.ceil(flavors.length / numberOfColumns);
- const selectedFlavor = van.state(props.selected?.val ?? props.selected);
+ const selectedIndex = van.state(props.selected?.val ?? props.selected);
window.testgen.isPage = true;
Streamlit.setFrameHeight(
@@ -50,17 +50,17 @@ const DatabaseFlavorSelector = (/** @type Properties */props) => {
class: 'tg-flavor-selector',
style: `grid-template-columns: ${Array(numberOfColumns).fill(columnSize).join(' ')}; row-gap: ${rowGap}px;`
},
- flavors.map(flavor =>
+ flavors.map((flavor, idx) =>
DatabaseFlavor(
{
label: van.state(flavor.label),
value: van.state(flavor.value),
icon: van.state(flavor.icon),
- selected: van.derive(() => selectedFlavor.val === flavor.value),
+ selected: van.derive(() => selectedIndex.val == idx),
},
() => {
- selectedFlavor.val = flavor.value;
- Streamlit.sendData(flavor.value);
+ selectedIndex.val = idx;
+ Streamlit.sendData({index: idx, value: flavor.value});
},
)
),
From 1a167b8e794b5cdbc4d5859753777f27660aef81 Mon Sep 17 00:00:00 2001
From: Luis Trinidad
Date: Tue, 5 Nov 2024 18:17:58 -0400
Subject: [PATCH 41/91] fix: sleep to protect against multiple reruns
---
testgen/ui/views/connections/forms.py | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/testgen/ui/views/connections/forms.py b/testgen/ui/views/connections/forms.py
index e6890e1..bbaba0c 100644
--- a/testgen/ui/views/connections/forms.py
+++ b/testgen/ui/views/connections/forms.py
@@ -1,4 +1,5 @@
# type: ignore
+import time
import typing
import streamlit as st
@@ -114,6 +115,7 @@ def form_key(self):
return f"connection_form:{self.connection_id or 'new'}"
def render_input_ui(self, container: DeltaGenerator, data: dict) -> "BaseConnectionForm":
+ time.sleep(0.1)
main_fields_container, optional_fields_container = container.columns([0.7, 0.3])
if self.get_field_value("connect_by_url", latest=True):
@@ -154,6 +156,8 @@ def render_input_ui(self, container: DeltaGenerator, data: dict) -> "BaseConnect
self.render_field("url_prefix", container=url_override_left_column)
self.render_field("url", container=url_override_right_column)
+ time.sleep(0.1)
+
return self
def render_extra(
From d25bce83a2e15cd7c2b2639e3585f405e42110e3 Mon Sep 17 00:00:00 2001
From: Luis Trinidad
Date: Tue, 5 Nov 2024 18:31:16 -0400
Subject: [PATCH 42/91] misc: remove qc schema creation logic
---
testgen/ui/views/connections/forms.py | 8 --
testgen/ui/views/connections/page.py | 115 +-------------------------
2 files changed, 2 insertions(+), 121 deletions(-)
diff --git a/testgen/ui/views/connections/forms.py b/testgen/ui/views/connections/forms.py
index bbaba0c..ce7fc42 100644
--- a/testgen/ui/views/connections/forms.py
+++ b/testgen/ui/views/connections/forms.py
@@ -92,13 +92,6 @@ class BaseConnectionForm(BaseForm, ManualRender):
"unless test queries are failing."
),
)
- project_qc_schema: str = Field(
- default="qc",
- max_length=50,
- st_kwargs_label="QC Utility Schema",
- st_kwargs_max_chars=50,
- st_kwargs_help="The name of the schema on your database that will contain TestGen's profiling functions.",
- )
connection_id: int | None = Field(default=None)
@@ -131,7 +124,6 @@ def render_input_ui(self, container: DeltaGenerator, data: dict) -> "BaseConnect
self.render_field("project_db", container=main_fields_container)
self.render_field("project_user", container=main_fields_container)
- self.render_field("project_qc_schema", container=optional_fields_container)
self.render_field("max_threads", container=optional_fields_container)
self.render_field("max_query_chars", container=optional_fields_container)
diff --git a/testgen/ui/views/connections/page.py b/testgen/ui/views/connections/page.py
index 7dfa6a7..aeb939c 100644
--- a/testgen/ui/views/connections/page.py
+++ b/testgen/ui/views/connections/page.py
@@ -1,5 +1,4 @@
import logging
-import os
import time
import typing
from functools import partial
@@ -11,7 +10,6 @@
import testgen.ui.services.database_service as db
from testgen.commands.run_profiling_bridge import run_profiling_in_background
-from testgen.commands.run_setup_profiling_tools import get_setup_profiling_tools_queries
from testgen.common.database.database_service import empty_cache
from testgen.ui.components import widgets as testgen
from testgen.ui.navigation.menu import MenuItem
@@ -102,6 +100,7 @@ def show_connection_form(self, selected_connection: dict, _mode: str, project_co
)
data.update({
"project_code": project_code,
+ "project_qc_schema": "",
})
if "private_key" not in data:
data.update({
@@ -121,7 +120,7 @@ def show_connection_form(self, selected_connection: dict, _mode: str, project_co
LOG.exception("unexpected form validation error")
st.error("Unexpected error displaying the form. Try again")
- test_button_column, config_qc_column, _, save_button_column = st.columns([.2, .2, .4, .2])
+ test_button_column, _, save_button_column = st.columns([.2, .6, .2])
is_submitted, set_submitted = temp_value(f"connection_form-{connection_id or 'new'}:submit")
get_connection_status, set_connection_status = temp_value(
f"connection_form-{connection_id or 'new'}:test_conn"
@@ -144,16 +143,6 @@ def show_connection_form(self, selected_connection: dict, _mode: str, project_co
on_click=lambda: set_connection_status(self.test_connection(data)),
)
- with config_qc_column:
- testgen.button(
- type_="stroked",
- color="basic",
- label="Configure QC Utility Schema",
- key=f"connection_form:{connection_id or 'new'}:config-qc-schema",
- tooltip="Creates the required Utility schema and related functions in the target database",
- on_click=lambda: self.create_qc_schema_dialog(connection)
- )
-
if (connection_status := get_connection_status()):
single_element_container = st.empty()
single_element_container.info("Connecting ...")
@@ -235,106 +224,6 @@ def test_connection(self, connection: dict) -> "ConnectionStatus":
except Exception as error:
return ConnectionStatus(message="Error attempting the Connection.", details=error.args[0], successful=False)
- @st.dialog(title="Configure QC Utility Schema")
- def create_qc_schema_dialog(self, selected_connection):
- connection_id = selected_connection["connection_id"]
- project_qc_schema = selected_connection["project_qc_schema"]
- sql_flavor = selected_connection["sql_flavor"]
- user = selected_connection["project_user"]
-
- create_qc_schema = st.toggle("Create QC Utility Schema", value=True)
- grant_privileges = st.toggle("Grant access privileges to TestGen user", value=True)
-
- user_role = None
-
- # TODO ALEX: This textbox may be needed if we want to grant permissions to user role
- # if sql_flavor == "snowflake":
- # user_role_textbox_label = f"Primary role for database user {user}"
- # user_role = st.text_input(label=user_role_textbox_label, max_chars=100)
-
- admin_credentials_expander = st.expander("Admin credential options", expanded=True)
- with admin_credentials_expander:
- admin_connection_option_index = 0
- admin_connection_options = ["Do not use admin credentials", "Use admin credentials with Password"]
- if sql_flavor == "snowflake":
- admin_connection_options.append("Use admin credentials with Key-Pair")
-
- admin_connection_option = st.radio(
- "Admin credential options",
- label_visibility="hidden",
- options=admin_connection_options,
- index=admin_connection_option_index,
- horizontal=True,
- )
-
- st.markdown(" ", unsafe_allow_html=True)
-
- db_user = None
- db_password = None
- admin_private_key_passphrase = None
- admin_private_key = None
- if admin_connection_option == admin_connection_options[0]:
- st.markdown(":orange[User created in the connection dialog will be used.]")
- else:
- db_user = st.text_input(label="Admin db user", max_chars=40)
- if admin_connection_option == admin_connection_options[1]:
- db_password = st.text_input(
- label="Admin db password", max_chars=40, type="password"
- )
- st.markdown(":orange[Note: Admin credentials are not stored, are only used for this operation.]")
-
- if len(admin_connection_options) > 2 and admin_connection_option == admin_connection_options[2]:
- admin_private_key_passphrase = st.text_input(
- label="Private Key Passphrase",
- key="create-qc-schema-private-key-password",
- type="password",
- max_chars=200,
- help="Passphrase used while creating the private Key (leave empty if not applicable)",
- )
-
- admin_uploaded_file = st.file_uploader("Upload private key (rsa_key.p8)", key="admin-uploaded-file")
- if admin_uploaded_file:
- admin_private_key = admin_uploaded_file.getvalue().decode("utf-8")
-
- st.markdown(":orange[Note: Admin credentials are not stored, are only used for this operation.]")
-
- submit = st.button("Update Configuration")
-
- if submit:
- empty_cache()
- script_expander = st.expander("Script Details")
-
- operation_status = st.empty()
- operation_status.info(f"Configuring QC Utility Schema '{project_qc_schema}'...")
-
- try:
- skip_granting_privileges = not grant_privileges
- queries = get_setup_profiling_tools_queries(sql_flavor, create_qc_schema, skip_granting_privileges, project_qc_schema, user, user_role)
- with script_expander:
- st.code(
- os.linesep.join(queries),
- language="sql",
- line_numbers=True)
-
- connection_service.create_qc_schema(
- connection_id,
- create_qc_schema,
- db_user if db_user else None,
- db_password if db_password else None,
- skip_granting_privileges,
- admin_private_key_passphrase=admin_private_key_passphrase,
- admin_private_key=admin_private_key,
- user_role=user_role,
- )
- operation_status.empty()
- operation_status.success("Operation has finished successfully.")
-
- except Exception as e:
- operation_status.empty()
- operation_status.error("Error configuring QC Utility Schema.")
- error_message = e.args[0]
- st.text_area("Error Details", value=error_message)
-
@st.dialog(title="Data Configuration Setup")
def setup_data_configuration(self, project_code: str, connection: dict) -> None:
will_run_profiling = st.session_state.get("connection_form-new:run-profiling-toggle", True)
From 10c612e1668d532208e5ff3e8d36e05948157386 Mon Sep 17 00:00:00 2001
From: Luis Trinidad
Date: Wed, 6 Nov 2024 09:53:38 -0400
Subject: [PATCH 43/91] fix(profiling): add parenthesis to profiling issue
criteria
the missing parentheses caused the query that inserted into profiling
anomalies table to include anomalies for other profiling runs
---
testgen/template/dbsetup/050_populate_new_schema_metadata.sql | 2 +-
testgen/template/dbupgrade/0112_incremental_upgrade.sql | 3 +++
2 files changed, 4 insertions(+), 1 deletion(-)
create mode 100644 testgen/template/dbupgrade/0112_incremental_upgrade.sql
diff --git a/testgen/template/dbsetup/050_populate_new_schema_metadata.sql b/testgen/template/dbsetup/050_populate_new_schema_metadata.sql
index f30d83c..c4ea048 100644
--- a/testgen/template/dbsetup/050_populate_new_schema_metadata.sql
+++ b/testgen/template/dbsetup/050_populate_new_schema_metadata.sql
@@ -16,7 +16,7 @@ INSERT INTO profile_anomaly_types (id, anomaly_type, data_object, anomaly_name,
VALUES ('1001', 'Suggested_Type', 'Column', 'Suggested Data Type', 'Data stored as text all meets criteria for a more suitable type. ', '(functional_data_type NOT IN (''Boolean'', ''Flag'') ) AND (column_type ILIKE ''%ch
ar%'' OR column_type ILIKE ''text'') AND NOT (datatype_suggestion ILIKE ''%char%'' OR datatype_suggestion ILIKE ''text'')', 'p.datatype_suggestion::VARCHAR(200)', 'Likely', 'Consider changing the column data type to tighte
n controls over data ingested and to make values more efficient, consistent and suitable for downstream analysis.'),
- ('1002', 'Non_Standard_Blanks', 'Column', 'Non-Standard Blank Values', 'Values representing missing data may be unexpected or inconsistent. Non-standard values may include empty strings as opposed to nulls, dummy entries such as "MISSING" or repeated characters that may have been used to bypass entry requirements, processing artifacts such as "NULL", or spreadsheet artifacts such as "NA", "ERROR".', 'p.filled_value_ct > 0 OR p.zero_length_ct > 0', '''Filled Values: '' || p.filled_value_ct::VARCHAR || '', Empty String: '' || p.zero_length_ct::VARCHAR || '', Null: '' || p.null_value_ct::VARCHAR || '', Records: '' || p.record_ct::VARCHAR', 'Definite', 'Consider cleansing the column upon ingestion to replace all variants of missing data with a standard designation, like Null.'),
+ ('1002', 'Non_Standard_Blanks', 'Column', 'Non-Standard Blank Values', 'Values representing missing data may be unexpected or inconsistent. Non-standard values may include empty strings as opposed to nulls, dummy entries such as "MISSING" or repeated characters that may have been used to bypass entry requirements, processing artifacts such as "NULL", or spreadsheet artifacts such as "NA", "ERROR".', '(p.filled_value_ct > 0 OR p.zero_length_ct > 0)', '''Filled Values: '' || p.filled_value_ct::VARCHAR || '', Empty String: '' || p.zero_length_ct::VARCHAR || '', Null: '' || p.null_value_ct::VARCHAR || '', Records: '' || p.record_ct::VARCHAR', 'Definite', 'Consider cleansing the column upon ingestion to replace all variants of missing data with a standard designation, like Null.'),
('1003', 'Invalid_Zip_USA', 'Column', 'Invalid USA Zip Code Format', 'Some values present do not conform with the expected format of USA Zip Codes.', 'p.std_pattern_match = ''ZIP_USA'' AND (p.general_type <> ''A'' OR p.filled_value_ct > 0 OR p.min_length >= 1 AND p.min_length <= 4 OR p.max_length > 10)', 'CASE WHEN p.general_type = ''N'' THEN ''Type: '' || p.column_type || '', '' ELSE '''' END || ''Min Length: '' || p.min_length::VARCHAR || '', Max Length: '' || p.max_length::VARCHAR || '', Filled Values: '' || p.filled_value_ct::VARCHAR', 'Definite', 'Consider correcting invalid column values or changing them to indicate a missing value if corrections cannot be made.'),
('1004', 'Multiple_Types_Minor', 'Multi-Col', 'Multiple Data Types per Column Name - Minor', 'Columns with the same name have the same general type across tables, but the types do not exactly match. Truncation issues may result if columns are commingled and assumed to be the same format.', 'm.general_type_ct = 1 AND m.type_ct > 1', '''Found '' || m.column_ct::VARCHAR || '' columns, '' || m.type_ct::VARCHAR(10) || '' types, '' || m.min_type || '' to '' || m.max_type', 'Possible', 'Consider changing the column data types to be fully consistent. This will tighten your standards at ingestion and assure that data is consistent between tables.'),
('1005', 'Multiple_Types_Major', 'Multi-Col', 'Multiple Data Types per Column Name - Major', 'Columns with the same name have broadly different types across tables. Differences could be significant enough to cause errors in downstream analysis, extra steps resulting in divergent business logic and inconsistencies in results.', 'm.general_type_ct > 1', '''Found '' || m.column_ct::VARCHAR || '' columns, '' || m.type_ct::VARCHAR(10) || '' types, '' || m.min_type || '' to '' || m.max_type', 'Likely', 'Ideally, you should change the column data types to be fully consistent. If the data is meant to be different, you should change column names so downstream users aren''t led astray.'),
diff --git a/testgen/template/dbupgrade/0112_incremental_upgrade.sql b/testgen/template/dbupgrade/0112_incremental_upgrade.sql
new file mode 100644
index 0000000..c81cccb
--- /dev/null
+++ b/testgen/template/dbupgrade/0112_incremental_upgrade.sql
@@ -0,0 +1,3 @@
+SET SEARCH_PATH TO {SCHEMA_NAME};
+
+UPDATE profile_anomaly_types SET anomaly_criteria = '(p.filled_value_ct > 0 OR p.zero_length_ct > 0)' WHERE id = '1002';
From e64d20a20ced85f42c7bfd0cf9a6ed8354c555a2 Mon Sep 17 00:00:00 2001
From: Luis Trinidad
Date: Thu, 7 Nov 2024 12:15:35 -0400
Subject: [PATCH 44/91] misc(profiling): remove qc schema creation logic
---
testgen/__main__.py | 80 ----------------
testgen/commands/run_quick_start.py | 11 ---
testgen/commands/run_setup_profiling_tools.py | 96 -------------------
testgen/ui/services/connection_service.py | 7 --
4 files changed, 194 deletions(-)
delete mode 100644 testgen/commands/run_setup_profiling_tools.py
diff --git a/testgen/__main__.py b/testgen/__main__.py
index 285e949..fd19379 100644
--- a/testgen/__main__.py
+++ b/testgen/__main__.py
@@ -1,4 +1,3 @@
-import getpass
import logging
import os
import subprocess
@@ -33,7 +32,6 @@
from testgen.commands.run_observability_exporter import run_observability_exporter
from testgen.commands.run_profiling_bridge import run_profiling_queries
from testgen.commands.run_quick_start import run_quick_start, run_quick_start_increment
-from testgen.commands.run_setup_profiling_tools import run_setup_profiling_tools
from testgen.commands.run_upgrade_db_config import get_schema_revision, is_db_revision_up_to_date, run_upgrade_db_config
from testgen.common import (
configure_logging,
@@ -450,84 +448,6 @@ def do_upgrade_system_version():
click.echo("System and services upgrade is not required.")
-@cli.command(
- "setup-target-db-functions", help="Use to set up the utility functions in the target database for running profiles."
-)
-@click.option(
- "-c",
- "--connection-id",
- help="The identifier for the connection. Use a connection_id shown in list-connections.",
- required=True,
- type=click.STRING,
-)
-@click.option(
- "-dr",
- "--dry-run",
- default=False,
- is_flag=True,
- required=False,
- help="Dry run to show which schema will be modified",
-)
-@click.option(
- "-cs",
- "--create-qc-schema",
- default=False,
- is_flag=True,
- required=False,
- help="Create the QC utility schema required in the target database",
-)
-@click.option("--yes", "-y", default=False, is_flag=True, required=False, help="Force yes")
-@click.option(
- "--skip-asking-credentials",
- "-s",
- default=False,
- is_flag=True,
- required=False,
- help="Skip request for special write credentials for target database, uses standard credentials instead",
-)
-@click.option(
- "--skip-granting-privileges",
- "-sgp",
- default=False,
- is_flag=True,
- required=False,
- help="Skip granting execute privileges to the user for the QC utility schema in the target database",
-)
-@pass_configuration
-def setup_profiling_tools(
- configuration: Configuration,
- connection_id: str,
- dry_run: bool,
- create_qc_schema: bool,
- yes: bool,
- skip_asking_credentials: bool,
- skip_granting_privileges: bool,
-):
- db_user = None
- db_password = None
- if not skip_asking_credentials:
- db_user = input("Admin DB User?")
- db_password = getpass.getpass("Admin DB Password?")
-
- if not yes and not dry_run:
- confirm = input(
- f"Are you sure you want to setup the utility functions to be able to run the profile for connection {connection_id}? [yes/No]"
- )
- if confirm.lower() != "yes":
- click.echo("Exiting without any operation performed.")
- return
- project_qc_schema = run_setup_profiling_tools(
- connection_id, dry_run, create_qc_schema, db_user, db_password, skip_granting_privileges
- )
- if not dry_run:
- message = f"Project DB has been set up. Modified schema: {project_qc_schema}"
- else:
- message = (
- f"Project DB dry run completed, no changes applied. Modified schema would have been: {project_qc_schema}"
- )
- click.echo(message)
-
-
@cli.command("get-test-results", help="Fetches results for a test run.")
@click.option(
"-tr",
diff --git a/testgen/commands/run_quick_start.py b/testgen/commands/run_quick_start.py
index 67a22b5..487c47d 100644
--- a/testgen/commands/run_quick_start.py
+++ b/testgen/commands/run_quick_start.py
@@ -5,7 +5,6 @@
from testgen import settings
from testgen.commands.run_get_entities import run_table_group_list
from testgen.commands.run_launch_db_config import run_launch_db_config
-from testgen.commands.run_setup_profiling_tools import run_setup_profiling_tools
from testgen.common.database.database_service import (
AssignConnectParms,
CreateDatabaseIfNotExists,
@@ -140,16 +139,6 @@ def run_quick_start(delete_target_db: bool) -> None:
rows, _ = run_table_group_list(project_key)
connection_id = str(rows[0][2])
- # run qc
- command = "testgen setup-target-db-functions --connection-id --create-qc-schema --yes"
- click.echo(f"Running CLI command: {command}")
- create_qc_schema = True
- db_user = params_mapping["TESTGEN_ADMIN_USER"]
- db_password = params_mapping["TESTGEN_ADMIN_PASSWORD"]
- dry_run = False
- project_qc_schema = run_setup_profiling_tools(connection_id, dry_run, create_qc_schema, db_user, db_password)
- click.echo(f"Schema {project_qc_schema} has been created in the target db")
-
def run_quick_start_increment(iteration):
params_mapping = _get_params_mapping(iteration)
diff --git a/testgen/commands/run_setup_profiling_tools.py b/testgen/commands/run_setup_profiling_tools.py
deleted file mode 100644
index c2d42f3..0000000
--- a/testgen/commands/run_setup_profiling_tools.py
+++ /dev/null
@@ -1,96 +0,0 @@
-import logging
-
-from testgen.commands.run_get_entities import run_get_connection
-from testgen.common import AssignConnectParms, RunActionQueryList
-from testgen.common.database.database_service import get_queries_for_command
-
-LOG = logging.getLogger("testgen")
-
-
-def _get_params_mapping(project_qc_schema: str, user: str, user_role: str | None) -> dict:
- return {
- "DATA_QC_SCHEMA": project_qc_schema,
- "DB_USER": user,
- "DB_USER_ROLE": user_role,
- }
-
-
-def get_setup_profiling_tools_queries(sql_flavor, create_qc_schema, skip_granting_privileges, project_qc_schema, user, user_role=None):
- queries = []
-
- params_mapping = _get_params_mapping(project_qc_schema, user, user_role)
-
- if create_qc_schema:
- queries.extend(
- get_queries_for_command(
- f"flavors/{sql_flavor}/setup_profiling_tools",
- params_mapping,
- mask=rf"^.*create_qc_schema_{sql_flavor}.sql$",
- )
- )
-
- queries.extend(
- get_queries_for_command(
- f"flavors/{sql_flavor}/setup_profiling_tools", params_mapping, mask=rf"^.*functions_{sql_flavor}.sql$"
- )
- )
-
- if not skip_granting_privileges:
- queries.extend(
- get_queries_for_command(
- f"flavors/{sql_flavor}/setup_profiling_tools",
- params_mapping,
- mask=rf"^.*grant_execute_privileges_{sql_flavor}.sql$",
- )
- )
-
- return queries
-
-
-def run_setup_profiling_tools(
- connection_id: str | int,
- dry_run: bool,
- create_qc_schema: bool = True,
- db_user: str | None = None,
- db_password: str | None = None,
- skip_granting_privileges: bool = False,
- admin_private_key_passphrase: str | None = None,
- admin_private_key: str | None = None,
- user_role: str | None = None,
-) -> str:
- connection = run_get_connection(str(connection_id))
-
- # Set Project Connection Parms in common.db_bridgers from retrieved parms
- LOG.info("CurrentStep: Assigning Connection Parms")
- user = db_user or connection["project_user"]
- connect_by_key = admin_private_key is not None or connection["connect_by_key"]
- private_key_passphrase = admin_private_key_passphrase if admin_private_key is not None else connection["private_key_passphrase"]
- private_key = admin_private_key if admin_private_key is not None else connection["private_key"]
-
- AssignConnectParms(
- connection["project_key"],
- connection["connection_id"],
- connection["project_host"],
- connection["project_port"],
- connection["project_db"],
- connection["project_qc_schema"],
- user,
- connection["sql_flavor"],
- connection["url"],
- connection["connect_by_url"],
- connect_by_key,
- private_key,
- private_key_passphrase,
- "PROJECT",
- )
-
- project_qc_schema = connection["project_qc_schema"]
- sql_flavor = connection["sql_flavor"]
- user = connection["project_user"]
-
- queries = get_setup_profiling_tools_queries(sql_flavor, create_qc_schema, skip_granting_privileges, project_qc_schema, user, user_role)
-
- if not dry_run:
- RunActionQueryList("PROJECT", queries, user_override=db_user, pwd_override=db_password)
-
- return project_qc_schema
diff --git a/testgen/ui/services/connection_service.py b/testgen/ui/services/connection_service.py
index 3fe3ecd..66796d4 100644
--- a/testgen/ui/services/connection_service.py
+++ b/testgen/ui/services/connection_service.py
@@ -3,7 +3,6 @@
import testgen.ui.queries.connection_queries as connection_queries
import testgen.ui.services.table_group_service as table_group_service
from testgen.commands.run_profiling_bridge import InitializeProfilingSQL
-from testgen.commands.run_setup_profiling_tools import run_setup_profiling_tools
from testgen.common.database.database_service import (
AssignConnectParms,
RetrieveDBResultsToList,
@@ -196,12 +195,6 @@ def test_qc_connection(project_code, connection, init_profiling=True):
return qc_results
-def create_qc_schema(connection_id, create_qc_schema, db_user, db_password, skip_granting_privileges, admin_private_key_passphrase=None, admin_private_key=None, user_role=None):
- dry_run = False
- empty_cache()
- run_setup_profiling_tools(connection_id, dry_run, create_qc_schema, db_user, db_password, skip_granting_privileges, admin_private_key_passphrase, admin_private_key, user_role)
-
-
def form_overwritten_connection_url(connection):
flavor = connection["sql_flavor"]
From 66ca55d5a6e8bf6d210afdfd1b8844ee69fa3643 Mon Sep 17 00:00:00 2001
From: Ricardo Boni
Date: Tue, 29 Oct 2024 18:36:29 -0400
Subject: [PATCH 45/91] feat(pdf): Hygiene Issues report
---
testgen/ui/pdf/hygiene_issue_report.py | 165 ++++++++++++++++++
testgen/ui/services/hygiene_issues_service.py | 87 +++++++++
.../views/dialogs/profiling_results_dialog.py | 12 +-
testgen/ui/views/profiling_anomalies.py | 143 +++++----------
testgen/ui/views/test_definitions.py | 9 +-
testgen/ui/views/test_results.py | 27 +--
6 files changed, 324 insertions(+), 119 deletions(-)
create mode 100644 testgen/ui/pdf/hygiene_issue_report.py
create mode 100644 testgen/ui/services/hygiene_issues_service.py
diff --git a/testgen/ui/pdf/hygiene_issue_report.py b/testgen/ui/pdf/hygiene_issue_report.py
new file mode 100644
index 0000000..4c23ec6
--- /dev/null
+++ b/testgen/ui/pdf/hygiene_issue_report.py
@@ -0,0 +1,165 @@
+import pandas
+from reportlab.lib import colors
+from reportlab.lib.colors import HexColor
+from reportlab.lib.enums import TA_CENTER
+from reportlab.lib.styles import ParagraphStyle
+from reportlab.platypus import CondPageBreak, KeepTogether, Paragraph, Table, TableStyle
+
+from testgen.ui.pdf.dataframe_table import DataFrameTableBuilder
+from testgen.ui.pdf.style import (
+ COLOR_GRAY_BG,
+ COLOR_GREEN_BG,
+ PARA_STYLE_CELL,
+ PARA_STYLE_FOOTNOTE,
+ PARA_STYLE_H1,
+ PARA_STYLE_INFO,
+ PARA_STYLE_MONO,
+ PARA_STYLE_TEXT,
+ PARA_STYLE_TITLE,
+ TABLE_STYLE_DEFAULT,
+)
+from testgen.ui.pdf.templates import DatakitchenTemplate
+from testgen.ui.services.hygiene_issues_service import get_source_data
+
+SECTION_MIN_AVAILABLE_HEIGHT = 120
+
+CLASS_COLORS = {
+ "Definite": HexColor(0xE94D4A),
+ "Likely": HexColor(0xFC8F2A),
+ "Possible": HexColor(0xFCD349),
+ "Potential PII": HexColor(0xFC8F2A),
+}
+
+def build_summary_table(document, hi_data):
+
+ summary_table_style = TableStyle(
+ (
+ # All-table styles
+ ("GRID", (0, 0), (-1, -1), 2, colors.white),
+ ("BACKGROUND", (0, 0), (-1, -1), COLOR_GRAY_BG),
+
+ # Empty cells
+ ("BACKGROUND", (2, 5), (-1, -1), colors.white),
+
+ # Header cells
+ *[
+ (cmd[0], *coords, *cmd[1:])
+ for coords in (
+ ((2, 2), (2, 4)),
+ ((0, 0), (0, -1))
+ )
+ for cmd in (
+ ("FONT", "Helvetica-Bold"),
+ ("ALIGN", "RIGHT"),
+ ("BACKGROUND", COLOR_GREEN_BG),
+ )
+ ],
+
+ # Layout
+ ("SPAN", (1, 0), (3, 0)),
+
+ ("SPAN", (1, 1), (4, 1)),
+
+ ("SPAN", (3, 2), (4, 2)),
+ ("SPAN", (3, 3), (4, 3)),
+ ("SPAN", (3, 4), (4, 4)),
+ ("SPAN", (3, 5), (4, 5)),
+
+
+ # Status cell
+ *[
+ (cmd[0], (4, 0), (4, 0), *cmd[1:])
+ for cmd in (
+ ("BACKGROUND", CLASS_COLORS.get(hi_data["issue_likelihood"], COLOR_GRAY_BG)),
+ ("ALIGNMENT", "CENTER"),
+ ("VALIGN", "MIDDLE"),
+ )
+ ],
+ ),
+ parent=TABLE_STYLE_DEFAULT,
+ )
+
+
+ profiling_timestamp = pandas.to_datetime(hi_data["profiling_starttime"]).strftime("%Y-%m-%d %H:%M:%S")
+ summary_table_data = [
+ (
+ "Hygiene Issue",
+ (
+ Paragraph(f"{hi_data["anomaly_name"]}:", style=PARA_STYLE_CELL),
+ Paragraph(hi_data["anomaly_description"], style=PARA_STYLE_CELL),
+ ),
+ None,
+ None,
+ Paragraph(
+ hi_data["issue_likelihood"],
+ style=ParagraphStyle("likelihood", textColor=colors.white, fontSize=10, parent=PARA_STYLE_CELL, alignment=TA_CENTER),
+ ),
+ ),
+ (
+ "Detail",
+ Paragraph(
+ hi_data["detail"],
+ style=ParagraphStyle("detail", fontName="Helvetica-Bold", parent=PARA_STYLE_CELL),
+ ),
+ ),
+
+ ("Database/Schema", hi_data["schema_name"], "Profiling Date", profiling_timestamp),
+ ("Table", hi_data["table_name"], "Table Group", hi_data["table_groups_name"]),
+ ("Column", hi_data["column_name"], "Disposition", hi_data["disposition"] or "No Decision"),
+ ("Column Type", hi_data["column_type"]),
+ ]
+
+ summary_table_col_widths = [n * document.width for n in (.15, .35, .15, .15, .20)]
+ return Table(summary_table_data, style=summary_table_style, hAlign="LEFT", colWidths=summary_table_col_widths)
+
+
+def build_sample_data_content(document, sample_data_tuple):
+ sample_data_status, sample_data_msg, lookup_query, sample_data = sample_data_tuple
+ if sample_data_status in ("ND", "NA"):
+ yield Paragraph(sample_data_msg, style=PARA_STYLE_INFO)
+ elif sample_data_status == "ERR" or sample_data is None:
+ yield Paragraph("It was not possible to fetch the sample data this time.", style=PARA_STYLE_INFO)
+ else:
+ sample_data.columns = [col.replace("_", " ").title() for col in sample_data.columns]
+ df_table_builder = DataFrameTableBuilder(sample_data, document.width)
+ table_flowables = [df_table_builder.build_table(hAlign="LEFT")]
+ if df_table_builder.omitted_columns:
+ omitted_columns = ", ".join(df_table_builder.omitted_columns)
+ sample_data_msg = f"Note: The following columns were omitted from this table: {omitted_columns}"
+ if sample_data_msg:
+ table_flowables.append(Paragraph(sample_data_msg, style=PARA_STYLE_FOOTNOTE))
+
+ yield from df_table_builder.split_in_columns(table_flowables)
+
+
+def build_sql_query_conntent(sample_data_tuple):
+ lookup_query = sample_data_tuple[2]
+ if lookup_query:
+ return Paragraph(lookup_query, PARA_STYLE_MONO)
+ else:
+ return Paragraph("No sample data lookup query registered for this issue.")
+
+
+def get_report_content(document, hi_data):
+ yield Paragraph("TestGen Issue Report", PARA_STYLE_TITLE)
+ yield build_summary_table(document, hi_data)
+
+ yield CondPageBreak(SECTION_MIN_AVAILABLE_HEIGHT)
+ yield Paragraph("Suggested Action", style=PARA_STYLE_H1)
+ yield Paragraph(hi_data["suggested_action"], style=PARA_STYLE_TEXT)
+
+ sample_data_tuple = get_source_data(hi_data)
+
+ yield CondPageBreak(SECTION_MIN_AVAILABLE_HEIGHT)
+ yield Paragraph("Sample Data", PARA_STYLE_H1)
+ yield from build_sample_data_content(document, sample_data_tuple)
+
+ yield KeepTogether([
+ Paragraph("SQL Query", PARA_STYLE_H1),
+ build_sql_query_conntent(sample_data_tuple)
+ ])
+
+
+def create_report(filename, hi_data):
+ doc = DatakitchenTemplate(filename)
+ doc.build(flowables=list(get_report_content(doc, hi_data)))
diff --git a/testgen/ui/services/hygiene_issues_service.py b/testgen/ui/services/hygiene_issues_service.py
new file mode 100644
index 0000000..0668876
--- /dev/null
+++ b/testgen/ui/services/hygiene_issues_service.py
@@ -0,0 +1,87 @@
+import streamlit as st
+
+from testgen.ui.services import database_service as db
+
+
+def get_source_data(hi_data):
+ str_schema = st.session_state["dbschema"]
+ # Define the query
+ str_sql = f"""
+ SELECT t.lookup_query, tg.table_group_schema, c.project_qc_schema,
+ c.sql_flavor, c.project_host, c.project_port, c.project_db, c.project_user, c.project_pw_encrypted,
+ c.url, c.connect_by_url, c.connect_by_key, c.private_key, c.private_key_passphrase
+ FROM {str_schema}.target_data_lookups t
+ INNER JOIN {str_schema}.table_groups tg
+ ON ('{hi_data["table_groups_id"]}'::UUID = tg.id)
+ INNER JOIN {str_schema}.connections c
+ ON (tg.connection_id = c.connection_id)
+ AND (t.sql_flavor = c.sql_flavor)
+ WHERE t.error_type = 'Profile Anomaly'
+ AND t.test_id = '{hi_data["anomaly_id"]}'
+ AND t.lookup_query > '';
+ """
+
+ def get_lookup_query(test_id, detail_exp, column_names):
+ if test_id in {"1019", "1020"}:
+ start_index = detail_exp.find("Columns: ")
+ if start_index == -1:
+ columns = [col.strip() for col in column_names.split(",")]
+ else:
+ start_index += len("Columns: ")
+ column_names_str = detail_exp[start_index:]
+ columns = [col.strip() for col in column_names_str.split(",")]
+ queries = [
+ f"SELECT '{column}' AS column_name, MAX({column}) AS max_date_available FROM {{TARGET_SCHEMA}}.{{TABLE_NAME}}"
+ for column in columns
+ ]
+ sql_query = " UNION ALL ".join(queries) + " ORDER BY max_date_available DESC;"
+ else:
+ sql_query = ""
+ return sql_query
+
+ def replace_parms(str_query):
+ str_query = (
+ get_lookup_query(hi_data["anomaly_id"], hi_data["detail"], hi_data["column_name"])
+ if lst_query[0]["lookup_query"] == "created_in_ui"
+ else lst_query[0]["lookup_query"]
+ )
+ str_query = str_query.replace("{TARGET_SCHEMA}", lst_query[0]["table_group_schema"])
+ str_query = str_query.replace("{TABLE_NAME}", hi_data["table_name"])
+ str_query = str_query.replace("{COLUMN_NAME}", hi_data["column_name"])
+ str_query = str_query.replace("{DATA_QC_SCHEMA}", lst_query[0]["project_qc_schema"])
+ str_query = str_query.replace("{DETAIL_EXPRESSION}", hi_data["detail"])
+ str_query = str_query.replace("{PROFILE_RUN_DATE}", hi_data["profiling_starttime"])
+ if str_query is None or str_query == "":
+ raise ValueError("Lookup query is not defined for this Anomoly Type.")
+ return str_query
+
+ try:
+ # Retrieve SQL for customer lookup
+ lst_query = db.retrieve_data_list(str_sql)
+
+ # Retrieve and return data as df
+ if lst_query:
+ str_sql = replace_parms(str_sql)
+ df = db.retrieve_target_db_df(
+ lst_query[0]["sql_flavor"],
+ lst_query[0]["project_host"],
+ lst_query[0]["project_port"],
+ lst_query[0]["project_db"],
+ lst_query[0]["project_user"],
+ lst_query[0]["project_pw_encrypted"],
+ str_sql,
+ lst_query[0]["url"],
+ lst_query[0]["connect_by_url"],
+ lst_query[0]["connect_by_key"],
+ lst_query[0]["private_key"],
+ lst_query[0]["private_key_passphrase"],
+ )
+ if df.empty:
+ return "ND", "Data that violates Hygiene Issue criteria is not present in the current dataset.", str_sql, None
+ else:
+ return "OK", None, str_sql, df
+ else:
+ return "NA", "Source data lookup is not available for this Issue.", None, None
+
+ except Exception as e:
+ return "ERR", f"Source data lookup query caused an error:\n\n{e.args[0]}", None, None
diff --git a/testgen/ui/views/dialogs/profiling_results_dialog.py b/testgen/ui/views/dialogs/profiling_results_dialog.py
index 26f3078..5cce9c6 100644
--- a/testgen/ui/views/dialogs/profiling_results_dialog.py
+++ b/testgen/ui/views/dialogs/profiling_results_dialog.py
@@ -12,13 +12,11 @@
BUTTON_HELP = "Review profiling for highlighted column"
-def view_profiling_button(button_container, str_table_name, str_column_name,
- str_profile_run_id=None, str_table_groups_id=None):
- with button_container:
- if st.button(
- BUTTON_TEXT, help=BUTTON_HELP, use_container_width=True
- ):
- profiling_results_dialog(str_table_name, str_column_name, str_profile_run_id, str_table_groups_id)
+def view_profiling_button(str_table_name, str_column_name, str_profile_run_id=None, str_table_groups_id=None):
+ if st.button(
+ BUTTON_TEXT, help=BUTTON_HELP, use_container_width=True
+ ):
+ profiling_results_dialog(str_table_name, str_column_name, str_profile_run_id, str_table_groups_id)
@st.dialog(title="Profiling Results")
diff --git a/testgen/ui/views/profiling_anomalies.py b/testgen/ui/views/profiling_anomalies.py
index 1450e42..4e70ce5 100644
--- a/testgen/ui/views/profiling_anomalies.py
+++ b/testgen/ui/views/profiling_anomalies.py
@@ -1,5 +1,7 @@
import typing
+from io import BytesIO
+import pandas as pd
import plotly.express as px
import streamlit as st
@@ -8,10 +10,12 @@
import testgen.ui.services.form_service as fm
import testgen.ui.services.query_service as dq
from testgen.common import date_service
-from testgen.common.read_file import replace_templated_functions
from testgen.ui.components import widgets as testgen
+from testgen.ui.components.widgets.download_dialog import FILE_DATA_TYPE, download_dialog, zip_multi_file_data
from testgen.ui.navigation.page import Page
+from testgen.ui.pdf.hygiene_issue_report import create_report
from testgen.ui.services import project_service
+from testgen.ui.services.hygiene_issues_service import get_source_data as get_source_data_uncached
from testgen.ui.session import session
from testgen.ui.views.dialogs.profiling_results_dialog import view_profiling_button
@@ -167,7 +171,7 @@ def render(self, run_id: str, issue_class: str | None = None, issue_type: str |
if not selected_row:
st.markdown(":orange[Select a record to see more information.]")
else:
- col1, col2 = st.columns([0.7, 0.3])
+ col1, col2 = st.columns([0.8, 0.2])
with col1:
fm.render_html_list(
selected_row,
@@ -185,17 +189,33 @@ def render(self, run_id: str, issue_class: str | None = None, issue_type: str |
int_data_width=700,
)
with col2:
- # _, v_col2 = st.columns([0.3, 0.7])
- v_col1, v_col2 = st.columns([0.5, 0.5])
- view_profiling_button(
- v_col1, selected_row["table_name"], selected_row["column_name"],
- str_profile_run_id=run_id
- )
- with v_col2:
+ view_profiling_button(
+ selected_row["table_name"], selected_row["column_name"], str_profile_run_id=run_id
+ )
+
if st.button(
"Source Data →", help="Review current source data for highlighted issue", use_container_width=True
):
source_data_dialog(selected_row)
+ if st.button(
+ ":material/file_save: Issue Report",
+ use_container_width=True,
+ help="Generate a PDF report for each selected issue",
+ ):
+ dialog_title = "Download Issue Report"
+ if len(selected) == 1:
+ download_dialog(
+ dialog_title=dialog_title,
+ file_content_func=get_report_file_data,
+ args=(selected[0],),
+ )
+ else:
+ zip_func = zip_multi_file_data(
+ "testgen_issue_reports.zip",
+ get_report_file_data,
+ [(arg,) for arg in selected],
+ )
+ download_dialog(dialog_title=dialog_title, file_content_func=zip_func)
cached_functions = [get_anomaly_disposition, get_profiling_anomaly_summary]
# Clear the list cache if the list is sorted by disposition/action
@@ -269,12 +289,16 @@ def get_profiling_anomalies(str_profile_run_id, str_likelihood, issue_type_id, s
WHEN t.issue_likelihood = 'Definite' THEN 4
END AS likelihood_order,
t.anomaly_description, r.detail, t.suggested_action,
- r.anomaly_id, r.table_groups_id::VARCHAR, r.id::VARCHAR, p.profiling_starttime
+ r.anomaly_id, r.table_groups_id::VARCHAR, r.id::VARCHAR, p.profiling_starttime,
+ tg.table_groups_name
FROM {str_schema}.profile_anomaly_results r
INNER JOIN {str_schema}.profile_anomaly_types t
ON r.anomaly_id = t.id
INNER JOIN {str_schema}.profiling_runs p
ON r.profile_run_id = p.id
+ INNER JOIN {str_schema}.table_groups tg
+ ON r.table_groups_id = tg.id
+
WHERE r.profile_run_id = '{str_profile_run_id}'
{str_criteria}
{str_order_by}
@@ -352,90 +376,8 @@ def get_profiling_anomaly_summary(str_profile_run_id):
@st.cache_data(show_spinner=False)
-def get_bad_data(selected_row):
- str_schema = st.session_state["dbschema"]
- # Define the query
- str_sql = f"""
- SELECT t.lookup_query, tg.table_group_schema, c.project_qc_schema,
- c.sql_flavor, c.project_host, c.project_port, c.project_db, c.project_user, c.project_pw_encrypted,
- c.url, c.connect_by_url, c.connect_by_key, c.private_key, c.private_key_passphrase
- FROM {str_schema}.target_data_lookups t
- INNER JOIN {str_schema}.table_groups tg
- ON ('{selected_row["table_groups_id"]}'::UUID = tg.id)
- INNER JOIN {str_schema}.connections c
- ON (tg.connection_id = c.connection_id)
- AND (t.sql_flavor = c.sql_flavor)
- WHERE t.error_type = 'Profile Anomaly'
- AND t.test_id = '{selected_row["anomaly_id"]}'
- AND t.lookup_query > '';
- """
-
- def get_lookup_query(test_id, detail_exp, column_names):
- if test_id in {"1019", "1020"}:
- start_index = detail_exp.find("Columns: ")
- if start_index == -1:
- columns = [col.strip() for col in column_names.split(",")]
- else:
- start_index += len("Columns: ")
- column_names_str = detail_exp[start_index:]
- columns = [col.strip() for col in column_names_str.split(",")]
- queries = [
- f"SELECT '{column}' AS column_name, MAX({column}) AS max_date_available FROM {{TARGET_SCHEMA}}.{{TABLE_NAME}}"
- for column in columns
- ]
- sql_query = " UNION ALL ".join(queries) + " ORDER BY max_date_available DESC;"
- else:
- sql_query = ""
- return sql_query
-
- def replace_parms(str_query):
- str_query: str = (
- get_lookup_query(selected_row["anomaly_id"], selected_row["detail"], selected_row["column_name"])
- if lst_query[0]["lookup_query"] == "created_in_ui"
- else lst_query[0]["lookup_query"]
- )
- str_query = str_query.replace("{TARGET_SCHEMA}", lst_query[0]["table_group_schema"])
- str_query = str_query.replace("{TABLE_NAME}", selected_row["table_name"])
- str_query = str_query.replace("{COLUMN_NAME}", selected_row["column_name"])
- str_query = str_query.replace("{DATA_QC_SCHEMA}", lst_query[0]["project_qc_schema"])
- str_query = str_query.replace("{DETAIL_EXPRESSION}", selected_row["detail"])
- str_query = str_query.replace("{PROFILE_RUN_DATE}", selected_row["profiling_starttime"])
- if "{{DKFN_" in str_query:
- str_query = replace_templated_functions(str_query, lst_query[0]["sql_flavor"])
- if str_query is None or str_query == "":
- raise ValueError("Lookup query is not defined for this Anomoly Type.")
- return str_query
-
- try:
- # Retrieve SQL for customer lookup
- lst_query = db.retrieve_data_list(str_sql)
-
- # Retrieve and return data as df
- if lst_query:
- str_sql = replace_parms(str_sql)
- df = db.retrieve_target_db_df(
- lst_query[0]["sql_flavor"],
- lst_query[0]["project_host"],
- lst_query[0]["project_port"],
- lst_query[0]["project_db"],
- lst_query[0]["project_user"],
- lst_query[0]["project_pw_encrypted"],
- str_sql,
- lst_query[0]["url"],
- lst_query[0]["connect_by_url"],
- lst_query[0]["connect_by_key"],
- lst_query[0]["private_key"],
- lst_query[0]["private_key_passphrase"],
- )
- if df.empty:
- return "ND", "Data that violates Hygiene Issue criteria is not present in the current dataset.", None
- else:
- return "OK", None, df
- else:
- return "NA", "A source data lookup for this Issue is not available.", None
-
- except Exception as e:
- return "ERR", f"Source data lookup query caused an error:\n\n{e.args[0]}", None
+def get_source_data(hi_data):
+ return get_source_data_uncached(hi_data)
def write_frequency_graph(df_tests):
@@ -466,7 +408,7 @@ def source_data_dialog(selected_row):
fm.render_html_list(selected_row, ["detail"], None, 700, ["Hygiene Issue Detail"])
with st.spinner("Retrieving source data..."):
- bad_data_status, bad_data_msg, df_bad = get_bad_data(selected_row)
+ bad_data_status, bad_data_msg, _, df_bad = get_source_data(selected_row)
if bad_data_status in {"ND", "NA"}:
st.info(bad_data_msg)
elif bad_data_status == "ERR":
@@ -496,3 +438,14 @@ def do_disposition_update(selected, str_new_status):
str_result = f":red[**The update {str_which} did not succeed.**]"
return str_result
+
+def get_report_file_data(update_progress, tr_data) -> FILE_DATA_TYPE:
+ hi_id = tr_data["anomaly_id"]
+ profiling_time = pd.Timestamp(tr_data["profiling_starttime"]).strftime("%Y%m%d_%H%M%S")
+ file_name = f"testgen_issue_report_{hi_id}_{profiling_time}.pdf"
+
+ with BytesIO() as buffer:
+ create_report(buffer, tr_data)
+ update_progress(1.0)
+ buffer.seek(0)
+ return file_name, "application/pdf", buffer.read()
diff --git a/testgen/ui/views/test_definitions.py b/testgen/ui/views/test_definitions.py
index f8bc5ec..0f7a542 100644
--- a/testgen/ui/views/test_definitions.py
+++ b/testgen/ui/views/test_definitions.py
@@ -808,10 +808,11 @@ def show_test_defs_grid(
_, col_profile_button = right_column.columns([0.7, 0.3])
if selected_row["test_scope"] == "column":
- view_profiling_button(
- col_profile_button, selected_row["table_name"], selected_row["column_name"],
- str_table_groups_id=str_table_groups_id
- )
+ with col_profile_button:
+ view_profiling_button(
+ selected_row["table_name"], selected_row["column_name"],
+ str_table_groups_id=str_table_groups_id
+ )
with right_column:
st.write(generate_test_defs_help(row_selected["test_type"]))
diff --git a/testgen/ui/views/test_results.py b/testgen/ui/views/test_results.py
index ed97aa9..9cc88eb 100644
--- a/testgen/ui/views/test_results.py
+++ b/testgen/ui/views/test_results.py
@@ -542,12 +542,21 @@ def show_result_detail(str_run_id, str_sel_test_status, test_type_id, sorting_co
v_col1, v_col2, v_col3, v_col4 = st.columns([.25, .25, .25, .25])
if authentication_service.current_user_has_edit_role():
view_edit_test(v_col1, selected_row["test_definition_id_current"])
+
if selected_row["test_scope"] == "column":
- view_profiling_button(
- v_col2, selected_row["table_name"], selected_row["column_names"],
- str_table_groups_id=selected_row["table_groups_id"]
- )
- view_bad_data(v_col3, selected_row)
+ with v_col2:
+ view_profiling_button(
+ selected_row["table_name"],
+ selected_row["column_names"],
+ str_table_groups_id=selected_row["table_groups_id"]
+ )
+
+ with v_col3:
+ if st.button(
+ "Source Data →", help="Review current source data for highlighted result",
+ use_container_width=True
+ ):
+ source_data_dialog(selected_row)
with v_col4:
@@ -694,14 +703,6 @@ def do_disposition_update(selected, str_new_status):
return str_result
-def view_bad_data(button_container, selected_row):
- with button_container:
- if st.button(
- "Source Data →", help="Review current source data for highlighted result", use_container_width=True
- ):
- source_data_dialog(selected_row)
-
-
@st.dialog(title="Source Data")
def source_data_dialog(selected_row):
st.markdown(f"#### {selected_row['test_name_short']}")
From a67078b7fd4e06b5edadee564f993901608f19a0 Mon Sep 17 00:00:00 2001
From: Aarthy Adityan
Date: Mon, 4 Nov 2024 11:58:38 -0500
Subject: [PATCH 46/91] refactor(components): upgrade vanJS to latest version
---
.../frontend/js/components/breadcrumbs.js | 10 ++++--
.../frontend/js/components/button.js | 14 ++++----
.../components/frontend/js/components/link.js | 22 ++++++++----
.../frontend/js/components/select.js | 6 ++--
.../frontend/js/components/sidebar.js | 19 +++++++----
.../frontend/js/components/summary_bar.js | 34 ++++++++-----------
.../frontend/js/pages/profiling_runs.js | 20 +++++------
.../components/frontend/js/pages/test_runs.js | 16 ++++-----
testgen/ui/components/frontend/js/utils.js | 23 ++++++++-----
testgen/ui/components/frontend/js/van.min.js | 3 +-
testgen/ui/components/widgets/empty_state.py | 3 +-
11 files changed, 97 insertions(+), 73 deletions(-)
diff --git a/testgen/ui/components/frontend/js/components/breadcrumbs.js b/testgen/ui/components/frontend/js/components/breadcrumbs.js
index 949499c..52a18a9 100644
--- a/testgen/ui/components/frontend/js/components/breadcrumbs.js
+++ b/testgen/ui/components/frontend/js/components/breadcrumbs.js
@@ -11,7 +11,7 @@
*/
import van from '../van.min.js';
import { Streamlit } from '../streamlit.js';
-import { emitEvent, loadStylesheet } from '../utils.js';
+import { emitEvent, getValue, loadStylesheet } from '../utils.js';
const { a, div, span } = van.tags;
@@ -25,7 +25,7 @@ const Breadcrumbs = (/** @type Properties */ props) => {
return div(
{class: 'tg-breadcrumbs-wrapper'},
() => {
- const breadcrumbs = van.val(props.breadcrumbs);
+ const breadcrumbs = getValue(props.breadcrumbs) || [];
return div(
{ class: 'tg-breadcrumbs' },
@@ -33,7 +33,11 @@ const Breadcrumbs = (/** @type Properties */ props) => {
const isLastItem = idx === breadcrumbs.length - 1;
items.push(a({
class: `tg-breadcrumbs--${ isLastItem ? 'current' : 'active'}`,
- onclick: () => emitEvent('LinkClicked', { href: b.path, params: b.params }) },
+ onclick: (event) => {
+ event.preventDefault();
+ event.stopPropagation();
+ emitEvent('LinkClicked', { href: b.path, params: b.params });
+ }},
b.label,
));
if (!isLastItem) {
diff --git a/testgen/ui/components/frontend/js/components/button.js b/testgen/ui/components/frontend/js/components/button.js
index 893a1b1..ba2092a 100644
--- a/testgen/ui/components/frontend/js/components/button.js
+++ b/testgen/ui/components/frontend/js/components/button.js
@@ -11,7 +11,7 @@
* @property {(bool)} disabled
* @property {string?} style
*/
-import { emitEvent, enforceElementWidth, loadStylesheet } from '../utils.js';
+import { emitEvent, enforceElementWidth, getValue, loadStylesheet } from '../utils.js';
import van from '../van.min.js';
import { Streamlit } from '../streamlit.js';
@@ -31,7 +31,9 @@ const BUTTON_COLOR = {
const Button = (/** @type Properties */ props) => {
loadStylesheet('button', stylesheet);
- const isIconOnly = props.type === BUTTON_TYPE.ICON || (props.icon?.val && !props.label?.val);
+ const buttonType = getValue(props.type);
+ const width = getValue(props.width);
+ const isIconOnly = buttonType === BUTTON_TYPE.ICON || (getValue(props.icon) && !getValue(props.label));
if (!window.testgen.isPage) {
Streamlit.setFrameHeight(40);
@@ -39,8 +41,8 @@ const Button = (/** @type Properties */ props) => {
enforceElementWidth(window.frameElement, 40);
}
- if (props.width?.val) {
- enforceElementWidth(window.frameElement, props.width?.val);
+ if (width) {
+ enforceElementWidth(window.frameElement, width);
}
}
@@ -52,8 +54,8 @@ const Button = (/** @type Properties */ props) => {
const onClickHandler = props.onclick || (() => emitEvent('ButtonClicked'));
return button(
{
- class: `tg-button tg-${props.type.val}-button tg-${props.color?.val ?? 'basic'}-button ${props.type.val !== 'icon' && isIconOnly ? 'tg-icon-button' : ''}`,
- style: () => `width: ${props.width?.val ?? '100%'}; ${props.style?.val}`,
+ class: `tg-button tg-${buttonType}-button tg-${getValue(props.color) ?? 'basic'}-button ${buttonType !== 'icon' && isIconOnly ? 'tg-icon-button' : ''}`,
+ style: () => `width: ${isIconOnly ? '' : (width ?? '100%')}; ${getValue(props.style)}`,
onclick: onClickHandler,
disabled: props.disabled,
},
diff --git a/testgen/ui/components/frontend/js/components/link.js b/testgen/ui/components/frontend/js/components/link.js
index 8a0b09b..49c562a 100644
--- a/testgen/ui/components/frontend/js/components/link.js
+++ b/testgen/ui/components/frontend/js/components/link.js
@@ -13,7 +13,7 @@
* @property {number?} width
* @property {string?} style
*/
-import { emitEvent, enforceElementWidth, loadStylesheet } from '../utils.js';
+import { emitEvent, enforceElementWidth, getValue, loadStylesheet } from '../utils.js';
import van from '../van.min.js';
import { Streamlit } from '../streamlit.js';
@@ -23,17 +23,25 @@ const Link = (/** @type Properties */ props) => {
loadStylesheet('link', stylesheet);
if (!window.testgen.isPage) {
- Streamlit.setFrameHeight(props.height?.val || 24);
- if (props.width?.val) {
- enforceElementWidth(window.frameElement, props.width.val);
+ Streamlit.setFrameHeight(getValue(props.height) || 24);
+ const width = getValue(props.width);
+ if (width) {
+ enforceElementWidth(window.frameElement, width);
}
}
+ const href = getValue(props.href);
+ const params = getValue(props.params) || {};
+
return a(
{
- class: `tg-link ${props.underline?.val ? 'tg-link--underline' : ''}`,
+ class: `tg-link ${getValue(props.underline) ? 'tg-link--underline' : ''}`,
style: props.style,
- onclick: () => emitEvent('LinkClicked', { href: props.href.val, params: props.params.val }),
+ onclick: (event) => {
+ event.preventDefault();
+ event.stopPropagation();
+ emitEvent('LinkClicked', { href, params });
+ },
},
div(
{class: 'tg-link--wrapper'},
@@ -50,7 +58,7 @@ const LinkIcon = (
/** @type string */position,
) => {
return i(
- {class: `material-symbols-rounded tg-link--icon tg-link--icon-${position}`, style: `font-size: ${size?.val || 20}px;`},
+ {class: `material-symbols-rounded tg-link--icon tg-link--icon-${position}`, style: `font-size: ${getValue(size) || 20}px;`},
icon,
);
};
diff --git a/testgen/ui/components/frontend/js/components/select.js b/testgen/ui/components/frontend/js/components/select.js
index 6cd7c48..5f4f68c 100644
--- a/testgen/ui/components/frontend/js/components/select.js
+++ b/testgen/ui/components/frontend/js/components/select.js
@@ -13,7 +13,7 @@
*/
import van from '../van.min.js';
import { Streamlit } from '../streamlit.js';
-import { loadStylesheet } from '../utils.js';
+import { getValue, loadStylesheet } from '../utils.js';
const { div, label, option, select } = van.tags;
@@ -25,9 +25,9 @@ const Select = (/** @type {Properties} */ props) => {
const changeHandler = props.onChange || post;
return div(
{class: 'tg-select'},
- label({for: domId, class: 'tg-select--label'}, van.val(props.label)),
+ label({for: domId, class: 'tg-select--label'}, props.label),
() => {
- const options = van.val(props.options);
+ const options = getValue(props.options) || [];
return select(
{id: domId, class: 'tg-select--field', onchange: changeHandler},
options.map(op => option({class: 'tg-select--field--option', value: op.value, selected: op.selected}, op.label)),
diff --git a/testgen/ui/components/frontend/js/components/sidebar.js b/testgen/ui/components/frontend/js/components/sidebar.js
index 56c5650..5057a48 100644
--- a/testgen/ui/components/frontend/js/components/sidebar.js
+++ b/testgen/ui/components/frontend/js/components/sidebar.js
@@ -46,7 +46,7 @@ const Sidebar = (/** @type {Properties} */ props) => {
return div(
{class: 'menu'},
() => {
- const menuItems = van.val(props.menu).items;
+ const menuItems = props.menu?.val.items || [];
return div(
{class: 'content'},
menuItems.map(item =>
@@ -56,12 +56,12 @@ const Sidebar = (/** @type {Properties} */ props) => {
);
},
button(
- { class: `tg-button logout`, onclick: () => navigate(van.val(props.logout_path)) },
+ { class: `tg-button logout`, onclick: (event) => navigate(event, props.logout_path?.val) },
i({class: 'material-symbols-rounded'}, 'logout'),
span('Logout'),
),
span({class: 'menu--username'}, props.username),
- () => Version(van.val(props.menu).version),
+ () => Version(props.menu?.val.version),
);
};
@@ -78,14 +78,14 @@ const MenuSection = (/** @type {MenuItem} */ item, /** @type {string} */ current
const MenuItem = (/** @type {MenuItem} */ item, /** @type {string} */ currentPage) => {
const classes = van.derive(() => {
- if (isCurrentPage(item.page, van.val(currentPage))) {
+ if (isCurrentPage(item.page, currentPage?.val)) {
return 'menu--item active';
}
return 'menu--item';
});
return a(
- {class: classes, href: `/${item.page}`, onclick: () => navigate(item.page, van.val(currentPage))},
+ {class: classes, href: `/${item.page}`, onclick: (event) => navigate(event, item.page, currentPage?.val)},
i({class: 'menu--item--icon material-symbols-rounded'}, item.icon),
span({class: 'menu--item--label'}, item.label),
);
@@ -121,11 +121,16 @@ const VersionRow = (/** @type string */ label, /** @type string */ version, icon
);
};
-function navigate(/** @type string */ path, /** @type string */ currentPage = null) {
+function navigate(/** @type object */ event, /** @type string */ path, /** @type string */ currentPage = null) {
+ // Needed to prevent page refresh
+ // Returning false does not work because VanJS does not use inline handlers -> https://github.com/vanjs-org/van/discussions/246
+ event.preventDefault();
+ // Prevent Streamlit from reacting to event
+ event.stopPropagation();
+
if (Sidebar.StreamlitInstance && path !== currentPage) {
Sidebar.StreamlitInstance.sendData(path);
}
- return false;
}
function isCurrentPage(/** @type string */ itemPath, /** @type string */ currentPage) {
diff --git a/testgen/ui/components/frontend/js/components/summary_bar.js b/testgen/ui/components/frontend/js/components/summary_bar.js
index 152b589..000d2fb 100644
--- a/testgen/ui/components/frontend/js/components/summary_bar.js
+++ b/testgen/ui/components/frontend/js/components/summary_bar.js
@@ -13,7 +13,7 @@
* @property {number} width
*/
import van from '../van.min.js';
-import { loadStylesheet } from '../utils.js';
+import { getValue, loadStylesheet } from '../utils.js';
const { div, span } = van.tags;
const colorMap = {
@@ -26,32 +26,28 @@ const colorMap = {
brown: '#8D6E63',
grey: '#BDBDBD',
}
+const defaultHeight = 24;
const SummaryBar = (/** @type Properties */ props) => {
loadStylesheet('summaryBar', stylesheet);
-
- const height = props.height.val || 24;
- const width = props.width.val;
- const summaryItems = props.items.val;
- const label = props.label?.val;
- const total = summaryItems.reduce((sum, item) => sum + item.value, 0);
+ const total = van.derive(() => getValue(props.items).reduce((sum, item) => sum + item.value, 0));
return div(
- { class: 'tg-summary-bar-wrapper' },
- () => {
- return label ? div(
- { class: 'tg-summary-bar--label' },
- label,
- ) : null;
- },
- div(
+ { style: () => `max-width: ${props.width ? getValue(props.width) + 'px' : '100%'};` },
+ () => props.label ? div(
+ { class: 'tg-summary-bar--label' },
+ props.label,
+ ) : '',
+ () => div(
{
class: 'tg-summary-bar',
- style: `height: ${height}px; max-width: ${width ? width + 'px' : '100%'}`
+ style: () => `height: ${getValue(props.height) || defaultHeight}px;`
},
- summaryItems.map(item => span({
- class: `tg-summary-bar--item`,
- style: `width: ${item.value * 100 / total}%; background-color: ${colorMap[item.color] || item.color};`,
+ getValue(props.items).map(item => span({
+ class: 'tg-summary-bar--item',
+ style: () => `width: ${item.value * 100 / total.val}%;
+ ${item.value ? 'min-width: 1px;' : ''}
+ background-color: ${colorMap[item.color] || item.color};`,
})),
),
() => {
diff --git a/testgen/ui/components/frontend/js/pages/profiling_runs.js b/testgen/ui/components/frontend/js/pages/profiling_runs.js
index 531768d..9afe07c 100644
--- a/testgen/ui/components/frontend/js/pages/profiling_runs.js
+++ b/testgen/ui/components/frontend/js/pages/profiling_runs.js
@@ -9,7 +9,7 @@ import { SummaryBar } from '../components/summary_bar.js';
import { Link } from '../components/link.js';
import { Button } from '../components/button.js';
import { Streamlit } from '../streamlit.js';
-import { emitEvent, resizeFrameHeightToElement, wrapProps } from '../utils.js';
+import { emitEvent, resizeFrameHeightToElement } from '../utils.js';
import { formatTimestamp, formatDuration } from '../display_utils.js';
const { div, span, i } = van.tags;
@@ -77,12 +77,12 @@ const ProfilingRunItem = (item, /** @type string[] */ columns) => {
formatDuration(item.duration),
),
),
- item.status === 'Running' && item.process_id ? Button(wrapProps({
+ item.status === 'Running' && item.process_id ? Button({
type: 'stroked',
label: 'Cancel Run',
style: 'width: auto; height: 32px; color: var(--purple); margin-left: 16px;',
onclick: () => emitEvent('RunCanceled', { payload: item }),
- })) : null,
+ }) : null,
),
div(
{ style: `flex: ${columns[2]}` },
@@ -94,17 +94,17 @@ const ProfilingRunItem = (item, /** @type string[] */ columns) => {
},
`${item.table_ct || 0} tables, ${item.column_ct || 0} columns`,
),
- item.column_ct ? Link(wrapProps({
+ item.column_ct ? Link({
label: 'View results',
href: 'profiling-runs:results',
params: { 'run_id': item.profiling_run_id },
underline: true,
right_icon: 'chevron_right',
- })) : null,
+ }) : null,
),
div(
{ style: `flex: ${columns[3]}` },
- item.anomaly_ct ? SummaryBar(wrapProps({
+ item.anomaly_ct ? SummaryBar({
items: [
{ label: 'Definite', value: item.anomalies_definite_ct, color: 'red' },
{ label: 'Likely', value: item.anomalies_likely_ct, color: 'orange' },
@@ -112,16 +112,16 @@ const ProfilingRunItem = (item, /** @type string[] */ columns) => {
{ label: 'Dismissed', value: item.anomalies_dismissed_ct, color: 'grey' },
],
height: 10,
- width: 300,
- })) : '--',
- item.anomaly_ct ? Link(wrapProps({
+ width: 350,
+ }) : '--',
+ item.anomaly_ct ? Link({
label: `View ${item.anomaly_ct} issues`,
href: 'profiling-runs:hygiene',
params: { 'run_id': item.profiling_run_id },
underline: true,
right_icon: 'chevron_right',
style: 'margin-top: 8px;',
- })) : null,
+ }) : null,
),
);
}
diff --git a/testgen/ui/components/frontend/js/pages/test_runs.js b/testgen/ui/components/frontend/js/pages/test_runs.js
index 596e8a7..c5084b4 100644
--- a/testgen/ui/components/frontend/js/pages/test_runs.js
+++ b/testgen/ui/components/frontend/js/pages/test_runs.js
@@ -9,7 +9,7 @@ import { SummaryBar } from '../components/summary_bar.js';
import { Link } from '../components/link.js';
import { Button } from '../components/button.js';
import { Streamlit } from '../streamlit.js';
-import { emitEvent, resizeFrameHeightToElement, wrapProps } from '../utils.js';
+import { emitEvent, resizeFrameHeightToElement } from '../utils.js';
import { formatTimestamp, formatDuration } from '../display_utils.js';
const { div, span, i } = van.tags;
@@ -58,12 +58,12 @@ const TestRunItem = (item, /** @type string[] */ columns) => {
{ class: 'table-row flex-row' },
div(
{ style: `flex: ${columns[0]}` },
- Link(wrapProps({
+ Link({
label: formatTimestamp(item.test_starttime),
href: 'test-runs:results',
params: { 'run_id': item.test_run_id },
underline: true,
- })),
+ }),
div(
{ class: 'text-caption mt-1' },
`${item.table_groups_name} > ${item.test_suite}`,
@@ -78,16 +78,16 @@ const TestRunItem = (item, /** @type string[] */ columns) => {
formatDuration(item.duration),
),
),
- item.status === 'Running' && item.process_id ? Button(wrapProps({
+ item.status === 'Running' && item.process_id ? Button({
type: 'stroked',
label: 'Cancel Run',
style: 'width: auto; height: 32px; color: var(--purple); margin-left: 16px;',
onclick: () => emitEvent('RunCanceled', { payload: item }),
- })) : null,
+ }) : null,
),
div(
{ style: `flex: ${columns[2]}` },
- item.test_ct ? SummaryBar(wrapProps({
+ item.test_ct ? SummaryBar({
items: [
{ label: 'Passed', value: item.passed_ct, color: 'green' },
{ label: 'Warning', value: item.warning_ct, color: 'yellow' },
@@ -96,8 +96,8 @@ const TestRunItem = (item, /** @type string[] */ columns) => {
{ label: 'Dismissed', value: item.dismissed_ct, color: 'grey' },
],
height: 10,
- width: 300,
- })) : '--',
+ width: 400,
+ }) : '--',
),
);
}
diff --git a/testgen/ui/components/frontend/js/utils.js b/testgen/ui/components/frontend/js/utils.js
index 9b3bcb9..d8d712c 100644
--- a/testgen/ui/components/frontend/js/utils.js
+++ b/testgen/ui/components/frontend/js/utils.js
@@ -32,13 +32,6 @@ function loadStylesheet(
}
}
-function wrapProps(/** @type object */props) {
- for (const [key, value] of Object.entries(props)) {
- props[key] = van.state(value);
- }
- return props;
-}
-
function emitEvent(
/** @type string */event,
/** @type object */data = {},
@@ -46,4 +39,18 @@ function emitEvent(
Streamlit.sendData({ event, ...data, _id: Math.random() }) // Identify the event so its handler is called once
}
-export { emitEvent, enforceElementWidth, loadStylesheet, resizeFrameHeightToElement, wrapProps };
+// Replacement for van.val()
+// https://github.com/vanjs-org/van/discussions/280
+const stateProto = Object.getPrototypeOf(van.state());
+function getValue(/** @type object */ prop) { // van state or static value
+ const proto = Object.getPrototypeOf(prop ?? 0);
+ if (proto === stateProto) {
+ return prop.val;
+ }
+ if (proto === Function.prototype) {
+ return prop();
+ }
+ return prop;
+}
+
+export { emitEvent, enforceElementWidth, getValue, loadStylesheet, resizeFrameHeightToElement };
diff --git a/testgen/ui/components/frontend/js/van.min.js b/testgen/ui/components/frontend/js/van.min.js
index a78d3da..7e23e03 100644
--- a/testgen/ui/components/frontend/js/van.min.js
+++ b/testgen/ui/components/frontend/js/van.min.js
@@ -1 +1,2 @@
-let e,t,l,r,o,f=Object,n=f.getPrototypeOf,s=document,a={isConnected:1},i={},d=n(a),u=n(n),_=(e,t,l,r)=>(e??(setTimeout(l,r),new Set)).add(t),h=(e,t,r)=>{let o=l;l=t;try{return e(r)}catch(e){return console.error(e),r}finally{l=o}},c=e=>e.filter(e=>e.t?.isConnected),g=t=>o=_(o,t,()=>{for(let e of o)e.l=c(e.l),e.o=c(e.o);o=e},1e3),w={get val(){return l?.add(this),this.i},get oldVal(){return l?.add(this),this.u},set val(l){let r=this;if(l!==r.i){r.i=l;let o=[...r.o=c(r.o)];for(let t of o)x(t.f,t.s,t.t),t.t=e;r.l.length?t=_(t,r,p):r.u=l}}},v=e=>({__proto__:w,i:e,u:e,l:[],o:[]}),S=e=>n(e??0)===w,y=(e,t)=>{let l=new Set,o={f:e},f=r;r=[];let n=h(e,l,t);n=(n??s).nodeType?n:new Text(n);for(let e of l)g(e),e.l.push(o);for(let e of r)e.t=n;return r=f,o.t=n},x=(e,t=v(),l)=>{let o=new Set,f={f:e,s:t};f.t=l??r?.push(f)??a,t.val=h(e,o);for(let e of o)g(e),e.o.push(f);return t},V=(t,...l)=>{for(let r of l.flat(1/0)){let l=n(r??0),o=l===w?y(()=>r.val):l===u?y(r):r;o!=e&&t.append(o)}return t},b=t=>new Proxy((l,...r)=>{let[o,...a]=n(r[0]??0)===d?r:[{},...r],_=t?s.createElementNS(t,l):s.createElement(l);for(let[t,r]of f.entries(o)){let o=l=>l?f.getOwnPropertyDescriptor(l,t)??o(n(l)):e,s=l+","+t,a=i[s]??(i[s]=o(n(_))?.set??0),d=a?a.bind(_):_.setAttribute.bind(_,t),h=n(r??0);h===w?y(()=>(d(r.val),_)):h!==u||t.startsWith("on")&&!r.h?d(r):y(()=>(d(r()),_))}return V(_,...a)},{get:(t,l)=>t.bind(e,l)}),m=(e,t)=>t?t!==e&&e.replaceWith(t):e.remove(),p=()=>{let l=[...t].filter(e=>e.i!==e.u);t=e;for(let t of new Set(l.flatMap(e=>e.l=c(e.l))))m(t.t,y(t.f,t.t)),t.t=e;for(let e of l)e.u=e.i};export default{add:V,_:e=>(e.h=1,e),tags:b(),tagsNS:b,state:v,val:e=>S(e)?e.val:e,oldVal:e=>S(e)?e.oldVal:e,derive:x,hydrate:(e,t)=>m(e,y(t,e))};
\ No newline at end of file
+// https://vanjs.org/code/van-1.5.2.min.js
+let e,t,r,o,l,n,s=Object.getPrototypeOf,f={isConnected:1},i={},h=s(f),a=s(s),d=(e,t,r,o)=>(e??(setTimeout(r,o),new Set)).add(t),u=(e,t,o)=>{let l=r;r=t;try{return e(o)}catch(e){return console.error(e),o}finally{r=l}},w=e=>e.filter(e=>e.t?.isConnected),_=e=>l=d(l,e,()=>{for(let e of l)e.o=w(e.o),e.l=w(e.l);l=n},1e3),c={get val(){return r?.i?.add(this),this.rawVal},get oldVal(){return r?.i?.add(this),this.h},set val(o){r?.u?.add(this),o!==this.rawVal&&(this.rawVal=o,this.o.length+this.l.length?(t?.add(this),e=d(e,this,v)):this.h=o)}},S=e=>({__proto__:c,rawVal:e,h:e,o:[],l:[]}),g=(e,t)=>{let r={i:new Set,u:new Set},l={f:e},n=o;o=[];let s=u(e,r,t);s=(s??document).nodeType?s:new Text(s);for(let e of r.i)r.u.has(e)||(_(e),e.o.push(l));for(let e of o)e.t=s;return o=n,l.t=s},y=(e,t=S(),r)=>{let l={i:new Set,u:new Set},n={f:e,s:t};n.t=r??o?.push(n)??f,t.val=u(e,l,t.rawVal);for(let e of l.i)l.u.has(e)||(_(e),e.l.push(n));return t},b=(e,...t)=>{for(let r of t.flat(1/0)){let t=s(r??0),o=t===c?g(()=>r.val):t===a?g(r):r;o!=n&&e.append(o)}return e},m=(e,t,...r)=>{let[o,...l]=s(r[0]??0)===h?r:[{},...r],f=e?document.createElementNS(e,t):document.createElement(t);for(let[e,r]of Object.entries(o)){let o=t=>t?Object.getOwnPropertyDescriptor(t,e)??o(s(t)):n,l=t+","+e,h=i[l]??=o(s(f))?.set??0,d=e.startsWith("on")?(t,r)=>{let o=e.slice(2);f.removeEventListener(o,r),f.addEventListener(o,t)}:h?h.bind(f):f.setAttribute.bind(f,e),u=s(r??0);e.startsWith("on")||u===a&&(r=y(r),u=c),u===c?g(()=>(d(r.val,r.h),f)):d(r)}return b(f,l)},x=e=>({get:(t,r)=>m.bind(n,e,r)}),j=(e,t)=>t?t!==e&&e.replaceWith(t):e.remove(),v=()=>{let r=0,o=[...e].filter(e=>e.rawVal!==e.h);do{t=new Set;for(let e of new Set(o.flatMap(e=>e.l=w(e.l))))y(e.f,e.s,e.t),e.t=n}while(++r<100&&(o=[...t]).length);let l=[...e].filter(e=>e.rawVal!==e.h);e=n;for(let e of new Set(l.flatMap(e=>e.o=w(e.o))))j(e.t,g(e.f,e.t)),e.t=n;for(let e of l)e.h=e.rawVal};export default{tags:new Proxy(e=>new Proxy(m,x(e)),x()),hydrate:(e,t)=>j(e,g(t,e)),add:b,state:S,derive:y};
\ No newline at end of file
diff --git a/testgen/ui/components/widgets/empty_state.py b/testgen/ui/components/widgets/empty_state.py
index 8b34df0..505d560 100644
--- a/testgen/ui/components/widgets/empty_state.py
+++ b/testgen/ui/components/widgets/empty_state.py
@@ -66,9 +66,10 @@ def empty_state(
elif button_onclick:
button(
type_="flat",
+ color="primary",
label=action_label,
icon=button_icon,
on_click=button_onclick,
- style="margin: auto; width: auto; background-color: var(--primary-color);",
+ style="margin: auto; width: auto;",
)
whitespace(5)
From 41b25c1a6e4af288ce5c84606ea37c70148a81ba Mon Sep 17 00:00:00 2001
From: Aarthy Adityan
Date: Mon, 4 Nov 2024 12:49:14 -0500
Subject: [PATCH 47/91] refactor(tooltip): support tooltip positions - add to
button and tooltip-icon components
---
testgen/ui/components/frontend/css/shared.css | 6 +
.../frontend/js/components/button.js | 29 +++-
.../frontend/js/components/tooltip.js | 157 ++++++++++++++++++
.../frontend/js/components/tooltip_icon.js | 45 +++++
.../frontend/js/pages/profiling_runs.js | 2 +-
.../components/frontend/js/pages/test_runs.js | 2 +-
.../ui/components/frontend/js/van-tooltip.js | 52 ------
7 files changed, 231 insertions(+), 62 deletions(-)
create mode 100644 testgen/ui/components/frontend/js/components/tooltip.js
create mode 100644 testgen/ui/components/frontend/js/components/tooltip_icon.js
delete mode 100644 testgen/ui/components/frontend/js/van-tooltip.js
diff --git a/testgen/ui/components/frontend/css/shared.css b/testgen/ui/components/frontend/css/shared.css
index bcbe89c..3c3f07f 100644
--- a/testgen/ui/components/frontend/css/shared.css
+++ b/testgen/ui/components/frontend/css/shared.css
@@ -26,6 +26,7 @@ body {
--disabled-text-color: #00000042;
--caption-text-color: rgba(49, 51, 63, 0.6); /* Match Streamlit's caption color */
--border-color: rgba(0, 0, 0, .12);
+ --tooltip-color: #333d;
--dk-card-background: #fff;
--sidebar-background-color: white;
@@ -88,6 +89,11 @@ body {
}
}
+.hidden {
+ display: none !important;
+}
+
+
/* Table styles */
.table {
background-color: var(--dk-card-background);
diff --git a/testgen/ui/components/frontend/js/components/button.js b/testgen/ui/components/frontend/js/components/button.js
index ba2092a..858a588 100644
--- a/testgen/ui/components/frontend/js/components/button.js
+++ b/testgen/ui/components/frontend/js/components/button.js
@@ -14,6 +14,7 @@
import { emitEvent, enforceElementWidth, getValue, loadStylesheet } from '../utils.js';
import van from '../van.min.js';
import { Streamlit } from '../streamlit.js';
+import { Tooltip } from './tooltip.js';
const { button, i, span } = van.tags;
const BUTTON_TYPE = {
@@ -44,21 +45,29 @@ const Button = (/** @type Properties */ props) => {
if (width) {
enforceElementWidth(window.frameElement, width);
}
- }
-
- if (props.tooltip) {
- window.frameElement.parentElement.setAttribute('data-tooltip', props.tooltip.val);
- window.frameElement.parentElement.setAttribute('data-tooltip-position', props.tooltipPosition.val);
+ if (props.tooltip) {
+ window.frameElement.parentElement.setAttribute('data-tooltip', props.tooltip.val);
+ window.frameElement.parentElement.setAttribute('data-tooltip-position', props.tooltipPosition.val);
+ }
}
const onClickHandler = props.onclick || (() => emitEvent('ButtonClicked'));
+ const showTooltip = van.state(false);
+
return button(
{
class: `tg-button tg-${buttonType}-button tg-${getValue(props.color) ?? 'basic'}-button ${buttonType !== 'icon' && isIconOnly ? 'tg-icon-button' : ''}`,
style: () => `width: ${isIconOnly ? '' : (width ?? '100%')}; ${getValue(props.style)}`,
onclick: onClickHandler,
disabled: props.disabled,
+ onmouseenter: props.tooltip ? (() => showTooltip.val = true) : undefined,
+ onmouseleave: props.tooltip ? (() => showTooltip.val = false) : undefined,
},
+ props.tooltip ? Tooltip({
+ text: props.tooltip,
+ show: showTooltip,
+ position: props.tooltipPosition,
+ }) : undefined,
span({class: 'tg-button-focus-state-indicator'}, ''),
props.icon ? i({class: 'material-symbols-rounded'}, props.icon) : undefined,
!isIconOnly ? span(props.label) : undefined,
@@ -71,7 +80,6 @@ button.tg-button {
height: 40px;
position: relative;
- overflow: hidden;
display: flex;
flex-direction: row;
@@ -88,6 +96,11 @@ button.tg-button {
font-size: 14px;
}
+button.tg-button .tg-button-focus-state-indicator {
+ border-radius: inherit;
+ overflow: hidden;
+}
+
button.tg-button .tg-button-focus-state-indicator::before {
content: "";
opacity: 0;
@@ -113,7 +126,7 @@ button.tg-button:has(span) {
}
button.tg-button:not(.tg-icon-button):has(span):has(i) {
- padding-left: 8px;
+ padding-left: 12px;
}
button.tg-button[disabled] {
@@ -121,7 +134,7 @@ button.tg-button[disabled] {
cursor: not-allowed;
}
-button.tg-button.tg-icon-button > i {
+button.tg-button > i {
font-size: 18px;
}
diff --git a/testgen/ui/components/frontend/js/components/tooltip.js b/testgen/ui/components/frontend/js/components/tooltip.js
new file mode 100644
index 0000000..843e175
--- /dev/null
+++ b/testgen/ui/components/frontend/js/components/tooltip.js
@@ -0,0 +1,157 @@
+// Code modified from vanjs-ui
+// https://www.npmjs.com/package/vanjs-ui
+// https://cdn.jsdelivr.net/npm/vanjs-ui@0.10.0/dist/van-ui.nomodule.js
+
+/**
+ * @typedef Properties
+ * @type {object}
+ * @property {string} text
+ * @property {boolean} show
+ * @property {('top-left' | 'top' | 'top-right' | 'right' | 'bottom-right' | 'bottom' | 'bottom-left' | 'left')?} position
+ */
+import van from '../van.min.js';
+import { getValue, loadStylesheet } from '../utils.js';
+
+const { div, span } = van.tags;
+const defaultPosition = 'top';
+
+const Tooltip = (/** @type Properties */ props) => {
+ loadStylesheet('tooltip', stylesheet);
+
+ return span(
+ {
+ class: () => `tg-tooltip ${getValue(props.position) || defaultPosition} ${getValue(props.show) ? '' : 'hidden'}`,
+ style: () => `opacity: ${getValue(props.show) ? 1 : 0};`,
+ },
+ props.text,
+ div({ class: 'tg-tooltip--triangle' }),
+ );
+};
+
+const stylesheet = new CSSStyleSheet();
+stylesheet.replace(`
+.tg-tooltip {
+ width: max-content;
+ max-width: 400px;
+ position: absolute;
+ z-index: 1;
+ border-radius: 4px;
+ background-color: var(--tooltip-color);
+ padding: 4px 8px;
+ color: white;
+ font-size: 13px;
+ font-family: 'Roboto', 'Helvetica Neue', sans-serif;
+ text-align: center;
+ text-wrap: wrap;
+ transition: opacity 0.3s;
+}
+
+.tg-tooltip--triangle {
+ width: 0;
+ height: 0;
+ position: absolute;
+ border: solid transparent;
+}
+
+.tg-tooltip.top-left {
+ right: 50%;
+ bottom: 125%;
+ transform: translateX(20px);
+}
+.top-left .tg-tooltip--triangle {
+ bottom: -5px;
+ right: 20px;
+ margin-right: -5px;
+ border-width: 5px 5px 0;
+ border-top-color: var(--tooltip-color);
+}
+
+.tg-tooltip.top {
+ left: 50%;
+ bottom: 125%;
+ transform: translateX(-50%);
+}
+.top .tg-tooltip--triangle {
+ bottom: -5px;
+ left: 50%;
+ margin-left: -5px;
+ border-width: 5px 5px 0;
+ border-top-color: var(--tooltip-color);
+}
+
+.tg-tooltip.top-right {
+ left: 50%;
+ bottom: 125%;
+ transform: translateX(-20px);
+}
+.top-right .tg-tooltip--triangle {
+ bottom: -5px;
+ left: 20px;
+ margin-left: -5px;
+ border-width: 5px 5px 0;
+ border-top-color: var(--tooltip-color);
+}
+
+.tg-tooltip.right {
+ left: 125%;
+}
+.right .tg-tooltip--triangle {
+ top: 50%;
+ left: -5px;
+ margin-top: -5px;
+ border-width: 5px 5px 5px 0;
+ border-right-color: var(--tooltip-color);
+}
+
+.tg-tooltip.bottom-right {
+ left: 50%;
+ top: 125%;
+ transform: translateX(-20px);
+}
+.bottom-right .tg-tooltip--triangle {
+ top: -5px;
+ left: 20px;
+ margin-left: -5px;
+ border-width: 0 5px 5px;
+ border-bottom-color: var(--tooltip-color);
+}
+
+.tg-tooltip.bottom {
+ top: 125%;
+ left: 50%;
+ transform: translateX(-50%);
+}
+.bottom .tg-tooltip--triangle {
+ top: -5px;
+ left: 50%;
+ margin-left: -5px;
+ border-width: 0 5px 5px;
+ border-bottom-color: var(--tooltip-color);
+}
+
+.tg-tooltip.bottom-left {
+ right: 50%;
+ top: 125%;
+ transform: translateX(20px);
+}
+.bottom-left .tg-tooltip--triangle {
+ top: -5px;
+ right: 20px;
+ margin-right: -5px;
+ border-width: 0 5px 5px;
+ border-bottom-color: var(--tooltip-color);
+}
+
+.tg-tooltip.left {
+ right: 125%;
+}
+.left .tg-tooltip--triangle {
+ top: 50%;
+ right: -5px;
+ margin-top: -5px;
+ border-width: 5px 0 5px 5px;
+ border-left-color: var(--tooltip-color);
+}
+`);
+
+export { Tooltip };
diff --git a/testgen/ui/components/frontend/js/components/tooltip_icon.js b/testgen/ui/components/frontend/js/components/tooltip_icon.js
new file mode 100644
index 0000000..7d3d5d3
--- /dev/null
+++ b/testgen/ui/components/frontend/js/components/tooltip_icon.js
@@ -0,0 +1,45 @@
+/**
+ * @typedef Properties
+ * @type {object}
+ * @property {string} icon
+ * @property {number?} iconSize
+ * @property {string} tooltip
+ * @property {('top-left' | 'top' | 'top-right' | 'right' | 'bottom-right' | 'bottom' | 'bottom-left' | 'left')?} tooltipPosition
+ * @property {string} classes
+ */
+import { getValue, loadStylesheet } from '../utils.js';
+import van from '../van.min.js';
+import { Tooltip } from './tooltip.js';
+
+const { i } = van.tags;
+const defaultIconSize = 20;
+
+const TooltipIcon = (/** @type Properties */ props) => {
+ loadStylesheet('tooltipIcon', stylesheet);
+ const showTooltip = van.state(false);
+
+ return i(
+ {
+ class: () => `material-symbols-rounded tg-tooltip-icon text-secondary ${getValue(props.classes)}`,
+ style: () => `font-size: ${getValue(props.iconSize) || defaultIconSize}px;`,
+ onmouseenter: () => showTooltip.val = true,
+ onmouseleave: () => showTooltip.val = false,
+ },
+ props.icon,
+ Tooltip({
+ text: props.tooltip,
+ show: showTooltip,
+ position: props.tooltipPosition,
+ }),
+ );
+};
+
+const stylesheet = new CSSStyleSheet();
+stylesheet.replace(`
+.tg-tooltip-icon {
+ position: relative;
+ cursor: default;
+}
+`);
+
+export { TooltipIcon };
diff --git a/testgen/ui/components/frontend/js/pages/profiling_runs.js b/testgen/ui/components/frontend/js/pages/profiling_runs.js
index 9afe07c..c434f37 100644
--- a/testgen/ui/components/frontend/js/pages/profiling_runs.js
+++ b/testgen/ui/components/frontend/js/pages/profiling_runs.js
@@ -4,7 +4,7 @@
* @property {array} items
*/
import van from '../van.min.js';
-import { Tooltip } from '../van-tooltip.js';
+import { Tooltip } from '../components/tooltip.js';
import { SummaryBar } from '../components/summary_bar.js';
import { Link } from '../components/link.js';
import { Button } from '../components/button.js';
diff --git a/testgen/ui/components/frontend/js/pages/test_runs.js b/testgen/ui/components/frontend/js/pages/test_runs.js
index c5084b4..d100f91 100644
--- a/testgen/ui/components/frontend/js/pages/test_runs.js
+++ b/testgen/ui/components/frontend/js/pages/test_runs.js
@@ -4,7 +4,7 @@
* @property {array} items
*/
import van from '../van.min.js';
-import { Tooltip } from '../van-tooltip.js';
+import { Tooltip } from '../components/tooltip.js';
import { SummaryBar } from '../components/summary_bar.js';
import { Link } from '../components/link.js';
import { Button } from '../components/button.js';
diff --git a/testgen/ui/components/frontend/js/van-tooltip.js b/testgen/ui/components/frontend/js/van-tooltip.js
deleted file mode 100644
index 565715b..0000000
--- a/testgen/ui/components/frontend/js/van-tooltip.js
+++ /dev/null
@@ -1,52 +0,0 @@
-// Code modified from vanjs-ui
-// https://www.npmjs.com/package/vanjs-ui
-// https://cdn.jsdelivr.net/npm/vanjs-ui@0.10.0/dist/van-ui.nomodule.js
-
-import van from './van.min.js';
-const { div, span } = van.tags;
-
-const toStyleStr = (style) => Object.entries(style).map(([k, v]) => `${k}: ${v};`).join("");
-
-const Tooltip = ({ text, show, backgroundColor = '#333D', fontColor = 'white', fadeInSec = 0.3, tooltipClass = '', tooltipStyleOverrides = {}, triangleClass = '', triangleStyleOverrides = {}, }) => {
- const tooltipStylesStr = toStyleStr({
- width: 'max-content',
- 'min-width': '100px',
- 'max-width': '400px',
- visibility: 'hidden',
- 'background-color': backgroundColor,
- color: fontColor,
- 'text-align': 'center',
- padding: '5px',
- 'border-radius': '5px',
- position: 'absolute',
- 'z-index': 1,
- bottom: '125%',
- left: '50%',
- transform: 'translateX(-50%)',
- opacity: 0,
- transition: `opacity ${fadeInSec}s`,
- 'font-size': '14px',
- 'font-family': `'Roboto', 'Helvetica Neue', sans-serif`,
- 'text-wrap': 'wrap',
- ...tooltipStyleOverrides,
- });
- const triangleStylesStr = toStyleStr({
- width: 0,
- height: 0,
- 'margin-left': '-5px',
- 'border-left': '5px solid transparent',
- 'border-right': '5px solid transparent',
- 'border-top': '5px solid #333',
- position: 'absolute',
- bottom: '-5px',
- left: '50%',
- ...triangleStyleOverrides,
- });
- const dom = span({ class: tooltipClass, style: tooltipStylesStr }, text, div({ class: triangleClass, style: triangleStylesStr }));
- van.derive(() => show.val ?
- (dom.style.opacity = '1', dom.style.visibility = 'visible') :
- (dom.style.opacity = '0', dom.style.visibility = 'hidden'));
- return dom;
-};
-
-export { Tooltip };
From eee533cc632bf3c844328f6ae3c881d55684a575 Mon Sep 17 00:00:00 2001
From: Aarthy Adityan
Date: Mon, 4 Nov 2024 12:51:09 -0500
Subject: [PATCH 48/91] feat(link): support opening links in new tabs
---
.../ui/components/frontend/js/components/link.js | 16 +++++++++++++++-
testgen/ui/components/widgets/link.py | 2 ++
2 files changed, 17 insertions(+), 1 deletion(-)
diff --git a/testgen/ui/components/frontend/js/components/link.js b/testgen/ui/components/frontend/js/components/link.js
index 49c562a..b070b6f 100644
--- a/testgen/ui/components/frontend/js/components/link.js
+++ b/testgen/ui/components/frontend/js/components/link.js
@@ -4,6 +4,7 @@
* @property {string} href
* @property {object} params
* @property {string} label
+ * @property {boolean} open_new
* @property {boolean} underline
* @property {string?} left_icon
* @property {number?} left_icon_size
@@ -32,12 +33,15 @@ const Link = (/** @type Properties */ props) => {
const href = getValue(props.href);
const params = getValue(props.params) || {};
+ const open_new = !!getValue(props.open_new);
return a(
{
class: `tg-link ${getValue(props.underline) ? 'tg-link--underline' : ''}`,
style: props.style,
- onclick: (event) => {
+ href: `/${href}${getQueryFromParams(params)}`,
+ target: open_new ? '_blank' : '',
+ onclick: open_new ? null : (event) => {
event.preventDefault();
event.stopPropagation();
emitEvent('LinkClicked', { href, params });
@@ -63,6 +67,16 @@ const LinkIcon = (
);
};
+function getQueryFromParams(/** @type object */ params) {
+ const query = Object.entries(params).reduce((query, [ key, value ]) => {
+ if (key && value) {
+ return `${query}${query ? '&' : ''}${key}=${value}`;
+ }
+ return query;
+ }, '');
+ return query ? `?${query}` : '';
+}
+
const stylesheet = new CSSStyleSheet();
stylesheet.replace(`
.tg-link {
diff --git a/testgen/ui/components/widgets/link.py b/testgen/ui/components/widgets/link.py
index 7230edb..4e2bf28 100644
--- a/testgen/ui/components/widgets/link.py
+++ b/testgen/ui/components/widgets/link.py
@@ -7,6 +7,7 @@ def link(
label: str,
*,
params: dict = {}, # noqa: B006
+ open_new: bool = False,
underline: bool = True,
left_icon: str | None = None,
left_icon_size: float = 20.0,
@@ -22,6 +23,7 @@ def link(
"params": params,
"label": label,
"height": height,
+ "open_new": open_new,
"underline": underline,
}
if left_icon:
From ab08ba23012c105e9a8169906ceef0b2e4405b6f Mon Sep 17 00:00:00 2001
From: Aarthy Adityan
Date: Mon, 4 Nov 2024 12:54:51 -0500
Subject: [PATCH 49/91] misc(summary): add legend colors to summary bar
component
---
testgen/ui/assets/style.css | 26 +++++++++++++
testgen/ui/components/frontend/css/shared.css | 8 ++++
.../frontend/js/components/summary_bar.js | 38 +++++++++++++------
testgen/ui/components/widgets/summary_bar.py | 2 +-
4 files changed, 62 insertions(+), 12 deletions(-)
diff --git a/testgen/ui/assets/style.css b/testgen/ui/assets/style.css
index 184a8c6..67266d7 100644
--- a/testgen/ui/assets/style.css
+++ b/testgen/ui/assets/style.css
@@ -262,8 +262,34 @@ Use as testgen.text("text", "extra_styles") */
.tg-summary-bar--caption {
margin-top: 4px;
+ display: flex;
+ flex-flow: row wrap;
+ align-items: center;
color: var(--caption-text-color);
+ font-size: 13px;
font-style: italic;
+ line-height: 1;
+}
+
+.tg-summary-bar--legend {
+ display: flex;
+ flex-flow: row nowrap;
+ align-items: center;
+ width: auto;
+}
+
+.tg-summary-bar--legend:not(:last-child) {
+ margin-right: 8px;
+}
+
+.tg-summary-bar--legend-dot {
+ margin-right: 2px;
+ font-size: 4px;
+ font-style: normal;
+}
+
+.tg-summary-bar--legend-dot::before {
+ content: '⬤';
}
/* */
diff --git a/testgen/ui/components/frontend/css/shared.css b/testgen/ui/components/frontend/css/shared.css
index 3c3f07f..7adb2bf 100644
--- a/testgen/ui/components/frontend/css/shared.css
+++ b/testgen/ui/components/frontend/css/shared.css
@@ -93,6 +93,14 @@ body {
display: none !important;
}
+.dot {
+ font-size: 10px;
+ font-style: normal;
+}
+
+.dot::before {
+ content: '⬤';
+}
/* Table styles */
.table {
diff --git a/testgen/ui/components/frontend/js/components/summary_bar.js b/testgen/ui/components/frontend/js/components/summary_bar.js
index 000d2fb..b73ea5c 100644
--- a/testgen/ui/components/frontend/js/components/summary_bar.js
+++ b/testgen/ui/components/frontend/js/components/summary_bar.js
@@ -8,9 +8,9 @@
* @typedef Properties
* @type {object}
* @property {Array.} items
- * @property {string} label
- * @property {number} height
- * @property {number} width
+ * @property {string?} label
+ * @property {number?} height
+ * @property {number?} width
*/
import van from '../van.min.js';
import { getValue, loadStylesheet } from '../utils.js';
@@ -50,12 +50,17 @@ const SummaryBar = (/** @type Properties */ props) => {
background-color: ${colorMap[item.color] || item.color};`,
})),
),
- () => {
- return total ? div(
- { class: `tg-summary-bar--caption` },
- summaryItems.map(item => `${item.label}: ${item.value || 0}`).join(', '),
- ) : null;
- },
+ () => total.val ? div(
+ { class: 'tg-summary-bar--caption flex-row fx-flex-wrap text-caption mt-1' },
+ getValue(props.items).map(item => div(
+ { class: 'tg-summary-bar--legend flex-row' },
+ span({
+ class: 'dot',
+ style: `color: ${colorMap[item.color] || item.color};`,
+ }),
+ `${item.label}: ${item.value || 0}`,
+ )),
+ ) : '',
);
};
@@ -80,10 +85,21 @@ stylesheet.replace(`
}
.tg-summary-bar--caption {
- margin-top: 4px;
- color: var(--caption-text-color);
font-style: italic;
}
+
+.tg-summary-bar--legend {
+ width: auto;
+}
+
+.tg-summary-bar--legend:not(:last-child) {
+ margin-right: 8px;
+}
+
+.tg-summary-bar--legend span {
+ margin-right: 2px;
+ font-size: 4px;
+}
`);
export { SummaryBar };
diff --git a/testgen/ui/components/widgets/summary_bar.py b/testgen/ui/components/widgets/summary_bar.py
index c4b636d..bf913c6 100644
--- a/testgen/ui/components/widgets/summary_bar.py
+++ b/testgen/ui/components/widgets/summary_bar.py
@@ -44,7 +44,7 @@ def summary_bar(
if total:
item_spans = "".join([ f'' for item in items ])
- caption = ", ".join([ f"{item['label']}: {item['value']}" for item in items ])
+ caption = "".join([ f'
{item["label"]}: {item["value"]}
' for item in items ])
caption_div = f"""
{caption}
From 58ca5a7d802cf90f35ab24b3405a57c3956798e7 Mon Sep 17 00:00:00 2001
From: Aarthy Adityan
Date: Mon, 4 Nov 2024 13:01:33 -0500
Subject: [PATCH 50/91] feat(components): add BoxPlot, PercentBar and
FrequencyBars components
---
testgen/ui/components/frontend/css/shared.css | 36 ++-
.../ui/components/frontend/js/axis_utils.js | 54 ++++
.../frontend/js/components/box_plot.js | 290 ++++++++++++++++++
.../frontend/js/components/frequency_bars.js | 94 ++++++
.../frontend/js/components/percent_bar.js | 79 +++++
.../frontend/js/components/summary_bar.js | 11 +-
.../components/frontend/js/display_utils.js | 23 +-
7 files changed, 573 insertions(+), 14 deletions(-)
create mode 100644 testgen/ui/components/frontend/js/axis_utils.js
create mode 100644 testgen/ui/components/frontend/js/components/box_plot.js
create mode 100644 testgen/ui/components/frontend/js/components/frequency_bars.js
create mode 100644 testgen/ui/components/frontend/js/components/percent_bar.js
diff --git a/testgen/ui/components/frontend/css/shared.css b/testgen/ui/components/frontend/css/shared.css
index 7adb2bf..a4884ec 100644
--- a/testgen/ui/components/frontend/css/shared.css
+++ b/testgen/ui/components/frontend/css/shared.css
@@ -20,6 +20,8 @@ body {
--blue: #42A5F5;
--brown: #8D6E63;
--grey: #BDBDBD;
+ --empty: #EEEEEE;
+ --empty-light: #FAFAFA;
--primary-text-color: #000000de;
--secondary-text-color: #0000008a;
@@ -62,6 +64,9 @@ body {
@media (prefers-color-scheme: dark) {
body {
+ --empty: #424242;
+ --empty-light: #212121;
+
--primary-text-color: rgba(255, 255, 255);
--secondary-text-color: rgba(255, 255, 255, .7);
--disabled-text-color: rgba(255, 255, 255, .5);
@@ -150,15 +155,12 @@ body {
.flex-row {
display: flex;
flex-direction: row;
- flex-grow: 1;
- width: 100%;
align-items: center;
}
.flex-column {
display: flex;
flex-direction: column;
- flex-grow: 1;
}
.fx-flex {
@@ -209,6 +211,34 @@ body {
align-content: flex-start;
}
+.fx-gap-1 {
+ gap: 4px;
+}
+
+.fx-gap-2 {
+ gap: 8px;
+}
+
+.fx-gap-3 {
+ gap: 12px;
+}
+
+.fx-gap-4 {
+ gap: 16px;
+}
+
+.fx-gap-5 {
+ gap: 24px;
+}
+
+.fx-gap-6 {
+ gap: 32px;
+}
+
+.fx-gap-7 {
+ gap: 40px;
+}
+
/* */
/* Whitespace utilities */
diff --git a/testgen/ui/components/frontend/js/axis_utils.js b/testgen/ui/components/frontend/js/axis_utils.js
new file mode 100644
index 0000000..6c7e835
--- /dev/null
+++ b/testgen/ui/components/frontend/js/axis_utils.js
@@ -0,0 +1,54 @@
+// https://stackoverflow.com/a/4955179
+function niceNumber(value, round = false) {
+ const exponent = Math.floor(Math.log10(value));
+ const fraction = value / Math.pow(10, exponent);
+ let niceFraction;
+
+ if (round) {
+ if (fraction < 1.5) {
+ niceFraction = 1;
+ } else if (fraction < 3) {
+ niceFraction = 2;
+ } else if (fraction < 7) {
+ niceFraction = 5;
+ } else {
+ niceFraction = 10;
+ }
+ } else {
+ if (fraction <= 1) {
+ niceFraction = 1;
+ } else if (fraction <= 2) {
+ niceFraction = 2;
+ } else if (fraction <= 5) {
+ niceFraction = 5;
+ } else {
+ niceFraction = 10;
+ }
+ }
+
+ return niceFraction * Math.pow(10, exponent);
+}
+
+function niceBounds(axisStart, axisEnd, tickCount = 4) {
+ let axisWidth = axisEnd - axisStart;
+
+ if (axisWidth == 0) {
+ axisStart -= 0.5;
+ axisEnd += 0.5;
+ axisWidth = axisEnd - axisStart;
+ }
+
+ const niceRange = niceNumber(axisWidth);
+ const niceTick = niceNumber(niceRange / (tickCount - 1), true);
+ axisStart = Math.floor(axisStart / niceTick) * niceTick;
+ axisEnd = Math.ceil(axisEnd / niceTick) * niceTick;
+
+ return {
+ min: axisStart,
+ max: axisEnd,
+ step: niceTick,
+ range: axisEnd - axisStart,
+ };
+}
+
+export { niceBounds };
diff --git a/testgen/ui/components/frontend/js/components/box_plot.js b/testgen/ui/components/frontend/js/components/box_plot.js
new file mode 100644
index 0000000..81447d3
--- /dev/null
+++ b/testgen/ui/components/frontend/js/components/box_plot.js
@@ -0,0 +1,290 @@
+/**
+ * @typedef Properties
+ * @type {object}
+ * @property {number} minimum
+ * @property {number} maximum
+ * @property {number} median
+ * @property {number} lowerQuartile
+ * @property {number} upperQuartile
+ * @property {number} average
+ * @property {number} standardDeviation
+ * @property {number?} width
+ */
+import van from '../van.min.js';
+import { getValue, loadStylesheet } from '../utils.js';
+import { colorMap } from '../display_utils.js';
+import { niceBounds } from '../axis_utils.js';
+
+const { div } = van.tags;
+const boxColor = colorMap.teal;
+const lineColor = colorMap.limeGreen;
+
+const BoxPlot = (/** @type Properties */ props) => {
+ loadStylesheet('boxPlot', stylesheet);
+
+ const { minimum, maximum, median, lowerQuartile, upperQuartile, average, standardDeviation, width } = props;
+ const axisTicks = van.derive(() => niceBounds(getValue(minimum), getValue(maximum)));
+
+ return div(
+ {
+ class: 'flex-row fx-flex-wrap fx-gap-6',
+ style: () => `max-width: ${width ? getValue(width) + 'px' : '100%'};`,
+ },
+ div(
+ { style: 'flex: 300px' },
+ div(
+ {
+ class: 'tg-box-plot--line',
+ style: () => {
+ const { min, range } = axisTicks.val;
+ return `left: ${(getValue(average) - getValue(standardDeviation) - min) * 100 / range}%;
+ width: ${getValue(standardDeviation) * 2 * 100 / range}%;`;
+ },
+ },
+ div({ class: 'tg-box-plot--dot' }),
+ ),
+ div(
+ {
+ class: 'tg-box-plot--grid',
+ style: () => {
+ const { min, max, range } = axisTicks.val;
+
+ return `grid-template-columns:
+ ${(getValue(minimum) - min) * 100 / range}%
+ ${(getValue(lowerQuartile) - getValue(minimum)) * 100 / range}%
+ ${(getValue(median) - getValue(lowerQuartile)) * 100 / range}%
+ ${(getValue(upperQuartile) - getValue(median)) * 100 / range}%
+ ${(getValue(maximum) - getValue(upperQuartile)) * 100 / range}%
+ ${(max - getValue(maximum)) * 100 / range}%;`;
+ },
+ },
+ div({ class: 'tg-box-plot--space-left' }),
+ div({ class: 'tg-box-plot--top-left' }),
+ div({ class: 'tg-box-plot--bottom-left' }),
+ div({ class: 'tg-box-plot--mid-left' }),
+ div({ class: 'tg-box-plot--mid-right' }),
+ div({ class: 'tg-box-plot--top-right' }),
+ div({ class: 'tg-box-plot--bottom-right' }),
+ div({ class: 'tg-box-plot--space-right' }),
+ ),
+ () => {
+ const { min, max, step, range } = axisTicks.val;
+ const ticks = [];
+ let currentTick = min;
+ while (currentTick <= max) {
+ ticks.push(currentTick);
+ currentTick += step;
+ }
+
+ return div(
+ { class: 'tg-box-plot--axis' },
+ ticks.map(position => div(
+ {
+ class: 'tg-box-plot--axis-tick',
+ style: `left: ${(position - min) * 100 / range}%;`
+ },
+ position,
+ )),
+ );
+ },
+ ),
+ div(
+ { class: 'flex-column fx-gap-2 text-caption', style: 'flex: 150px;' },
+ div(
+ { class: 'flex-row fx-gap-2' },
+ div({ class: 'tg-blox-plot--legend-line' }),
+ 'Average---Standard Deviation',
+ ),
+ div(
+ { class: 'flex-row fx-gap-2' },
+ div({ class: 'tg-blox-plot--legend-whisker' }),
+ 'Minimum---Maximum',
+ ),
+ div(
+ { class: 'flex-row fx-gap-2' },
+ div({ class: 'tg-blox-plot--legend-box' }),
+ '25th---Median---75th',
+ ),
+ ),
+ );
+};
+
+const stylesheet = new CSSStyleSheet();
+stylesheet.replace(`
+.tg-box-plot--line {
+ position: relative;
+ margin: 8px 0 24px 0;
+ border-top: 2px dotted ${lineColor};
+}
+
+.tg-box-plot--dot {
+ position: absolute;
+ top: -1px;
+ left: 50%;
+ transform: translateX(-50%) translateY(-50%);
+ width: 10px;
+ height: 10px;
+ border-radius: 5px;
+ background-color: ${lineColor};
+}
+
+.tg-box-plot--grid {
+ height: 24px;
+ display: grid;
+ grid-template-rows: 50% 50%;
+}
+
+.tg-box-plot--grid div {
+ border-color: var(--caption-text-color);
+ border-style: solid;
+}
+
+.tg-box-plot--space-left {
+ grid-column-start: 1;
+ grid-column-end: 2;
+ grid-row-start: 1;
+ grid-row-end: 3;
+ border: 0;
+}
+
+.tg-box-plot--top-left {
+ grid-column-start: 2;
+ grid-column-end: 3;
+ grid-row-start: 1;
+ grid-row-end: 2;
+ border-width: 0 0 1px 2px;
+}
+
+.tg-box-plot--bottom-left {
+ grid-column-start: 2;
+ grid-column-end: 3;
+ grid-row-start: 2;
+ grid-row-end: 3;
+ border-width: 1px 0 0 2px;
+}
+
+.tg-box-plot--mid-left {
+ grid-column-start: 3;
+ grid-column-end: 4;
+ grid-row-start: 1;
+ grid-row-end: 3;
+ border-width: 1px 2px 1px 1px;
+ border-radius: 4px 0 0 4px;
+ background-color: ${boxColor};
+}
+
+.tg-box-plot--mid-right {
+ grid-column-start: 4;
+ grid-column-end: 5;
+ grid-row-start: 1;
+ grid-row-end: 3;
+ border-width: 1px 1px 1px 2px;
+ border-radius: 0 4px 4px 0;
+ background-color: ${boxColor};
+}
+
+.tg-box-plot--top-right {
+ grid-column-start: 5;
+ grid-column-end: 6;
+ grid-row-start: 1;
+ grid-row-end: 2;
+ border-width: 0 2px 1px 0;
+}
+
+.tg-box-plot--bottom-right {
+ grid-column-start: 5;
+ grid-column-end: 6;
+ grid-row-start: 2;
+ grid-row-end: 3;
+ border-width: 1px 2px 0 0;
+}
+
+.tg-box-plot--space-right {
+ grid-column-start: 6;
+ grid-column-end: 7;
+ grid-row-start: 1;
+ grid-row-end: 3;
+ border: 0;
+}
+
+.tg-box-plot--axis {
+ position: relative;
+ margin: 24px 0;
+ width: 100%;
+ height: 2px;
+ background-color: var(--disabled-text-color);
+ color: var(--caption-text-color);
+}
+
+.tg-box-plot--axis-tick {
+ position: absolute;
+ top: 8px;
+ transform: translateX(-50%);
+}
+
+.tg-box-plot--axis-tick::before {
+ position: absolute;
+ top: -9px;
+ left: 50%;
+ transform: translateX(-50%);
+ width: 4px;
+ height: 4px;
+ border-radius: 2px;
+ background-color: var(--disabled-text-color);
+ content: '';
+}
+
+.tg-blox-plot--legend-line {
+ width: 26px;
+ border: 1px dotted ${lineColor};
+ position: relative;
+}
+
+.tg-blox-plot--legend-line::after {
+ position: absolute;
+ left: 50%;
+ transform: translateX(-50%) translateY(-50%);
+ width: 6px;
+ height: 6px;
+ border-radius: 6px;
+ background-color: ${lineColor};
+ content: '';
+}
+
+.tg-blox-plot--legend-whisker {
+ width: 24px;
+ height: 12px;
+ border: solid var(--caption-text-color);
+ border-width: 0 2px 0 2px;
+ position: relative;
+}
+
+.tg-blox-plot--legend-whisker::after {
+ position: absolute;
+ top: 5px;
+ width: 24px;
+ height: 2px;
+ background-color: var(--caption-text-color);
+ content: '';
+}
+
+.tg-blox-plot--legend-box {
+ width: 26px;
+ height: 12px;
+ border: 1px solid var(--caption-text-color);
+ border-radius: 4px;
+ background-color: ${boxColor};
+ position: relative;
+}
+
+.tg-blox-plot--legend-box::after {
+ position: absolute;
+ left: 12px;
+ width: 2px;
+ height: 12px;
+ background-color: var(--caption-text-color);
+ content: '';
+}
+`);
+
+export { BoxPlot };
diff --git a/testgen/ui/components/frontend/js/components/frequency_bars.js b/testgen/ui/components/frontend/js/components/frequency_bars.js
new file mode 100644
index 0000000..ed49bf5
--- /dev/null
+++ b/testgen/ui/components/frontend/js/components/frequency_bars.js
@@ -0,0 +1,94 @@
+/**
+ * @typedef FrequencyItem
+ * @type {object}
+ * @property {string} value
+ * @property {number} count
+ *
+ * @typedef Properties
+ * @type {object}
+ * @property {FrequencyItem[]} items
+ * @property {number} total
+ * @property {string} title
+ * @property {string?} color
+ */
+import van from '../van.min.js';
+import { getValue, loadStylesheet } from '../utils.js';
+import { colorMap } from '../display_utils.js';
+
+const { div, span } = van.tags;
+const defaultColor = 'teal';
+
+const FrequencyBars = (/** @type Properties */ props) => {
+ loadStylesheet('frequencyBars', stylesheet);
+
+ const total = van.derive(() => getValue(props.total));
+ const color = van.derive(() => {
+ const colorValue = getValue(props.color) || defaultColor;
+ return colorMap[colorValue] || colorValue;
+ });
+ const width = van.derive(() => {
+ const maxCount = getValue(props.items).reduce((max, { count }) => Math.max(max, count), 0);
+ return String(maxCount).length * 7;
+ });
+
+ return () => div(
+ div(
+ { class: 'mb-2 text-secondary' },
+ props.title,
+ ),
+ getValue(props.items).map(({ value, count }) => {
+ return div(
+ { class: 'flex-row fx-gap-2' },
+ div(
+ { class: 'tg-frequency-bars' },
+ span({ class: 'tg-frequency-bars--empty' }),
+ span({
+ class: 'tg-frequency-bars--fill',
+ style: () => `width: ${count * 100 / total.val}%;
+ ${count ? 'min-width: 1px;' : ''}
+ background-color: ${color.val};`,
+ }),
+ ),
+ div(
+ {
+ class: 'text-caption tg-frequency-bars--count',
+ style: () => `width: ${width.val}px;`,
+ },
+ count,
+ ),
+ div(value),
+ );
+ }),
+ );
+};
+
+const stylesheet = new CSSStyleSheet();
+stylesheet.replace(`
+.tg-frequency-bars {
+ width: 150px;
+ height: 15px;
+ flex-shrink: 0;
+ position: relative;
+}
+
+.tg-frequency-bars--empty {
+ position: absolute;
+ width: 100%;
+ height: 100%;
+ border-radius: 4px;
+ background-color: ${colorMap['emptyLight']}
+}
+
+.tg-frequency-bars--fill {
+ position: absolute;
+ border-radius: 4px;
+ height: 100%;
+}
+
+.tg-frequency-bars--count {
+ flex-shrink: 0;
+ text-align: right;
+}
+`);
+
+export { FrequencyBars };
diff --git a/testgen/ui/components/frontend/js/components/percent_bar.js b/testgen/ui/components/frontend/js/components/percent_bar.js
new file mode 100644
index 0000000..e6a5321
--- /dev/null
+++ b/testgen/ui/components/frontend/js/components/percent_bar.js
@@ -0,0 +1,79 @@
+/**
+ * @typedef Properties
+ * @type {object}
+ * @property {string} label
+ * @property {number} value
+ * @property {number} total
+ * @property {string?} color
+ * @property {number?} height
+ * @property {number?} width
+ */
+import van from '../van.min.js';
+import { getValue, loadStylesheet } from '../utils.js';
+import { colorMap } from '../display_utils.js';
+
+const { div, span } = van.tags;
+const defaultHeight = 10;
+const defaultColor = 'purpleLight';
+
+const PercentBar = (/** @type Properties */ props) => {
+ loadStylesheet('percentBar', stylesheet);
+ const value = van.derive(() => getValue(props.value));
+ const total = van.derive(() => getValue(props.total));
+
+ return div(
+ { style: () => `max-width: ${props.width ? getValue(props.width) + 'px' : '100%'};` },
+ div(
+ { class: () => `tg-percent-bar--label ${value.val ? '' : 'text-secondary'}` },
+ () => `${getValue(props.label)}: ${value.val}`,
+ ),
+ div(
+ {
+ class: 'tg-percent-bar',
+ style: () => `height: ${getValue(props.height) || defaultHeight}px;`,
+ },
+ span({
+ class: 'tg-percent-bar--fill',
+ style: () => {
+ const color = getValue(props.color) || defaultColor;
+ return `width: ${value.val * 100 / total.val}%;
+ ${value.val ? 'min-width: 1px;' : ''}
+ background-color: ${colorMap[color] || color};`
+ },
+ }),
+ span({
+ class: 'tg-percent-bar--empty',
+ style: () => `width: ${(total.val - value.val) * 100 / total.val}%;
+ ${(total.val - value.val) ? 'min-width: 1px;' : ''};`,
+ }),
+ ),
+ );
+};
+
+const stylesheet = new CSSStyleSheet();
+stylesheet.replace(`
+.tg-percent-bar--label {
+ margin-bottom: 4px;
+}
+
+.tg-percent-bar {
+ height: 100%;
+ display: flex;
+ flex-flow: row nowrap;
+ align-items: flex-start;
+ justify-content: flex-start;
+ border-radius: 4px;
+ overflow: hidden;
+}
+
+.tg-percent-bar--fill {
+ height: 100%;
+}
+
+.tg-percent-bar--empty {
+ height: 100%;
+ background-color: ${colorMap['empty']}
+}
+`);
+
+export { PercentBar };
diff --git a/testgen/ui/components/frontend/js/components/summary_bar.js b/testgen/ui/components/frontend/js/components/summary_bar.js
index b73ea5c..e331000 100644
--- a/testgen/ui/components/frontend/js/components/summary_bar.js
+++ b/testgen/ui/components/frontend/js/components/summary_bar.js
@@ -14,18 +14,9 @@
*/
import van from '../van.min.js';
import { getValue, loadStylesheet } from '../utils.js';
+import { colorMap } from '../display_utils.js';
const { div, span } = van.tags;
-const colorMap = {
- red: '#EF5350',
- orange: '#FF9800',
- yellow: '#FDD835',
- green: '#9CCC65',
- purple: '#AB47BC',
- blue: '#42A5F5',
- brown: '#8D6E63',
- grey: '#BDBDBD',
-}
const defaultHeight = 24;
const SummaryBar = (/** @type Properties */ props) => {
diff --git a/testgen/ui/components/frontend/js/display_utils.js b/testgen/ui/components/frontend/js/display_utils.js
index 512cc0f..a2d6384 100644
--- a/testgen/ui/components/frontend/js/display_utils.js
+++ b/testgen/ui/components/frontend/js/display_utils.js
@@ -26,4 +26,25 @@ function formatDuration(/** @type string */ duration) {
return formatted.trim() || '< 1s';
}
-export { formatTimestamp, formatDuration };
+// https://m2.material.io/design/color/the-color-system.html#tools-for-picking-colors
+const colorMap = {
+ red: '#EF5350', // Red 400
+ orange: '#FF9800', // Orange 500
+ yellow: '#FDD835', // Yellow 600
+ green: '#9CCC65', // Light Green 400
+ limeGreen: '#C0CA33', // Lime Green 600
+ purple: '#AB47BC', // Purple 400
+ purpleLight: '#CE93D8', // Purple 200
+ blue: '#2196F3', // Blue 500
+ blueLight: '#90CAF9', // Blue 200
+ indigo: '#5C6BC0', // Indigo 400
+ teal: '#26A69A', // Teal 400
+ brown: '#8D6E63', // Brown 400
+ brownLight: '#D7CCC8', // Brown 100
+ brownDark: '#4E342E', // Brown 800
+ grey: '#BDBDBD', // Gray 400
+ empty: 'var(--empty)', // Light: Gray 200, Dark: Gray 800
+ emptyLight: 'var(--empty-light)', // Light: Gray 50, Dark: Gray 900
+}
+
+export { formatTimestamp, formatDuration, colorMap };
From 6f1f5957b7e910abadd2603d7d93e3e21fa1eb11 Mon Sep 17 00:00:00 2001
From: Aarthy Adityan
Date: Mon, 4 Nov 2024 13:03:55 -0500
Subject: [PATCH 51/91] feat(components): add Attribute, Card and EditableCard
components
---
testgen/ui/components/frontend/css/shared.css | 3 +
.../frontend/js/components/attribute.js | 39 +++++++++++
.../components/frontend/js/components/card.js | 47 ++++++++++++++
.../frontend/js/components/editable_card.js | 64 +++++++++++++++++++
4 files changed, 153 insertions(+)
create mode 100644 testgen/ui/components/frontend/js/components/attribute.js
create mode 100644 testgen/ui/components/frontend/js/components/card.js
create mode 100644 testgen/ui/components/frontend/js/components/editable_card.js
diff --git a/testgen/ui/components/frontend/css/shared.css b/testgen/ui/components/frontend/css/shared.css
index a4884ec..e387b81 100644
--- a/testgen/ui/components/frontend/css/shared.css
+++ b/testgen/ui/components/frontend/css/shared.css
@@ -149,6 +149,9 @@ body {
font-size: 12px;
color: var(--caption-text-color);
}
+.text-capitalize {
+ text-transform: capitalize;
+}
/* */
/* Flex utilities */
diff --git a/testgen/ui/components/frontend/js/components/attribute.js b/testgen/ui/components/frontend/js/components/attribute.js
new file mode 100644
index 0000000..5ca702f
--- /dev/null
+++ b/testgen/ui/components/frontend/js/components/attribute.js
@@ -0,0 +1,39 @@
+/**
+ * @typedef Properties
+ * @type {object}
+ * @property {string} label
+ * @property {string | number} value
+ * @property {number?} width
+ */
+import { getValue, loadStylesheet } from '../utils.js';
+import van from '../van.min.js';
+
+const { div } = van.tags;
+
+const Attribute = (/** @type Properties */ props) => {
+ loadStylesheet('attribute', stylesheet);
+
+ return div(
+ { style: () => `width: ${props.width ? getValue(props.width) + 'px' : 'auto'}` },
+ div(
+ { class: 'text-caption text-capitalize mb-1' },
+ props.label,
+ ),
+ div(
+ { class: 'attribute-value' },
+ () => {
+ const value = getValue(props.value);
+ return (value || value === 0) ? value : '--';
+ },
+ ),
+ );
+};
+
+const stylesheet = new CSSStyleSheet();
+stylesheet.replace(`
+.attribute-value {
+ word-wrap: break-word;
+}
+`);
+
+export { Attribute };
diff --git a/testgen/ui/components/frontend/js/components/card.js b/testgen/ui/components/frontend/js/components/card.js
new file mode 100644
index 0000000..66c6ebb
--- /dev/null
+++ b/testgen/ui/components/frontend/js/components/card.js
@@ -0,0 +1,47 @@
+/**
+ * @typedef Properties
+ * @type {object}
+ * @property {string} title
+ * @property {object} content
+ * @property {object?} actionContent
+ */
+import { loadStylesheet } from '../utils.js';
+import van from '../van.min.js';
+
+const { div, h3 } = van.tags;
+
+const Card = (/** @type Properties */ props) => {
+ loadStylesheet('card', stylesheet);
+
+ return div(
+ { class: 'tg-card mb-4' },
+ div(
+ { class: 'flex-row fx-justify-space-between fx-align-flex-start' },
+ h3(
+ { class: 'tg-card--title' },
+ props.title,
+ ),
+ props.actionContent,
+ ),
+ props.content,
+ );
+};
+
+const stylesheet = new CSSStyleSheet();
+stylesheet.replace(`
+.tg-card {
+ border-radius: 8px;
+ background-color: var(--dk-card-background);
+ padding: 16px;
+}
+
+.tg-card--title {
+ margin: 0 0 16px;
+ color: var(--secondary-text-color);
+ font-size: 16px;
+ font-weight: 500;
+ text-transform: capitalize;
+}
+`);
+
+export { Card };
diff --git a/testgen/ui/components/frontend/js/components/editable_card.js b/testgen/ui/components/frontend/js/components/editable_card.js
new file mode 100644
index 0000000..4dc8e54
--- /dev/null
+++ b/testgen/ui/components/frontend/js/components/editable_card.js
@@ -0,0 +1,64 @@
+/**
+ * @typedef Properties
+ * @type {object}
+ * @property {string} title
+ * @property {object} content
+ * @property {object} editingContent
+ * @property {function} onSave
+ * @property {function?} onCancel
+ * @property {function?} hasChanges
+ */
+import { getValue } from '../utils.js';
+import van from '../van.min.js';
+import { Card } from './card.js';
+import { Button } from './button.js';
+
+const { div } = van.tags;
+
+const EditableCard = (/** @type Properties */ props) => {
+ const editing = van.state(false);
+ const onCancel = van.derive(() => {
+ const cancelFunction = props.onCancel?.val ?? props.onCancel;
+ return () => {
+ editing.val = false;
+ cancelFunction?.();
+ }
+ });
+ const saveDisabled = van.derive(() => {
+ const hasChanges = props.hasChanges?.val ?? props.hasChanges;
+ return !hasChanges?.();
+ });
+
+ return Card({
+ title: props.title,
+ content: [
+ () => editing.val ? getValue(props.editingContent) : getValue(props.content),
+ () => editing.val ? div(
+ { class: 'flex-row fx-justify-content-flex-end fx-gap-3 mt-4' },
+ Button({
+ type: 'stroked',
+ label: 'Cancel',
+ width: 'auto',
+ onclick: onCancel,
+ }),
+ Button({
+ type: 'stroked',
+ color: 'primary',
+ label: 'Save',
+ width: 'auto',
+ disabled: saveDisabled,
+ onclick: props.onSave,
+ }),
+ ) : '',
+ ],
+ actionContent: () => !editing.val ? Button({
+ type: 'stroked',
+ label: 'Edit',
+ icon: 'edit',
+ width: 'auto',
+ onclick: () => editing.val = true,
+ }) : '',
+ });
+};
+
+export { EditableCard };
From b8cb63ab55fa0419c123f2bc9086eac26fcc4806 Mon Sep 17 00:00:00 2001
From: Aarthy Adityan
Date: Mon, 4 Nov 2024 13:06:05 -0500
Subject: [PATCH 52/91] feat(components): add Input, Checkbox and RadioGroup
components
---
testgen/ui/components/frontend/css/shared.css | 8 +-
.../frontend/js/components/checkbox.js | 83 ++++++++++++++
.../frontend/js/components/input.js | 104 ++++++++++++++++++
.../frontend/js/components/radio_group.js | 104 ++++++++++++++++++
.../frontend/js/components/select.js | 4 +-
testgen/ui/components/frontend/js/utils.js | 18 ++-
6 files changed, 317 insertions(+), 4 deletions(-)
create mode 100644 testgen/ui/components/frontend/js/components/checkbox.js
create mode 100644 testgen/ui/components/frontend/js/components/input.js
create mode 100644 testgen/ui/components/frontend/js/components/radio_group.js
diff --git a/testgen/ui/components/frontend/css/shared.css b/testgen/ui/components/frontend/css/shared.css
index e387b81..4955a1b 100644
--- a/testgen/ui/components/frontend/css/shared.css
+++ b/testgen/ui/components/frontend/css/shared.css
@@ -27,7 +27,8 @@ body {
--secondary-text-color: #0000008a;
--disabled-text-color: #00000042;
--caption-text-color: rgba(49, 51, 63, 0.6); /* Match Streamlit's caption color */
- --border-color: rgba(0, 0, 0, .12);
+ --form-field-color: rgb(240, 242, 246); /* Match Streamlit's form field color */
+ --border-color: rgba(0, 0, 0, .12);
--tooltip-color: #333d;
--dk-card-background: #fff;
@@ -71,6 +72,7 @@ body {
--secondary-text-color: rgba(255, 255, 255, .7);
--disabled-text-color: rgba(255, 255, 255, .5);
--caption-text-color: rgba(250, 250, 250, .6); /* Match Streamlit's caption color */
+ --form-field-color: rgb(38, 39, 48); /* Match Streamlit's form field color */
--border-color: rgba(255, 255, 255, .25);
--dk-card-background: #14181f;
@@ -94,6 +96,10 @@ body {
}
}
+.clickable {
+ cursor: pointer;
+}
+
.hidden {
display: none !important;
}
diff --git a/testgen/ui/components/frontend/js/components/checkbox.js b/testgen/ui/components/frontend/js/components/checkbox.js
new file mode 100644
index 0000000..c7cf9a9
--- /dev/null
+++ b/testgen/ui/components/frontend/js/components/checkbox.js
@@ -0,0 +1,83 @@
+/**
+ * @typedef Properties
+ * @type {object}
+ * @property {string} label
+ * @property {boolean?} checked
+ * @property {function?} onChange
+ * @property {number?} width
+ */
+import van from '../van.min.js';
+import { getValue, loadStylesheet } from '../utils.js';
+
+const { input, label } = van.tags;
+
+const Checkbox = (/** @type Properties */ props) => {
+ loadStylesheet('checkbox', stylesheet);
+
+ return label(
+ {
+ class: 'flex-row fx-gap-2 clickable',
+ style: () => `width: ${props.width ? getValue(props.width) + 'px' : 'auto'}`,
+ },
+ input({
+ type: 'checkbox',
+ class: 'tg-checkbox--input clickable',
+ checked: props.checked,
+ onchange: van.derive(() => {
+ const onChange = props.onChange?.val ?? props.onChange;
+ return onChange ? (event) => onChange(event.target.checked) : null;
+ }),
+ }),
+ props.label,
+ );
+};
+
+const stylesheet = new CSSStyleSheet();
+stylesheet.replace(`
+.tg-checkbox--input {
+ appearance: none;
+ box-sizing: border-box;
+ margin: 0;
+ width: 18px;
+ height: 18px;
+ border: 1px solid var(--secondary-text-color);
+ border-radius: 4px;
+ position: relative;
+ transition-property: border-color, background-color;
+ transition-duration: 0.3s;
+}
+
+.tg-checkbox--input:focus,
+.tg-checkbox--input:focus-visible {
+ outline: none;
+}
+
+.tg-checkbox--input:focus-visible::before {
+ content: '';
+ box-sizing: border-box;
+ position: absolute;
+ top: -4px;
+ left: -4px;
+ width: 24px;
+ height: 24px;
+ border: 3px solid var(--border-color);
+ border-radius: 7px;
+}
+
+.tg-checkbox--input:checked {
+ border-color: transparent;
+ background-color: var(--primary-color);
+}
+
+.tg-checkbox--input:checked::after {
+ position: absolute;
+ top: -4px;
+ left: -3px;
+ content: 'check';
+ font-family: 'Material Symbols Rounded';
+ font-size: 22px;
+ color: white;
+}
+`);
+
+export { Checkbox };
diff --git a/testgen/ui/components/frontend/js/components/input.js b/testgen/ui/components/frontend/js/components/input.js
new file mode 100644
index 0000000..be2aa03
--- /dev/null
+++ b/testgen/ui/components/frontend/js/components/input.js
@@ -0,0 +1,104 @@
+/**
+ * @typedef Properties
+ * @type {object}
+ * @property {string?} label
+ * @property {(string | number)?} value
+ * @property {string?} placeholder
+ * @property {string?} icon
+ * @property {boolean?} clearable
+ * @property {function?} onChange
+ * @property {number?} width
+ */
+import van from '../van.min.js';
+import { debounce, getValue, loadStylesheet } from '../utils.js';
+
+const { input, label, i } = van.tags;
+
+const Input = (/** @type Properties */ props) => {
+ loadStylesheet('input', stylesheet);
+
+ const value = van.derive(() => getValue(props.value) ?? '');
+ van.derive(() => {
+ const onChange = props.onChange?.val ?? props.onChange;
+ onChange?.(value.val);
+ });
+
+ return label(
+ {
+ class: 'flex-column fx-gap-1 text-caption text-capitalize tg-input--label',
+ style: () => `width: ${props.width ? getValue(props.width) + 'px' : 'auto'}`,
+ },
+ props.label,
+ () => getValue(props.icon) ? i(
+ { class: 'material-symbols-rounded tg-input--icon' },
+ props.icon,
+ ) : '',
+ () => getValue(props.clearable) ? i(
+ {
+ class: () => `material-symbols-rounded tg-input--clear clickable ${value.val ? '' : 'hidden'}`,
+ onclick: () => value.val = '',
+ },
+ 'clear',
+ ) : '',
+ input({
+ class: 'tg-input--field',
+ value,
+ placeholder: () => getValue(props.placeholder) ?? '',
+ oninput: debounce(event => value.val = event.target.value, 300),
+ }),
+ );
+};
+
+const stylesheet = new CSSStyleSheet();
+stylesheet.replace(`
+.tg-input--label {
+ position: relative;
+}
+
+.tg-input--icon {
+ position: absolute;
+ bottom: 5px;
+ left: 4px;
+ font-size: 22px;
+}
+
+.tg-input--icon ~ .tg-input--field {
+ padding-left: 28px;
+}
+
+.tg-input--clear {
+ position: absolute;
+ bottom: 6px;
+ right: 4px;
+ font-size: 20px;
+}
+
+.tg-input--clear ~ .tg-input--field {
+ padding-right: 24px;
+}
+
+.tg-input--field {
+ box-sizing: border-box;
+ width: 100%;
+ height: 32px;
+ border-radius: 8px;
+ border: 1px solid transparent;
+ transition: border-color 0.3s;
+ background-color: var(--form-field-color);
+ padding: 4px 8px;
+ color: var(--primary-text-color);
+ font-size: 14px;
+}
+
+.tg-input--field::placeholder {
+ color: var(--disabled-text-color);
+}
+
+.tg-input--field:focus,
+.tg-input--field:focus-visible {
+ outline: none;
+ border-color: var(--primary-color);
+}
+`);
+
+export { Input };
diff --git a/testgen/ui/components/frontend/js/components/radio_group.js b/testgen/ui/components/frontend/js/components/radio_group.js
new file mode 100644
index 0000000..0c7f5e4
--- /dev/null
+++ b/testgen/ui/components/frontend/js/components/radio_group.js
@@ -0,0 +1,104 @@
+/**
+* @typedef Option
+ * @type {object}
+ * @property {string} label
+ * @property {string | number | boolean | null} value
+ *
+ * @typedef Properties
+ * @type {object}
+ * @property {string} label
+ * @property {Option[]} options
+ * @property {string | number | boolean | null} selected
+ * @property {function?} onChange
+ * @property {number?} width
+ */
+import van from '../van.min.js';
+import { getRandomId, getValue, loadStylesheet } from '../utils.js';
+
+const { div, input, label } = van.tags;
+
+const RadioGroup = (/** @type Properties */ props) => {
+ loadStylesheet('radioGroup', stylesheet);
+ const groupName = getRandomId();
+
+ return div(
+ { style: () => `width: ${props.width ? getValue(props.width) + 'px' : 'auto'}` },
+ div(
+ { class: 'text-caption text-capitalize mb-1' },
+ props.label,
+ ),
+ () => div(
+ { class: 'flex-row fx-gap-4 tg-radio-group' },
+ getValue(props.options).map(option => label(
+ { class: 'flex-row fx-gap-2 text-capitalize clickable' },
+ input({
+ type: 'radio',
+ name: groupName,
+ value: option.value,
+ checked: () => option.value === getValue(props.value),
+ onchange: van.derive(() => {
+ const onChange = props.onChange?.val ?? props.onChange;
+ return onChange ? () => onChange(option.value) : null;
+ }),
+ class: 'tg-radio-group--input',
+ }),
+ option.label,
+ )),
+ ),
+ );
+};
+
+const stylesheet = new CSSStyleSheet();
+stylesheet.replace(`
+.tg-radio-group {
+ height: 32px;
+}
+
+.tg-radio-group--input {
+ appearance: none;
+ box-sizing: border-box;
+ margin: 0;
+ width: 18px;
+ height: 18px;
+ border: 1px solid var(--secondary-text-color);
+ border-radius: 9px;
+ position: relative;
+ transition-property: border-color, background-color;
+ transition-duration: 0.3s;
+}
+
+.tg-radio-group--input:focus,
+.tg-radio-group--input:focus-visible {
+ outline: none;
+}
+
+.tg-radio-group--input:focus-visible::before {
+ content: '';
+ box-sizing: border-box;
+ position: absolute;
+ top: -4px;
+ left: -4px;
+ width: 24px;
+ height: 24px;
+ border: 3px solid var(--border-color);
+ border-radius: 12px;
+}
+
+.tg-radio-group--input:checked {
+ border-color: var(--primary-color);
+}
+
+.tg-radio-group--input:checked::after {
+ content: '';
+ box-sizing: border-box;
+ position: absolute;
+ top: 3px;
+ left: 3px;
+ width: 10px;
+ height: 10px;
+ background-color: var(--primary-color);
+ border-radius: 5px;
+}
+`);
+
+export { RadioGroup };
diff --git a/testgen/ui/components/frontend/js/components/select.js b/testgen/ui/components/frontend/js/components/select.js
index 5f4f68c..f4fe618 100644
--- a/testgen/ui/components/frontend/js/components/select.js
+++ b/testgen/ui/components/frontend/js/components/select.js
@@ -13,7 +13,7 @@
*/
import van from '../van.min.js';
import { Streamlit } from '../streamlit.js';
-import { getValue, loadStylesheet } from '../utils.js';
+import { getRandomId, getValue, loadStylesheet } from '../utils.js';
const { div, label, option, select } = van.tags;
@@ -21,7 +21,7 @@ const Select = (/** @type {Properties} */ props) => {
loadStylesheet('select', stylesheet);
Streamlit.setFrameHeight();
- const domId = Math.random().toString(36).substring(2);
+ const domId = getRandomId();
const changeHandler = props.onChange || post;
return div(
{class: 'tg-select'},
diff --git a/testgen/ui/components/frontend/js/utils.js b/testgen/ui/components/frontend/js/utils.js
index d8d712c..b5bdc96 100644
--- a/testgen/ui/components/frontend/js/utils.js
+++ b/testgen/ui/components/frontend/js/utils.js
@@ -53,4 +53,20 @@ function getValue(/** @type object */ prop) { // van state or static value
return prop;
}
-export { emitEvent, enforceElementWidth, getValue, loadStylesheet, resizeFrameHeightToElement };
+function getRandomId() {
+ return Math.random().toString(36).substring(2);
+}
+
+// https://stackoverflow.com/a/75988895
+function debounce(
+ /** @type function */ callback,
+ /** @type number */ wait,
+) {
+ let timeoutId = null;
+ return (...args) => {
+ window.clearTimeout(timeoutId);
+ timeoutId = window.setTimeout(() => callback(...args), wait);
+ };
+}
+
+export { debounce, emitEvent, enforceElementWidth, getRandomId, getValue, loadStylesheet, resizeFrameHeightToElement };
From d87f69cafd8768e11cd86b79c080c19efd5d1733 Mon Sep 17 00:00:00 2001
From: Aarthy Adityan
Date: Mon, 4 Nov 2024 13:07:21 -0500
Subject: [PATCH 53/91] feat(components): add Tree component
---
testgen/ui/components/frontend/css/shared.css | 4 +
.../components/frontend/js/components/tree.js | 211 ++++++++++++++++++
2 files changed, 215 insertions(+)
create mode 100644 testgen/ui/components/frontend/js/components/tree.js
diff --git a/testgen/ui/components/frontend/css/shared.css b/testgen/ui/components/frontend/css/shared.css
index 4955a1b..0d61aa6 100644
--- a/testgen/ui/components/frontend/css/shared.css
+++ b/testgen/ui/components/frontend/css/shared.css
@@ -104,6 +104,10 @@ body {
display: none !important;
}
+.invisible {
+ visibility: hidden !important;
+}
+
.dot {
font-size: 10px;
font-style: normal;
diff --git a/testgen/ui/components/frontend/js/components/tree.js b/testgen/ui/components/frontend/js/components/tree.js
new file mode 100644
index 0000000..d29dd2a
--- /dev/null
+++ b/testgen/ui/components/frontend/js/components/tree.js
@@ -0,0 +1,211 @@
+/**
+ * @typedef TreeNode
+ * @type {object}
+ * @property {string} id
+ * @property {string} label
+ * @property {string?} classes
+ * @property {string?} icon
+ * @property {number?} iconSize
+ * @property {TreeNode[]?} children
+ * @property {number?} level
+ * @property {boolean?} expanded
+ * @property {boolean?} hidden
+ *
+ * @typedef Properties
+ * @type {object}
+ * @property {TreeNode[]} nodes
+ * @property {string} selected
+ * @property {string} classes
+ */
+import van from '../van.min.js';
+import { emitEvent, getValue, loadStylesheet } from '../utils.js';
+import { Input } from './input.js';
+import { Button } from './button.js';
+
+const { div, i } = van.tags;
+const levelOffset = 14;
+
+const Tree = (/** @type Properties */ props) => {
+ loadStylesheet('tree', stylesheet);
+
+ // Use only initial prop value as default and maintain internal state
+ const initialSelection = props.selected?.rawVal || props.selected || null;
+ const selected = van.state(initialSelection);
+
+ const treeNodes = van.derive(() => {
+ const nodes = getValue(props.nodes) || [];
+ const treeSelected = initTreeState(nodes, initialSelection);
+ if (!treeSelected) {
+ selected.val = null;
+ }
+ return nodes;
+ });
+
+ return div(
+ { class: () => `flex-column ${getValue(props.classes)}` },
+ div(
+ { class: 'flex-row fx-gap-1 tg-tree--actions' },
+ Input({
+ icon: 'search',
+ clearable: true,
+ onChange: (value) => searchTree(treeNodes.val, value),
+ }),
+ Button({
+ type: 'icon',
+ icon: 'expand_all',
+ style: 'width: 24px; height: 24px; padding: 4px;',
+ tooltip: 'Expand All',
+ tooltipPosition: 'bottom',
+ onclick: () => expandOrCollapseTree(treeNodes.val, true),
+ }),
+ Button({
+ type: 'icon',
+ icon: 'collapse_all',
+ style: 'width: 24px; height: 24px; padding: 4px;',
+ tooltip: 'Collapse All',
+ tooltipPosition: 'bottom',
+ onclick: () => expandOrCollapseTree(treeNodes.val, false),
+ }),
+ ),
+ div(
+ { class: 'tg-tree' },
+ () => div(
+ { class: 'tg-tree--nodes' },
+ treeNodes.val.map(node => TreeNode(node, selected)),
+ ),
+ ),
+ );
+};
+
+const TreeNode = (
+ /** @type TreeNode */ node,
+ /** @type string */ selected,
+) => {
+ const hasChildren = !!node.children?.length;
+ return div(
+ div(
+ {
+ class: () => `tg-tree--row flex-row clickable ${node.classes || ''}
+ ${selected.val === node.id ? 'selected' : ''}
+ ${node.hidden.val ? 'hidden' : ''}`,
+ style: `padding-left: ${levelOffset * node.level}px;`,
+ onclick: () => {
+ selected.val = node.id;
+ emitEvent('TreeNodeSelected', { payload: node.id });
+ },
+ },
+ i(
+ {
+ class: `material-symbols-rounded text-secondary ${hasChildren ? '' : 'invisible'}`,
+ onclick: () => {
+ node.expanded.val = hasChildren ? !node.expanded.val : false;
+ },
+ },
+ () => node.expanded.val ? 'arrow_drop_down' : 'arrow_right',
+ ),
+ node.icon ? i(
+ {
+ class: 'material-symbols-rounded tg-tree--row-icon',
+ style: `font-size: ${node.iconSize || 24}px;`,
+ },
+ node.icon,
+ ) : null,
+ node.label,
+ ),
+ hasChildren ? div(
+ { class: () => node.expanded.val ? '' : 'hidden' },
+ node.children.map(node => TreeNode(node, selected)),
+ ) : null,
+ );
+};
+
+const initTreeState = (
+ /** @type TreeNode[] */ nodes,
+ /** @type string */ selected,
+ /** @type number */ level = 0,
+) => {
+ let treeExpanded = false;
+ nodes.forEach(node => {
+ node.level = level;
+ // Expand node if it is initial selection
+ let expanded = node.id === selected;
+ if (node.children) {
+ // Expand node if initial selection is a descendent
+ expanded = initTreeState(node.children, selected, level + 1) || expanded;
+ }
+ node.expanded = van.state(expanded);
+ node.hidden = van.state(false);
+ treeExpanded = treeExpanded || expanded;
+ });
+ return treeExpanded;
+};
+
+const searchTree = (
+ /** @type TreeNode[] */ nodes,
+ /** @type string */ search,
+) => {
+ nodes.forEach(node => {
+ let hidden = !node.label.includes(search);
+ if (node.children) {
+ searchTree(node.children, search);
+ hidden = hidden && node.children.every(child => child.hidden.rawVal);
+ }
+ node.hidden.val = hidden;
+ });
+};
+
+const expandOrCollapseTree = (
+ /** @type TreeNode[] */ nodes,
+ /** @type boolean */ expanded,
+) => {
+ nodes.forEach(node => {
+ if (node.children) {
+ expandOrCollapseTree(node.children, expanded);
+ node.expanded.val = expanded;
+ }
+ });
+}
+
+const stylesheet = new CSSStyleSheet();
+stylesheet.replace(`
+.tg-tree {
+ overflow: auto;
+}
+
+.tg-tree--actions {
+ margin: 4px;
+}
+
+.tg-tree--nodes {
+ width: fit-content;
+ min-width: 100%;
+}
+
+.tg-tree--row {
+ box-sizing: border-box;
+ width: auto;
+ min-width: fit-content;
+ border: solid transparent;
+ border-width: 1px 0;
+ padding-right: 8px;
+ transition: background-color 0.3s;
+}
+
+.tg-tree--row:hover {
+ background-color: var(--sidebar-item-hover-color);
+}
+
+.tg-tree--row.selected {
+ background-color: #06a04a17;
+ font-weight: 500;
+}
+
+.tg-tree--row-icon {
+ margin-right: 4px;
+ width: 24px;
+ color: #B0BEC5;
+ text-align: center;
+}
+`);
+
+export { Tree };
From fd343eb0b253c360c270b137b3435babe64d6509 Mon Sep 17 00:00:00 2001
From: Aarthy Adityan
Date: Mon, 4 Nov 2024 13:10:13 -0500
Subject: [PATCH 54/91] feat(components): add ColumnProfile component
---
.../frontend/js/components/column_profile.js | 287 ++++++++++++++++++
.../components/frontend/js/display_utils.js | 31 +-
2 files changed, 308 insertions(+), 10 deletions(-)
create mode 100644 testgen/ui/components/frontend/js/components/column_profile.js
diff --git a/testgen/ui/components/frontend/js/components/column_profile.js b/testgen/ui/components/frontend/js/components/column_profile.js
new file mode 100644
index 0000000..bdbef62
--- /dev/null
+++ b/testgen/ui/components/frontend/js/components/column_profile.js
@@ -0,0 +1,287 @@
+/**
+ * @typedef ColumnProfile
+ * @type {object}
+ * @property {'A' | 'B' | 'D' | 'N' | 'T' | 'X'} general_type
+ * * Value Counts
+ * @property {number} record_ct
+ * @property {number} value_ct
+ * @property {number} distinct_value_ct
+ * @property {number} null_value_ct
+ * @property {number} zero_value_ct
+ * * Alpha
+ * @property {number} zero_length_ct
+ * @property {number} filled_value_ct
+ * @property {number} includes_digit_ct
+ * @property {number} numeric_ct
+ * @property {number} date_ct
+ * @property {number} quoted_value_ct
+ * @property {number} lead_space_ct
+ * @property {number} embedded_space_ct
+ * @property {number} avg_embedded_spaces
+ * @property {number} min_length
+ * @property {number} max_length
+ * @property {number} avg_length
+ * @property {string} min_text
+ * @property {string} max_text
+ * @property {number} distinct_std_value_ct
+ * @property {number} distinct_pattern_ct
+ * @property {'STREET_ADDR' | 'STATE_USA' | 'PHONE_USA' | 'EMAIL' | 'ZIP_USA' | 'FILE_NAME' | 'CREDIT_CARD' | 'DELIMITED_DATA' | 'SSN'} std_pattern_match
+ * @property {string} top_freq_values
+ * @property {string} top_patterns
+ * * Numeric
+ * @property {number} min_value
+ * @property {number} min_value_over_0
+ * @property {number} max_value
+ * @property {number} avg_value
+ * @property {number} stdev_value
+ * @property {number} percentile_25
+ * @property {number} percentile_50
+ * @property {number} percentile_75
+ * * Date
+ * @property {number} min_date
+ * @property {number} max_date
+ * @property {number} before_1yr_date_ct
+ * @property {number} before_5yr_date_ct
+ * @property {number} before_20yr_date_ct
+ * @property {number} within_1yr_date_ct
+ * @property {number} within_1mo_date_ct
+ * @property {number} future_date_ct
+ * * Boolean
+ * @property {number} boolean_true_ct
+ */
+import van from '../van.min.js';
+import { Attribute } from '../components/attribute.js';
+import { SummaryBar } from './summary_bar.js';
+import { PercentBar } from './percent_bar.js';
+import { FrequencyBars } from './frequency_bars.js';
+import { BoxPlot } from './box_plot.js';
+import { loadStylesheet } from '../utils.js';
+import { formatTimestamp, roundDigits } from '../display_utils.js';
+
+const { div } = van.tags;
+const columnTypeFunctionMap = {
+ A: AlphaColumn,
+ B: BooleanColumn,
+ D: DatetimeColumn,
+ N: NumericColumn,
+};
+const attributeWidth = 200;
+const percentWidth = 250;
+const summaryWidth = 400;
+const summaryHeight = 10;
+const boxPlotWidth = 800;
+
+const ColumnProfile = (/** @type ColumnProfile */ item) => {
+ loadStylesheet('column_profile', stylesheet);
+ const columnFunction = columnTypeFunctionMap[item.general_type];
+ return columnFunction ? columnFunction(item) : null;
+};
+
+function AlphaColumn(/** @type ColumnProfile */ item) {
+ const standardPatternLabels = {
+ STREET_ADDR: 'Street Address',
+ STATE_USA: 'State (USA)',
+ PHONE_USA: 'Phone (USA)',
+ EMAIL: 'Email',
+ ZIP_USA: 'Zip Code (USA)',
+ FILE_NAME: 'Filename',
+ CREDIT_CARD: 'Credit Card',
+ DELIMITED_DATA: 'Delimited Data',
+ SSN: 'SSN (USA)',
+ };
+ let standardPattern = standardPatternLabels[item.std_pattern_match];
+ if (!standardPattern) {
+ standardPattern = (item.std_pattern_match || '').split('_')
+ .map(word => word ? (word[0].toUpperCase() + word.substring(1)) : '')
+ .join(' ');
+ }
+
+ const total = item.record_ct;
+
+ return div(
+ { class: 'flex-column fx-gap-4' },
+ div(
+ { class: 'flex-row fx-flex-wrap fx-align-flex-start fx-gap-4 tg-profile--fx-basis-content' },
+ div(
+ {
+ class: 'flex-column fx-gap-5',
+ },
+ DistinctsBar(item),
+ SummaryBar({
+ height: summaryHeight,
+ width: summaryWidth,
+ label: `Missing Values: ${item.zero_length_ct + item.zero_value_ct + item.filled_value_ct + item.null_value_ct}`,
+ items: [
+ { label: 'Values', value: item.value_ct - item.zero_value_ct - item.filled_value_ct - item.zero_length_ct, color: 'green' },
+ { label: 'Zero Values', value: item.zero_value_ct, color: 'brown' },
+ { label: 'Dummy Values', value: item.filled_value_ct, color: 'orange' },
+ { label: 'Zero Length', value: item.zero_length_ct, color: 'yellow' },
+ { label: 'Null', value: item.null_value_ct, color: 'brownLight' },
+ ],
+ }),
+ ),
+ div(
+ {
+ class: 'flex-row fx-flex-wrap fx-align-flex-start fx-gap-3 mb-1 tg-profile--fx-grow-content',
+ },
+ div(
+ { class: 'flex-column fx-gap-3' },
+ PercentBar({ label: 'Includes Digits', value: item.includes_digit_ct, total, width: percentWidth }),
+ PercentBar({ label: 'Numeric Values', value: item.numeric_ct, total, width: percentWidth }),
+ PercentBar({ label: 'Date Values', value: item.date_ct, total, width: percentWidth }),
+ PercentBar({ label: 'Quoted Values', value: item.quoted_value_ct, total, width: percentWidth }),
+ ),
+ div(
+ { class: 'flex-column fx-gap-3' },
+ PercentBar({ label: 'Leading Spaces', value: item.lead_space_ct, total, width: percentWidth }),
+ PercentBar({ label: 'Embedded Spaces', value: item.embedded_space_ct ?? 0, total, width: percentWidth }),
+ Attribute({ label: 'Average Embedded Spaces', value: roundDigits(item.avg_embedded_spaces), width: attributeWidth }),
+ ),
+ ),
+ ),
+ div(
+ { class: 'flex-row fx-flex-wrap fx-align-flex-start fx-gap-4' },
+ Attribute({ label: 'Minimum Length', value: item.min_length, width: attributeWidth }),
+ Attribute({ label: 'Maximum Length', value: item.max_length, width: attributeWidth }),
+ Attribute({ label: 'Average Length', value: roundDigits(item.avg_length), width: attributeWidth }),
+ Attribute({ label: 'Minimum Text', value: item.min_text, width: attributeWidth }),
+ Attribute({ label: 'Maximum Text', value: item.max_text, width: attributeWidth }),
+ Attribute({ label: 'Distinct Standard Values', value: item.distinct_std_value_ct, width: attributeWidth }),
+ Attribute({ label: 'Distinct Patterns', value: item.distinct_pattern_ct, width: attributeWidth }),
+ Attribute({ label: 'Standard Pattern Match', value: standardPattern, width: attributeWidth }),
+ ),
+ item.top_freq_values || item.top_patterns ? div(
+ { class: 'flex-row fx-flex-wrap fx-align-flex-start fx-gap-4 mt-2 mb-2 tg-profile--fx-basis-content' },
+ item.top_freq_values ? FrequencyBars({
+ title: 'Frequent Values',
+ total: item.record_ct,
+ items: item.top_freq_values.substring(2).split('\n| ').map(parts => {
+ const [value, count] = parts.split(' | ');
+ return { value, count: Number(count) };
+ }),
+ }) : null,
+ item.top_patterns ? FrequencyBars({
+ title: 'Frequent Patterns',
+ total: item.record_ct,
+ items: item.top_patterns.split(' | ').reduce((array, item, index) => {
+ if (index % 2) {
+ array[(index - 1) / 2].value = item;
+ } else {
+ array.push({ count: Number(item) });
+ }
+ return array;
+ }, []),
+ }) : null,
+ ) : null,
+ );
+}
+
+function BooleanColumn(/** @type ColumnProfile */ item) {
+ return SummaryBar({
+ height: summaryHeight,
+ width: summaryWidth,
+ label: `Record count: ${item.record_ct}`,
+ items: [
+ { label: 'True', value: item.boolean_true_ct, color: 'brownLight' },
+ { label: 'False', value: item.value_ct - item.boolean_true_ct, color: 'brown' },
+ { label: 'Null', value: item.null_value_ct, color: 'brownDark' },
+ ],
+ });
+}
+
+function DatetimeColumn(/** @type ColumnProfile */ item) {
+ const total = item.record_ct;
+
+ return div(
+ { class: 'flex-row fx-flex-wrap fx-align-flex-start fx-gap-4 tg-profile--fx-basis-content' },
+ div(
+ DistinctsBar(item),
+ div(
+ { class: 'flex-row fx-flex-wrap fx-align-flex-start fx-gap-3 mt-5 tg-profile--fx-grow-content' },
+ Attribute({ label: 'Minimum Date', value: formatTimestamp(item.min_date, true) }),
+ Attribute({ label: 'Maximum Date', value: formatTimestamp(item.max_date, true) }),
+ ),
+ ),
+ div(
+ {
+ class: 'flex-row fx-flex-wrap fx-align-flex-start fx-gap-3 mb-1 tg-profile--fx-grow-content',
+ },
+ div(
+ { class: 'flex-column fx-gap-3' },
+ PercentBar({ label: 'Before 1 Year', value: item.before_1yr_date_ct, total, width: percentWidth }),
+ PercentBar({ label: 'Before 5 Year', value: item.before_5yr_date_ct, total, width: percentWidth }),
+ PercentBar({ label: 'Before 20 Year', value: item.before_20yr_date_ct, total, width: percentWidth }),
+ ),
+ div(
+ { class: 'flex-column fx-gap-3' },
+ PercentBar({ label: 'Within 1 Year', value: item.within_1yr_date_ct, total, width: percentWidth }),
+ PercentBar({ label: 'Within 1 Month', value: item.within_1mo_date_ct, total, width: percentWidth }),
+ PercentBar({ label: 'Future Dates', value: item.future_date_ct, total, width: percentWidth }),
+ ),
+ ),
+ );
+}
+
+function NumericColumn(/** @type ColumnProfile */ item) {
+ return [
+ div(
+ { class: 'flex-row fx-flex-wrap fx-align-flex-start fx-gap-4 mb-5 tg-profile--fx-basis-content tg-profile--fx-grow-content' },
+ div(
+ DistinctsBar(item),
+ ),
+ div(
+ PercentBar({ label: 'Zero Values', value: item.zero_value_ct, total: item.record_ct, width: percentWidth }),
+ ),
+ ),
+ div(
+ { class: 'flex-row fx-flex-wrap fx-align-flex-start fx-gap-4' },
+ Attribute({ label: 'Minimum Value', value: item.min_value, width: attributeWidth }),
+ Attribute({ label: 'Minimum Value > 0', value: item.min_value_over_0, width: attributeWidth }),
+ Attribute({ label: 'Maximum Value', value: item.max_value, width: attributeWidth }),
+ Attribute({ label: 'Average Value', value: roundDigits(item.avg_value), width: attributeWidth }),
+ Attribute({ label: 'Standard Deviation', value: roundDigits(item.stdev_value), width: attributeWidth }),
+ Attribute({ label: '25th Percentile', value: roundDigits(item.percentile_25), width: attributeWidth }),
+ Attribute({ label: 'Median Value', value: roundDigits(item.percentile_50), width: attributeWidth }),
+ Attribute({ label: '75th Percentile', value: roundDigits(item.percentile_75), width: attributeWidth }),
+ ),
+ div(
+ { class: 'flex-row fx-justify-center mt-5 tg-profile--fx-grow-content' },
+ BoxPlot({
+ minimum: item.min_value,
+ maximum: item.max_value,
+ median: item.percentile_50,
+ lowerQuartile: item.percentile_25,
+ upperQuartile: item.percentile_75,
+ average: item.avg_value,
+ standardDeviation: item.stdev_value,
+ width: boxPlotWidth,
+ }),
+ ),
+ ];
+}
+
+const DistinctsBar = (/** @type ColumnProfile */ item) => {
+ return SummaryBar({
+ height: summaryHeight,
+ width: summaryWidth,
+ label: `Record count: ${item.record_ct}`,
+ items: [
+ { label: 'Distinct', value: item.distinct_value_ct, color: 'blue' },
+ { label: 'Non-Distinct', value: item.value_ct - item.distinct_value_ct, color: 'blueLight' },
+ { label: 'Null', value: item.null_value_ct, color: 'brownLight' },
+ ],
+ });
+};
+
+const stylesheet = new CSSStyleSheet();
+stylesheet.replace(`
+.tg-profile--fx-grow-content > * {
+ flex-grow: 1;
+}
+
+.tg-profile--fx-basis-content > * {
+ flex: 300px;
+}
+`);
+
+export { ColumnProfile };
diff --git a/testgen/ui/components/frontend/js/display_utils.js b/testgen/ui/components/frontend/js/display_utils.js
index a2d6384..1be340b 100644
--- a/testgen/ui/components/frontend/js/display_utils.js
+++ b/testgen/ui/components/frontend/js/display_utils.js
@@ -1,13 +1,17 @@
-function formatTimestamp(/** @type number */ timestamp) {
- if (!timestamp) {
- return '--';
+function formatTimestamp(
+ /** @type number | string */ timestamp,
+ /** @type boolean */ show_year,
+) {
+ if (timestamp) {
+ const date = new Date(timestamp);
+ if (!isNaN(date)) {
+ const months = [ 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec' ];
+ const hours = date.getHours();
+ const minutes = date.getMinutes();
+ return `${months[date.getMonth()]} ${date.getDate()}, ${show_year ? date.getFullYear() + ' at ': ''}${hours % 12}:${String(minutes).padStart(2, '0')} ${hours / 12 > 1 ? 'PM' : 'AM'}`;
+ }
}
-
- const date = new Date(timestamp);
- const months = [ 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec' ];
- const hours = date.getHours();
- const minutes = date.getMinutes();
- return `${months[date.getMonth()]} ${date.getDate()}, ${hours % 12}:${String(minutes).padStart(2, '0')} ${hours / 12 > 1 ? 'PM' : 'AM'}`;
+ return '--';
}
function formatDuration(/** @type string */ duration) {
@@ -26,6 +30,13 @@ function formatDuration(/** @type string */ duration) {
return formatted.trim() || '< 1s';
}
+function roundDigits(/** @type number | string */ number, /** @type number */ precision = 3) {
+ if (!['number', 'string'].includes(typeof number) || isNaN(number)) {
+ return '--';
+ }
+ return parseFloat(Number(number).toPrecision(precision));
+}
+
// https://m2.material.io/design/color/the-color-system.html#tools-for-picking-colors
const colorMap = {
red: '#EF5350', // Red 400
@@ -47,4 +58,4 @@ const colorMap = {
emptyLight: 'var(--empty-light)', // Light: Gray 50, Dark: Gray 900
}
-export { formatTimestamp, formatDuration, colorMap };
+export { formatTimestamp, formatDuration, roundDigits, colorMap };
From b1bf3eaefe044b629e8439d21c5e5b4d76e91b11 Mon Sep 17 00:00:00 2001
From: Aarthy Adityan
Date: Mon, 4 Nov 2024 13:10:42 -0500
Subject: [PATCH 55/91] feat(ui): add Data Hierarchy page
---
testgen/ui/assets/style.css | 9 +
testgen/ui/bootstrap.py | 2 +
testgen/ui/components/frontend/css/shared.css | 29 +-
testgen/ui/components/frontend/js/main.js | 2 +
.../frontend/js/pages/data_hierarchy.js | 663 ++++++++++++++++++
.../components/widgets/testgen_component.py | 33 +-
testgen/ui/views/data_hierarchy.py | 487 +++++++++++++
testgen/utils/__init__.py | 10 +
8 files changed, 1224 insertions(+), 11 deletions(-)
create mode 100644 testgen/ui/components/frontend/js/pages/data_hierarchy.js
create mode 100644 testgen/ui/views/data_hierarchy.py
diff --git a/testgen/ui/assets/style.css b/testgen/ui/assets/style.css
index 67266d7..3122291 100644
--- a/testgen/ui/assets/style.css
+++ b/testgen/ui/assets/style.css
@@ -51,6 +51,7 @@ footer {
/* Sidebar */
section[data-testid="stSidebar"] {
+ width: 250px;
z-index: 999;
background-color: var(--sidebar-background-color);
}
@@ -86,6 +87,14 @@ div[data-testid="stDialog"] div[role="dialog"] {
}
/* */
+div[data-testid="stSpinner"] {
+ background: transparent;
+}
+
+div[data-testid="stSpinner"] > div > i {
+ border-color: var(--primary-color) rgba(49, 51, 63, 0.2) rgba(49, 51, 63, 0.2);
+}
+
/* Theming for buttons, tabs and form inputs */
button[data-testid="stBaseButton-secondary"]:hover,
button[data-testid="stBaseButton-secondary"]:focus:not(:active),
diff --git a/testgen/ui/bootstrap.py b/testgen/ui/bootstrap.py
index e3a99a6..414f7e5 100644
--- a/testgen/ui/bootstrap.py
+++ b/testgen/ui/bootstrap.py
@@ -11,6 +11,7 @@
from testgen.ui.navigation.router import Router
from testgen.ui.session import session
from testgen.ui.views.connections import ConnectionsPage
+from testgen.ui.views.data_hierarchy import DataHierarchyPage
from testgen.ui.views.login import LoginPage
from testgen.ui.views.overview import OverviewPage
from testgen.ui.views.profiling_anomalies import ProfilingAnomaliesPage
@@ -27,6 +28,7 @@
BUILTIN_PAGES: list[type[Page]] = [
LoginPage,
OverviewPage,
+ DataHierarchyPage,
DataProfilingPage,
ProfilingResultsPage,
ProfilingAnomaliesPage,
diff --git a/testgen/ui/components/frontend/css/shared.css b/testgen/ui/components/frontend/css/shared.css
index 0d61aa6..460fc3a 100644
--- a/testgen/ui/components/frontend/css/shared.css
+++ b/testgen/ui/components/frontend/css/shared.css
@@ -155,10 +155,23 @@ body {
color: var(--secondary-text-color);
}
+.text-disabled {
+ color: var(--disabled-text-color);
+}
+
.text-caption {
font-size: 12px;
color: var(--caption-text-color);
}
+
+.text-error {
+ color: var(--error-color);
+}
+
+.text-green {
+ color: var(--primary-color);
+}
+
.text-capitalize {
text-transform: capitalize;
}
@@ -256,7 +269,7 @@ body {
/* Whitespace utilities */
.mt-0 {
- margin-top: 2px;
+ margin-top: 0;
}
.mt-1 {
@@ -288,7 +301,7 @@ body {
}
.mr-0 {
- margin-right: 2px;
+ margin-right: 0;
}
.mr-1 {
@@ -320,7 +333,7 @@ body {
}
.mb-0 {
- margin-bottom: 2px;
+ margin-bottom: 0;
}
.mb-1 {
@@ -352,7 +365,7 @@ body {
}
.ml-0 {
- margin-left: 2px;
+ margin-left: 0;
}
.ml-1 {
@@ -384,7 +397,7 @@ body {
}
.pt-0 {
- padding-top: 2px;
+ padding-top: 0;
}
.pt-1 {
@@ -416,7 +429,7 @@ body {
}
.pr-0 {
- padding-right: 2px;
+ padding-right: 0;
}
.pr-1 {
@@ -448,7 +461,7 @@ body {
}
.pb-0 {
- padding-bottom: 2px;
+ padding-bottom: 0;
}
.pb-1 {
@@ -480,7 +493,7 @@ body {
}
.pl-0 {
- padding-left: 2px;
+ padding-left: 0;
}
.pl-1 {
diff --git a/testgen/ui/components/frontend/js/main.js b/testgen/ui/components/frontend/js/main.js
index 3dc7f62..bc75e9a 100644
--- a/testgen/ui/components/frontend/js/main.js
+++ b/testgen/ui/components/frontend/js/main.js
@@ -17,6 +17,7 @@ import { SortingSelector } from './components/sorting_selector.js';
import { TestRuns } from './pages/test_runs.js';
import { ProfilingRuns } from './pages/profiling_runs.js';
import { DatabaseFlavorSelector } from './components/flavor_selector.js';
+import { DataHierarchy } from './pages/data_hierarchy.js';
let currentWindowVan = van;
let topWindowVan = window.top.van;
@@ -34,6 +35,7 @@ const TestGenComponent = (/** @type {string} */ id, /** @type {object} */ props)
test_runs: TestRuns,
profiling_runs: ProfilingRuns,
database_flavor_selector: DatabaseFlavorSelector,
+ data_hierarchy: DataHierarchy,
};
if (Object.keys(componentById).includes(id)) {
diff --git a/testgen/ui/components/frontend/js/pages/data_hierarchy.js b/testgen/ui/components/frontend/js/pages/data_hierarchy.js
new file mode 100644
index 0000000..a1d09ce
--- /dev/null
+++ b/testgen/ui/components/frontend/js/pages/data_hierarchy.js
@@ -0,0 +1,663 @@
+/**
+ * @typedef ColumnPath
+ * @type {object}
+ * @property {string} column_id
+ * @property {string} table_id
+ * @property {string} column_name
+ * @property {string} table_name
+ * @property {'A' | 'B' | 'D' | 'N' | 'T' | 'X'} general_type
+ * @property {number} column_drop_date
+ * @property {number} table_drop_date
+ *
+ * @typedef Anomaly
+ * @type {object}
+ * @property {string} column_name
+ * @property {string} anomaly_name
+ * @property {'Definite' | 'Likely' | 'Possible' | 'Potential PII'} issue_likelihood
+ * @property {string} detail
+ * @property {'High' | 'Moderate'} pii_risk
+ *
+ * @typedef TestIssue
+ * @type {object}
+ * @property {string} column_name
+ * @property {string} test_name
+ * @property {'Failed' | 'Warning' | 'Error' } result_status
+ * @property {string} result_message
+ * @property {string} test_suite
+ * @property {string} test_run_id
+ * @property {number} test_run_date
+ *
+ * @typedef Column
+ * @type {ColumnProfile}
+ * @property {string} id
+ * @property {'column'} type
+ * @property {string} column_name
+ * @property {string} table_name
+ * @property {string} table_group_id
+ * * Characteristics
+ * @property {string} column_type
+ * @property {string} functional_data_type
+ * @property {string} datatype_suggestion
+ * @property {number} add_date
+ * @property {number} last_mod_date
+ * @property {number} drop_date
+ * * Column Metadata
+ * @property {boolean} critical_data_element
+ * @property {string} data_source
+ * @property {string} source_system
+ * @property {string} source_process
+ * @property {string} business_domain
+ * @property {string} stakeholder_group
+ * @property {string} transform_level
+ * @property {string} aggregation_level
+ * * Table Metadata
+ * @property {boolean} table_critical_data_element
+ * @property {string} table_cdata_source
+ * @property {string} table_csource_system
+ * @property {string} table_csource_process
+ * @property {string} table_cbusiness_domain
+ * @property {string} table_cstakeholder_group
+ * @property {string} table_ctransform_level
+ * @property {string} table_caggregation_level
+ * * Latest Profile & Test Runs
+ * @property {string} latest_profile_id
+ * @property {number} latest_profile_date
+ * @property {number} latest_test_run_date
+ * * Issues
+ * @property {Anomaly[]} latest_anomalies
+ * @property {TestIssue[]} latest_test_issues
+ *
+ * @typedef Table
+ * @type {object}
+ * @property {string} id
+ * @property {'table'} type
+ * @property {string} table_name
+ * @property {string} table_group_id
+ * * Characteristics
+ * @property {string} functional_table_type
+ * @property {number} record_ct
+ * @property {number} column_ct
+ * @property {number} data_point_ct
+ * @property {number} add_date
+ * @property {number} drop_date
+ * * Metadata
+ * @property {boolean} critical_data_element
+ * @property {string} data_source
+ * @property {string} source_system
+ * @property {string} source_process
+ * @property {string} business_domain
+ * @property {string} stakeholder_group
+ * @property {string} transform_level
+ * @property {string} aggregation_level
+ * * Latest Profile & Test Runs
+ * @property {string} latest_profile_id
+ * @property {number} latest_profile_date
+ * @property {number} latest_test_run_date
+ * * Issues
+ * @property {Anomaly[]} latest_anomalies
+ * @property {TestResult[]} latest_test_results
+ *
+ * @typedef Properties
+ * @type {object}
+ * @property {ColumnPath[]} columns
+ * @property {Table | Column} selected
+ */
+import van from '../van.min.js';
+import { Tree } from '../components/tree.js';
+import { Card } from '../components/card.js';
+import { EditableCard } from '../components/editable_card.js';
+import { Link } from '../components/link.js';
+import { Attribute } from '../components/attribute.js';
+import { Input } from '../components/input.js';
+import { TooltipIcon } from '../components/tooltip_icon.js';
+import { Streamlit } from '../streamlit.js';
+import { emitEvent, getValue, loadStylesheet } from '../utils.js';
+import { formatTimestamp } from '../display_utils.js';
+import { ColumnProfile } from '../components/column_profile.js';
+import { RadioGroup } from '../components/radio_group.js';
+
+const { div, h2, span, i } = van.tags;
+
+const tableIcon = { icon: 'table', iconSize: 20 };
+const columnIcons = {
+ A: { icon: 'abc' },
+ B: { icon: 'toggle_off', iconSize: 20 },
+ D: { icon: 'calendar_clock', iconSize: 20 },
+ N: { icon: '123' },
+ T: { icon: 'calendar_clock', iconSize: 20 },
+ X: { icon: 'question_mark', iconSize: 18 },
+};
+
+const DataHierarchy = (/** @type Properties */ props) => {
+ loadStylesheet('data_hierarchy', stylesheet);
+ Streamlit.setFrameHeight(1); // Non-zero value is needed to render
+ window.frameElement.style.setProperty('height', 'calc(100vh - 200px)');
+ window.testgen.isPage = true;
+
+ const treeNodes = van.derive(() => {
+ let columns = [];
+ try {
+ columns = JSON.parse(getValue(props.columns));
+ } catch { }
+
+ const tables = {};
+ columns.forEach(({ column_id, table_id, column_name, table_name, general_type, column_drop_date, table_drop_date }) => {
+ if (!tables[table_id]) {
+ tables[table_id] = {
+ id: table_id,
+ label: table_name,
+ classes: table_drop_date ? 'text-disabled' : '',
+ ...tableIcon,
+ children: [],
+ };
+ }
+ tables[table_id].children.push({
+ id: column_id,
+ label: column_name,
+ classes: column_drop_date ? 'text-disabled' : '',
+ ...columnIcons[general_type || 'X'],
+ });
+ });
+ return Object.values(tables);
+ });
+
+ const selectedItem = van.derive(() => {
+ try {
+ return JSON.parse(getValue(props.selected));
+ } catch (e) {
+ console.error(e)
+ return null;
+ }
+ });
+
+ return div(
+ { class: 'flex-row tg-dh' },
+ Tree({
+ nodes: treeNodes,
+ // Use .rawVal, so only initial value from query params is passed to tree
+ selected: selectedItem.rawVal?.id,
+ classes: 'tg-dh--tree',
+ }),
+ () => {
+ const item = selectedItem.val;
+ if (item) {
+ return div(
+ { class: 'tg-dh--details' },
+ h2(
+ { class: 'tg-dh--title' },
+ item.type === 'column' ? [
+ span(
+ { class: 'text-secondary' },
+ `${item.table_name}: `,
+ ),
+ item.column_name,
+ ] : item.table_name,
+ ),
+ span(
+ { class: 'flex-row fx-gap-1 fx-justify-content-flex-end mb-2 text-secondary' },
+ '* as of latest profiling run on ',
+ Link({
+ href: 'profiling-runs:results',
+ params: {
+ run_id: item.latest_profile_id,
+ table_name: item.table_name,
+ column_name: item.column_name,
+ },
+ open_new: true,
+ label: formatTimestamp(item.latest_profile_date),
+ }),
+ ),
+ CharacteristicsCard(item),
+ item.type === 'column' ? Card({
+ title: 'Value Distribution *',
+ content: ColumnProfile(item),
+ }) : null,
+ MetadataCard(item),
+ PotentialPIICard(item),
+ HygieneIssuesCard(item),
+ TestIssuesCard(item),
+ );
+ }
+
+ return div(
+ { class: 'flex-column fx-align-flex-center fx-justify-center tg-dh--no-selection' },
+ i(
+ { class: 'material-symbols-rounded text-disabled mb-5' },
+ 'quick_reference_all',
+ ),
+ span(
+ { class: 'text-secondary' },
+ 'Select a table or column on the left to view its details.',
+ ),
+ );
+ },
+ );
+};
+
+const CharacteristicsCard = (/** @type Table | Column */ item) => {
+ let attributes = [];
+ if (item.type === 'column') {
+ attributes.push(
+ { key: 'column_type', label: 'Data Type' },
+ { key: 'datatype_suggestion', label: 'Suggested Data Type' },
+ { key: 'functional_data_type', label: 'Semantic Data Type' },
+ { key: 'add_date', label: 'First Detected' },
+ );
+ if (item.last_mod_date !== item.add_date) {
+ attributes.push({ key: 'last_mod_date', label: 'Modification Detected' });
+ }
+ } else {
+ attributes.push(
+ { key: 'functional_table_type', label: 'Semantic Table Type' },
+ { key: 'record_ct', label: 'Row Count' },
+ { key: 'column_ct', label: 'Column Count' },
+ { key: 'data_point_ct', label: 'Data Point Count' },
+ { key: 'add_date', label: 'First Detected' },
+ );
+ }
+ if (item.drop_date) {
+ attributes.push({ key: 'drop_date', label: 'Drop Detected' });
+ }
+
+ return Card({
+ title: `${item.type} Characteristics *`,
+ content: div(
+ { class: 'flex-row fx-flex-wrap fx-gap-4' },
+ attributes.map(({ key, label }) => {
+ let value = item[key];
+ if (key === 'column_type') {
+ const { icon, iconSize } = columnIcons[item.general_type || 'X'];
+ value = div(
+ { class: 'flex-row' },
+ i(
+ {
+ class: 'material-symbols-rounded tg-dh--column-icon',
+ style: `font-size: ${iconSize || 24}px;`,
+ },
+ icon,
+ ),
+ (value || 'unknown').toLowerCase(),
+ );
+ } else if (key === 'datatype_suggestion') {
+ value = (value || '').toLowerCase();
+ } else if (key === 'functional_table_type') {
+ value = (value || '').split('-')
+ .map(word => word ? (word[0].toUpperCase() + word.substring(1)) : '')
+ .join(' ');
+ } else if (['add_date', 'last_mod_date', 'drop_date'].includes(key)) {
+ value = formatTimestamp(value, true);
+ if (key === 'drop_date') {
+ label = span({ class: 'text-error' }, label);
+ }
+ }
+
+ return Attribute({ label, value, width: 300 });
+ }),
+ ),
+ });
+};
+
+const MetadataCard = (/** @type Table | Column */ item) => {
+ const attributes = [
+ 'critical_data_element',
+ 'data_source',
+ 'source_system',
+ 'source_process',
+ 'business_domain',
+ 'stakeholder_group',
+ 'transform_level',
+ 'aggregation_level',
+ ].map(key => ({
+ key,
+ label: key.replaceAll('_', ' '),
+ state: van.state(item[key]),
+ inherited: item[`table_${key}`], // Table values inherited by column
+ }));
+
+ const InheritedIcon = () => TooltipIcon({
+ icon: 'layers',
+ iconSize: 18,
+ classes: 'text-disabled',
+ tooltip: 'Inherited from table metadata',
+ tooltipPosition: 'top-right',
+ });
+ const width = 300;
+
+ const content = div(
+ { class: 'flex-row fx-flex-wrap fx-gap-4' },
+ attributes.map(({ key, label, state, inherited }) => {
+ let value = state.rawVal ?? inherited;
+ const isInherited = item.type === 'column' && state.rawVal === null;
+
+ if (key === 'critical_data_element') {
+ return span(
+ { class: 'flex-row fx-gap-1', style: `width: ${width}px` },
+ i(
+ { class: `material-symbols-rounded ${value ? 'text-green' : 'text-disabled'}` },
+ value ? 'check_circle' : 'cancel',
+ ),
+ span(
+ { class: value ? 'text-capitalize' : 'text-secondary' },
+ value ? label : `Not a ${label}`,
+ ),
+ isInherited ? InheritedIcon() : null,
+ );
+ }
+
+ if (isInherited && value) {
+ value = span(
+ { class: 'flex-row fx-gap-1' },
+ InheritedIcon(),
+ value,
+ );
+ }
+ return Attribute({ label, value, width });
+ }),
+ );
+
+ const editingContent = div(
+ { class: 'flex-row fx-flex-wrap fx-gap-4' },
+ attributes.map(({ key, label, state, inherited }) => {
+ if (key === 'critical_data_element') {
+ const options = [
+ { label: 'Yes', value: true },
+ { label: 'No', value: false },
+ ];
+ if (item.type === 'column') {
+ options.push({ label: 'Inherit', value: null });
+ }
+ return RadioGroup({
+ label, width, options,
+ value: item.type === 'column' ? state.rawVal : !!state.rawVal, // Coerce null to false for tables
+ onChange: (value) => state.val = value,
+ });
+ };
+
+ return Input({
+ label, width,
+ value: state.rawVal,
+ placeholder: inherited ? `Inherited: ${inherited}` : null,
+ onChange: (value) => state.val = value || null,
+ });
+ }),
+ );
+
+ return EditableCard({
+ title: `${item.type} Metadata`,
+ content,
+ // Pass as function so the block is re-rendered with reset values when re-editing after a cancel
+ editingContent: () => editingContent,
+ onSave: () => {
+ const payload = attributes.reduce((object, { key, state }) => {
+ object[key] = state.rawVal;
+ return object;
+ }, { id: item.id });
+ emitEvent('MetadataChanged', { payload })
+ },
+ // Reset states to original values on cancel
+ onCancel: () => attributes.forEach(({ key, state }) => state.val = item[key]),
+ hasChanges: () => attributes.some(({ key, state }) => state.val !== item[key]),
+ });
+};
+
+const PotentialPIICard = (/** @type Table | Column */ item) => {
+ const riskColors = {
+ High: 'red',
+ Moderate: 'orange',
+ };
+
+ const attributes = [
+ {
+ key: 'detail', width: 150, label: 'Type',
+ value_function: (issue) => (issue.detail || '').split('Type: ')[1],
+ },
+ {
+ key: 'pii_risk', width: 100, label: 'Risk', classes: 'text-secondary',
+ value_function: (issue) => div(
+ { class: 'flex-row' },
+ span({ class: 'dot mr-2', style: `color: var(--${riskColors[issue.pii_risk]});` }),
+ issue.pii_risk,
+ ),
+ },
+ ];
+ if (item.type === 'table') {
+ attributes.unshift(
+ { key: 'column_name', width: 150, label: 'Column' },
+ );
+ }
+
+ const potentialPII = item.latest_anomalies.filter(({ issue_likelihood }) => issue_likelihood === 'Potential PII');
+ const linkProps = {
+ href: 'profiling-runs:hygiene',
+ params: { run_id: item.latest_profile_id, issue_class: 'Potential PII' },
+ };
+
+ return IssuesCard('Potential PII', potentialPII, attributes, linkProps, 'No potential PII detected');
+};
+
+const HygieneIssuesCard = (/** @type Table | Column */ item) => {
+ const likelihoodColors = {
+ Definite: 'red',
+ Likely: 'orange',
+ Possible: 'yellow',
+ };
+
+ const attributes = [
+ { key: 'anomaly_name', width: 200, label: 'Issue' },
+ {
+ key: 'issue_likelihood', width: 80, label: 'Likelihood', classes: 'text-secondary',
+ value_function: (issue) => div(
+ { class: 'flex-row' },
+ span({ class: 'dot mr-2', style: `color: var(--${likelihoodColors[issue.issue_likelihood]});` }),
+ issue.issue_likelihood,
+ ),
+ },
+ { key: 'detail', width: 300, label: 'Detail' },
+ ];
+ if (item.type === 'table') {
+ attributes.unshift(
+ { key: 'column_name', width: 150, label: 'Column' },
+ );
+ }
+
+ const hygieneIssues = item.latest_anomalies.filter(({ issue_likelihood }) => issue_likelihood !== 'Potential PII');
+ const linkProps = {
+ href: 'profiling-runs:hygiene',
+ params: { run_id: item.latest_profile_id },
+ };
+
+ return IssuesCard('Hygiene Issues', hygieneIssues, attributes, linkProps, 'No hygiene issues detected');
+};
+
+const TestIssuesCard = (/** @type Table | Column */ item) => {
+ const statusColors = {
+ Failed: 'red',
+ Warning: 'yellow',
+ Error: 'brown',
+ };
+
+ const attributes = [
+ { key: 'test_name', width: 150, label: 'Test' },
+ {
+ key: 'result_status', width: 80, label: 'Status', classes: 'text-secondary',
+ value_function: (issue) => div(
+ { class: 'flex-row' },
+ span({ class: 'dot mr-2', style: `color: var(--${statusColors[issue.result_status]});` }),
+ issue.result_status,
+ ),
+ },
+ { key: 'result_message', width: 300, label: 'Details' },
+ {
+ key: 'test_run_id', width: 150, label: 'Test Suite | Start Time',
+ value_function: (issue) => div(
+ div(
+ { class: 'text-secondary' },
+ issue.test_suite,
+ ),
+ Link({
+ href: 'test-runs:results',
+ params: { run_id: issue.test_run_id },
+ open_new: true,
+ label: formatTimestamp(issue.test_run_date),
+ style: 'font-size: 12px; margin-top: 2px;',
+ }),
+ ),
+ },
+ ];
+ if (item.type === 'table') {
+ attributes.unshift(
+ { key: 'column_name', width: 150, label: 'Column' },
+ );
+ }
+
+ let noneContent = 'No test issues detected';
+ if (!item.latest_test_run_date) {
+ if (item.drop_date) {
+ noneContent = span({ class: 'text-secondary' }, `No test results for ${item.type}`);
+ } else {
+ noneContent = span(
+ { class: 'text-secondary flex-row fx-gap-1 fx-justify-content-flex-end' },
+ `No test results yet for ${item.type}.`,
+ Link({
+ href: 'test-suites',
+ open_new: true,
+ label: 'Go to Test Suites',
+ right_icon: 'chevron_right',
+ }),
+ );
+ }
+ }
+
+ return IssuesCard('Test Issues', item.latest_test_issues, attributes, null, noneContent);
+};
+
+/**
+ * @typedef Attribute
+ * @type {object}
+ * @property {string} key
+ * @property {number} width
+ * @property {string} label
+ * @property {string} classes
+ * @property {function?} value_function
+ */
+const IssuesCard = (
+ /** @type string */ title,
+ /** @type (Anomaly | TestIssue)[] */ items,
+ /** @type Attribute[] */ attributes,
+ /** @type object? */ linkProps,
+ /** @type (string | object)? */ noneContent,
+) => {
+ const gap = 8;
+ const minWidth = attributes.reduce((sum, { width }) => sum + width, attributes.length * gap);
+
+ let content = null;
+ let actionContent = null;
+ if (items.length) {
+ content = div(
+ { style: 'overflow: auto; max-height: 300px;' },
+ div(
+ {
+ class: 'flex-row table-row text-caption pt-0',
+ style: `gap: ${gap}px; min-width: ${minWidth}px;`,
+ },
+ attributes.map(({ label, width }) => span(
+ { style: `flex: 1 0 ${width}px;` },
+ label,
+ )),
+ ),
+ items.map(item => div(
+ {
+ class: 'flex-row table-row pt-2 pb-2',
+ style: `gap: ${gap}px; min-width: ${minWidth}px;`,
+ },
+ attributes.map(({ key, width, value_function, classes }) => {
+ const value = value_function ? value_function(item) : item[key];
+ return span(
+ {
+ class: classes || '',
+ style: `flex: 1 0 ${width}px; word-break: break-word;`,
+ },
+ value || '--',
+ );
+ }),
+ )),
+ );
+
+ if (linkProps) {
+ actionContent = Link({
+ ...linkProps,
+ open_new: true,
+ label: 'View details',
+ right_icon: 'chevron_right',
+ });
+ }
+ } else {
+ actionContent = typeof noneContent === 'string' ? span(
+ { class: 'text-secondary flex-row fx-gap-1' },
+ noneContent,
+ i({ class: 'material-symbols-rounded text-green' }, 'check_circle'),
+ ) : (noneContent || null);
+ }
+
+ return Card({
+ title: `${title} (${items.length})`,
+ content,
+ actionContent,
+ });
+}
+
+const stylesheet = new CSSStyleSheet();
+stylesheet.replace(`
+.tg-dh {
+ height: 100%;
+ align-items: stretch;
+}
+
+.tg-dh--tree {
+ min-width: 250px;
+ border-radius: 8px;
+ border: 1px solid var(--border-color);
+ background-color: var(--sidebar-background-color);
+}
+
+.tg-dh--details {
+ padding: 8px 0 0 20px;
+ overflow: auto;
+ flex-grow: 1;
+}
+
+.tg-dh--title {
+ margin: 0;
+ color: var(--primary-text-color);
+ font-size: 20px;
+ font-weight: 500;
+}
+
+.tg-dh--details > .tg-card {
+ min-width: 400px;
+}
+
+.tg-dh--column-icon {
+ margin-right: 4px;
+ width: 24px;
+ color: #B0BEC5;
+ text-align: center;
+}
+
+.tg-dh--no-selection {
+ flex: auto;
+ max-height: 400px;
+ padding: 16px;
+}
+
+.tg-dh--no-selection > i {
+ font-size: 80px;
+}
+
+.tg-dh--no-selection > span {
+ font-size: 18px;
+ text-align: center;
+}
+`);
+
+export { DataHierarchy };
diff --git a/testgen/ui/components/widgets/testgen_component.py b/testgen/ui/components/widgets/testgen_component.py
index 7fb2be2..89b8ef0 100644
--- a/testgen/ui/components/widgets/testgen_component.py
+++ b/testgen/ui/components/widgets/testgen_component.py
@@ -1,20 +1,47 @@
import typing
+import streamlit as st
+
from testgen.ui.components.utils.component import component
from testgen.ui.navigation.router import Router
from testgen.ui.session import session
def testgen_component(
- component_id: typing.Literal["profiling_runs", "test_runs", "database_flavor_selector"],
+ component_id: typing.Literal["profiling_runs", "test_runs", "database_flavor_selector", "data_hierarchy"],
props: dict,
- event_handlers: dict | None,
+ on_change_handlers: dict[str, typing.Callable] | None = None,
+ event_handlers: dict[str, typing.Callable] | None = None,
) -> dict | None:
+ """
+ Testgen component to display a VanJS page.
+
+ # Parameters
+ :param component_id: name of page
+ :param props: properties expected by the page
+ :param on_change_handlers: event handlers to be called during on_change callback (recommended, but does not support calling st.rerun())
+ :param event_handlers: event handlers to be called on next run (supports calling st.rerun())
+
+ For both on_change_handlers and event_handlers, the "payload" data from the event is passed as the only argument to the callback function
+ """
+
+ key = f"testgen:{component_id}"
+
+ def on_change():
+ event_data = st.session_state[key]
+ if event_data and (event := event_data.get("event")):
+ if on_change_handlers and (handler := on_change_handlers.get(event)):
+ # Prevent handling the same event multiple times
+ event_id = f"{component_id}:{event_data.get('_id', '')}"
+ if event_id != session.testgen_event_id:
+ session.testgen_event_id = event_id
+ handler(event_data.get("payload"))
event_data = component(
id_=component_id,
- key=f"testgen:{component_id}",
+ key=key,
props=props,
+ on_change=on_change if on_change_handlers else None,
)
if event_data and (event := event_data.get("event")):
if event == "LinkClicked":
diff --git a/testgen/ui/views/data_hierarchy.py b/testgen/ui/views/data_hierarchy.py
new file mode 100644
index 0000000..59421b3
--- /dev/null
+++ b/testgen/ui/views/data_hierarchy.py
@@ -0,0 +1,487 @@
+import json
+import typing
+from functools import partial
+
+import pandas as pd
+import streamlit as st
+
+import testgen.ui.services.database_service as db
+import testgen.ui.services.query_service as dq
+from testgen.ui.components import widgets as testgen
+from testgen.ui.components.widgets import testgen_component
+from testgen.ui.navigation.menu import MenuItem
+from testgen.ui.navigation.page import Page
+from testgen.ui.queries import project_queries
+from testgen.ui.session import session
+from testgen.ui.views.dialogs.run_profiling_dialog import run_profiling_dialog
+from testgen.utils import is_uuid4
+
+PAGE_ICON = "dataset"
+
+class DataHierarchyPage(Page):
+ path = "data-hierarchy"
+ can_activate: typing.ClassVar = [
+ lambda: session.authentication_status,
+ ]
+ menu_item = MenuItem(icon=PAGE_ICON, label="Data Hierarchy", order=1)
+
+ def render(self, project_code: str | None = None, table_group_id: str | None = None, selected: str | None = None, **_kwargs) -> None:
+ testgen.page_header(
+ "Data Hierarchy",
+ )
+
+ project_code = project_code or session.project
+
+ if render_empty_state(project_code):
+ return
+
+ group_filter_column, _, loading_column = st.columns([.3, .5, .2], vertical_alignment="center")
+
+ with group_filter_column:
+ table_groups_df = get_table_group_options(project_code)
+ table_group_id = testgen.select(
+ options=table_groups_df,
+ value_column="id",
+ display_column="table_groups_name",
+ default_value=table_group_id,
+ required=True,
+ label="Table Group",
+ bind_to_query="table_group_id",
+ )
+
+ with loading_column:
+ columns_df = get_table_group_columns(table_group_id)
+ selected_item = get_selected_item(selected, table_group_id)
+ if not selected_item:
+ self.router.set_query_params({ "selected": None })
+
+ if columns_df.empty:
+ table_group = table_groups_df.loc[table_groups_df["id"] == table_group_id].iloc[0]
+ testgen.empty_state(
+ label="No profiling data yet",
+ icon=PAGE_ICON,
+ message=testgen.EmptyStateMessage.Profiling,
+ action_label="Run Profiling",
+ button_onclick=partial(run_profiling_dialog, project_code, table_group),
+ button_icon="play_arrow",
+ )
+ else:
+ def on_tree_node_select(node_id):
+ self.router.set_query_params({ "selected": node_id })
+
+ testgen_component(
+ "data_hierarchy",
+ props={ "columns": columns_df.to_json(orient="records"), "selected": json.dumps(selected_item) },
+ on_change_handlers={ "TreeNodeSelected": on_tree_node_select },
+ event_handlers={ "MetadataChanged": on_metadata_changed },
+ )
+
+
+def on_metadata_changed(metadata: dict) -> None:
+ schema = st.session_state["dbschema"]
+ item_type, item_id = metadata["id"].split("_", 2)
+
+ if item_type == "table":
+ update_table = "data_table_chars"
+ id_column = "table_id"
+ else:
+ update_table = "data_column_chars"
+ id_column = "column_id"
+
+ attributes = [
+ "data_source",
+ "source_system",
+ "source_process",
+ "business_domain",
+ "stakeholder_group",
+ "transform_level",
+ "aggregation_level"
+ ]
+ cde_value_map = {
+ True: "TRUE",
+ False: "FALSE",
+ None: "NULL",
+ }
+ set_attributes = [ f"{key} = NULLIF('{metadata.get(key) or ''}', '')" for key in attributes ]
+ set_attributes.append(f"critical_data_element = {cde_value_map[metadata.get('critical_data_element')]}")
+
+ sql = f"""
+ UPDATE {schema}.{update_table}
+ SET {', '.join(set_attributes)}
+ WHERE {id_column} = '{item_id}';
+ """
+ db.execute_sql(sql)
+ get_selected_item.clear()
+ st.rerun()
+
+
+def render_empty_state(project_code: str) -> bool:
+ project_summary_df = project_queries.get_summary_by_code(project_code)
+ if project_summary_df["profiling_runs_ct"]: # Without profiling, we don't have any table and column information in db
+ return False
+
+ label="Your project is empty"
+ testgen.whitespace(5)
+ if not project_summary_df["connections_ct"]:
+ testgen.empty_state(
+ label=label,
+ icon=PAGE_ICON,
+ message=testgen.EmptyStateMessage.Connection,
+ action_label="Go to Connections",
+ link_href="connections",
+ )
+ else:
+ testgen.empty_state(
+ label=label,
+ icon=PAGE_ICON,
+ message=testgen.EmptyStateMessage.Profiling if project_summary_df["table_groups_ct"] else testgen.EmptyStateMessage.TableGroup,
+ action_label="Go to Table Groups",
+ link_href="connections:table-groups",
+ link_params={ "connection_id": str(project_summary_df["default_connection_id"]) }
+ )
+ return True
+
+
+@st.cache_data(show_spinner=False)
+def get_table_group_options(project_code):
+ schema = st.session_state["dbschema"]
+ return dq.run_table_groups_lookup_query(schema, project_code)
+
+
+@st.cache_data(show_spinner="Loading data ...")
+def get_table_group_columns(table_group_id: str) -> pd.DataFrame:
+ schema = st.session_state["dbschema"]
+ sql = f"""
+ SELECT CONCAT('column_', column_chars.column_id) AS column_id,
+ CONCAT('table_', table_chars.table_id) AS table_id,
+ column_chars.column_name,
+ table_chars.table_name,
+ column_chars.general_type,
+ column_chars.drop_date AS column_drop_date,
+ table_chars.drop_date AS table_drop_date
+ FROM {schema}.data_column_chars column_chars
+ LEFT JOIN {schema}.data_table_chars table_chars ON (
+ column_chars.table_id = table_chars.table_id
+ )
+ WHERE column_chars.table_groups_id = '{table_group_id}'
+ ORDER BY table_name, column_name;
+ """
+ return db.retrieve_data(sql)
+
+
+@st.cache_data(show_spinner="Loading data ...")
+def get_selected_item(selected: str, table_group_id: str) -> dict | None:
+ if not selected:
+ return None
+
+ schema = st.session_state["dbschema"]
+ item_type, item_id = selected.split("_", 2)
+
+ if item_type not in ["table", "column"] or not is_uuid4(item_id):
+ return None
+
+ if item_type == "table":
+ sql = f"""
+ WITH latest_profile_dates AS (
+ SELECT table_name,
+ profiling_runs.table_groups_id,
+ MAX(profiling_starttime) AS profiling_starttime
+ FROM {schema}.profile_results
+ LEFT JOIN {schema}.profiling_runs ON (
+ profile_results.profile_run_id = profiling_runs.id
+ )
+ GROUP BY profiling_runs.table_groups_id, table_name
+ ),
+ latest_test_run_dates AS (
+ SELECT table_name,
+ test_results.table_groups_id,
+ MAX(test_starttime) AS test_starttime
+ FROM {schema}.test_results
+ LEFT JOIN {schema}.test_runs ON (
+ test_results.test_run_id = test_runs.id
+ )
+ GROUP BY test_results.table_groups_id, table_name
+ )
+ SELECT table_chars.table_name,
+ table_chars.table_groups_id::VARCHAR(50) AS table_group_id,
+ -- Characteristics
+ functional_table_type,
+ record_ct,
+ table_chars.column_ct,
+ data_point_ct,
+ add_date AS add_date,
+ drop_date AS drop_date,
+ -- Metadata
+ critical_data_element,
+ data_source,
+ source_system,
+ source_process,
+ business_domain,
+ stakeholder_group,
+ transform_level,
+ aggregation_level,
+ -- Latest Profile & Test Runs
+ profiling_runs.id::VARCHAR(50) AS latest_profile_id,
+ lpd.profiling_starttime AS latest_profile_date,
+ lrd.test_starttime AS latest_test_run_date
+ FROM {schema}.data_table_chars table_chars
+ LEFT JOIN latest_profile_dates lpd ON (
+ table_chars.table_groups_id = lpd.table_groups_id
+ AND table_chars.table_name = lpd.table_name
+ )
+ LEFT JOIN latest_test_run_dates lrd ON (
+ table_chars.table_groups_id = lrd.table_groups_id
+ AND table_chars.table_name = lrd.table_name
+ )
+ LEFT JOIN {schema}.profiling_runs ON (
+ lpd.table_groups_id = profiling_runs.table_groups_id
+ AND lpd.profiling_starttime = profiling_runs.profiling_starttime
+ )
+ WHERE table_id = '{item_id}'
+ AND table_chars.table_groups_id = '{table_group_id}';
+ """
+ else:
+ sql = f"""
+ WITH latest_profile_dates AS (
+ SELECT column_name,
+ table_name,
+ profiling_runs.table_groups_id,
+ MAX(profiling_starttime) AS profiling_starttime
+ FROM {schema}.profile_results
+ LEFT JOIN {schema}.profiling_runs ON (
+ profile_results.profile_run_id = profiling_runs.id
+ )
+ GROUP BY profiling_runs.table_groups_id, table_name, column_name
+ ),
+ latest_test_run_dates AS (
+ SELECT column_names,
+ table_name,
+ test_results.table_groups_id,
+ MAX(test_starttime) AS test_starttime
+ FROM {schema}.test_results
+ LEFT JOIN {schema}.test_runs ON (
+ test_results.test_run_id = test_runs.id
+ )
+ GROUP BY test_results.table_groups_id, table_name, column_names
+ )
+ SELECT column_chars.column_name,
+ column_chars.table_name,
+ column_chars.table_groups_id::VARCHAR(50) AS table_group_id,
+ -- Characteristics
+ column_chars.general_type,
+ column_chars.column_type,
+ column_chars.functional_data_type,
+ datatype_suggestion,
+ column_chars.add_date AS add_date,
+ column_chars.last_mod_date AS last_mod_date,
+ column_chars.drop_date AS drop_date,
+ -- Column Metadata
+ column_chars.critical_data_element,
+ column_chars.data_source,
+ column_chars.source_system,
+ column_chars.source_process,
+ column_chars.business_domain,
+ column_chars.stakeholder_group,
+ column_chars.transform_level,
+ column_chars.aggregation_level,
+ -- Table Metadata
+ table_chars.critical_data_element AS table_critical_data_element,
+ table_chars.data_source AS table_data_source,
+ table_chars.source_system AS table_source_system,
+ table_chars.source_process AS table_source_process,
+ table_chars.business_domain AS table_business_domain,
+ table_chars.stakeholder_group AS table_stakeholder_group,
+ table_chars.transform_level AS table_transform_level,
+ table_chars.aggregation_level AS table_aggregation_level,
+ -- Latest Profile & Test Runs
+ profiling_runs.id::VARCHAR(50) AS latest_profile_id,
+ lpd.profiling_starttime AS latest_profile_date,
+ lrd.test_starttime AS latest_test_run_date,
+ -- Value Counts
+ profile_results.record_ct,
+ value_ct,
+ distinct_value_ct,
+ null_value_ct,
+ zero_value_ct,
+ -- Alpha
+ zero_length_ct,
+ filled_value_ct,
+ includes_digit_ct,
+ numeric_ct,
+ date_ct,
+ quoted_value_ct,
+ lead_space_ct,
+ embedded_space_ct,
+ avg_embedded_spaces,
+ min_length,
+ max_length,
+ avg_length,
+ min_text,
+ max_text,
+ distinct_std_value_ct,
+ distinct_pattern_ct,
+ std_pattern_match,
+ top_freq_values,
+ top_patterns,
+ -- Numeric
+ min_value,
+ min_value_over_0,
+ max_value,
+ avg_value,
+ stdev_value,
+ percentile_25,
+ percentile_50,
+ percentile_75,
+ -- Date
+ min_date,
+ max_date,
+ before_1yr_date_ct,
+ before_5yr_date_ct,
+ before_20yr_date_ct,
+ within_1yr_date_ct,
+ within_1mo_date_ct,
+ future_date_ct,
+ -- Boolean
+ boolean_true_ct
+ FROM {schema}.data_column_chars column_chars
+ LEFT JOIN {schema}.data_table_chars table_chars ON (
+ column_chars.table_id = table_chars.table_id
+ )
+ LEFT JOIN latest_profile_dates lpd ON (
+ column_chars.table_groups_id = lpd.table_groups_id
+ AND column_chars.table_name = lpd.table_name
+ AND column_chars.column_name = lpd.column_name
+ )
+ LEFT JOIN latest_test_run_dates lrd ON (
+ column_chars.table_groups_id = lrd.table_groups_id
+ AND column_chars.table_name = lrd.table_name
+ AND column_chars.column_name = lrd.column_names
+ )
+ LEFT JOIN {schema}.profiling_runs ON (
+ lpd.table_groups_id = profiling_runs.table_groups_id
+ AND lpd.profiling_starttime = profiling_runs.profiling_starttime
+ )
+ LEFT JOIN {schema}.profile_results ON (
+ profiling_runs.id = profile_results.profile_run_id
+ AND column_chars.column_name = profile_results.column_name
+ )
+ WHERE column_id = '{item_id}'
+ AND column_chars.table_groups_id = '{table_group_id}';;
+ """
+
+ item_df = db.retrieve_data(sql)
+ if not item_df.empty:
+ # to_json converts datetimes, NaN, etc, to JSON-safe values (Note: to_dict does not)
+ item = json.loads(item_df.to_json(orient="records"))[0]
+ item["id"] = selected
+ item["type"] = item_type
+ item["latest_anomalies"] = get_profile_anomalies(item["latest_profile_id"], item["table_name"], item.get("column_name"))
+ item["latest_test_issues"] = get_latest_test_issues(item["table_group_id"], item["table_name"], item.get("column_name"))
+ return item
+
+
+@st.cache_data(show_spinner=False)
+def get_profile_anomalies(profile_run_id: str, table_name: str, column_name: str | None = None) -> dict | None:
+ schema = st.session_state["dbschema"]
+
+ column_condition = ""
+ if column_name:
+ column_condition = f"AND column_name = '{column_name}'"
+
+ sql = f"""
+ WITH pii_results AS (
+ SELECT id,
+ CASE
+ WHEN detail LIKE 'Risk: HIGH%%' THEN 'High'
+ WHEN detail LIKE 'Risk: MODERATE%%' THEN 'Moderate'
+ ELSE null
+ END AS pii_risk
+ FROM {schema}.profile_anomaly_results
+ )
+ SELECT column_name,
+ anomaly_name,
+ issue_likelihood,
+ detail,
+ pii_risk
+ FROM {schema}.profile_anomaly_results anomaly_results
+ LEFT JOIN {schema}.profile_anomaly_types anomaly_types ON (
+ anomaly_types.id = anomaly_results.anomaly_id
+ )
+ LEFT JOIN pii_results ON (
+ anomaly_results.id = pii_results.id
+ )
+ WHERE profile_run_id = '{profile_run_id}'
+ AND table_name = '{table_name}'
+ {column_condition}
+ AND COALESCE(disposition, 'Confirmed') = 'Confirmed'
+ ORDER BY
+ CASE issue_likelihood
+ WHEN 'Definite' THEN 1
+ WHEN 'Likely' THEN 2
+ WHEN 'Possible' THEN 3
+ ELSE 4
+ END,
+ CASE pii_risk
+ WHEN 'High' THEN 1
+ WHEN 'Moderate' THEN 2
+ ELSE 3
+ END,
+ column_name;
+ """
+
+ df = db.retrieve_data(sql)
+ return json.loads(df.to_json(orient="records"))
+
+
+@st.cache_data(show_spinner=False)
+def get_latest_test_issues(table_group_id: str, table_name: str, column_name: str | None = None) -> dict | None:
+ schema = st.session_state["dbschema"]
+
+ column_condition = ""
+ if column_name:
+ column_condition = f"AND column_names = '{column_name}'"
+
+ sql = f"""
+ WITH latest_run_dates AS (
+ SELECT test_suite_id,
+ MAX(test_starttime) AS test_starttime
+ FROM {schema}.test_runs
+ GROUP BY test_suite_id
+ )
+ SELECT column_names AS column_name,
+ test_name_short AS test_name,
+ result_status,
+ result_message,
+ test_suite,
+ test_results.test_run_id::VARCHAR(50),
+ lrd.test_starttime AS test_run_date
+ FROM latest_run_dates lrd
+ LEFT JOIN {schema}.test_runs ON (
+ lrd.test_suite_id = test_runs.test_suite_id
+ AND lrd.test_starttime = test_runs.test_starttime
+ )
+ LEFT JOIN {schema}.test_results ON (
+ test_runs.id = test_results.test_run_id
+ )
+ LEFT JOIN {schema}.test_types ON (
+ test_results.test_type = test_types.test_type
+ )
+ LEFT JOIN {schema}.test_suites ON (
+ lrd.test_suite_id = test_suites.id
+ )
+ WHERE test_suites.table_groups_id = '{table_group_id}'
+ AND table_name = '{table_name}'
+ {column_condition}
+ AND result_status <> 'Passed'
+ AND COALESCE(disposition, 'Confirmed') = 'Confirmed'
+ ORDER BY
+ CASE result_status
+ WHEN 'Failed' THEN 1
+ WHEN 'Warning' THEN 2
+ ELSE 3
+ END,
+ column_name;
+ """
+
+ df = db.retrieve_data(sql)
+ return json.loads(df.to_json(orient="records"))
diff --git a/testgen/utils/__init__.py b/testgen/utils/__init__.py
index bd4bda8..db58739 100644
--- a/testgen/utils/__init__.py
+++ b/testgen/utils/__init__.py
@@ -1,4 +1,5 @@
import math
+from uuid import UUID
import pandas as pd
@@ -13,3 +14,12 @@ def truncate(value: float) -> int:
if 0 < value < 1:
return 1
return math.trunc(value)
+
+
+def is_uuid4(value: str) -> bool:
+ try:
+ uuid = UUID(value, version=4)
+ except Exception:
+ return False
+
+ return str(uuid) == value
From 5ce267f125b9fb48fc947aff1cd73785d41a8ee8 Mon Sep 17 00:00:00 2001
From: Aarthy Adityan
Date: Mon, 4 Nov 2024 13:11:11 -0500
Subject: [PATCH 56/91] misc(ui): fixes and typing improvements
---
.../frontend/js/pages/profiling_runs.js | 26 ++++++++++++++++---
.../components/frontend/js/pages/test_runs.js | 23 +++++++++++++---
testgen/ui/navigation/router.py | 3 +++
testgen/ui/services/javascript_service.py | 1 -
4 files changed, 45 insertions(+), 8 deletions(-)
diff --git a/testgen/ui/components/frontend/js/pages/profiling_runs.js b/testgen/ui/components/frontend/js/pages/profiling_runs.js
index c434f37..6b98d38 100644
--- a/testgen/ui/components/frontend/js/pages/profiling_runs.js
+++ b/testgen/ui/components/frontend/js/pages/profiling_runs.js
@@ -1,7 +1,25 @@
/**
+ * @typedef ProfilingRun
+ * @type {object}
+ * @property {string} profiling_run_id
+ * @property {number} start_time
+ * @property {string} table_groups_name
+ * @property {'Running'|'Complete'|'Error'|'Cancelled'} status
+ * @property {string} log_message
+ * @property {string} duration
+ * @property {string} process_id
+ * @property {string} schema_name
+ * @property {number} column_ct
+ * @property {number} table_ct
+ * @property {number} anomaly_ct
+ * @property {number} anomalies_definite_ct
+ * @property {number} anomalies_likely_ct
+ * @property {number} anomalies_possible_ct
+ * @property {number} anomalies_dismissed_ct
+ *
* @typedef Properties
* @type {object}
- * @property {array} items
+ * @property {ProfilingRun[]} items
*/
import van from '../van.min.js';
import { Tooltip } from '../components/tooltip.js';
@@ -57,7 +75,7 @@ const ProfilingRuns = (/** @type Properties */ props) => {
);
}
-const ProfilingRunItem = (item, /** @type string[] */ columns) => {
+const ProfilingRunItem = (/** @type ProfilingRun */ item, /** @type string[] */ columns) => {
return div(
{ class: 'table-row flex-row' },
div(
@@ -92,7 +110,7 @@ const ProfilingRunItem = (item, /** @type string[] */ columns) => {
class: 'text-caption mt-1 mb-1',
style: item.status === 'Complete' && !item.column_ct ? 'color: var(--red);' : '',
},
- `${item.table_ct || 0} tables, ${item.column_ct || 0} columns`,
+ item.status === 'Complete' ? `${item.table_ct || 0} tables, ${item.column_ct || 0} columns` : null,
),
item.column_ct ? Link({
label: 'View results',
@@ -126,7 +144,7 @@ const ProfilingRunItem = (item, /** @type string[] */ columns) => {
);
}
-function ProfilingRunStatus(/** @type object */ item) {
+function ProfilingRunStatus(/** @type ProfilingRun */ item) {
const attributeMap = {
Running: { label: 'Running', color: 'blue' },
Complete: { label: 'Completed', color: '' },
diff --git a/testgen/ui/components/frontend/js/pages/test_runs.js b/testgen/ui/components/frontend/js/pages/test_runs.js
index d100f91..c5656a6 100644
--- a/testgen/ui/components/frontend/js/pages/test_runs.js
+++ b/testgen/ui/components/frontend/js/pages/test_runs.js
@@ -1,7 +1,24 @@
/**
+ * @typedef TestRun
+ * @type {object}
+ * @property {string} test_run_id
+ * @property {number} test_starttime
+ * @property {string} table_groups_name
+ * @property {string} test_suite
+ * @property {'Running'|'Complete'|'Error'|'Cancelled'} status
+ * @property {string} log_message
+ * @property {string} duration
+ * @property {string} process_id
+ * @property {number} test_ct
+ * @property {number} passed_ct
+ * @property {number} warning_ct
+ * @property {number} failed_ct
+ * @property {number} error_ct
+ * @property {number} dismissed_ct
+ *
* @typedef Properties
* @type {object}
- * @property {array} items
+ * @property {TestRun[]} items
*/
import van from '../van.min.js';
import { Tooltip } from '../components/tooltip.js';
@@ -53,7 +70,7 @@ const TestRuns = (/** @type Properties */ props) => {
);
}
-const TestRunItem = (item, /** @type string[] */ columns) => {
+const TestRunItem = (/** @type TestRun */ item, /** @type string[] */ columns) => {
return div(
{ class: 'table-row flex-row' },
div(
@@ -102,7 +119,7 @@ const TestRunItem = (item, /** @type string[] */ columns) => {
);
}
-function TestRunStatus(/** @type object */ item) {
+function TestRunStatus(/** @type TestRun */ item) {
const attributeMap = {
Running: { label: 'Running', color: 'blue' },
Complete: { label: 'Completed', color: '' },
diff --git a/testgen/ui/navigation/router.py b/testgen/ui/navigation/router.py
index 011ebb8..3b812a3 100644
--- a/testgen/ui/navigation/router.py
+++ b/testgen/ui/navigation/router.py
@@ -37,6 +37,9 @@ def run(self, hide_sidebar=False) -> None:
if not session.cookies_ready:
session.cookies_ready = 1
session.page_pending_cookies = current_page
+ # Set this anyway so that sidebar displays initial selection correctly
+ session.current_page = current_page.url_path
+ st.rerun()
# Sometimes the cookie is ready on the second rerun and other times only on the third -_-
# so we have to make sure the page renders correctly in both cases
diff --git a/testgen/ui/services/javascript_service.py b/testgen/ui/services/javascript_service.py
index 7b4ea32..93eae90 100644
--- a/testgen/ui/services/javascript_service.py
+++ b/testgen/ui/services/javascript_service.py
@@ -38,7 +38,6 @@ def get_browser_locale_timezone():
return st_javascript(
"""await (async () => {
const userTimezone = Intl.DateTimeFormat().resolvedOptions().timeZone;
- console.log(userTimezone)
return userTimezone
})().then(returnValue => returnValue)"""
)
From 9fa43f13be9df29fb0e12775d001fe9ff8ba5d5f Mon Sep 17 00:00:00 2001
From: Aarthy Adityan
Date: Wed, 6 Nov 2024 17:12:23 -0500
Subject: [PATCH 57/91] misc(ui): improve data hierarchy query performance
---
testgen/ui/views/data_hierarchy.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/testgen/ui/views/data_hierarchy.py b/testgen/ui/views/data_hierarchy.py
index 59421b3..445f6c4 100644
--- a/testgen/ui/views/data_hierarchy.py
+++ b/testgen/ui/views/data_hierarchy.py
@@ -245,13 +245,13 @@ def get_selected_item(selected: str, table_group_id: str) -> dict | None:
WITH latest_profile_dates AS (
SELECT column_name,
table_name,
- profiling_runs.table_groups_id,
+ profile_results.table_groups_id,
MAX(profiling_starttime) AS profiling_starttime
FROM {schema}.profile_results
LEFT JOIN {schema}.profiling_runs ON (
profile_results.profile_run_id = profiling_runs.id
)
- GROUP BY profiling_runs.table_groups_id, table_name, column_name
+ GROUP BY profile_results.table_groups_id, table_name, column_name
),
latest_test_run_dates AS (
SELECT column_names,
@@ -366,7 +366,7 @@ def get_selected_item(selected: str, table_group_id: str) -> dict | None:
AND column_chars.column_name = profile_results.column_name
)
WHERE column_id = '{item_id}'
- AND column_chars.table_groups_id = '{table_group_id}';;
+ AND column_chars.table_groups_id = '{table_group_id}';
"""
item_df = db.retrieve_data(sql)
From fc6cb63662be60113e8f79250391643d95c4d237 Mon Sep 17 00:00:00 2001
From: Astor
Date: Thu, 7 Nov 2024 16:04:37 -0300
Subject: [PATCH 58/91] fix(threshold error count fix): code fixes
Refs: TG-806
---
testgen/ui/views/test_definitions.py | 25 ++++++++++++-------------
1 file changed, 12 insertions(+), 13 deletions(-)
diff --git a/testgen/ui/views/test_definitions.py b/testgen/ui/views/test_definitions.py
index c0eaf09..58fc022 100644
--- a/testgen/ui/views/test_definitions.py
+++ b/testgen/ui/views/test_definitions.py
@@ -529,20 +529,19 @@ def show_test_form(
if dynamic_attribute in ["custom_query"]:
show_custom_query = True
+ elif dynamic_attribute in ["threshold"]:
+ test_definition[dynamic_attribute] = current_column.number_input(
+ label=actual_dynamic_attributes_labels,
+ value=value,
+ help=actual_dynamic_attributes_help,
+ )
else:
- if "threshold" in dynamic_attribute:
- test_definition[dynamic_attribute] = current_column.number_input(
- label=actual_dynamic_attributes_labels,
- value=value,
- help=actual_dynamic_attributes_help,
- )
- else:
- test_definition[dynamic_attribute] = current_column.text_input(
- label=actual_dynamic_attributes_labels,
- max_chars=4000 if dynamic_attribute in ["match_column_names", "match_groupby_names", "groupby_names"] else 1000,
- value=value,
- help=actual_dynamic_attributes_help,
- )
+ test_definition[dynamic_attribute] = current_column.text_input(
+ label=actual_dynamic_attributes_labels,
+ max_chars=4000 if dynamic_attribute in ["match_column_names", "match_groupby_names", "groupby_names"] else 1000,
+ value=value,
+ help=actual_dynamic_attributes_help,
+ )
# Custom Query
if show_custom_query:
From 1113bcd516c2a5ea461662cde6cfcd55c43900c0 Mon Sep 17 00:00:00 2001
From: Astor
Date: Thu, 7 Nov 2024 16:30:22 -0300
Subject: [PATCH 59/91] fix: threshold error count
Refs: TG-806
---
testgen/ui/views/test_definitions.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/testgen/ui/views/test_definitions.py b/testgen/ui/views/test_definitions.py
index 58fc022..d36465d 100644
--- a/testgen/ui/views/test_definitions.py
+++ b/testgen/ui/views/test_definitions.py
@@ -529,7 +529,7 @@ def show_test_form(
if dynamic_attribute in ["custom_query"]:
show_custom_query = True
- elif dynamic_attribute in ["threshold"]:
+ elif dynamic_attribute in ["threshold_value"]:
test_definition[dynamic_attribute] = current_column.number_input(
label=actual_dynamic_attributes_labels,
value=value,
From 00692706bf78f9b1ca4b75e5fa9f7747032ac692 Mon Sep 17 00:00:00 2001
From: Aarthy Adityan
Date: Thu, 7 Nov 2024 14:37:45 -0500
Subject: [PATCH 60/91] ci(docker): add git to dockerfile
---
Dockerfile | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/Dockerfile b/Dockerfile
index e436ca4..cdab57c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,7 +2,7 @@ FROM python:3.10-slim-bookworm AS build-image
RUN mkdir -p /dk && \
apt-get update && \
- apt-get install -y gcc libpcre3 libpcre3-dev g++
+ apt-get install -y gcc libpcre3 libpcre3-dev g++ git
COPY ./pyproject.toml /tmp/dk/
RUN python3 -m pip install /tmp/dk --prefix=/dk
From 776d2b3f09d6a93ed57f141879e1d518504cb053 Mon Sep 17 00:00:00 2001
From: Luis Trinidad
Date: Thu, 7 Nov 2024 12:48:08 -0400
Subject: [PATCH 61/91] fix(profiling): add parenthesis to the formatted
anomaly criteria
---
testgen/template/profiling/profile_anomalies_screen_column.sql | 2 +-
.../profiling/profile_anomalies_screen_multi_column.sql | 2 +-
.../template/profiling/profile_anomalies_screen_variants.sql | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/testgen/template/profiling/profile_anomalies_screen_column.sql b/testgen/template/profiling/profile_anomalies_screen_column.sql
index e0d9e34..cb9c4c1 100644
--- a/testgen/template/profiling/profile_anomalies_screen_column.sql
+++ b/testgen/template/profiling/profile_anomalies_screen_column.sql
@@ -19,4 +19,4 @@ LEFT JOIN v_inactive_anomalies i
AND '{ANOMALY_ID}' = i.anomaly_id)
WHERE p.profile_run_id = '{PROFILE_RUN_ID}'::UUID
AND i.anomaly_id IS NULL
- AND {ANOMALY_CRITERIA};
+ AND ({ANOMALY_CRITERIA});
diff --git a/testgen/template/profiling/profile_anomalies_screen_multi_column.sql b/testgen/template/profiling/profile_anomalies_screen_multi_column.sql
index 7a61561..6451eaf 100644
--- a/testgen/template/profiling/profile_anomalies_screen_multi_column.sql
+++ b/testgen/template/profiling/profile_anomalies_screen_multi_column.sql
@@ -44,7 +44,7 @@ WITH mults AS ( SELECT p.project_code,
AND '{ANOMALY_ID}' = i.anomaly_id)
WHERE p.profile_run_id = '{PROFILE_RUN_ID}'::UUID
AND i.anomaly_id IS NULL
- AND {ANOMALY_CRITERIA}
+ AND ({ANOMALY_CRITERIA})
)
INSERT INTO profile_anomaly_results
(project_code, table_groups_id, profile_run_id, anomaly_id,
diff --git a/testgen/template/profiling/profile_anomalies_screen_variants.sql b/testgen/template/profiling/profile_anomalies_screen_variants.sql
index cec9bdb..266e73e 100644
--- a/testgen/template/profiling/profile_anomalies_screen_variants.sql
+++ b/testgen/template/profiling/profile_anomalies_screen_variants.sql
@@ -22,7 +22,7 @@ WITH all_matches
AND p.column_name = i.column_name
AND '{ANOMALY_ID}' = i.anomaly_id)
WHERE p.profile_run_id = '{PROFILE_RUN_ID}'::UUID
- AND {ANOMALY_CRITERIA}
+ AND ({ANOMALY_CRITERIA})
AND p.top_freq_values > ''
AND i.anomaly_id IS NULL
AND fn_count_intersecting_items(LOWER(fn_extract_top_values(p.top_freq_values)), v.check_values, '|') > 1
From 9521759a3f7b3bb9ad150f21b7b159e6723d7ae3 Mon Sep 17 00:00:00 2001
From: Ricardo Boni
Date: Mon, 11 Nov 2024 11:14:44 -0500
Subject: [PATCH 62/91] misc(pdf): Code review feedback
---
testgen/ui/pdf/hygiene_issue_report.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/testgen/ui/pdf/hygiene_issue_report.py b/testgen/ui/pdf/hygiene_issue_report.py
index 4c23ec6..b228231 100644
--- a/testgen/ui/pdf/hygiene_issue_report.py
+++ b/testgen/ui/pdf/hygiene_issue_report.py
@@ -85,7 +85,7 @@ def build_summary_table(document, hi_data):
(
"Hygiene Issue",
(
- Paragraph(f"{hi_data["anomaly_name"]}:", style=PARA_STYLE_CELL),
+ Paragraph(f"{hi_data['anomaly_name']}:", style=PARA_STYLE_CELL),
Paragraph(hi_data["anomaly_description"], style=PARA_STYLE_CELL),
),
None,
From f5ba8790ef3dde24403ed93379e67b68cd8115fd Mon Sep 17 00:00:00 2001
From: "Chip.Bloche"
Date: Fri, 8 Nov 2024 15:14:19 -0500
Subject: [PATCH 63/91] feat(cli): add scoring infrastructure and default score
roll-ups
---
.../queries/execute_cat_tests_query.py | 11 ++
testgen/commands/queries/profiling_query.py | 17 +-
testgen/commands/run_execute_cat_tests.py | 6 +-
testgen/commands/run_profiling_bridge.py | 29 ++-
.../commands/run_test_parameter_validation.py | 12 +-
.../020_create_standard_functions_sprocs.sql | 83 +++++++++
.../030_initialize_new_schema_structure.sql | 120 ++++++++-----
.../050_populate_new_schema_metadata.sql | 165 +++++++++---------
.../dbsetup/060_create_standard_views.sql | 2 +-
.../dbupgrade/0120_incremental_upgrade.sql | 133 ++++++++++++++
.../ex_finalize_test_run_results.sql | 66 +++++++
.../execution/ex_update_test_suite.sql | 13 ++
.../execution/test_scoring_rollup.sql | 123 +++++++++++++
.../project_profiling_query_mssql.yaml | 27 +++
.../project_profiling_query_postgresql.yaml | 27 +++
.../project_profiling_query_redshift.yaml | 27 +++
.../project_profiling_query_snowflake.yaml | 27 +++
.../project_profiling_query_trino.yaml | 27 +++
.../template/parms/parms_test_execution.sql | 1 +
.../profiling/functional_datatype.sql | 1 +
.../profiling/profile_anomaly_scoring.sql | 10 ++
.../profile_anomaly_scoring_rollup.sql | 109 ++++++++++++
.../profiling/profile_anomaly_types_get.sql | 2 +-
.../ex_get_test_column_list_tg.sql | 25 ++-
.../ex_write_test_val_errors.sql | 4 +-
25 files changed, 913 insertions(+), 154 deletions(-)
create mode 100644 testgen/template/dbupgrade/0120_incremental_upgrade.sql
create mode 100644 testgen/template/execution/ex_update_test_suite.sql
create mode 100644 testgen/template/execution/test_scoring_rollup.sql
create mode 100644 testgen/template/profiling/profile_anomaly_scoring.sql
create mode 100644 testgen/template/profiling/profile_anomaly_scoring_rollup.sql
diff --git a/testgen/commands/queries/execute_cat_tests_query.py b/testgen/commands/queries/execute_cat_tests_query.py
index ac905d3..89e8ff8 100644
--- a/testgen/commands/queries/execute_cat_tests_query.py
+++ b/testgen/commands/queries/execute_cat_tests_query.py
@@ -12,6 +12,7 @@ class CCATExecutionSQL:
test_suite = ""
run_date = ""
test_run_id = ""
+ table_groups_id = ""
max_query_chars = ""
exception_message = ""
@@ -39,6 +40,7 @@ def _ReplaceParms(self, strInputString):
strInputString = strInputString.replace("{PROJECT_CODE}", self.project_code)
strInputString = strInputString.replace("{TEST_SUITE}", self.test_suite)
strInputString = strInputString.replace("{TEST_SUITE_ID}", self.test_suite_id)
+ strInputString = strInputString.replace("{TABLE_GROUPS_ID}", self.table_groups_id)
# NOTE: REPLACE_QC_SCHEMA is parm replaced to run build query: sets the actual value to replace.
# DATA_QC_SCHEMA is parm in cat_test_conditions that build query replaces via SQL.
strInputString = strInputString.replace("{REPLACE_QC_SCHEMA}", self.replace_qc_schema)
@@ -99,3 +101,12 @@ def FinalizeTestResultsSQL(self):
def PushTestRunStatusUpdateSQL(self):
strQ = self._ReplaceParms(read_template_sql_file("ex_update_test_record_in_testrun_table.sql", "execution"))
return strQ
+
+ def FinalizeTestSuiteUpdateSQL(self):
+ strQ = self._ReplaceParms(read_template_sql_file("ex_update_test_suite.sql", "execution"))
+ return strQ
+
+
+ def TestScoringRollupSQL(self):
+ strQ = self._ReplaceParms(read_template_sql_file("test_scoring_rollup.sql", "execution"))
+ return strQ
diff --git a/testgen/commands/queries/profiling_query.py b/testgen/commands/queries/profiling_query.py
index 84cc50f..ed35c0c 100644
--- a/testgen/commands/queries/profiling_query.py
+++ b/testgen/commands/queries/profiling_query.py
@@ -144,11 +144,16 @@ def GetPIIFlagUpdateQuery(self):
strQ = self.ReplaceParms(read_template_sql_file("pii_flag.sql", sub_directory="profiling"))
return strQ
- def GetAnomalyRefreshQuery(self):
+ def GetAnomalyStatsRefreshQuery(self):
# Runs on DK Postgres Server
strQ = self.ReplaceParms(read_template_sql_file("refresh_anomalies.sql", sub_directory="profiling"))
return strQ
+ def GetAnomalyScoringRollupQuery(self):
+ # Runs on DK Postgres Server
+ strQ = self.ReplaceParms(read_template_sql_file("profile_anomaly_scoring_rollup.sql", sub_directory="profiling"))
+ return strQ
+
def GetAnomalyTestTypesQuery(self):
# Runs on DK Postgres Server
strQ = self.ReplaceParms(read_template_sql_file("profile_anomaly_types_get.sql", sub_directory="profiling"))
@@ -178,6 +183,16 @@ def GetAnomalyTestQuery(self, dct_test_type):
return strQ
+ def GetAnomalyScoringQuery(self, dct_test_type):
+ # Runs on DK Postgres Server
+ strQ = read_template_sql_file("profile_anomaly_scoring.sql", sub_directory="profiling")
+ if strQ:
+ strQ = strQ.replace("{PROFILE_RUN_ID}", self.profile_run_id)
+ strQ = strQ.replace("{ANOMALY_ID}", dct_test_type["id"])
+ strQ = strQ.replace("{PREV_FORMULA}", dct_test_type["dq_score_prevalence_formula"])
+ strQ = strQ.replace("{RISK}", dct_test_type["dq_score_risk_factor"])
+ return strQ
+
def GetDataCharsRefreshQuery(self):
# Runs on DK Postgres Server
strQ = self.ReplaceParms(
diff --git a/testgen/commands/run_execute_cat_tests.py b/testgen/commands/run_execute_cat_tests.py
index 9ca8de5..23e20a5 100644
--- a/testgen/commands/run_execute_cat_tests.py
+++ b/testgen/commands/run_execute_cat_tests.py
@@ -61,7 +61,10 @@ def ParseCATResults(clsCATExecute):
def FinalizeTestRun(clsCATExecute):
- lstQueries = [clsCATExecute.FinalizeTestResultsSQL(), clsCATExecute.PushTestRunStatusUpdateSQL()]
+ lstQueries = [clsCATExecute.FinalizeTestResultsSQL(),
+ clsCATExecute.PushTestRunStatusUpdateSQL(),
+ clsCATExecute.FinalizeTestSuiteUpdateSQL(),
+ clsCATExecute.TestScoringRollupSQL()]
RunActionQueryList(("DKTG"), lstQueries)
@@ -80,6 +83,7 @@ def run_cat_test_queries(
)
clsCATExecute.test_run_id = strTestRunID
clsCATExecute.run_date = strTestTime
+ clsCATExecute.table_groups_id = dctParms["table_groups_id"]
clsCATExecute.exception_message += error_msg
# Set Project Connection Params in common.db_bridgers from retrieved params
diff --git a/testgen/commands/run_profiling_bridge.py b/testgen/commands/run_profiling_bridge.py
index c141c76..4dd42b3 100644
--- a/testgen/commands/run_profiling_bridge.py
+++ b/testgen/commands/run_profiling_bridge.py
@@ -29,10 +29,8 @@ def InitializeProfilingSQL(strProject, strSQLFlavor):
return CProfilingSQL(strProject, strSQLFlavor)
-def CompileAnomalyTestQueries(clsProfiling):
- str_query = clsProfiling.GetAnomalyTestTypesQuery()
- lst_tests = RetrieveDBResultsToDictList("DKTG", str_query)
-
+def CompileAnomalyTestQueries(clsProfiling, lst_tests):
+ # Get queries for each test
lst_queries = []
for dct_test_type in lst_tests:
str_query = clsProfiling.GetAnomalyTestQuery(dct_test_type)
@@ -42,6 +40,18 @@ def CompileAnomalyTestQueries(clsProfiling):
return lst_queries
+def CompileAnomalyScoringQueries(clsProfiling, lst_tests):
+ # Get queries for each test
+ lst_queries = []
+ for dct_test_type in lst_tests:
+ if dct_test_type["dq_score_prevalence_formula"]:
+ str_query = clsProfiling.GetAnomalyScoringQuery(dct_test_type)
+ if str_query:
+ lst_queries.append(str_query)
+
+ return lst_queries
+
+
def save_contingency_rules(df_merged, threshold_ratio):
# Prep rows to save
lst_rules = []
@@ -434,6 +444,7 @@ def run_profiling_queries(strTableGroupsID, spinner=None):
LOG.info("CurrentStep: Generating profiling update queries")
lstQueries = []
+ lstAnomalyTypes = []
if lstUpdates:
# Run single update query, then delete from staging
@@ -451,9 +462,14 @@ def run_profiling_queries(strTableGroupsID, spinner=None):
lstQueries.append(strQuery)
strQuery = clsProfiling.GetPIIFlagUpdateQuery()
lstQueries.append(strQuery)
- lstQueries.extend(CompileAnomalyTestQueries(clsProfiling))
- strQuery = clsProfiling.GetAnomalyRefreshQuery()
+
+ strQuery = clsProfiling.GetAnomalyTestTypesQuery()
+ lstAnomalyTypes = RetrieveDBResultsToDictList("DKTG", strQuery)
+ lstQueries.extend(CompileAnomalyTestQueries(clsProfiling, lstAnomalyTypes))
+ lstQueries.extend(CompileAnomalyScoringQueries(clsProfiling, lstAnomalyTypes))
+ strQuery = clsProfiling.GetAnomalyStatsRefreshQuery()
lstQueries.append(strQuery)
+
# Always runs last
strQuery = clsProfiling.GetDataCharsRefreshQuery()
lstQueries.append(strQuery)
@@ -475,6 +491,7 @@ def run_profiling_queries(strTableGroupsID, spinner=None):
finally:
LOG.info("Updating the profiling run record")
lstProfileRunQuery = [clsProfiling.GetProfileRunInfoRecordUpdateQuery()]
+ lstProfileRunQuery.append(clsProfiling.GetAnomalyScoringRollupQuery())
RunActionQueryList("DKTG", lstProfileRunQuery)
if booErrors:
str_error_status = "with errors. Check log for details."
diff --git a/testgen/commands/run_test_parameter_validation.py b/testgen/commands/run_test_parameter_validation.py
index 8e93148..f93ac32 100644
--- a/testgen/commands/run_test_parameter_validation.py
+++ b/testgen/commands/run_test_parameter_validation.py
@@ -65,8 +65,8 @@ def run_parameter_validation_queries(
strSchemas = ", ".join([f"'{value}'" for value in setSchemas])
LOG.debug("Test column list successfully retrieved")
- # Retrieve Project Column list
- LOG.info("CurrentStep: Retrieve Test Columns for Validation")
+ # Retrieve Current Project Column list
+ LOG.info("CurrentStep: Retrieve Current Columns for Validation")
clsExecute.test_schemas = strSchemas
strProjectColumnList = clsExecute.GetProjectTestValidationColumns()
if "where table_schema in ()" in strProjectColumnList:
@@ -74,9 +74,9 @@ def run_parameter_validation_queries(
lstProjectTestColumns = RetrieveDBResultsToDictList("PROJECT", strProjectColumnList)
if len(lstProjectTestColumns) == 0:
- LOG.info("Project Test Column list is empty")
+ LOG.info("Current Test Column list is empty")
- LOG.debug("Project column list successfully received")
+ LOG.debug("Current column list successfully received")
LOG.info("CurrentStep: Compare column sets")
# load results into sets
result_set1 = {col.lower() for col, _ in test_columns}
@@ -86,7 +86,7 @@ def run_parameter_validation_queries(
missing_columns = result_set1.difference(result_set2)
if len(missing_columns) == 0:
- LOG.info("No missing column in Project Column list.")
+ LOG.info("No missing column in Current Column list.")
if missing_columns:
LOG.debug("Test Columns are missing in target database: %s", ", ".join(missing_columns))
@@ -143,7 +143,7 @@ def run_parameter_validation_queries(
# when run_parameter_validation_queries() is called from execute_tests_query.py:
# we disable tests and write validation errors to test_results table.
if booRunFromTestExec:
- # Copy test results to DK DB, using temporary flagged -1 value to identify
+ # Copy test results to DK DB, using temporary flagged D value to identify
LOG.info("CurrentStep: Saving error results for invalid tests")
strReportValErrors = clsExecute.ReportTestValidationErrors()
RunActionQueryList("DKTG", [strReportValErrors])
diff --git a/testgen/template/dbsetup/020_create_standard_functions_sprocs.sql b/testgen/template/dbsetup/020_create_standard_functions_sprocs.sql
index f21925e..c0bad4d 100644
--- a/testgen/template/dbsetup/020_create_standard_functions_sprocs.sql
+++ b/testgen/template/dbsetup/020_create_standard_functions_sprocs.sql
@@ -177,3 +177,86 @@ FROM (
) AS t
WHERE trim(value) <> ''
$$ LANGUAGE sql;
+
+
+CREATE OR REPLACE FUNCTION {SCHEMA_NAME}.fn_normal_cdf(z_score DOUBLE PRECISION)
+RETURNS DOUBLE PRECISION AS
+$$
+/*
+ This function calculates the cumulative distribution function (CDF)
+ for the standard normal distribution for a given Z-score using
+ the Abramowitz and Stegun approximation method. It returns the
+ probability that a standard normal variable is less than or equal
+ to the given Z-score.
+
+ The approximation formula uses a series expansion to estimate the
+ CDF, which is accurate for most practical purposes.
+
+ To estimate the count of observations that fall outside a certain Z-score
+ (both above and below), you can use the `normal_cdf()` function. For a
+ total number of observations N, the proportion of values outside the Z-score
+ is given by: 2 * (1 - normal_cdf(ABS(Z)))
+
+ This gives the proportion of values greater than the positive Z-score and
+ less than the negative Z-score combined. To get the estimated count of
+ observations, multiply this proportion by N: N * 2 * (1 - normal_cdf(ABS(Z)))
+*/
+DECLARE
+ t DOUBLE PRECISION;
+ cdf DOUBLE PRECISION;
+BEGIN
+ t := 1.0 / (1.0 + 0.2316419 * ABS(z_score));
+
+ cdf := (1.0 / SQRT(2 * PI())) * EXP(-0.5 * z_score * z_score) *
+ (0.319381530 * t
+ - 0.356563782 * t * t
+ + 1.781477937 * t * t * t
+ - 1.821255978 * t * t * t * t
+ + 1.330274429 * t * t * t * t * t);
+
+ IF z_score >= 0 THEN
+ RETURN 1.0 - cdf;
+ ELSE
+ RETURN cdf;
+ END IF;
+END;
+$$ LANGUAGE plpgsql;
+
+
+CREATE OR REPLACE FUNCTION {SCHEMA_NAME}.fn_eval(expression TEXT) RETURNS FLOAT
+AS
+$$
+DECLARE
+ result FLOAT;
+ invalid_parts TEXT;
+BEGIN
+ -- Check the modified expression for invalid characters, allowing colons
+ IF expression ~* E'[^0-9+\\-*/(),.\\sA-Z_:e\\\'"]' THEN
+ RAISE EXCEPTION 'Invalid characters detected in expression: %', expression;
+ END IF;
+
+ -- Check for dangerous PostgreSQL-specific keywords
+ IF expression ~* E'\b(DROP|ALTER|INSERT|UPDATE|DELETE|TRUNCATE|GRANT|REVOKE|COPY|EXECUTE|CREATE|COMMENT|SECURITY|WITH|SET ROLE|SET SESSION|DO|CALL|--|/\\*|;|pg_read_file|pg_write_file|pg_terminate_backend)\b' THEN
+ RAISE EXCEPTION 'Invalid expression: dangerous statement detected';
+ END IF;
+
+ -- Remove all allowed tokens from the validation expression, treating 'FLOAT' as a keyword
+ invalid_parts := regexp_replace(
+ expression,
+ E'(\\mGREATEST|LEAST|ABS|FN_NORMAL_CDF|DATEDIFF|DAY|FLOAT)\\M|[0-9]+(\\.[0-9]+)?([eE][+-]?[0-9]+)?|[+\\-*/(),\\\'":]+|\\s+',
+ '',
+ 'gi'
+ );
+
+ -- If anything is left in the validation expression, it's invalid
+ IF invalid_parts <> '' THEN
+ RAISE EXCEPTION 'Invalid expression contains invalid tokens "%" in expression: %', invalid_parts, expression;
+ END IF;
+
+ -- Use the original expression (with ::FLOAT) for execution
+ EXECUTE format('SELECT (%s)::FLOAT', expression) INTO result;
+
+ RETURN result;
+END;
+$$
+LANGUAGE plpgsql;
diff --git a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql
index 8c14348..4e6a7be 100644
--- a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql
+++ b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql
@@ -30,13 +30,13 @@ CREATE TABLE stg_functional_table_updates (
);
CREATE TABLE projects (
- id UUID DEFAULT gen_random_uuid(),
- project_code VARCHAR(30) NOT NULL
+ id UUID DEFAULT gen_random_uuid(),
+ project_code VARCHAR(30) NOT NULL
CONSTRAINT projects_project_code_pk
PRIMARY KEY,
- project_name VARCHAR(50),
- effective_from_date DATE,
- effective_thru_date DATE,
+ project_name VARCHAR(50),
+ effective_from_date DATE,
+ effective_thru_date DATE,
observability_api_key TEXT,
observability_api_url TEXT DEFAULT ''
);
@@ -94,26 +94,32 @@ CREATE TABLE table_groups
source_process VARCHAR(40),
business_domain VARCHAR(40),
stakeholder_group VARCHAR(40),
- transform_level VARCHAR(40)
+ transform_level VARCHAR(40),
+ last_complete_profile_run_id UUID,
+ dq_score_profiling FLOAT,
+ dq_score_testing FLOAT
);
CREATE TABLE profiling_runs (
- id UUID
+ id UUID
CONSTRAINT pk_prun_id
PRIMARY KEY,
- project_code VARCHAR(30) NOT NULL,
- connection_id BIGINT NOT NULL,
- table_groups_id UUID NOT NULL,
- profiling_starttime TIMESTAMP,
- profiling_endtime TIMESTAMP,
- status VARCHAR(100) DEFAULT 'Running',
- log_message VARCHAR,
- table_ct BIGINT,
- column_ct BIGINT,
- anomaly_ct BIGINT,
- anomaly_table_ct BIGINT,
- anomaly_column_ct BIGINT,
- process_id INTEGER
+ project_code VARCHAR(30) NOT NULL,
+ connection_id BIGINT NOT NULL,
+ table_groups_id UUID NOT NULL,
+ profiling_starttime TIMESTAMP,
+ profiling_endtime TIMESTAMP,
+ status VARCHAR(100) DEFAULT 'Running',
+ log_message VARCHAR,
+ table_ct BIGINT,
+ column_ct BIGINT,
+ anomaly_ct BIGINT,
+ anomaly_table_ct BIGINT,
+ anomaly_column_ct BIGINT,
+ dq_affected_data_points BIGINT,
+ dq_total_data_points BIGINT,
+ dq_score_profiling FLOAT,
+ process_id INTEGER
);
CREATE TABLE test_suites (
@@ -128,16 +134,12 @@ CREATE TABLE test_suites (
test_action VARCHAR(100),
severity VARCHAR(10),
export_to_observability VARCHAR(5) DEFAULT 'Y',
--- email_list VARCHAR(200),
--- email_slack VARCHAR(100),
--- wiki_link VARCHAR(200),
--- variation_link VARCHAR(200),
--- wiki_page_id BIGINT,
--- confluence_space VARCHAR(10),
test_suite_schema VARCHAR(100),
component_key VARCHAR(100),
component_type VARCHAR(100),
component_name VARCHAR(100),
+ last_complete_test_run_id UUID,
+ dq_score_exclude BOOLEAN default FALSE,
CONSTRAINT test_suites_id_pk
PRIMARY KEY (id)
);
@@ -230,6 +232,10 @@ CREATE TABLE profile_results (
filled_value_ct BIGINT,
min_text VARCHAR(1000),
max_text VARCHAR(1000),
+ upper_case_ct BIGINT,
+ lower_case_ct BIGINT,
+ non_alpha_ct BIGINT,
+ mixed_case_ct BIGINT GENERATED ALWAYS AS ( value_ct - upper_case_ct - lower_case_ct - non_alpha_ct ) STORED,
numeric_ct BIGINT,
date_ct BIGINT,
top_patterns VARCHAR(1000),
@@ -249,9 +255,11 @@ CREATE TABLE profile_results (
before_1yr_date_ct BIGINT,
before_5yr_date_ct BIGINT,
before_20yr_date_ct BIGINT,
+ before_100yr_date_ct BIGINT,
within_1yr_date_ct BIGINT,
within_1mo_date_ct BIGINT,
future_date_ct BIGINT,
+ distant_future_date_ct BIGINT,
date_days_present BIGINT,
date_weeks_present BIGINT,
date_months_present BIGINT,
@@ -275,13 +283,15 @@ CREATE TABLE profile_anomaly_types (
CONSTRAINT pk_anomaly_types_id
PRIMARY KEY,
anomaly_type VARCHAR(200) NOT NULL,
- data_object VARCHAR(10), -- Table, Dates, Column
+ data_object VARCHAR(10), -- Column, Multi-Col, Dates, Variant
anomaly_name VARCHAR(100),
anomaly_description VARCHAR(500),
anomaly_criteria VARCHAR(2000),
detail_expression VARCHAR(2000),
issue_likelihood VARCHAR(50), -- Potential, Likely, Certain
- suggested_action VARCHAR(1000) -- Consider, Investigate, Correct
+ suggested_action VARCHAR(1000),
+ dq_score_prevalence_formula TEXT,
+ dq_score_risk_factor TEXT
);
CREATE TABLE profile_anomaly_results (
@@ -298,7 +308,8 @@ CREATE TABLE profile_anomaly_results (
column_type VARCHAR(50),
anomaly_id VARCHAR(10),
detail VARCHAR,
- disposition VARCHAR(20) -- Confirmed, Dismissed, Inactive
+ disposition VARCHAR(20), -- Confirmed, Dismissed, Inactive
+ dq_prevalence FLOAT
);
@@ -350,7 +361,10 @@ CREATE TABLE data_table_chars (
drop_date TIMESTAMP,
record_ct BIGINT,
column_ct BIGINT,
- data_point_ct BIGINT
+ data_point_ct BIGINT,
+ last_complete_profile_run_id UUID,
+ dq_score_profiling FLOAT,
+ dq_score_testing FLOAT
);
CREATE TABLE data_column_chars (
@@ -384,7 +398,10 @@ CREATE TABLE data_column_chars (
fails_30_days_prior INTEGER,
warnings_last_run INTEGER,
warnings_7_days_prior INTEGER,
- warnings_30_days_prior INTEGER
+ warnings_30_days_prior INTEGER,
+ last_complete_profile_run_id UUID,
+ dq_score_profiling FLOAT,
+ dq_score_testing FLOAT
);
CREATE TABLE test_types (
@@ -399,6 +416,8 @@ CREATE TABLE test_types (
measure_uom VARCHAR(100),
measure_uom_description VARCHAR(200),
selection_criteria TEXT,
+ dq_score_prevalence_formula TEXT,
+ dq_score_risk_factor TEXT,
column_name_prompt TEXT,
column_name_help TEXT,
default_parm_columns TEXT,
@@ -434,25 +453,28 @@ CREATE TABLE generation_sets (
);
CREATE TABLE test_runs (
- id UUID NOT NULL
+ id UUID NOT NULL
CONSTRAINT test_runs_id_pk
PRIMARY KEY,
- test_suite_id UUID NOT NULL,
- test_starttime TIMESTAMP,
- test_endtime TIMESTAMP,
- status VARCHAR(100) DEFAULT 'Running',
- log_message TEXT,
- duration VARCHAR(50),
- test_ct INTEGER,
- passed_ct INTEGER,
- failed_ct INTEGER,
- warning_ct INTEGER,
- error_ct INTEGER,
- table_ct INTEGER,
- column_ct INTEGER,
- column_failed_ct INTEGER,
- column_warning_ct INTEGER,
- process_id INTEGER,
+ test_suite_id UUID NOT NULL,
+ test_starttime TIMESTAMP,
+ test_endtime TIMESTAMP,
+ status VARCHAR(100) DEFAULT 'Running',
+ log_message TEXT,
+ duration VARCHAR(50),
+ test_ct INTEGER,
+ passed_ct INTEGER,
+ failed_ct INTEGER,
+ warning_ct INTEGER,
+ error_ct INTEGER,
+ table_ct INTEGER,
+ column_ct INTEGER,
+ column_failed_ct INTEGER,
+ column_warning_ct INTEGER,
+ dq_affected_data_points BIGINT,
+ dq_total_data_points BIGINT,
+ dq_score_test_run FLOAT,
+ process_id INTEGER,
CONSTRAINT test_runs_test_suites_fk
FOREIGN KEY (test_suite_id) REFERENCES test_suites
);
@@ -488,6 +510,8 @@ CREATE TABLE test_results (
test_description VARCHAR(1000),
test_run_id UUID NOT NULL,
table_groups_id UUID,
+ dq_prevalence FLOAT,
+ dq_record_ct BIGINT,
observability_status VARCHAR(10),
CONSTRAINT test_results_test_suites_project_code_test_suite_fk
FOREIGN KEY (test_suite_id) REFERENCES test_suites
diff --git a/testgen/template/dbsetup/050_populate_new_schema_metadata.sql b/testgen/template/dbsetup/050_populate_new_schema_metadata.sql
index c4ea048..2524edc 100644
--- a/testgen/template/dbsetup/050_populate_new_schema_metadata.sql
+++ b/testgen/template/dbsetup/050_populate_new_schema_metadata.sql
@@ -12,15 +12,16 @@ ALTER TABLE cat_test_conditions DROP CONSTRAINT cat_test_conditions_cat_tests_te
TRUNCATE TABLE profile_anomaly_types;
-INSERT INTO profile_anomaly_types (id, anomaly_type, data_object, anomaly_name, anomaly_description, anomaly_criteria, detail_expression, issue_likelihood, suggested_action)
+INSERT INTO profile_anomaly_types
+ (id, anomaly_type, data_object, anomaly_name, anomaly_description, anomaly_criteria, detail_expression, issue_likelihood, suggested_action, dq_score_prevalence_formula, dq_score_risk_factor)
VALUES ('1001', 'Suggested_Type', 'Column', 'Suggested Data Type', 'Data stored as text all meets criteria for a more suitable type. ', '(functional_data_type NOT IN (''Boolean'', ''Flag'') ) AND (column_type ILIKE ''%ch
ar%'' OR column_type ILIKE ''text'') AND NOT (datatype_suggestion ILIKE ''%char%'' OR datatype_suggestion ILIKE ''text'')', 'p.datatype_suggestion::VARCHAR(200)', 'Likely', 'Consider changing the column data type to tighte
-n controls over data ingested and to make values more efficient, consistent and suitable for downstream analysis.'),
- ('1002', 'Non_Standard_Blanks', 'Column', 'Non-Standard Blank Values', 'Values representing missing data may be unexpected or inconsistent. Non-standard values may include empty strings as opposed to nulls, dummy entries such as "MISSING" or repeated characters that may have been used to bypass entry requirements, processing artifacts such as "NULL", or spreadsheet artifacts such as "NA", "ERROR".', '(p.filled_value_ct > 0 OR p.zero_length_ct > 0)', '''Filled Values: '' || p.filled_value_ct::VARCHAR || '', Empty String: '' || p.zero_length_ct::VARCHAR || '', Null: '' || p.null_value_ct::VARCHAR || '', Records: '' || p.record_ct::VARCHAR', 'Definite', 'Consider cleansing the column upon ingestion to replace all variants of missing data with a standard designation, like Null.'),
- ('1003', 'Invalid_Zip_USA', 'Column', 'Invalid USA Zip Code Format', 'Some values present do not conform with the expected format of USA Zip Codes.', 'p.std_pattern_match = ''ZIP_USA'' AND (p.general_type <> ''A'' OR p.filled_value_ct > 0 OR p.min_length >= 1 AND p.min_length <= 4 OR p.max_length > 10)', 'CASE WHEN p.general_type = ''N'' THEN ''Type: '' || p.column_type || '', '' ELSE '''' END || ''Min Length: '' || p.min_length::VARCHAR || '', Max Length: '' || p.max_length::VARCHAR || '', Filled Values: '' || p.filled_value_ct::VARCHAR', 'Definite', 'Consider correcting invalid column values or changing them to indicate a missing value if corrections cannot be made.'),
- ('1004', 'Multiple_Types_Minor', 'Multi-Col', 'Multiple Data Types per Column Name - Minor', 'Columns with the same name have the same general type across tables, but the types do not exactly match. Truncation issues may result if columns are commingled and assumed to be the same format.', 'm.general_type_ct = 1 AND m.type_ct > 1', '''Found '' || m.column_ct::VARCHAR || '' columns, '' || m.type_ct::VARCHAR(10) || '' types, '' || m.min_type || '' to '' || m.max_type', 'Possible', 'Consider changing the column data types to be fully consistent. This will tighten your standards at ingestion and assure that data is consistent between tables.'),
- ('1005', 'Multiple_Types_Major', 'Multi-Col', 'Multiple Data Types per Column Name - Major', 'Columns with the same name have broadly different types across tables. Differences could be significant enough to cause errors in downstream analysis, extra steps resulting in divergent business logic and inconsistencies in results.', 'm.general_type_ct > 1', '''Found '' || m.column_ct::VARCHAR || '' columns, '' || m.type_ct::VARCHAR(10) || '' types, '' || m.min_type || '' to '' || m.max_type', 'Likely', 'Ideally, you should change the column data types to be fully consistent. If the data is meant to be different, you should change column names so downstream users aren''t led astray.'),
- ('1006', 'No_Values', 'Column', 'No Column Values Present', 'This column is present in the table, but no values have been ingested or assigned in any records. This could indicate missing data or a processing error. Note that this considers dummy values and zero-length values as missing data. ', '(p.null_value_ct + p.filled_value_ct + p.zero_length_ct) = p.record_ct', '''Null: '' || p.null_value_ct::VARCHAR(10) || '', Filled: '' || p.filled_value_ct::VARCHAR(10) || '', Zero Len: '' || p.zero_length_ct::VARCHAR(10)', 'Possible', 'Review your source data, ingestion process, and any processing steps that update this column.'),
+n controls over data ingested and to make values more efficient, consistent and suitable for downstream analysis.', NULL, NULL),
+ ('1002', 'Non_Standard_Blanks', 'Column', 'Non-Standard Blank Values', 'Values representing missing data may be unexpected or inconsistent. Non-standard values may include empty strings as opposed to nulls, dummy entries such as "MISSING" or repeated characters that may have been used to bypass entry requirements, processing artifacts such as "NULL", or spreadsheet artifacts such as "NA", "ERROR".', '(p.filled_value_ct > 0 OR p.zero_length_ct > 0)', '''Filled Values: '' || p.filled_value_ct::VARCHAR || '', Empty String: '' || p.zero_length_ct::VARCHAR || '', Null: '' || p.null_value_ct::VARCHAR || '', Records: '' || p.record_ct::VARCHAR', 'Definite', 'Consider cleansing the column upon ingestion to replace all variants of missing data with a standard designation, like Null.', 'p.filled_value_ct::FLOAT/p.record_ct::FLOAT', '1.0'),
+ ('1003', 'Invalid_Zip_USA', 'Column', 'Invalid USA Zip Code Format', 'Some values present do not conform with the expected format of USA Zip Codes.', 'p.std_pattern_match = ''ZIP_USA'' AND (p.general_type <> ''A'' OR p.filled_value_ct > 0 OR p.min_length >= 1 AND p.min_length <= 4 OR p.max_length > 10)', 'CASE WHEN p.general_type = ''N'' THEN ''Type: '' || p.column_type || '', '' ELSE '''' END || ''Min Length: '' || p.min_length::VARCHAR || '', Max Length: '' || p.max_length::VARCHAR || '', Filled Values: '' || p.filled_value_ct::VARCHAR', 'Definite', 'Consider correcting invalid column values or changing them to indicate a missing value if corrections cannot be made.', NULL, '1.0'),
+ ('1004', 'Multiple_Types_Minor', 'Multi-Col', 'Multiple Data Types per Column Name - Minor', 'Columns with the same name have the same general type across tables, but the types do not exactly match. Truncation issues may result if columns are commingled and assumed to be the same format.', 'm.general_type_ct = 1 AND m.type_ct > 1', '''Found '' || m.column_ct::VARCHAR || '' columns, '' || m.type_ct::VARCHAR(10) || '' types, '' || m.min_type || '' to '' || m.max_type', 'Possible', 'Consider changing the column data types to be fully consistent. This will tighten your standards at ingestion and assure that data is consistent between tables.', NULL, NULL),
+ ('1005', 'Multiple_Types_Major', 'Multi-Col', 'Multiple Data Types per Column Name - Major', 'Columns with the same name have broadly different types across tables. Differences could be significant enough to cause errors in downstream analysis, extra steps resulting in divergent business logic and inconsistencies in results.', 'm.general_type_ct > 1', '''Found '' || m.column_ct::VARCHAR || '' columns, '' || m.type_ct::VARCHAR(10) || '' types, '' || m.min_type || '' to '' || m.max_type', 'Likely', 'Ideally, you should change the column data types to be fully consistent. If the data is meant to be different, you should change column names so downstream users aren''t led astray.', NULL, NULL),
+ ('1006', 'No_Values', 'Column', 'No Column Values Present', 'This column is present in the table, but no values have been ingested or assigned in any records. This could indicate missing data or a processing error. Note that this considers dummy values and zero-length values as missing data. ', '(p.null_value_ct + p.filled_value_ct + p.zero_length_ct) = p.record_ct', '''Null: '' || p.null_value_ct::VARCHAR(10) || '', Filled: '' || p.filled_value_ct::VARCHAR(10) || '', Zero Len: '' || p.zero_length_ct::VARCHAR(10)', 'Possible', 'Review your source data, ingestion process, and any processing steps that update this column.', '1.0', '0.33'),
('1007', 'Column_Pattern_Mismatch', 'Column', 'Pattern Inconsistency Within Column', 'Alpha-numeric string data within this column conforms to 2-4 different patterns, with 95% matching the first pattern. This could indicate data errors in the remaining values. ', 'p.general_type = ''A''
AND p.max_length > 3
AND p.value_ct > (p.numeric_ct + p.filled_value_ct)
@@ -31,127 +32,121 @@ n controls over data ingested and to make values more efficient, consistent and
AND SPLIT_PART(p.top_patterns, ''|'', 3)::NUMERIC / SPLIT_PART(p.top_patterns, ''|'', 1)::NUMERIC < 0.05)
OR
SPLIT_PART(p.top_patterns, ''|'', 3)::NUMERIC / SPLIT_PART(p.top_patterns, ''|'', 1)::NUMERIC < 0.1
- )', '''Patterns: '' || p.top_patterns', 'Likely', 'Review the values for any data that doesn''t conform to the most common pattern and correct any data errors.'),
+ )', '''Patterns: '' || p.top_patterns', 'Likely', 'Review the values for any data that doesn''t conform to the most common pattern and correct any data errors.', '(p.record_ct - SPLIT_PART(p.top_patterns, ''|'', 1)::INT)::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '0.66'),
('1008', 'Table_Pattern_Mismatch', 'Multi-Col', 'Pattern Inconsistency Across Tables', 'Alpha-numeric string data within this column matches a single pattern, but other columns with the same name have data that matches a different single pattern. Inconsistent formatting may contradict user assumptions and cause downstream errors, extra steps and inconsistent business logic.', 'p.general_type = ''A''
AND p.max_length > 3
AND p.value_ct > (p.numeric_ct + p.filled_value_ct)
AND m.max_pattern_ct = 1
AND m.column_ct > 1
AND SPLIT_PART(p.top_patterns, ''|'', 2) <> SPLIT_PART(m.very_top_pattern, ''|'', 2)
- AND SPLIT_PART(p.top_patterns, ''|'', 1)::NUMERIC / SPLIT_PART(m.very_top_pattern, ''|'', 1)::NUMERIC < 0.1', '''Patterns: '' || SPLIT_PART(p.top_patterns, ''|'', 2) || '', '' || SPLIT_PART(ltrim(m.very_top_pattern, ''0''), ''|'', 2)', 'Likely', 'Review the profiled patterns for the same column in other tables. You may want to add a hygiene step to your processing to make patterns consistent.'),
- ('1009', 'Leading_Spaces', 'Column', 'Leading Spaces Found in Column Values', 'Spaces were found before data at the front of column string values. This likely contradicts user expectations and could be a sign of broader ingestion or processing errors.', 'p.lead_space_ct > 0', '''Cases Found: '' || p.lead_space_ct::VARCHAR(10)', 'Likely', 'Review your source data, ingestion process, and any processing steps that update this column.'),
- ('1010', 'Quoted_Values', 'Column', 'Quoted Values Found in Column Values', 'Column values were found within quotes. This likely contradicts user expectations and could be a sign of broader ingestion or processing errors.', 'p.quoted_value_ct > 0', '''Cases Found: '' || p.quoted_value_ct::VARCHAR(10)', 'Likely', 'Review your source data, ingestion process, and any processing steps that update this column.'),
+ AND SPLIT_PART(p.top_patterns, ''|'', 1)::NUMERIC / SPLIT_PART(m.very_top_pattern, ''|'', 1)::NUMERIC < 0.1', '''Patterns: '' || SPLIT_PART(p.top_patterns, ''|'', 2) || '', '' || SPLIT_PART(ltrim(m.very_top_pattern, ''0''), ''|'', 2)', 'Likely', 'Review the profiled patterns for the same column in other tables. You may want to add a hygiene step to your processing to make patterns consistent.', NULL, NULL),
+ ('1009', 'Leading_Spaces', 'Column', 'Leading Spaces Found in Column Values', 'Spaces were found before data at the front of column string values. This likely contradicts user expectations and could be a sign of broader ingestion or processing errors.', 'p.lead_space_ct > 0', '''Cases Found: '' || p.lead_space_ct::VARCHAR(10)', 'Likely', 'Review your source data, ingestion process, and any processing steps that update this column.', 'p.lead_space_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '0.66'),
+ ('1010', 'Quoted_Values', 'Column', 'Quoted Values Found in Column Values', 'Column values were found within quotes. This likely contradicts user expectations and could be a sign of broader ingestion or processing errors.', 'p.quoted_value_ct > 0', '''Cases Found: '' || p.quoted_value_ct::VARCHAR(10)', 'Likely', 'Review your source data, ingestion process, and any processing steps that update this column.', 'p.quoted_value_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '0.66'),
('1011', 'Char_Column_Number_Values', 'Column', 'Character Column with Mostly Numeric Values', 'This column is defined as alpha, but more than 95% of its values are numeric. Numbers in alpha columns won''t sort correctly, and might contradict user expectations downstream. It''s also possible that more than one type of information is stored in the column, making it harder to retrieve.', 'p.general_type = ''A''
AND p.column_name NOT ILIKE ''%zip%''
AND p.functional_data_type NOT ILIKE ''id%''
AND p.value_ct > p.numeric_ct
- AND p.numeric_ct::NUMERIC > (0.95 * p.value_ct::NUMERIC)', '''Numeric Ct: '' || p.numeric_ct || '' of '' || p.value_ct || '' (Numeric Percent: '' || ROUND(100.0 * p.numeric_ct::NUMERIC(18, 5) / p.value_ct::NUMERIC(18, 5), 2) || '' )''::VARCHAR(200)', 'Likely', 'Review your source data and ingestion process. Consider whether it might be better to store the numeric data in a numeric column. If the alpha data is significant, you could store it in a different column.'),
+ AND p.numeric_ct::NUMERIC > (0.95 * p.value_ct::NUMERIC)', '''Numeric Ct: '' || p.numeric_ct || '' of '' || p.value_ct || '' (Numeric Percent: '' || ROUND(100.0 * p.numeric_ct::NUMERIC(18, 5) / p.value_ct::NUMERIC(18, 5), 2) || '' )''::VARCHAR(200)', 'Likely', 'Review your source data and ingestion process. Consider whether it might be better to store the numeric data in a numeric column. If the alpha data is significant, you could store it in a different column.', 'p.numeric_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '0.66'),
('1012', 'Char_Column_Date_Values', 'Column', 'Character Column with Mostly Date Values', 'This column is defined as alpha, but more than 95% of its values are dates. Dates in alpha columns might not sort correctly, and might contradict user expectations downstream. It''s also possible that more than one type of information is stored in the column, making it harder to retrieve. ', 'p.general_type = ''A''
AND p.value_ct > p.date_ct
- AND p.date_ct::NUMERIC > (0.95 * p.value_ct::NUMERIC)', ''' Date Ct: '' || p.date_ct || '' of '' || p.value_ct || '' (Date Percent: '' || ROUND(100.0 * p.date_ct::NUMERIC(18, 5) / p.value_ct::NUMERIC(18, 5), 2) || '' )''::VARCHAR(200)' , 'Likely', 'Review your source data and ingestion process. Consider whether it might be better to store the date values as a date or datetime column. If the alpha data is also significant, you could store it in a different column.'),
+ AND p.date_ct::NUMERIC > (0.95 * p.value_ct::NUMERIC)', ''' Date Ct: '' || p.date_ct || '' of '' || p.value_ct || '' (Date Percent: '' || ROUND(100.0 * p.date_ct::NUMERIC(18, 5) / p.value_ct::NUMERIC(18, 5), 2) || '' )''::VARCHAR(200)', 'Likely', 'Review your source data and ingestion process. Consider whether it might be better to store the date values as a date or datetime column. If the alpha data is also significant, you could store it in a different column.', 'p.date_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '0.66'),
('1013', 'Small Missing Value Ct', 'Column', 'Small Percentage of Missing Values Found', 'Under 3% of values in this column were found to be null, zero-length or dummy values, but values are not universally present. This could indicate unexpected missing values in a required column.', '(p.value_ct - p.zero_length_ct - p.filled_value_ct)::FLOAT / p.record_ct::FLOAT > 0.97
AND (p.value_ct - p.zero_length_ct - p.filled_value_ct) < p.record_ct', '(p.record_ct - (p.value_ct - p.zero_length_ct - p.filled_value_ct))::VARCHAR(20) ||
'' of '' || p.record_ct::VARCHAR(20) || '' blank values: '' ||
ROUND(100.0 * (p.record_ct - (p.value_ct - p.zero_length_ct - p.filled_value_ct))::NUMERIC(18, 5)
- / NULLIF(p.value_ct, 0)::NUMERIC(18, 5), 2)::VARCHAR(40) || ''%''', 'Possible', 'Review your source data and follow-up with data owners to determine whether this data needs to be corrected, supplemented or excluded.'),
+ / NULLIF(p.value_ct, 0)::NUMERIC(18, 5), 2)::VARCHAR(40) || ''%''', 'Possible', 'Review your source data and follow-up with data owners to determine whether this data needs to be corrected, supplemented or excluded.', '(p.null_value_ct + filled_value_ct + zero_length_ct)::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '0.33'),
('1014', 'Small Divergent Value Ct', 'Column', 'Small Percentage of Divergent Values Found', 'Under 3% of values in this column were found to be different from the most common value. This could indicate a data error.', '(100.0 * fn_parsefreq(p.top_freq_values, 1, 2)::FLOAT /
p.value_ct::FLOAT) > 97::FLOAT
AND (100.0 * fn_parsefreq(p.top_freq_values, 1, 2)::FLOAT /
NULLIF(p.value_ct, 0)::FLOAT) < 100::FLOAT', '''Single Value Pct: '' || ROUND(100.0 * fn_parsefreq(p.top_freq_values, 1, 2)::FLOAT
/ NULLIF(p.value_ct, 0)::FLOAT)::VARCHAR(40)
- || '', Value | Freq: '' || top_freq_values', 'Possible', 'Review your source data and follow-up with data owners to determine whether this data needs to be corrected.'),
+ || '', Value | Freq: '' || top_freq_values', 'Possible', 'Review your source data and follow-up with data owners to determine whether this data needs to be corrected.', '(p.record_ct - fn_parsefreq(p.top_freq_values, 1, 2)::INT)::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '0.33'),
('1015', 'Boolean_Value_Mismatch', 'Column', 'Unexpected Boolean Values Found', 'This column appears to contain boolean (True/False) data, but unexpected values were found. This could indicate inconsistent coding for the same intended values, potentially leading to downstream errors or inconsistent business logic. ', '(distinct_value_ct > 1 AND
((lower(top_freq_values) ILIKE ''| true |%'' OR lower(top_freq_values) ILIKE ''| false |%'') AND NOT (lower(top_freq_values) ILIKE ''%| true |%'' AND lower(top_freq_values) ILIKE ''%| false |%''))
OR ((lower(top_freq_values) ILIKE ''| yes |%'' OR lower(top_freq_values) ILIKE ''| no |%'' ) AND NOT (lower(top_freq_values) ILIKE ''%| yes |%'' AND lower(top_freq_values) ILIKE ''%| no |%'')) )', 'CASE WHEN p.top_freq_values IS NULL THEN ''Min: '' || p.min_text || '', Max: '' || p.max_text
- ELSE ''Top Freq: '' || p.top_freq_values END', 'Likely', 'Review your source data and follow-up with data owners to determine whether this data needs to be corrected. '),
+ ELSE ''Top Freq: '' || p.top_freq_values END', 'Likely', 'Review your source data and follow-up with data owners to determine whether this data needs to be corrected. ', NULL, '0.66'),
('1016', 'Potential_Duplicates', 'Column', 'Potential Duplicate Values Found', 'This column is largely unique, but some duplicate values are present. This pattern is uncommon and could indicate inadvertant duplication. ', 'p.distinct_value_ct > 1000
- AND fn_parsefreq(p.top_freq_values, 1, 2)::BIGINT BETWEEN 2 AND 4', '''Top Freq: '' || p.top_freq_values', 'Possible', 'Review your source data and follow-up with data owners to determine whether this data needs to be corrected. '),
+ AND fn_parsefreq(p.top_freq_values, 1, 2)::BIGINT BETWEEN 2 AND 4', '''Top Freq: '' || p.top_freq_values', 'Possible', 'Review your source data and follow-up with data owners to determine whether this data needs to be corrected. ', '(p.value_ct - p.distinct_value_ct)::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '0.33'),
('1017', 'Standardized_Value_Matches', 'Column', 'Similar Values Match When Standardized', 'When column values are standardized (removing spaces, single-quotes, periods and dashes), matching values are found in other records. This may indicate that formats should be further standardized to allow consistent comparisons for merges, joins and roll-ups. It could also indicate the presence of unintended duplicates.', 'p.general_type = ''A'' AND p.distinct_std_value_ct <> p.distinct_value_ct', '''Distinct Values: '' || p.distinct_value_ct::VARCHAR
- || '', Standardized: '' || p.distinct_std_value_ct::VARCHAR', 'Likely', 'Review standardized vs. raw data values for all matches. Correct data if values should be consistent.'),
+ || '', Standardized: '' || p.distinct_std_value_ct::VARCHAR', 'Likely', 'Review standardized vs. raw data values for all matches. Correct data if values should be consistent.', '(p.distinct_value_ct - p.distinct_std_value_ct)::FLOAT/NULLIF(p.value_ct, 0)', '0.66'),
('1018', 'Unlikely_Date_Values', 'Column', 'Unlikely Dates out of Typical Range', 'Some date values in this column are earlier than 1900-01-01 or later than 30 years after Profiling date.', 'p.general_type = ''D''
AND (p.min_date BETWEEN ''0001-01-02''::DATE AND ''1900-01-01''::DATE
- OR p.max_date > CURRENT_DATE + INTERVAL ''30 year'')', '''Date Range: '' || p.min_date::VARCHAR || '' thru '' || p.max_date::VARCHAR', 'Likely', 'Review your source data and follow-up with data owners to determine whether this data needs to be corrected or removed.'),
- ('1019', 'Recency_One_Year', 'Dates', 'Recency - No Table Dates within 1 Year', 'Among all date columns present in the table, none fall inside of one year from Profile date.', 'MAX(p.max_date) < CURRENT_DATE - INTERVAL ''1 year''', '''Most Recent Date: '' || MAX(p.max_date)::VARCHAR', 'Possible', 'Review your source data and follow-up with data owners to determine whether dates in table should be more recent.'),
- ('1020', 'Recency_Six_Months', 'Dates', 'Recency - No Table Dates within 6 Months', 'Among all date columns present in the table, the most recent date falls 6 months to 1 year back from Profile date. ', 'MAX(p.max_date) >= CURRENT_DATE - INTERVAL ''1 year'' AND MAX(p.max_date) < CURRENT_DATE - INTERVAL ''6 months''', '''Most Recent Date: '' || MAX(p.max_date)::VARCHAR', 'Possible', 'Review your source data and follow-up with data owners to determine whether dates in table should be more recent.'),
+ OR p.max_date > CURRENT_DATE + INTERVAL ''30 year'')', '''Date Range: '' || p.min_date::VARCHAR || '' thru '' || p.max_date::VARCHAR', 'Likely', 'Review your source data and follow-up with data owners to determine whether this data needs to be corrected or removed.', '(COALESCE(p.before_100yr_date_ct,0)+COALESCE(p.distant_future_date_ct, 0))::FLOAT/NULLIF(p.record_ct, 0)', '0.66'),
+ ('1019', 'Recency_One_Year', 'Dates', 'Recency - No Table Dates within 1 Year', 'Among all date columns present in the table, none fall inside of one year from Profile date.', 'MAX(p.max_date) < CURRENT_DATE - INTERVAL ''1 year''', '''Most Recent Date: '' || MAX(p.max_date)::VARCHAR', 'Possible', 'Review your source data and follow-up with data owners to determine whether dates in table should be more recent.', NULL, NULL),
+ ('1020', 'Recency_Six_Months', 'Dates', 'Recency - No Table Dates within 6 Months', 'Among all date columns present in the table, the most recent date falls 6 months to 1 year back from Profile date. ', 'MAX(p.max_date) >= CURRENT_DATE - INTERVAL ''1 year'' AND MAX(p.max_date) < CURRENT_DATE - INTERVAL ''6 months''', '''Most Recent Date: '' || MAX(p.max_date)::VARCHAR', 'Possible', 'Review your source data and follow-up with data owners to determine whether dates in table should be more recent.', NULL, NULL),
('1021', 'Unexpected US States', 'Column', 'Unexpected Column Contains US States', 'This column is not labeled as a state, but contains mostly US State abbreviations. This could indicate shifted or switched source data columns.', 'p.std_pattern_match = ''STATE_USA''
AND p.distinct_value_ct > 5
- AND NOT (p.column_name ILIKE ''%state%'' OR p.column_name ILIKE ''%_st'')', '''Value Range: '' || p.min_text || '' thru '' || max_text || CASE WHEN p.top_freq_values > '''' THEN ''Top Freq Values: '' || REPLACE(p.top_freq_values, CHR(10), '' ; '') ELSE '''' END ', 'Possible', 'Review your source data and follow-up with data owners to determine whether column should be populated with US states.'),
+ AND NOT (p.column_name ILIKE ''%state%'' OR p.column_name ILIKE ''%_st'')', '''Value Range: '' || p.min_text || '' thru '' || max_text || CASE WHEN p.top_freq_values > '''' THEN ''Top Freq Values: '' || REPLACE(p.top_freq_values, CHR(10), '' ; '') ELSE '''' END ', 'Possible', 'Review your source data and follow-up with data owners to determine whether column should be populated with US states.', NULL, '0.33'),
('1022', 'Unexpected Emails', 'Column', 'Unexpected Column Contains Emails', 'This column is not labeled as email, but contains mostly email addresses. This could indicate shifted or switched source data columns.', 'p.std_pattern_match = ''EMAIL''
- AND NOT (p.column_name ILIKE ''%email%'' OR p.column_name ILIKE ''%addr%'')', '''Value Range: '' || p.min_text || '' thru '' || max_text', 'Possible', 'Review your source data and follow-up with data owners to determine whether column should be populated with email addresses.'),
- ('1023', 'Small_Numeric_Value_Ct', 'Column', 'Unexpected Numeric Values Found',
- 'Under 3% of values in this column were found to be numeric. This could indicate a data error.', e'p.general_type = \'A\'
+ AND NOT (p.column_name ILIKE ''%email%'' OR p.column_name ILIKE ''%addr%'')', '''Value Range: '' || p.min_text || '' thru '' || max_text', 'Possible', 'Review your source data and follow-up with data owners to determine whether column should be populated with email addresses.', NULL, '0.33'),
+ ('1023', 'Small_Numeric_Value_Ct', 'Column', 'Unexpected Numeric Values Found', 'Under 3% of values in this column were found to be numeric. This could indicate a data error.', 'p.general_type = ''A''
AND p.numeric_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT < 0.03
- AND p.numeric_ct > 0',
- '''Numeric Ct: '' || p.numeric_ct || '' of '' || p.value_ct || '' (Numeric Percent: '' || ROUND(100.0 * p.numeric_ct::NUMERIC(18, 5)/NULLIF(p.value_ct, 0)::NUMERIC(18, 5), 2) || '' )''::VARCHAR(200)',
- 'Likely', 'Review your source data and follow-up with data owners to determine whether numeric values are invalid entries here.'),
- ('1024', 'Invalid_Zip3_USA', 'Column', 'Invalid USA ZIP-3 Format', 'The majority of values in this column are 3-digit zips, but divergent patterns were found. This could indicate an incorrect roll-up category or a PII concern.', 'p.distinct_pattern_ct > 1
+ AND p.numeric_ct > 0', '''Numeric Ct: '' || p.numeric_ct || '' of '' || p.value_ct || '' (Numeric Percent: '' || ROUND(100.0 * p.numeric_ct::NUMERIC(18, 5)/NULLIF(p.value_ct, 0)::NUMERIC(18, 5), 2) || '' )''::VARCHAR(200)', 'Likely', 'Review your source data and follow-up with data owners to determine whether numeric values are invalid entries here.', 'p.numeric_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '0.66'),
+ ('1024', 'Invalid_Zip3_USA', 'Column', 'Invalid USA ZIP-3 Format', 'The majority of values in this column are 3-digit zips, but divergent patterns were found. This could indicate an incorrect roll-up category or a PII concern.', 'p.distinct_pattern_ct > 1
AND (p.column_name ilike ''%zip%'' OR p.column_name ILIKE ''%postal%'')
AND SPLIT_PART(p.top_patterns, '' | '', 2) = ''NNN''
- AND SPLIT_PART(p.top_patterns, '' | '', 1)::FLOAT/NULLIF(value_ct, 0)::FLOAT > 0.50', '''Pattern: '' || p.top_patterns', 'Definite', 'Review your source data, ingestion process, and any processing steps that update this column.'),
- ('1025', 'Delimited_Data_Embedded', 'Column', 'Delimited Data Embedded in Column', 'Delimited data, separated by a common delimiter (comma, tab, pipe or caret) is present in over 80% of column values. This could indicate data that was incorrectly ingested, or data that would be better represented in parsed form.', 'p.std_pattern_match = ''DELIMITED_DATA''', 'CASE WHEN p.top_freq_values IS NULL THEN ''Min: '' || p.min_text || '', Max: '' || p.max_text ELSE ''Top Freq: '' || p.top_freq_values END', 'Likely', 'Review your source data and follow-up with data consumers to determine the most useful representation of this data.'),
- ('1026', 'Char_Column_Number_Units', 'Column', 'Character Column with Numbers and Units',
- 'This column is defined as alpha, but values include numbers with percents or common units. Embedded measures in alpha columns are harder to access, won''t sort correctly, and might contradict user expectations downstream. Consider parsing into numeric and UOM columns to improve usability.',
- 'p.includes_digit_ct::FLOAT/NULLIF(p.value_ct, 0)::FLOAT > 0.5 AND TRIM(fn_parsefreq(p.top_freq_values, 1, 1)) ~ ''(?i)^[0-9]+(\.[0-9]+)? ?(%|lb|oz|kg|g|mg|km|m|cm|mm|mi|ft|in)$''',
- '''Top Freq: '' || p.top_freq_values', 'Possible',
- 'Review your source data and ingestion process. Consider whether it might be better to parse the numeric and unit data and store in separate columns.'),
- ('1027', 'Variant_Coded_Values', 'Variant', 'Variant Codings for Same Values', 'This column contains more than one common variants that represent a single value or state. This can occur when data is integrated from multiple sources with different standards, or when free entry is permitted without validation. The variations can cause confusion and error for downstream data users and multiple versions of the truth. ', 'p.distinct_value_ct <= 20', '''Variants Found: '' || intersect_list', 'Definite', 'Review your source data and ingestion process. Consider cleansing this data to standardize on a single set of definitive codes.'),
- ('1100', 'Potential_PII', 'Column', 'Personally Identifiable Information', 'This column contains data that could be Personally Identifiable Information (PII)', 'p.pii_flag > ''''', '''Risk: '' || CASE LEFT(p.pii_flag, 1) WHEN ''A'' THEN ''HIGH'' WHEN ''B'' THEN ''MODERATE'' WHEN ''C'' THEN ''LOW'' END || '', PII Type: '' || SUBSTRING(p.pii_flag, 3)', 'Potential PII', 'PII may require steps to ensure data security and compliance with relevant privacy regulations and legal requirements. You may have to classify and inventory PII, implement appropriate access controls, encrypt data, and monitor for unauthorized access. Your organization might be required to update privacy policies and train staff on data protection practices. Note that PII that is lower-risk in isolation might be high-risk in conjunction with other data.');
+ AND SPLIT_PART(p.top_patterns, '' | '', 1)::FLOAT/NULLIF(value_ct, 0)::FLOAT > 0.50', '''Pattern: '' || p.top_patterns', 'Definite', 'Review your source data, ingestion process, and any processing steps that update this column.', '(NULLIF(p.record_ct, 0)::INT - SPLIT_PART(p.top_patterns, '' | '', 1)::INT)::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '1'),
+ ('1025', 'Delimited_Data_Embedded', 'Column', 'Delimited Data Embedded in Column', 'Delimited data, separated by a common delimiter (comma, tab, pipe or caret) is present in over 80% of column values. This could indicate data that was incorrectly ingested, or data that would be better represented in parsed form.', 'p.std_pattern_match = ''DELIMITED_DATA''', 'CASE WHEN p.top_freq_values IS NULL THEN ''Min: '' || p.min_text || '', Max: '' || p.max_text ELSE ''Top Freq: '' || p.top_freq_values END', 'Likely', 'Review your source data and follow-up with data consumers to determine the most useful representation of this data.', NULL, '0.66'),
+ ('1026', 'Char_Column_Number_Units', 'Column', 'Character Column with Numbers and Units', 'This column is defined as alpha, but values include numbers with percents or common units. Embedded measures in alpha columns are harder to access, won''t sort correctly, and might contradict user expectations downstream. Consider parsing into numeric and UOM columns to improve usability.', 'p.includes_digit_ct::FLOAT/NULLIF(p.value_ct, 0)::FLOAT > 0.5 AND TRIM(fn_parsefreq(p.top_freq_values, 1, 1)) ~ ''(?i)^[0-9]+(\.[0-9]+)? ?(%|lb|oz|kg|g|mg|km|m|cm|mm|mi|ft|in)$''', '''Top Freq: '' || p.top_freq_values', 'Possible', 'Review your source data and ingestion process. Consider whether it might be better to parse the numeric and unit data and store in separate columns.', NULL, '0.33'),
+ ('1027', 'Variant_Coded_Values', 'Variant', 'Variant Codings for Same Values', 'This column contains more than one common variants that represent a single value or state. This can occur when data is integrated from multiple sources with different standards, or when free entry is permitted without validation. The variations can cause confusion and error for downstream data users and multiple versions of the truth. ', 'p.distinct_value_ct <= 20', '''Variants Found: '' || intersect_list', 'Definite', 'Review your source data and ingestion process. Consider cleansing this data to standardize on a single set of definitive codes.', NULL, NULL),
+ ('1100', 'Potential_PII', 'Column', 'Personally Identifiable Information', 'This column contains data that could be Personally Identifiable Information (PII)', 'p.pii_flag > ''''', '''Risk: '' || CASE LEFT(p.pii_flag, 1) WHEN ''A'' THEN ''HIGH'' WHEN ''B'' THEN ''MODERATE'' WHEN ''C'' THEN ''LOW'' END || '', PII Type: '' || SUBSTRING(p.pii_flag, 3)', 'Potential PII', 'PII may require steps to ensure data security and compliance with relevant privacy regulations and legal requirements. You may have to classify and inventory PII, implement appropriate access controls, encrypt data, and monitor for unauthorized access. Your organization might be required to update privacy policies and train staff on data protection practices. Note that PII that is lower-risk in isolation might be high-risk in conjunction with other data.', NULL, 'CASE LEFT(p.pii_flag, 1) WHEN ''A'' THEN 1 WHEN ''B'' THEN 0.66 WHEN ''C'' THEN 0.33 END')
+;
TRUNCATE TABLE test_types;
INSERT INTO test_types
- (id, test_type, test_name_short, test_name_long, test_description, except_message, measure_uom, measure_uom_description, selection_criteria, column_name_prompt, column_name_help, default_parm_columns, default_parm_values, default_parm_prompts, default_parm_help, default_severity, run_type, test_scope, dq_dimension, health_dimension, threshold_description, usage_notes, active)
-VALUES ('1004', 'Alpha_Trunc', 'Alpha Truncation', 'Maximum character count consistent', 'Tests that the maximum count of characters in a column value has not dropped vs. baseline data', 'Maximum length of values has dropped from prior expected length.', 'Values over max', NULL, 'general_type =''A'' AND max_length > 0 AND ( (min_length = avg_length AND max_length = avg_length) OR (numeric_ct <> value_ct ) ) AND functional_table_type NOT LIKE ''%window%'' /* The conditions below are to eliminate overlap with : LOV_Match (excluded selection criteria for this test_type), Pattern_Match (excluded selection criteria for this test_type), Constant (excluded functional_data_type Constant and Boolean) */ AND ( (distinct_value_ct NOT BETWEEN 2 AND 10 AND functional_data_type NOT IN ( ''Constant'', ''Boolean'') ) AND NOT ( fn_charcount(top_patterns, E'' \| '' ) = 1 AND fn_charcount(top_patterns, E'' \| '' ) IS NOT NULL AND REPLACE(SPLIT_PART(top_patterns, ''|'' , 2), ''N'' , '''' ) > ''''))', NULL, NULL, 'threshold_value', 'max_length', 'Maximum String Length at Baseline', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Maximum length expected', 'Alpha Truncation tests that the longest text value in a column hasn''t become shorter than the longest value at baseline. This could indicate a problem in a cumulative dataset, where prior values should still exist unchanged. A failure here would suggest that some process changed data that you would still expect to be present and matching its value when the column was profiled. This test would not be appropriate for an incremental or windowed dataset.', 'Y'),
- ('1005', 'Avg_Shift', 'Average Shift', 'Column mean is consistent with reference', 'Tests for statistically-significant shift in mean value for column from average calculated at baseline.', 'Standardized difference between averages is over the selected threshold level.', 'Difference Measure', 'Cohen''s D Difference (0.20 small, 0.5 mod, 0.8 large, 1.2 very large, 2.0 huge)', 'general_type=''N'' AND distinct_value_ct > 10 AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%''', NULL, NULL, 'baseline_value_ct,baseline_avg,baseline_sd,threshold_value', 'value_ct,avg_value,stdev_value,0.5::VARCHAR', 'Value Ct at Baseline,Mean at Baseline,Std Deviation at Baseline,Threshold Difference Measure ', NULL, 'Warning', 'CAT', 'column', 'Consistency', 'Data Drift', 'Standardized Difference Measure', 'Average Shift tests that the average of a numeric column has not significantly changed since baseline, when profiling was done. A significant shift may indicate errors in processing, differences in source data, or valid changes that may nevertheless impact assumptions in downstream data products. The test uses Cohen''s D, a statistical technique to identify significant shifts in a value. Cohen''s D measures the difference between the two averages, reporting results on a standardized scale, which can be interpreted via a rule-of-thumb from small to huge. Depending on your data, some difference may be expected, so it''s reasonable to adjust the threshold value that triggers test failure. This test works well for measures, or even for identifiers if you expect them to increment consistently. You may want to periodically adjust the expected threshold, or even the expected average value if you expect shifting over time. Consider this test along with Variability Increase. If variability rises too, process or measurement flaws could be at work. If variability remains consistent, the issue is more likely to be with the source data itself. ', 'Y'),
- ('1007', 'Constant', 'Constant Match', 'All column values match constant value', 'Tests that all values in the column match the constant value identified in baseline data', 'A constant value is expected for this column.', 'Mismatched values', NULL, 'TEMPLATE', NULL, NULL, 'baseline_value,threshold_value', NULL, 'Constant Value at Baseline,Threshold Error Count', 'The single, unchanging value of the column, per baseline|The number of errors that are acceptable before test fails.', 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Count of records with unexpected values', 'Constant Match tests that a single value determined to be a constant in baseline profiling is still the only value for the column that appears in subsequent versions of the dataset. Sometimes new data or business knowledge may reveal that the value is not a constant at all, even though only one value was present at profiling. In this case, you will want to disable this test. Alternatively, you can use the Value Match test to provide a limited number of valid values for the column.', 'Y'),
- ('1009', 'Daily_Record_Ct', 'Daily Records', 'All dates present within date range', 'Tests for presence of every calendar date within min/max date range, per baseline data', 'Not every date value between min and max dates is present, unlike at baseline.', 'Missing dates', NULL, 'general_type= ''D'' AND date_days_present > 21 AND date_days_present - (DATEDIFF(''day'', ''1800-01-05''::DATE, max_date) - DATEDIFF(''day'', ''1800-01-05''::DATE, min_date) + 1) = 0 AND future_date_ct::FLOAT / NULLIF(value_ct, 0) <= 0.75', NULL, NULL, 'threshold_value', '0', 'Threshold Missing Calendar Days', NULL, 'Warning', 'CAT', 'column', 'Completeness', 'Volume', 'Missing calendar days within min/max range', 'Daily Records tests that at least one record is present for every day within the minimum and maximum date range for the column. The test is relevant for transactional data, where you would expect at least one transaction to be recorded each day. A failure here would suggest missing records for the number of days identified without data. You can adjust the threshold to accept a number of days that you know legitimately have no records. ', 'Y'),
- ('1011', 'Dec_Trunc', 'Decimal Truncation', 'Sum of fractional values at or above reference', 'Tests for decimal truncation by confirming that the sum of fractional values in data is no less than the sum at baseline', 'The sum of fractional values is under baseline, which may indicate decimal truncation', 'Fractional sum', 'The sum of all decimal values from all data for this column', 'fractional_sum IS NOT NULL AND functional_table_type LIKE''%cumulative%''', NULL, NULL, 'threshold_value', 'ROUND(fractional_sum, 0)', 'Sum of Fractional Values at Baseline', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Minimum expected sum of all fractional values', 'Decimal Truncation tests that the fractional (decimal) part of a numeric column has not been truncated since Baseline. This works by summing all the fractional values after the decimal point and confirming that the total is at least equal to the fractional total at baseline. This could indicate a problem in a cumulative dataset, where prior values should still exist unchanged. A failure here would suggest that some process changed data that you would still expect to be present and matching its value when the column was profiled. This test would not be appropriate for an incremental or windowed dataset.', 'Y'),
- ('1012', 'Distinct_Date_Ct', 'Date Count', 'Count of distinct dates at or above reference', 'Tests that the count of distinct dates referenced in the column has not dropped vs. baseline data', 'Drop in count of unique dates recorded in column.', 'Unique dates', 'Count of unique dates in transactional date column', 'general_type=''D'' and date_days_present IS NOT NULL AND functional_table_type NOT LIKE ''%window%''', NULL, NULL, 'baseline_value,threshold_value', 'date_days_present,date_days_present', 'Distinct Date Count at Baseline,Min Expected Date Count', NULL, 'Fail', 'CAT', 'column', 'Timeliness', 'Recency', 'Minimum distinct date count expected', 'Date Count tests that the count of distinct dates present in the column has not dropped since baseline. The test is relevant for cumulative datasets, where old records are retained. A failure here would indicate missing records, which could be caused by a processing error or changed upstream data sources.', 'Y'),
- ('1013', 'Distinct_Value_Ct', 'Value Count', 'Count of distinct values has not dropped', 'Tests that the count of unique values in the column has not changed from baseline.', 'Count of unique values in column has changed from baseline.', 'Unique Values', NULL, 'distinct_value_ct between 2 and 10 AND value_ct > 0 AND NOT (coalesce(top_freq_values,'''') > '''' AND distinct_value_ct BETWEEN 2 and 10) AND NOT (lower(functional_data_type) LIKE ''%sequence%'' OR lower(functional_data_type) LIKE ''%measurement%'' OR functional_data_type LIKE ''%date%'' OR general_type = ''D'')', NULL, NULL, 'baseline_value_ct,threshold_value', 'distinct_value_ct,distinct_value_ct', 'Distinct Value Count at Baseline,Min Expected Value Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected distinct value count', 'Value Count tests that the count of unique values present in the column has not dropped since baseline. The test is relevant for cumulative datasets, where old records are retained, or for any dataset where you would expect a set number of distinct values should be present. A failure here would indicate missing records or a change in categories or value assignment.', 'Y'),
- ('1014', 'Email_Format', 'Email Format', 'Email is correctly formatted', 'Tests that non-blank, non-empty email addresses match the standard format', 'Invalid email address formats found.', 'Invalid emails', 'Number of emails that do not match standard format', 'std_pattern_match=''EMAIL''', NULL, NULL, 'threshold_value', '0', 'Maximum Invalid Email Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of invalid email addresses', NULL, 'Y'),
- ('1015', 'Future_Date', 'Past Dates', 'Latest date is prior to test run date', 'Tests that the maximum date referenced in the column is no greater than the test date, consistent with baseline data', 'Future date found when absent in baseline data.', 'Future dates', NULL, 'general_type=''D''AND future_date_ct = 0', NULL, NULL, 'threshold_value', '0', 'Maximum Future Date Count', NULL, 'Fail', 'CAT', 'column', 'Timeliness', 'Recency', 'Expected count of future dates', NULL, 'Y'),
- ('1016', 'Future_Date_1Y', 'Future Year', 'Future dates within year of test run date', 'Tests that the maximum date referenced in the column is no greater than one year beyond the test date, consistent with baseline data', 'Future date beyond one-year found when absent in baseline.', 'Future dates post 1 year', NULL, 'general_type=''D''AND future_date_ct > 0 AND max_date <=''{AS_OF_DATE}''::DATE + INTERVAL''365 DAYS''', NULL, NULL, 'threshold_value', '0', 'Maximum Post 1-Year Future Date Count', NULL, 'Fail', 'CAT', 'column', 'Timeliness', 'Recency', 'Expected count of future dates beyond one year', 'Future Year looks for date values in the column that extend beyond one year after the test date. This would be appropriate for transactional dates where you would expect to find dates in the near future, but not beyond one year ahead. Errors could indicate invalid entries or possibly dummy dates representing blank values.', 'Y'),
- ('1017', 'Incr_Avg_Shift', 'New Shift', 'New record mean is consistent with reference', 'Tests for statistically-significant shift in mean of new values for column compared to average calculated at baseline.', 'Significant shift in average of new values vs. baseline avg', 'Z-score of mean shift', 'Absolute Z-score (number of SD''s outside mean) of prior avg - incremental avg', 'general_type=''N'' AND distinct_value_ct > 10 AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%''', NULL, NULL, 'baseline_value_ct,baseline_sum,baseline_avg,baseline_sd,threshold_value', 'value_ct,(avg_value * value_ct)::FLOAT,avg_value,stdev_value,2', 'Value Count at Baseline,Sum at Baseline,Mean Value at Baseline,Std Deviation at Baseline,Threshold Max Z-Score', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Maximum Z-Score (number of SD''s beyond mean) expected', 'This is a more sensitive test than Average Shift, because it calculates an incremental difference in the average of new values compared to the average of values at baseline. This is appropriate for a cumulative dataset only, because it calculates the average of new entries based on the assumption that the count and average of records present at baseline are still present at the time of the test. This test compares the mean of new values with the standard deviation of the baseline average to calculate a Z-score. If the new mean falls outside the Z-score threshold, a shift is detected. Potential Z-score thresholds may range from 0 to 3, depending on the sensitivity you prefer. A failed test could indicate a quality issue or a legitimate shift in new data that should be noted and assessed by business users. Consider this test along with Variability Increase. If variability rises too, process, methodology or measurement flaws could be at issue. If variability remains consistent, the problem is more likely to be with the source data itself.', 'Y'),
- ('1018', 'LOV_All', 'Value Match All', 'List of expected values all present in column', 'Tests that all values match a pipe-delimited list of expected values and that all expected values are present', 'Column values found don''t exactly match the expected list of values', 'Values found', NULL, NULL, NULL, NULL, 'threshold_value', NULL, 'List of Expected Values', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'List of values expected, in form (''Val1'',''Val2)', 'This is a more restrictive form of Value Match, testing that all values in the dataset match the list provided, and also that all values present in the list appear at least once in the dataset. This would be appropriate for tables where all category values in the column are represented at least once.', 'Y'),
- ('1019', 'LOV_Match', 'Value Match', 'All column values present in expected list', 'Tests that all values in the column match the list-of-values identified in baseline data.', 'Values not matching expected List-of-Values from baseline.', 'Non-matching records', NULL, 'top_freq_values > '''' AND distinct_value_ct BETWEEN 2 and 10 AND NOT (functional_data_type LIKE ''%date%'' OR lower(datatype_suggestion) LIKE ''%date%'' OR general_type = ''D'' OR lower(column_name) IN (''file_name'', ''filename''))', NULL, NULL, 'baseline_value,threshold_value', '''('' || SUBSTRING( CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 2) > '''' THEN '','''''' || TRIM( REPLACE ( SPLIT_PART(top_freq_values, ''|'' , 2), '''''''' , '''''''''''' ) ) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 4) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 4), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 6) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 6), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 8) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 8), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 10) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 10), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 12) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 12), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 14) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 14), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 16) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 16), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 18) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 18), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 20) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 20), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END, 2, 999) || '')'',0', 'List of Expected Values,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'List of values expected, in form (''Val1'',''Val2)', 'This tests that all values in the column match the hard-coded list provided. This is relevant when the list of allowable values is small and not expected to change often. Even if new values might occasionally be added, this test is useful for downstream data products to provide warning that assumptions and logic may need to change.', 'Y'),
- ('1020', 'Min_Date', 'Minimum Date', 'All dates on or after set minimum', 'Tests that the earliest date referenced in the column is no earlier than baseline data', 'The earliest date value found is before the earliest value at baseline.', 'Dates prior to limit', NULL, 'general_type=''D''and min_date IS NOT NULL AND distinct_value_ct > 1', NULL, NULL, 'baseline_value,threshold_value', 'min_date,0', 'Minimum Date at Baseline,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of dates prior to minimum', 'This test is appropriate for a cumulative dataset only, because it assumes all prior values are still present. It''s appropriate where new records are added with more recent dates, but old dates dates do not change.', 'Y'),
- ('1021', 'Min_Val', 'Minimum Value', 'All values at or above set minimum', 'Tests that the minimum value present in the column is no lower than the minimum value in baseline data', 'Minimum column value less than baseline.', 'Values under limit', NULL, 'general_type=''N''and min_value IS NOT NULL AND (distinct_value_ct >= 2 OR (distinct_value_ct=2 and min_value<>0 and max_value<>1))', NULL, NULL, 'baseline_value,threshold_value', 'min_value,0', 'Minimum Value at Baseline,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of values under limit', 'This test is appropriate for a cumulative dataset only, assuming all prior values are still present. It is also appropriate for any measure that has an absolute, definable minimum value, or a heuristic that makes senes for valid data.', 'Y'),
- ('1022', 'Missing_Pct', 'Percent Missing', 'Consistent ratio of missing values', 'Tests for statistically-significant shift in percentage of missing values in column vs. baseline data', 'Significant shift in percent of missing values vs. baseline.', 'Difference measure', 'Cohen''s H Difference (0.20 small, 0.5 mod, 0.8 large, 1.2 very large, 2.0 huge)', 'record_ct <> value_ct', NULL, NULL, 'baseline_ct,baseline_value_ct,threshold_value', 'record_ct,value_ct,2::VARCHAR(10)', 'Baseline Record Count,Baseline Value Count,Standardized Difference Measure', NULL, 'Warning', 'CAT', 'column', 'Completeness', 'Data Drift', 'Expected maximum Cohen''s H Difference', 'This test uses Cohen''s H, a statistical test to identify a significant difference between two ratios. Results are reported on a standardized scale, which can be interpreted via a rule-of-thumb from small to huge. An uptick in missing data may indicate a collection issue at the source. A larger change may indicate a processing failure. A drop in missing data may also be significant, if it affects assumptions built into analytic products downstream. You can refine the expected threshold value as you view legitimate results of the measure over time.', 'Y'),
- ('1023', 'Monthly_Rec_Ct', 'Monthly Records', 'At least one date per month present within date range', 'Tests for presence of at least one date per calendar month within min/max date range, per baseline data', 'At least one date per month expected in min/max date range.', 'Missing months', 'Calendar months without date values present', 'general_type= ''D'' AND date_days_present > 1 AND date_months_present > 2 AND date_months_present - (datediff( ''MON'' , min_date, max_date) + 1) = 0 AND future_date_ct::FLOAT / NULLIF(value_ct, 0) <= 0.75', NULL, NULL, 'threshold_value', '0', 'Threshold Count of Months without Dates', NULL, 'Fail', 'CAT', 'column', 'Completeness', 'Volume', 'Expected maximum count of calendar months without dates present', 'Monthly Records tests that at least one record is present for every calendar month within the minimum and maximum date range for the column. The test is relevant for transactional data, where you would expect at least one transaction to be recorded each month. A failure here would suggest missing records for the number of months identified without data. You can adjust the threshold to accept a number of month that you know legitimately have no records.', 'Y'),
- ('1024', 'Outlier_Pct_Above', 'Outliers Above', 'Consistent outlier counts over 2 SD above mean', 'Tests that percent of outliers over 2 SD above Mean doesn''t exceed threshold', 'Percent of outliers exceeding 2 SD above the mean is greater than expected threshold.', 'Pct records over limit', NULL, 'general_type = ''N'' AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%'' AND value_ct <> distinct_value_ct AND distinct_value_ct > 10 AND stdev_value > 0 AND avg_value IS NOT NULL AND NOT (distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct > 2)', NULL, NULL, 'baseline_avg,baseline_sd,threshold_value', 'avg_value,stdev_value,0.05', 'Baseline Mean, Baseline Std Deviation, Pct Records over 2 SD', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Expected maximum pct records over upper 2 SD limit', 'This test counts the number of data points that may be considered as outliers, determined by whether their value exceeds 2 standard deviations above the mean at baseline. Assuming a normal distribution, a small percentage (defaulted to 5%) of outliers is expected. The actual number may vary for different distributions. The expected threshold reflects the maximum percentage of outliers you expect to see. This test uses the baseline mean rather than the mean for the latest dataset to capture systemic shift as well as individual outliers. ', 'Y'),
- ('1025', 'Outlier_Pct_Below', 'Outliers Below', 'Consistent outlier counts under 2 SD below mean', 'Tests that percent of outliers over 2 SD below Mean doesn''t exceed threshold', 'Percent of outliers exceeding 2 SD below the mean is greater than expected threshold.', 'Pct records under limit', NULL, 'general_type = ''N'' AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%'' AND value_ct <> distinct_value_ct AND distinct_value_ct > 10 AND stdev_value > 0 AND avg_value IS NOT NULL AND NOT (distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct > 2)', NULL, NULL, 'baseline_avg,baseline_sd,threshold_value', 'avg_value,stdev_value,0.05', 'Baseline Mean, Baseline Std Deviation, Pct Records over 2 SD', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Expected maximum pct records over lower 2 SD limit', 'This test counts the number of data points that may be considered as outliers, determined by whether their value exceeds 2 standard deviations below the mean at baseline. Assuming a normal distribution, a small percentage (defaulted to 5%) of outliers is expected. The actual number may vary for different distributions. The expected threshold reflects the maximum percentage of outliers you expect to see. This test uses the baseline mean rather than the mean for the latest dataset to capture systemic shift as well as individual outliers. ', 'Y'),
- ('1026', 'Pattern_Match', 'Pattern Match', 'Column values match alpha-numeric pattern', 'Tests that all values in the column match the same alpha-numeric pattern identified in baseline data', 'Alpha values do not match consistent pattern in baseline.', 'Pattern Mismatches', NULL, 'fn_charcount(top_patterns, E'' \| '' ) = 1 AND REPLACE(SPLIT_PART(top_patterns, ''|'' , 2), ''N'' , '''' ) > '''' AND distinct_value_ct > 10', NULL, NULL, 'baseline_value,threshold_value', 'trim(REPLACE(REPLACE(REPLACE(SPLIT_PART(top_patterns, '' | '', 2), ''A'', ''[A-Z]''), ''N'', ''[0-9]''), ''a'', ''[a-z]'')),0', 'Pattern at Baseline,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of pattern mismatches', 'This test is appropriate for character fields that are expected to appear in a consistent format. It uses pattern matching syntax as appropriate for your database: REGEX matching if available, otherwise LIKE expressions. The expected threshold is the number of records that fail to match the defined pattern.', 'Y'),
- ('1028', 'Recency', 'Recency', 'Latest date within expected range of test date', 'Tests that the latest date in column is within a set number of days of the test date', 'Most recent date value not within expected days of test date.', 'Days before test', 'Number of days that most recent date precedes the date of test', 'general_type= ''D'' AND max_date <= run_date AND NOT column_name IN ( ''filedate'' , ''file_date'' ) AND NOT functional_data_type IN (''Future Date'', ''Schedule Date'') AND DATEDIFF( ''DAY'' , max_date, run_date) <= 62', NULL, NULL, 'threshold_value', 'CASE WHEN DATEDIFF( ''DAY'' , max_date, run_date) <= 3 THEN DATEDIFF(''DAY'', max_date, run_date) + 3 WHEN DATEDIFF(''DAY'', max_date, run_date) <= 7 then DATEDIFF(''DAY'', max_date, run_date) + 7 WHEN DATEDIFF( ''DAY'' , max_date, run_date) <= 31 THEN CEILING( DATEDIFF( ''DAY'' , max_date, run_date)::FLOAT / 7.0) * 7 WHEN DATEDIFF( ''DAY'' , max_date, run_date) > 31 THEN CEILING( DATEDIFF( ''DAY'' , max_date, run_date)::FLOAT / 30.0) * 30 END', 'Threshold Maximum Days before Test', NULL, 'Warning', 'CAT', 'column', 'Timeliness', 'Recency', 'Expected maximum count of days preceding test date', 'This test evaluates recency based on the latest referenced dates in the column. The test is appropriate for transactional dates and timestamps. The test can be especially valuable because timely data deliveries themselves may not assure that the most recent data is present. You can adjust the expected threshold to the maximum number of days that you expect the data to age before the dataset is refreshed. ', 'Y'),
- ('1030', 'Required', 'Required Entry', 'Required non-null value present', 'Tests that a non-null value is present in each record for the column, consistent with baseline data', 'Every record for this column is expected to be filled, but some are missing.', 'Missing values', NULL, 'record_ct = value_ct', NULL, NULL, 'threshold_value', '0', 'Threshold Missing Value Count', NULL, 'Fail', 'CAT', 'column', 'Completeness', 'Schema Drift', 'Expected count of missing values', NULL, 'Y'),
- ('1033', 'Street_Addr_Pattern', 'Street Address', 'Enough street address entries match defined pattern', 'Tests for percent of records matching standard street address pattern.', 'Percent of values matching standard street address format is under expected threshold.', 'Percent matches', 'Percent of records that match street address pattern', '(std_pattern_match=''STREET_ADDR'') AND (avg_length <> round(avg_length)) AND (avg_embedded_spaces BETWEEN 2 AND 6) AND (avg_length < 35)', NULL, NULL, 'threshold_value', '75', 'Threshold Pct that Match Address Pattern', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected percent of records that match standard street address pattern', 'The street address pattern used in this test should match the vast majority of USA addresses. You can adjust the threshold percent of matches based on the results you are getting -- you may well want to tighten it to make the test more sensitive to invalid entries.', 'Y'),
- ('1034', 'Unique', 'Unique Values', 'Each column value is unique', 'Tests that no values for the column are repeated in multiple records.', 'Column values should be unique per row.', 'Duplicate values', 'Count of non-unique values', 'record_ct > 500 and record_ct = distinct_value_ct and value_ct > 0', NULL, NULL, 'threshold_value', '0', 'Threshold Duplicate Value Count', NULL, 'Fail', 'CAT', 'column', 'Uniqueness', 'Schema Drift', 'Expected count of duplicate values', 'This test is ideal when the database itself does not enforce a primary key constraint on the table. It serves as an independent check on uniqueness. If''s also useful when there are a small number of exceptions to uniqueness, which can be reflected in the expected threshold count of duplicates.', 'Y'),
- ('1035', 'Unique_Pct', 'Percent Unique', 'Consistent ratio of unique values', 'Tests for statistically-significant shift in percentage of unique values vs. baseline data.', 'Significant shift in percent of unique values vs. baseline.', 'Difference measure', 'Cohen''s H Difference (0.20 small, 0.5 mod, 0.8 large, 1.2 very large, 2.0 huge)', 'distinct_value_ct > 10', NULL, NULL, 'baseline_value_ct,baseline_unique_ct,threshold_value', 'value_ct,distinct_value_ct,0.5', 'Value Count at Baseline,Distinct Value Count at Baseline,Standardized Difference Measure (0 to 1)', NULL, 'Warning', 'CAT', 'column', 'Uniqueness', 'Data Drift', 'Expected maximum Cohen''s H Difference', 'You can think of this as a test of similarity that measures whether the percentage of unique values is consistent with the percentage at baseline. A significant change might indicate duplication or a telling shift in cardinality between entities. The test uses Cohen''s H, a statistical test to identify a significant difference between two ratios. Results are reported on a standardized scale, which can be interpreted via a rule-of-thumb from small to huge. You can refine the expected threshold value as you view legitimate results of the measure over time.', 'Y'),
- ('1036', 'US_State', 'US State', 'Column value is two-letter US state code', 'Tests that the recorded column value is a valid US state.', 'Column Value is not a valid US state.', 'Not US States', 'Values that doo not match 2-character US state abbreviations.', 'general_type= ''A'' AND column_name ILIKE ''%state%'' AND distinct_value_ct < 70 AND max_length = 2', NULL, NULL, 'threshold_value', '0', 'Threshold Count not Matching State Abbreviations', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of values that are not US state abbreviations', 'This test validates entries against a fixed list of two-character US state codes and related Armed Forces codes.', 'Y'),
- ('1037', 'Weekly_Rec_Ct', 'Weekly Records', 'At least one date per week present within date range', 'Tests for presence of at least one date per calendar week within min/max date range, per baseline data', 'At least one date per week expected in min/max date range.', 'Missing weeks', 'Calendar weeks without date values present', 'general_type= ''D'' AND date_days_present > 1 AND date_weeks_present > 3 AND date_weeks_present - (DATEDIFF(''week'', ''1800-01-05''::DATE, max_date) - DATEDIFF(''week'', ''1800-01-05''::DATE, min_date) + 1) = 0 AND future_date_ct::FLOAT / NULLIF(value_ct, 0) <= 0.75', NULL, NULL, 'threshold_value', '0', 'Threshold Weeks without Dates', NULL, 'Fail', 'CAT', 'column', 'Completeness', 'Volume', 'Expected maximum count of calendar weeks without dates present', 'Weekly Records tests that at least one record is present for every calendar week within the minimum and maximum date range for the column. The test is relevant for transactional data, where you would expect at least one transaction to be recorded each week. A failure here would suggest missing records for the number of weeks identified without data. You can adjust the threshold to accept a number of weeks that you know legitimately have no records.', 'Y'),
- ('1040', 'Variability_Increase', 'Variability Increase', 'Variability has increased above threshold', 'Tests that the spread or dispersion of column values has increased significantly over baseline, indicating a drop in stability of the measure.', 'The Standard Deviation of the measure has increased beyond the defined threshold. This could signal a change in a process or a data quality issue.', 'Pct SD shift', 'Percent of baseline Standard Deviation', 'general_type = ''N'' AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%'' AND value_ct <> distinct_value_ct AND distinct_value_ct > 10 AND stdev_value > 0 AND avg_value IS NOT NULL AND NOT (distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct > 2)', NULL, NULL, 'baseline_sd,threshold_value', 'stdev_value,120', 'Std Deviation at Baseline,Expected Maximum Percent', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Expected maximum pct of baseline Standard Deviation (SD)', 'This test looks for percent shifts in standard deviation as a measure of the stability of a measure over time. A significant change could indicate that new values are erroneous, or that the cohort being evaluated is significantly different from baseline. An increase in particular could mark new problems in measurement, a more heterogeneous cohort, or that significant outliers have been introduced. Consider this test along with Average Shift and New Shift. If the average shifts as well, there may be a fundamental shift in the dataset or process used to collect the data point. This might suggest a data shift that should be noted and assessed by business users. If the average does not shift, this may point to a data quality or data collection problem. ', 'Y'),
- ('1041', 'Variability_Decrease', 'Variability Decrease', 'Variability has decreased below threshold', 'Tests that the spread or dispersion of column values has decreased significantly over baseline, indicating a shift in stability of the measure. This could signal a change in a process or a data quality issue.', 'The Standard Deviation of the measure has decreased below the defined threshold. This could signal a change in a process or a data quality issue.', 'Pct SD shift', 'Percent of baseline Standard Deviation', 'general_type = ''N'' AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%'' AND value_ct <> distinct_value_ct AND distinct_value_ct > 10 AND stdev_value > 0 AND avg_value IS NOT NULL AND NOT (distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct > 2)', NULL, NULL, 'baseline_sd,threshold_value', 'stdev_value, 80', 'Std Deviation at Baseline,Expected Minimum Percent', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Expected minimum pct of baseline Standard Deviation (SD)', 'This test looks for percent shifts in standard deviation as a measure of the stability of a measure over time. A significant change could indicate that new values are erroneous, or that the cohort being evaluated is significantly different from baseline. A decrease in particular could indicate an improved process, better precision in measurement, the elimination of outliers, or a more homogeneous cohort. ', 'Y'),
- ('1042', 'Valid_Month', 'Valid Month', 'Valid calendar month in expected format', 'Tests for the presence of a valid representation of a calendar month consistent with the format at baseline.', 'Column values are not a valid representation of a calendar month consistent with the format at baseline.', 'Invalid months', NULL, 'functional_data_type = ''Period Month''', NULL, NULL, 'threshold_value,baseline_value', '0,CASE WHEN max_length > 3 AND initcap(min_text) = min_text THEN ''''''January'''',''''February'''',''''March'''',''''April'''',''''May'''',''''June'''',''''July'''',''''August'''',''''September'''',''''October'''',''''November'''',''''December'''''' WHEN max_length > 3 AND upper(min_text) = min_text THEN ''''''JANUARY'''',''''FEBRUARY'''',''''MARCH'''',''''APRIL'''',''''MAY'''',''''JUNE'''',''''JULY'''',''''AUGUST'''',''''SEPTEMBER'''',''''OCTOBER'''',''''NOVEMBER'''',''''DECEMBER'''''' WHEN max_length > 3 AND lower(min_text) = min_text THEN ''''''january'''',''''february'''',''''march'''',''''april'''',''''may'''',''''june'''',''''july'''',''''august'''',''''september'''',''''october'''',''''november'''',''''december'''''' WHEN max_length = 3 AND initcap(min_text) = min_text THEN ''''''Jan'''',''''Feb'''',''''Mar'''',''''Apr'''',''''May'''',''''Jun'''',''''Jul'''',''''Aug'''',''''Sep'''',''''Oct'''',''''Nov'''',''''Dec'''''' WHEN max_length = 3 AND upper(min_text) = min_text THEN ''''''JAN'''',''''FEB'''',''''MAR'''',''''APR'''',''''MAY'''',''''JUN'''',''''JUL'''',''''AUG'''',''''SEP'''',''''OCT'''',''''NOV'''',''''DEC'''''' WHEN max_length = 3 AND lower(min_text) = min_text THEN ''''''jan'''',''''feb'''',''''mar'''',''''apr'''',''''may'''',''''jun'''',''''jul'''',''''aug'''',''''sep'''',''''oct'''',''''nov'''',''''dec'''''' WHEN max_length = 2 AND min_text = ''01'' THEN ''''''01'''',''''02'''',''''03'''',''''04'''',''''05'''',''''06'''',''''07'''',''''08'''',''''09'''',''''10'''',''''11'''',''''12'''''' WHEN max_length = 2 AND min_text = ''1'' THEN ''''''1'''',''''2'''',''''3'''',''''4'''',''''5'''',''''6'''',''''7'''',''''8'''',''''9'''',''''10'''',''''11'''',''''12'''''' WHEN min_value = 1 THEN ''1,2,3,4,5,6,7,8,9,10,11,12'' ELSE ''NULL'' END', 'Threshold Invalid Months,Valid Month List', 'The acceptable number of records with invalid months present.|List of valid month values for this field, in quotes if field is numeric, separated by commas.', 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of invalid months', NULL, 'N'),
- ('1043', 'Valid_Characters', 'Valid Characters', 'Column contains no invalid characters', 'Tests for the presence of non-printing characters, leading spaces, or surrounding quotes.', 'Invalid characters, such as non-printing characters, leading spaces, or surrounding quotes, were found.', 'Invalid records', 'Expected count of values with invalid characters', 'general_type = ''A''', NULL, NULL, 'threshold_value', '0', NULL, 'The acceptable number of records with invalid character values present.', 'Warning', 'CAT', 'column', 'Validity', 'Schema Drift', 'Threshold Invalid Value Count', 'This test looks for the presence of non-printing ASCII characters that are considered non-standard in basic text processing. It also identifies leading spaces and values enclosed in quotes. Values that fail this test may be artifacts of data conversion, or just more difficult to process or analyze downstream.', 'N'),
- ('1044', 'Valid_US_Zip', 'Valid US Zip', 'Valid USA Postal Codes', 'Tests that postal codes match the 5 or 9 digit standard US format', 'Invalid US Zip Code formats found.', 'Invalid Zip Codes', 'Expected count of values with invalid Zip Codes', 'functional_data_type = ''Zip''', NULL, NULL, 'threshold_value', '0', NULL, NULL, 'Warning', 'CAT', 'column', 'Validity', 'Schema Drift', 'Threshold Invalid Value Count', NULL, 'Y'),
- ('1045', 'Valid_US_Zip3', 'Valid US Zip-3 ', 'Valid USA Zip-3 Prefix', 'Tests that postal codes match the 3 digit format of a regional prefix.', 'Invalid 3-digit US Zip Code regional prefix formats found.', 'Invalid Zip-3 Prefix', 'Expected count of values with invalid Zip-3 Prefix Codes', 'functional_data_type = ''Zip3''', NULL, NULL, 'threshold_value', '0', NULL, NULL, 'Warning', 'CAT', 'column', 'Validity', 'Schema Drift', 'Threshold Invalid Zip3 Count', 'This test looks for the presence of values that fail to match the three-digit numeric code expected for US Zip Code regional prefixes. These prefixes are often used to roll up Zip Code data to a regional level, and may be critical to anonymize detailed data and protect PID. Depending on your needs and regulatory requirements, longer zip codes could place PID at risk.', 'Y'),
- ('1006', 'Condition_Flag', 'Custom Condition', 'Column values match pre-defined condition', 'Tests that each record in the table matches a pre-defined, custom condition', 'Value(s) found not matching defined condition.', 'Values Failing', NULL, NULL, 'Test Focus', 'Specify a brief descriptor of the focus of this test that is unique within this Test Suite for the Table and Test Type. This distinguishes this test from others of the same type on the same table. Example: `Quantity Consistency` if you are testing that quantity ordered matches quantity shipped.', 'threshold_value,custom_query', NULL, 'Threshold Error Count,Custom SQL Expression (TRUE on error)', 'The number of errors that are acceptable before test fails.|Expression should evaluate to TRUE to register an error or FALSE if no error. An expression can reference only columns in the selected table.', 'Fail', 'CAT', 'custom', 'Validity', 'Schema Drift', 'Count of records that don''t meet test condition', 'Custom Condition is a business-rule test for a user-defined error condition based on the value of one or more columns. The condition is applied to each record within the table, and the count of records failing the condition is added up. If that count exceeds a threshold of errors, the test as a whole is failed. This test is ideal for error conditions that TestGen cannot automatically infer, and any condition that involves the values of more than one column in the same record. Performance of this test is fast, since it is performed together with other aggregate tests. Interpretation is based on the user-defined meaning of the test.', 'Y'),
+ (id, test_type, test_name_short, test_name_long, test_description, except_message, measure_uom, measure_uom_description, selection_criteria, dq_score_prevalence_formula, dq_score_risk_factor, column_name_prompt, column_name_help, default_parm_columns, default_parm_values, default_parm_prompts, default_parm_help, default_severity, run_type, test_scope, dq_dimension, health_dimension, threshold_description, usage_notes, active)
+VALUES ('1004', 'Alpha_Trunc', 'Alpha Truncation', 'Maximum character count consistent', 'Tests that the maximum count of characters in a column value has not dropped vs. baseline data', 'Maximum length of values has dropped from prior expected length.', 'Values over max', NULL, 'general_type =''A'' AND max_length > 0 AND ( (min_length = avg_length AND max_length = avg_length) OR (numeric_ct <> value_ct ) ) AND functional_table_type NOT LIKE ''%window%'' /* The conditions below are to eliminate overlap with : LOV_Match (excluded selection criteria for this test_type), Pattern_Match (excluded selection criteria for this test_type), Constant (excluded functional_data_type Constant and Boolean) */ AND ( (distinct_value_ct NOT BETWEEN 2 AND 10 AND functional_data_type NOT IN ( ''Constant'', ''Boolean'') ) AND NOT ( fn_charcount(top_patterns, E'' \| '' ) = 1 AND fn_charcount(top_patterns, E'' \| '' ) IS NOT NULL AND REPLACE(SPLIT_PART(top_patterns, ''|'' , 2), ''N'' , '''' ) > ''''))', '{VALUE_CT}::FLOAT * (FN_NORMAL_CDF(({MAX_LENGTH}::FLOAT - {AVG_LENGTH}::FLOAT) / ({MAX_LENGTH}::FLOAT / 3)) - FN_NORMAL_CDF(({RESULT_MEASURE}::FLOAT - {AVG_LENGTH}::FLOAT) / ({MAX_LENGTH}::FLOAT / 3)) ) /{RECORD_CT}::FLOAT', '1.0', NULL, NULL, 'threshold_value', 'max_length', 'Maximum String Length at Baseline', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Maximum length expected', 'Alpha Truncation tests that the longest text value in a column hasn''t become shorter than the longest value at baseline. This could indicate a problem in a cumulative dataset, where prior values should still exist unchanged. A failure here would suggest that some process changed data that you would still expect to be present and matching its value when the column was profiled. This test would not be appropriate for an incremental or windowed dataset.', 'Y'),
+ ('1005', 'Avg_Shift', 'Average Shift', 'Column mean is consistent with reference', 'Tests for statistically-significant shift in mean value for column from average calculated at baseline.', 'Standardized difference between averages is over the selected threshold level.', 'Difference Measure', 'Cohen''s D Difference (0.20 small, 0.5 mod, 0.8 large, 1.2 very large, 2.0 huge)', 'general_type=''N'' AND distinct_value_ct > 10 AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%''', 'LEAST({RESULT_MEASURE}, 1.0)', '0.75', NULL, NULL, 'baseline_value_ct,baseline_avg,baseline_sd,threshold_value', 'value_ct,avg_value,stdev_value,0.5::VARCHAR', 'Value Ct at Baseline,Mean at Baseline,Std Deviation at Baseline,Threshold Difference Measure ', NULL, 'Warning', 'CAT', 'column', 'Consistency', 'Data Drift', 'Standardized Difference Measure', 'Average Shift tests that the average of a numeric column has not significantly changed since baseline, when profiling was done. A significant shift may indicate errors in processing, differences in source data, or valid changes that may nevertheless impact assumptions in downstream data products. The test uses Cohen''s D, a statistical technique to identify significant shifts in a value. Cohen''s D measures the difference between the two averages, reporting results on a standardized scale, which can be interpreted via a rule-of-thumb from small to huge. Depending on your data, some difference may be expected, so it''s reasonable to adjust the threshold value that triggers test failure. This test works well for measures, or even for identifiers if you expect them to increment consistently. You may want to periodically adjust the expected threshold, or even the expected average value if you expect shifting over time. Consider this test along with Variability Increase. If variability rises too, process or measurement flaws could be at work. If variability remains consistent, the issue is more likely to be with the source data itself. ', 'Y'),
+ ('1007', 'Constant', 'Constant Match', 'All column values match constant value', 'Tests that all values in the column match the constant value identified in baseline data', 'A constant value is expected for this column.', 'Mismatched values', NULL, 'TEMPLATE', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/{RECORD_CT}::FLOAT', '1.0', NULL, NULL, 'baseline_value,threshold_value', NULL, 'Constant Value at Baseline,Threshold Error Count', 'The single, unchanging value of the column, per baseline|The number of errors that are acceptable before test fails.', 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Count of records with unexpected values', 'Constant Match tests that a single value determined to be a constant in baseline profiling is still the only value for the column that appears in subsequent versions of the dataset. Sometimes new data or business knowledge may reveal that the value is not a constant at all, even though only one value was present at profiling. In this case, you will want to disable this test. Alternatively, you can use the Value Match test to provide a limited number of valid values for the column.', 'Y'),
+ ('1009', 'Daily_Record_Ct', 'Daily Records', 'All dates present within date range', 'Tests for presence of every calendar date within min/max date range, per baseline data', 'Not every date value between min and max dates is present, unlike at baseline.', 'Missing dates', NULL, 'general_type= ''D'' AND date_days_present > 21 AND date_days_present - (DATEDIFF(''day'', ''1800-01-05''::DATE, max_date) - DATEDIFF(''day'', ''1800-01-05''::DATE, min_date) + 1) = 0 AND future_date_ct::FLOAT / NULLIF(value_ct, 0) <= 0.75', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT*{PRO_RECORD_CT}::FLOAT/{DATE_DAYS_PRESENT}::FLOAT/{RECORD_CT}::FLOAT', '0.75', NULL, NULL, 'threshold_value', '0', 'Threshold Missing Calendar Days', NULL, 'Warning', 'CAT', 'column', 'Completeness', 'Volume', 'Missing calendar days within min/max range', 'Daily Records tests that at least one record is present for every day within the minimum and maximum date range for the column. The test is relevant for transactional data, where you would expect at least one transaction to be recorded each day. A failure here would suggest missing records for the number of days identified without data. You can adjust the threshold to accept a number of days that you know legitimately have no records. ', 'Y'),
+ ('1011', 'Dec_Trunc', 'Decimal Truncation', 'Sum of fractional values at or above reference', 'Tests for decimal truncation by confirming that the sum of fractional values in data is no less than the sum at baseline', 'The sum of fractional values is under baseline, which may indicate decimal truncation', 'Fractional sum', 'The sum of all decimal values from all data for this column', 'fractional_sum IS NOT NULL AND functional_table_type LIKE''%cumulative%''', '1', '1.0', NULL, NULL, 'threshold_value', 'ROUND(fractional_sum, 0)', 'Sum of Fractional Values at Baseline', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Minimum expected sum of all fractional values', 'Decimal Truncation tests that the fractional (decimal) part of a numeric column has not been truncated since Baseline. This works by summing all the fractional values after the decimal point and confirming that the total is at least equal to the fractional total at baseline. This could indicate a problem in a cumulative dataset, where prior values should still exist unchanged. A failure here would suggest that some process changed data that you would still expect to be present and matching its value when the column was profiled. This test would not be appropriate for an incremental or windowed dataset.', 'Y'),
+ ('1012', 'Distinct_Date_Ct', 'Date Count', 'Count of distinct dates at or above reference', 'Tests that the count of distinct dates referenced in the column has not dropped vs. baseline data', 'Drop in count of unique dates recorded in column.', 'Unique dates', 'Count of unique dates in transactional date column', 'general_type=''D'' and date_days_present IS NOT NULL AND functional_table_type NOT LIKE ''%window%''', '(({RECORD_CT}-{PRO_RECORD_CT})::FLOAT*{DISTINCT_VALUE_CT}::FLOAT/{PRO_RECORD_CT}::FLOAT)/{PRO_RECORD_CT}::FLOAT', '1.0', NULL, NULL, 'baseline_value,threshold_value', 'date_days_present,date_days_present', 'Distinct Date Count at Baseline,Min Expected Date Count', NULL, 'Fail', 'CAT', 'column', 'Timeliness', 'Recency', 'Minimum distinct date count expected', 'Date Count tests that the count of distinct dates present in the column has not dropped since baseline. The test is relevant for cumulative datasets, where old records are retained. A failure here would indicate missing records, which could be caused by a processing error or changed upstream data sources.', 'Y'),
+ ('1013', 'Distinct_Value_Ct', 'Value Count', 'Count of distinct values has not dropped', 'Tests that the count of unique values in the column has not changed from baseline.', 'Count of unique values in column has changed from baseline.', 'Unique Values', NULL, 'distinct_value_ct between 2 and 10 AND value_ct > 0 AND NOT (coalesce(top_freq_values,'''') > '''' AND distinct_value_ct BETWEEN 2 and 10) AND NOT (lower(functional_data_type) LIKE ''%sequence%'' OR lower(functional_data_type) LIKE ''%measurement%'' OR functional_data_type LIKE ''%date%'' OR general_type = ''D'')', 'ABS({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT*{PRO_RECORD_CT}::FLOAT/{DISTINCT_VALUE_CT}::FLOAT', '1.0', NULL, NULL, 'baseline_value_ct,threshold_value', 'distinct_value_ct,distinct_value_ct', 'Distinct Value Count at Baseline,Min Expected Value Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected distinct value count', 'Value Count tests that the count of unique values present in the column has not dropped since baseline. The test is relevant for cumulative datasets, where old records are retained, or for any dataset where you would expect a set number of distinct values should be present. A failure here would indicate missing records or a change in categories or value assignment.', 'Y'),
+ ('1014', 'Email_Format', 'Email Format', 'Email is correctly formatted', 'Tests that non-blank, non-empty email addresses match the standard format', 'Invalid email address formats found.', 'Invalid emails', 'Number of emails that do not match standard format', 'std_pattern_match=''EMAIL''', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/{RECORD_CT}::FLOAT', '1.0', NULL, NULL, 'threshold_value', '0', 'Maximum Invalid Email Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of invalid email addresses', NULL, 'Y'),
+ ('1015', 'Future_Date', 'Past Dates', 'Latest date is prior to test run date', 'Tests that the maximum date referenced in the column is no greater than the test date, consistent with baseline data', 'Future date found when absent in baseline data.', 'Future dates', NULL, 'general_type=''D''AND future_date_ct = 0', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/{RECORD_CT}::FLOAT', '1.0', NULL, NULL, 'threshold_value', '0', 'Maximum Future Date Count', NULL, 'Fail', 'CAT', 'column', 'Timeliness', 'Recency', 'Expected count of future dates', NULL, 'Y'),
+ ('1016', 'Future_Date_1Y', 'Future Year', 'Future dates within year of test run date', 'Tests that the maximum date referenced in the column is no greater than one year beyond the test date, consistent with baseline data', 'Future date beyond one-year found when absent in baseline.', 'Future dates post 1 year', NULL, 'general_type=''D''AND future_date_ct > 0 AND max_date <=''{AS_OF_DATE}''::DATE + INTERVAL''365 DAYS''', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/{RECORD_CT}::FLOAT', '1.0', NULL, NULL, 'threshold_value', '0', 'Maximum Post 1-Year Future Date Count', NULL, 'Fail', 'CAT', 'column', 'Timeliness', 'Recency', 'Expected count of future dates beyond one year', 'Future Year looks for date values in the column that extend beyond one year after the test date. This would be appropriate for transactional dates where you would expect to find dates in the near future, but not beyond one year ahead. Errors could indicate invalid entries or possibly dummy dates representing blank values.', 'Y'),
+ ('1017', 'Incr_Avg_Shift', 'New Shift', 'New record mean is consistent with reference', 'Tests for statistically-significant shift in mean of new values for column compared to average calculated at baseline.', 'Significant shift in average of new values vs. baseline avg', 'Z-score of mean shift', 'Absolute Z-score (number of SD''s outside mean) of prior avg - incremental avg', 'general_type=''N'' AND distinct_value_ct > 10 AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%''', '{RECORD_CT}::FLOAT*(1-FN_NORMAL_CDF({RESULT_MEASURE}::FLOAT))/{RECORD_CT}::FLOAT', '0.75', NULL, NULL, 'baseline_value_ct,baseline_sum,baseline_avg,baseline_sd,threshold_value', 'value_ct,(avg_value * value_ct)::FLOAT,avg_value,stdev_value,2', 'Value Count at Baseline,Sum at Baseline,Mean Value at Baseline,Std Deviation at Baseline,Threshold Max Z-Score', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Maximum Z-Score (number of SD''s beyond mean) expected', 'This is a more sensitive test than Average Shift, because it calculates an incremental difference in the average of new values compared to the average of values at baseline. This is appropriate for a cumulative dataset only, because it calculates the average of new entries based on the assumption that the count and average of records present at baseline are still present at the time of the test. This test compares the mean of new values with the standard deviation of the baseline average to calculate a Z-score. If the new mean falls outside the Z-score threshold, a shift is detected. Potential Z-score thresholds may range from 0 to 3, depending on the sensitivity you prefer. A failed test could indicate a quality issue or a legitimate shift in new data that should be noted and assessed by business users. Consider this test along with Variability Increase. If variability rises too, process, methodology or measurement flaws could be at issue. If variability remains consistent, the problem is more likely to be with the source data itself.', 'Y'),
+ ('1018', 'LOV_All', 'Value Match All', 'List of expected values all present in column', 'Tests that all values match a pipe-delimited list of expected values and that all expected values are present', 'Column values found don''t exactly match the expected list of values', 'Values found', NULL, NULL, '1', '1.0', NULL, NULL, 'threshold_value', NULL, 'List of Expected Values', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'List of values expected, in form (''Val1'',''Val2)', 'This is a more restrictive form of Value Match, testing that all values in the dataset match the list provided, and also that all values present in the list appear at least once in the dataset. This would be appropriate for tables where all category values in the column are represented at least once.', 'Y'),
+ ('1019', 'LOV_Match', 'Value Match', 'All column values present in expected list', 'Tests that all values in the column match the list-of-values identified in baseline data.', 'Values not matching expected List-of-Values from baseline.', 'Non-matching records', NULL, 'top_freq_values > '''' AND distinct_value_ct BETWEEN 2 and 10 AND NOT (functional_data_type LIKE ''%date%'' OR lower(datatype_suggestion) LIKE ''%date%'' OR general_type = ''D'' OR lower(column_name) IN (''file_name'', ''filename''))', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/{RECORD_CT}::FLOAT', '1.0', NULL, NULL, 'baseline_value,threshold_value', '''('' || SUBSTRING( CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 2) > '''' THEN '','''''' || TRIM( REPLACE ( SPLIT_PART(top_freq_values, ''|'' , 2), '''''''' , '''''''''''' ) ) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 4) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 4), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 6) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 6), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 8) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 8), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 10) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 10), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 12) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 12), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 14) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 14), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 16) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 16), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 18) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 18), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 20) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 20), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END, 2, 999) || '')'',0', 'List of Expected Values,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'List of values expected, in form (''Val1'',''Val2)', 'This tests that all values in the column match the hard-coded list provided. This is relevant when the list of allowable values is small and not expected to change often. Even if new values might occasionally be added, this test is useful for downstream data products to provide warning that assumptions and logic may need to change.', 'Y'),
+ ('1020', 'Min_Date', 'Minimum Date', 'All dates on or after set minimum', 'Tests that the earliest date referenced in the column is no earlier than baseline data', 'The earliest date value found is before the earliest value at baseline.', 'Dates prior to limit', NULL, 'general_type=''D''and min_date IS NOT NULL AND distinct_value_ct > 1', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/{RECORD_CT}::FLOAT', '1.0', NULL, NULL, 'baseline_value,threshold_value', 'min_date,0', 'Minimum Date at Baseline,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of dates prior to minimum', 'This test is appropriate for a cumulative dataset only, because it assumes all prior values are still present. It''s appropriate where new records are added with more recent dates, but old dates dates do not change.', 'Y'),
+ ('1021', 'Min_Val', 'Minimum Value', 'All values at or above set minimum', 'Tests that the minimum value present in the column is no lower than the minimum value in baseline data', 'Minimum column value less than baseline.', 'Values under limit', NULL, 'general_type=''N''and min_value IS NOT NULL AND (distinct_value_ct >= 2 OR (distinct_value_ct=2 and min_value<>0 and max_value<>1))', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/{RECORD_CT}::FLOAT', '1.0', NULL, NULL, 'baseline_value,threshold_value', 'min_value,0', 'Minimum Value at Baseline,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of values under limit', 'This test is appropriate for a cumulative dataset only, assuming all prior values are still present. It is also appropriate for any measure that has an absolute, definable minimum value, or a heuristic that makes senes for valid data.', 'Y'),
+ ('1022', 'Missing_Pct', 'Percent Missing', 'Consistent ratio of missing values', 'Tests for statistically-significant shift in percentage of missing values in column vs. baseline data', 'Significant shift in percent of missing values vs. baseline.', 'Difference measure', 'Cohen''s H Difference (0.20 small, 0.5 mod, 0.8 large, 1.2 very large, 2.0 huge)', 'record_ct <> value_ct', 'LEAST({RESULT_MEASURE}, 1.0)', '0.75', NULL, NULL, 'baseline_ct,baseline_value_ct,threshold_value', 'record_ct,value_ct,2::VARCHAR(10)', 'Baseline Record Count,Baseline Value Count,Standardized Difference Measure', NULL, 'Warning', 'CAT', 'column', 'Completeness', 'Data Drift', 'Expected maximum Cohen''s H Difference', 'This test uses Cohen''s H, a statistical test to identify a significant difference between two ratios. Results are reported on a standardized scale, which can be interpreted via a rule-of-thumb from small to huge. An uptick in missing data may indicate a collection issue at the source. A larger change may indicate a processing failure. A drop in missing data may also be significant, if it affects assumptions built into analytic products downstream. You can refine the expected threshold value as you view legitimate results of the measure over time.', 'Y'),
+ ('1023', 'Monthly_Rec_Ct', 'Monthly Records', 'At least one date per month present within date range', 'Tests for presence of at least one date per calendar month within min/max date range, per baseline data', 'At least one date per month expected in min/max date range.', 'Missing months', 'Calendar months without date values present', 'general_type= ''D'' AND date_days_present > 1 AND date_months_present > 2 AND date_months_present - (datediff( ''MON'' , min_date, max_date) + 1) = 0 AND future_date_ct::FLOAT / NULLIF(value_ct, 0) <= 0.75', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT*{PRO_RECORD_CT}::FLOAT/{DATE_MONTHS_PRESENT}::FLOAT/{RECORD_CT}::FLOAT', '1.0', NULL, NULL, 'threshold_value', '0', 'Threshold Count of Months without Dates', NULL, 'Fail', 'CAT', 'column', 'Completeness', 'Volume', 'Expected maximum count of calendar months without dates present', 'Monthly Records tests that at least one record is present for every calendar month within the minimum and maximum date range for the column. The test is relevant for transactional data, where you would expect at least one transaction to be recorded each month. A failure here would suggest missing records for the number of months identified without data. You can adjust the threshold to accept a number of month that you know legitimately have no records.', 'Y'),
+ ('1024', 'Outlier_Pct_Above', 'Outliers Above', 'Consistent outlier counts over 2 SD above mean', 'Tests that percent of outliers over 2 SD above Mean doesn''t exceed threshold', 'Percent of outliers exceeding 2 SD above the mean is greater than expected threshold.', 'Pct records over limit', NULL, 'general_type = ''N'' AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%'' AND value_ct <> distinct_value_ct AND distinct_value_ct > 10 AND stdev_value > 0 AND avg_value IS NOT NULL AND NOT (distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct > 2)', 'GREATEST(0, {RESULT_MEASURE}::FLOAT-{THRESHOLD_VALUE}::FLOAT)', '0.75', NULL, NULL, 'baseline_avg,baseline_sd,threshold_value', 'avg_value,stdev_value,0.05', 'Baseline Mean, Baseline Std Deviation, Pct Records over 2 SD', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Expected maximum pct records over upper 2 SD limit', 'This test counts the number of data points that may be considered as outliers, determined by whether their value exceeds 2 standard deviations above the mean at baseline. Assuming a normal distribution, a small percentage (defaulted to 5%) of outliers is expected. The actual number may vary for different distributions. The expected threshold reflects the maximum percentage of outliers you expect to see. This test uses the baseline mean rather than the mean for the latest dataset to capture systemic shift as well as individual outliers. ', 'Y'),
+ ('1025', 'Outlier_Pct_Below', 'Outliers Below', 'Consistent outlier counts under 2 SD below mean', 'Tests that percent of outliers over 2 SD below Mean doesn''t exceed threshold', 'Percent of outliers exceeding 2 SD below the mean is greater than expected threshold.', 'Pct records under limit', NULL, 'general_type = ''N'' AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%'' AND value_ct <> distinct_value_ct AND distinct_value_ct > 10 AND stdev_value > 0 AND avg_value IS NOT NULL AND NOT (distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct > 2)', 'GREATEST(0, {RESULT_MEASURE}::FLOAT-{THRESHOLD_VALUE}::FLOAT)', '0.75', NULL, NULL, 'baseline_avg,baseline_sd,threshold_value', 'avg_value,stdev_value,0.05', 'Baseline Mean, Baseline Std Deviation, Pct Records over 2 SD', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Expected maximum pct records over lower 2 SD limit', 'This test counts the number of data points that may be considered as outliers, determined by whether their value exceeds 2 standard deviations below the mean at baseline. Assuming a normal distribution, a small percentage (defaulted to 5%) of outliers is expected. The actual number may vary for different distributions. The expected threshold reflects the maximum percentage of outliers you expect to see. This test uses the baseline mean rather than the mean for the latest dataset to capture systemic shift as well as individual outliers. ', 'Y'),
+ ('1026', 'Pattern_Match', 'Pattern Match', 'Column values match alpha-numeric pattern', 'Tests that all values in the column match the same alpha-numeric pattern identified in baseline data', 'Alpha values do not match consistent pattern in baseline.', 'Pattern Mismatches', NULL, 'fn_charcount(top_patterns, E'' \| '' ) = 1 AND REPLACE(SPLIT_PART(top_patterns, ''|'' , 2), ''N'' , '''' ) > '''' AND distinct_value_ct > 10', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/{RECORD_CT}::FLOAT', '1.0', NULL, NULL, 'baseline_value,threshold_value', 'trim(REPLACE(REPLACE(REPLACE(SPLIT_PART(top_patterns, '' | '', 2), ''A'', ''[A-Z]''), ''N'', ''[0-9]''), ''a'', ''[a-z]'')),0', 'Pattern at Baseline,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of pattern mismatches', 'This test is appropriate for character fields that are expected to appear in a consistent format. It uses pattern matching syntax as appropriate for your database: REGEX matching if available, otherwise LIKE expressions. The expected threshold is the number of records that fail to match the defined pattern.', 'Y'),
+ ('1028', 'Recency', 'Recency', 'Latest date within expected range of test date', 'Tests that the latest date in column is within a set number of days of the test date', 'Most recent date value not within expected days of test date.', 'Days before test', 'Number of days that most recent date precedes the date of test', 'general_type= ''D'' AND max_date <= run_date AND NOT column_name IN ( ''filedate'' , ''file_date'' ) AND NOT functional_data_type IN (''Future Date'', ''Schedule Date'') AND DATEDIFF( ''DAY'' , max_date, run_date) <= 62', '(ABS({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT*{PRO_RECORD_CT}::FLOAT/(1.0+DATEDIFF(''DAY'', ''{MIN_DATE}'', ''{MAX_DATE}''))::FLOAT)/{RECORD_CT}::FLOAT', '0.75', NULL, NULL, 'threshold_value', 'CASE WHEN DATEDIFF( ''DAY'' , max_date, run_date) <= 3 THEN DATEDIFF(''DAY'', max_date, run_date) + 3 WHEN DATEDIFF(''DAY'', max_date, run_date) <= 7 then DATEDIFF(''DAY'', max_date, run_date) + 7 WHEN DATEDIFF( ''DAY'' , max_date, run_date) <= 31 THEN CEILING( DATEDIFF( ''DAY'' , max_date, run_date)::FLOAT / 7.0) * 7 WHEN DATEDIFF( ''DAY'' , max_date, run_date) > 31 THEN CEILING( DATEDIFF( ''DAY'' , max_date, run_date)::FLOAT / 30.0) * 30 END', 'Threshold Maximum Days before Test', NULL, 'Warning', 'CAT', 'column', 'Timeliness', 'Recency', 'Expected maximum count of days preceding test date', 'This test evaluates recency based on the latest referenced dates in the column. The test is appropriate for transactional dates and timestamps. The test can be especially valuable because timely data deliveries themselves may not assure that the most recent data is present. You can adjust the expected threshold to the maximum number of days that you expect the data to age before the dataset is refreshed. ', 'Y'),
+ ('1030', 'Required', 'Required Entry', 'Required non-null value present', 'Tests that a non-null value is present in each record for the column, consistent with baseline data', 'Every record for this column is expected to be filled, but some are missing.', 'Missing values', NULL, 'record_ct = value_ct', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/{RECORD_CT}::FLOAT', '1.0', NULL, NULL, 'threshold_value', '0', 'Threshold Missing Value Count', NULL, 'Fail', 'CAT', 'column', 'Completeness', 'Schema Drift', 'Expected count of missing values', NULL, 'Y'),
+ ('1033', 'Street_Addr_Pattern', 'Street Address', 'Enough street address entries match defined pattern', 'Tests for percent of records matching standard street address pattern.', 'Percent of values matching standard street address format is under expected threshold.', 'Percent matches', 'Percent of records that match street address pattern', '(std_pattern_match=''STREET_ADDR'') AND (avg_length <> round(avg_length)) AND (avg_embedded_spaces BETWEEN 2 AND 6) AND (avg_length < 35)', '({VALUE_CT}::FLOAT * ({RESULT_MEASURE}::FLOAT - {THRESHOLD_VALUE}::FLOAT)/100.0)/{RECORD_CT}::FLOAT', '1.0', NULL, NULL, 'threshold_value', '75', 'Threshold Pct that Match Address Pattern', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected percent of records that match standard street address pattern', 'The street address pattern used in this test should match the vast majority of USA addresses. You can adjust the threshold percent of matches based on the results you are getting -- you may well want to tighten it to make the test more sensitive to invalid entries.', 'Y'),
+ ('1034', 'Unique', 'Unique Values', 'Each column value is unique', 'Tests that no values for the column are repeated in multiple records.', 'Column values should be unique per row.', 'Duplicate values', 'Count of non-unique values', 'record_ct > 500 and record_ct = distinct_value_ct and value_ct > 0', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/{RECORD_CT}::FLOAT', '1.0', NULL, NULL, 'threshold_value', '0', 'Threshold Duplicate Value Count', NULL, 'Fail', 'CAT', 'column', 'Uniqueness', 'Schema Drift', 'Expected count of duplicate values', 'This test is ideal when the database itself does not enforce a primary key constraint on the table. It serves as an independent check on uniqueness. If''s also useful when there are a small number of exceptions to uniqueness, which can be reflected in the expected threshold count of duplicates.', 'Y'),
+ ('1035', 'Unique_Pct', 'Percent Unique', 'Consistent ratio of unique values', 'Tests for statistically-significant shift in percentage of unique values vs. baseline data.', 'Significant shift in percent of unique values vs. baseline.', 'Difference measure', 'Cohen''s H Difference (0.20 small, 0.5 mod, 0.8 large, 1.2 very large, 2.0 huge)', 'distinct_value_ct > 10', 'LEAST({RESULT_MEASURE}, 1.0)', '0.75', NULL, NULL, 'baseline_value_ct,baseline_unique_ct,threshold_value', 'value_ct,distinct_value_ct,0.5', 'Value Count at Baseline,Distinct Value Count at Baseline,Standardized Difference Measure (0 to 1)', NULL, 'Warning', 'CAT', 'column', 'Uniqueness', 'Data Drift', 'Expected maximum Cohen''s H Difference', 'You can think of this as a test of similarity that measures whether the percentage of unique values is consistent with the percentage at baseline. A significant change might indicate duplication or a telling shift in cardinality between entities. The test uses Cohen''s H, a statistical test to identify a significant difference between two ratios. Results are reported on a standardized scale, which can be interpreted via a rule-of-thumb from small to huge. You can refine the expected threshold value as you view legitimate results of the measure over time.', 'Y'),
+ ('1036', 'US_State', 'US State', 'Column value is two-letter US state code', 'Tests that the recorded column value is a valid US state.', 'Column Value is not a valid US state.', 'Not US States', 'Values that doo not match 2-character US state abbreviations.', 'general_type= ''A'' AND column_name ILIKE ''%state%'' AND distinct_value_ct < 70 AND max_length = 2', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/{RECORD_CT}::FLOAT', '1.0', NULL, NULL, 'threshold_value', '0', 'Threshold Count not Matching State Abbreviations', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of values that are not US state abbreviations', 'This test validates entries against a fixed list of two-character US state codes and related Armed Forces codes.', 'Y'),
+ ('1037', 'Weekly_Rec_Ct', 'Weekly Records', 'At least one date per week present within date range', 'Tests for presence of at least one date per calendar week within min/max date range, per baseline data', 'At least one date per week expected in min/max date range.', 'Missing weeks', 'Calendar weeks without date values present', 'general_type= ''D'' AND date_days_present > 1 AND date_weeks_present > 3 AND date_weeks_present - (DATEDIFF(''week'', ''1800-01-05''::DATE, max_date) - DATEDIFF(''week'', ''1800-01-05''::DATE, min_date) + 1) = 0 AND future_date_ct::FLOAT / NULLIF(value_ct, 0) <= 0.75', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT*{PRO_RECORD_CT}::FLOAT/{DATE_WEEKS_PRESENT}::FLOAT/{RECORD_CT}::FLOAT', '1.0', NULL, NULL, 'threshold_value', '0', 'Threshold Weeks without Dates', NULL, 'Fail', 'CAT', 'column', 'Completeness', 'Volume', 'Expected maximum count of calendar weeks without dates present', 'Weekly Records tests that at least one record is present for every calendar week within the minimum and maximum date range for the column. The test is relevant for transactional data, where you would expect at least one transaction to be recorded each week. A failure here would suggest missing records for the number of weeks identified without data. You can adjust the threshold to accept a number of weeks that you know legitimately have no records.', 'Y'),
+ ('1040', 'Variability_Increase', 'Variability Increase', 'Variability has increased above threshold', 'Tests that the spread or dispersion of column values has increased significantly over baseline, indicating a drop in stability of the measure.', 'The Standard Deviation of the measure has increased beyond the defined threshold. This could signal a change in a process or a data quality issue.', 'Pct SD shift', 'Percent of baseline Standard Deviation', 'general_type = ''N'' AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%'' AND value_ct <> distinct_value_ct AND distinct_value_ct > 10 AND stdev_value > 0 AND avg_value IS NOT NULL AND NOT (distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct > 2)', '1', '0.75', NULL, NULL, 'baseline_sd,threshold_value', 'stdev_value,120', 'Std Deviation at Baseline,Expected Maximum Percent', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Expected maximum pct of baseline Standard Deviation (SD)', 'This test looks for percent shifts in standard deviation as a measure of the stability of a measure over time. A significant change could indicate that new values are erroneous, or that the cohort being evaluated is significantly different from baseline. An increase in particular could mark new problems in measurement, a more heterogeneous cohort, or that significant outliers have been introduced. Consider this test along with Average Shift and New Shift. If the average shifts as well, there may be a fundamental shift in the dataset or process used to collect the data point. This might suggest a data shift that should be noted and assessed by business users. If the average does not shift, this may point to a data quality or data collection problem. ', 'Y'),
+ ('1041', 'Variability_Decrease', 'Variability Decrease', 'Variability has decreased below threshold', 'Tests that the spread or dispersion of column values has decreased significantly over baseline, indicating a shift in stability of the measure. This could signal a change in a process or a data quality issue.', 'The Standard Deviation of the measure has decreased below the defined threshold. This could signal a change in a process or a data quality issue.', 'Pct SD shift', 'Percent of baseline Standard Deviation', 'general_type = ''N'' AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%'' AND value_ct <> distinct_value_ct AND distinct_value_ct > 10 AND stdev_value > 0 AND avg_value IS NOT NULL AND NOT (distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct > 2)', '1', '0.75', NULL, NULL, 'baseline_sd,threshold_value', 'stdev_value, 80', 'Std Deviation at Baseline,Expected Minimum Percent', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Expected minimum pct of baseline Standard Deviation (SD)', 'This test looks for percent shifts in standard deviation as a measure of the stability of a measure over time. A significant change could indicate that new values are erroneous, or that the cohort being evaluated is significantly different from baseline. A decrease in particular could indicate an improved process, better precision in measurement, the elimination of outliers, or a more homogeneous cohort. ', 'Y'),
+ ('1042', 'Valid_Month', 'Valid Month', 'Valid calendar month in expected format', 'Tests for the presence of a valid representation of a calendar month consistent with the format at baseline.', 'Column values are not a valid representation of a calendar month consistent with the format at baseline.', 'Invalid months', NULL, 'functional_data_type = ''Period Month''', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/{RECORD_CT}::FLOAT', '1.0', NULL, NULL, 'threshold_value,baseline_value', '0,CASE WHEN max_length > 3 AND initcap(min_text) = min_text THEN ''''''January'''',''''February'''',''''March'''',''''April'''',''''May'''',''''June'''',''''July'''',''''August'''',''''September'''',''''October'''',''''November'''',''''December'''''' WHEN max_length > 3 AND upper(min_text) = min_text THEN ''''''JANUARY'''',''''FEBRUARY'''',''''MARCH'''',''''APRIL'''',''''MAY'''',''''JUNE'''',''''JULY'''',''''AUGUST'''',''''SEPTEMBER'''',''''OCTOBER'''',''''NOVEMBER'''',''''DECEMBER'''''' WHEN max_length > 3 AND lower(min_text) = min_text THEN ''''''january'''',''''february'''',''''march'''',''''april'''',''''may'''',''''june'''',''''july'''',''''august'''',''''september'''',''''october'''',''''november'''',''''december'''''' WHEN max_length = 3 AND initcap(min_text) = min_text THEN ''''''Jan'''',''''Feb'''',''''Mar'''',''''Apr'''',''''May'''',''''Jun'''',''''Jul'''',''''Aug'''',''''Sep'''',''''Oct'''',''''Nov'''',''''Dec'''''' WHEN max_length = 3 AND upper(min_text) = min_text THEN ''''''JAN'''',''''FEB'''',''''MAR'''',''''APR'''',''''MAY'''',''''JUN'''',''''JUL'''',''''AUG'''',''''SEP'''',''''OCT'''',''''NOV'''',''''DEC'''''' WHEN max_length = 3 AND lower(min_text) = min_text THEN ''''''jan'''',''''feb'''',''''mar'''',''''apr'''',''''may'''',''''jun'''',''''jul'''',''''aug'''',''''sep'''',''''oct'''',''''nov'''',''''dec'''''' WHEN max_length = 2 AND min_text = ''01'' THEN ''''''01'''',''''02'''',''''03'''',''''04'''',''''05'''',''''06'''',''''07'''',''''08'''',''''09'''',''''10'''',''''11'''',''''12'''''' WHEN max_length = 2 AND min_text = ''1'' THEN ''''''1'''',''''2'''',''''3'''',''''4'''',''''5'''',''''6'''',''''7'''',''''8'''',''''9'''',''''10'''',''''11'''',''''12'''''' WHEN min_value = 1 THEN ''1,2,3,4,5,6,7,8,9,10,11,12'' ELSE ''NULL'' END', 'Threshold Invalid Months,Valid Month List', 'The acceptable number of records with invalid months present.|List of valid month values for this field, in quotes if field is numeric, separated by commas.', 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of invalid months', NULL, 'N'),
+ ('1043', 'Valid_Characters', 'Valid Characters', 'Column contains no invalid characters', 'Tests for the presence of non-printing characters, leading spaces, or surrounding quotes.', 'Invalid characters, such as non-printing characters, leading spaces, or surrounding quotes, were found.', 'Invalid records', 'Expected count of values with invalid characters', 'general_type = ''A''', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/{RECORD_CT}::FLOAT', '0.75', NULL, NULL, 'threshold_value', '0', NULL, 'The acceptable number of records with invalid character values present.', 'Warning', 'CAT', 'column', 'Validity', 'Schema Drift', 'Threshold Invalid Value Count', 'This test looks for the presence of non-printing ASCII characters that are considered non-standard in basic text processing. It also identifies leading spaces and values enclosed in quotes. Values that fail this test may be artifacts of data conversion, or just more difficult to process or analyze downstream.', 'N'),
+ ('1044', 'Valid_US_Zip', 'Valid US Zip', 'Valid USA Postal Codes', 'Tests that postal codes match the 5 or 9 digit standard US format', 'Invalid US Zip Code formats found.', 'Invalid Zip Codes', 'Expected count of values with invalid Zip Codes', 'functional_data_type = ''Zip''', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/{RECORD_CT}::FLOAT', '0.75', NULL, NULL, 'threshold_value', '0', NULL, NULL, 'Warning', 'CAT', 'column', 'Validity', 'Schema Drift', 'Threshold Invalid Value Count', NULL, 'Y'),
+ ('1045', 'Valid_US_Zip3', 'Valid US Zip-3 ', 'Valid USA Zip-3 Prefix', 'Tests that postal codes match the 3 digit format of a regional prefix.', 'Invalid 3-digit US Zip Code regional prefix formats found.', 'Invalid Zip-3 Prefix', 'Expected count of values with invalid Zip-3 Prefix Codes', 'functional_data_type = ''Zip3''', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/{RECORD_CT}::FLOAT', '0.75', NULL, NULL, 'threshold_value', '0', NULL, NULL, 'Warning', 'CAT', 'column', 'Validity', 'Schema Drift', 'Threshold Invalid Zip3 Count', 'This test looks for the presence of values that fail to match the three-digit numeric code expected for US Zip Code regional prefixes. These prefixes are often used to roll up Zip Code data to a regional level, and may be critical to anonymize detailed data and protect PID. Depending on your needs and regulatory requirements, longer zip codes could place PID at risk.', 'Y'),
+ ('1006', 'Condition_Flag', 'Custom Condition', 'Column values match pre-defined condition', 'Tests that each record in the table matches a pre-defined, custom condition', 'Value(s) found not matching defined condition.', 'Values Failing', NULL, NULL, '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/{RECORD_CT}::FLOAT', '1.0', 'Test Focus', 'Specify a brief descriptor of the focus of this test that is unique within this Test Suite for the Table and Test Type. This distinguishes this test from others of the same type on the same table. Example: `Quantity Consistency` if you are testing that quantity ordered matches quantity shipped.', 'threshold_value,custom_query', NULL, 'Threshold Error Count,Custom SQL Expression (TRUE on error)', 'The number of errors that are acceptable before test fails.|Expression should evaluate to TRUE to register an error or FALSE if no error. An expression can reference only columns in the selected table.', 'Fail', 'CAT', 'custom', 'Validity', 'Schema Drift', 'Count of records that don''t meet test condition', 'Custom Condition is a business-rule test for a user-defined error condition based on the value of one or more columns. The condition is applied to each record within the table, and the count of records failing the condition is added up. If that count exceeds a threshold of errors, the test as a whole is failed. This test is ideal for error conditions that TestGen cannot automatically infer, and any condition that involves the values of more than one column in the same record. Performance of this test is fast, since it is performed together with other aggregate tests. Interpretation is based on the user-defined meaning of the test.', 'Y'),
- ('1031', 'Row_Ct', 'Row Count', 'Number of rows is at or above threshold', 'Tests that the count of records has not decreased from the baseline count.', 'Row count less than baseline count.', 'Row count', NULL, 'TEMPLATE', NULL, NULL, 'threshold_value', NULL, 'Threshold Minimum Record Count', NULL, 'Fail', 'CAT', 'table', 'Completeness', 'Volume', 'Expected minimum row count', 'Because this tests the row count against a constant minimum threshold, it''s appropriate for any dataset, as long as the number of rows doesn''t radically change from refresh to refresh. But it''s not responsive to change over time. You may want to adjust the threshold periodically if you are dealing with a cumulative dataset.', 'Y'),
- ('1032', 'Row_Ct_Pct', 'Row Range', 'Number of rows within percent range of threshold', 'Tests that the count of records is within a percentage above or below the baseline count.', 'Row Count is outside of threshold percent of baseline count.', 'Percent of baseline', 'Row count percent above or below baseline', 'TEMPLATE', NULL, NULL, 'baseline_ct,threshold_value', NULL, 'Baseline Record Count,Threshold Pct Above or Below Baseline', NULL, 'Fail', 'CAT', 'table', 'Completeness', 'Volume', 'Expected percent window below or above baseline', 'This test is better than Row Count for an incremental or windowed dataset where you would expect the row count to range within a percentage of baseline.', 'Y'),
+ ('1031', 'Row_Ct', 'Row Count', 'Number of rows is at or above threshold', 'Tests that the count of records has not decreased from the baseline count.', 'Row count less than baseline count.', 'Row count', NULL, 'TEMPLATE', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/{THRESHOLD_VALUE}::FLOAT', '1.0', NULL, NULL, 'threshold_value', NULL, 'Threshold Minimum Record Count', NULL, 'Fail', 'CAT', 'table', 'Completeness', 'Volume', 'Expected minimum row count', 'Because this tests the row count against a constant minimum threshold, it''s appropriate for any dataset, as long as the number of rows doesn''t radically change from refresh to refresh. But it''s not responsive to change over time. You may want to adjust the threshold periodically if you are dealing with a cumulative dataset.', 'Y'),
+ ('1032', 'Row_Ct_Pct', 'Row Range', 'Number of rows within percent range of threshold', 'Tests that the count of records is within a percentage above or below the baseline count.', 'Row Count is outside of threshold percent of baseline count.', 'Percent of baseline', 'Row count percent above or below baseline', 'TEMPLATE', '(100.0 - {RESULT_MEASURE}::FLOAT)/100.0', '1.0', NULL, NULL, 'baseline_ct,threshold_value', NULL, 'Baseline Record Count,Threshold Pct Above or Below Baseline', NULL, 'Fail', 'CAT', 'table', 'Completeness', 'Volume', 'Expected percent window below or above baseline', 'This test is better than Row Count for an incremental or windowed dataset where you would expect the row count to range within a percentage of baseline.', 'Y'),
- ('1008', 'CUSTOM', 'Custom Test', 'Custom-defined business rule', 'Custom SQL Query Test', 'Errors were detected according to test definition.', 'Errors found', 'Count of errors identified by query', NULL, 'Test Focus', 'Specify a brief descriptor of the focus of this test that is unique within this Test Suite for the Table and Test Type. This distinguishes this test from others of the same type on the same table. Example: `Order Total Matches Detail` if you are testing that the total in one table matches the sum of lines in another.', 'custom_query', NULL, 'Custom SQL Query Returning Error Records', 'Query should return records indicating one or more errors. The test passes if no records are returned. Results of the query will be shown when you click `Review Source Data` for a failed test, so be sure to include enough data in your results to follow-up. \n\nA query can refer to any tables in the database. You must hard-code the schema or use `{DATA_SCHEMA}` to represent the schema defined for the Table Group.', 'Fail', 'QUERY', 'custom', 'Accuracy', 'Data Drift', 'Expected count of errors found by custom query', 'This business-rule test is highly flexible, covering any error state that can be expressed by a SQL query against one or more tables in the database. In operation, the user-defined query is embedded within a parent query returning the count of error rows identified. Any row returned by the query is interpreted as a single error condition in the test. Note that this query is run independently of other tests, and that performance will be slower, depending in large part on the efficiency of the query you write. Interpretation is based on the user-defined meaning of the test. Your query might be written to return errors in individual rows identified by joining tables. Or it might return an error based on a multi-column aggregate condition returning a single row if an error is found. This query is run separately when you click `Review Source Data` from Test Results, so be sure to include enough data in your results to follow-up. Interpretation is based on the user-defined meaning of the test.', 'Y'),
+ ('1008', 'CUSTOM', 'Custom Test', 'Custom-defined business rule', 'Custom SQL Query Test', 'Errors were detected according to test definition.', 'Errors found', 'Count of errors identified by query', NULL, '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/{RECORD_CT}::FLOAT', '1.0', 'Test Focus', 'Specify a brief descriptor of the focus of this test that is unique within this Test Suite for the Table and Test Type. This distinguishes this test from others of the same type on the same table. Example: `Order Total Matches Detail` if you are testing that the total in one table matches the sum of lines in another.', 'custom_query', NULL, 'Custom SQL Query Returning Error Records', 'Query should return records indicating one or more errors. The test passes if no records are returned. Results of the query will be shown when you click `Review Source Data` for a failed test, so be sure to include enough data in your results to follow-up. \n\nA query can refer to any tables in the database. You must hard-code the schema or use `{DATA_SCHEMA}` to represent the schema defined for the Table Group.', 'Fail', 'QUERY', 'custom', 'Accuracy', 'Data Drift', 'Expected count of errors found by custom query', 'This business-rule test is highly flexible, covering any error state that can be expressed by a SQL query against one or more tables in the database. In operation, the user-defined query is embedded within a parent query returning the count of error rows identified. Any row returned by the query is interpreted as a single error condition in the test. Note that this query is run independently of other tests, and that performance will be slower, depending in large part on the efficiency of the query you write. Interpretation is based on the user-defined meaning of the test. Your query might be written to return errors in individual rows identified by joining tables. Or it might return an error based on a multi-column aggregate condition returning a single row if an error is found. This query is run separately when you click `Review Source Data` from Test Results, so be sure to include enough data in your results to follow-up. Interpretation is based on the user-defined meaning of the test.', 'Y'),
- ('1500', 'Aggregate_Balance', 'Aggregate Balance', 'Aggregate values per group match reference', 'Tests for exact match in aggregate values for each set of column values vs. reference dataset', 'Aggregate measure per set of column values does not exactly match reference dataset.', 'Mismatched measures', NULL, NULL, 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_schema_name,match_table_name,match_column_names,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'Record Subset Condition,Grouping Columns,Group Subset Condition,Matching Schema Name,Matching Table Name,Matching Aggregate Expression,Matching Record Subset Condition,Matching Grouping Columns,Matching Group Subset Condition', 'Condition defining a subset of records in main table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in main table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in main table (e.g. HAVING clause) - OPTIONAL|Schema location of matching table|Matching table name|Agregate column expression in matching table: one of `SUM([column_name])` or `COUNT([column_name])`|Condition defining a subset of records in matching table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in matching table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in matching table (e.g. HAVING clause) - OPTIONAL', 'Fail', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected count of group totals not matching aggregate value', 'This test compares sums or counts of a column rolled up to one or more category combinations across two different tables. Both tables must be accessible at the same time. It''s ideal for confirming that two datasets exactly match -- that the sum of a measure or count of a value hasn''t changed or shifted between categories. Use this test to compare a raw and processed version of the same dataset, or to confirm that an aggregated table exactly matches the detail table that it''s built from. An error here means that one or more value combinations fail to match. New categories or combinations will cause failure.', 'Y'),
- ('1501', 'Aggregate_Minimum', 'Aggregate Minimum', 'Aggregate values per group are at or above reference', 'Tests that aggregate values for each set of column values are at least the same as reference dataset', 'Aggregate measure per set of column values is not at least the same as reference dataset.', 'Mismatched measures', NULL, NULL, 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_schema_name,match_table_name,match_column_names,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'Record Subset Condition,Grouping Columns,Group Subset Condition,Matching Schema Name,Matching Table Name,Matching Aggregate Expression,Matching Record Subset Condition,Matching Grouping Columns,Matching Group Subset Condition', 'Condition defining a subset of records in main table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in main table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in main table (e.g. HAVING clause) - OPTIONAL|Schema location of reference table|Reference table name|Aggregate column expression in reference table (e.g. `SUM(sales)`)|Condition defining a subset of records in reference table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in reference table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in reference table (e.g. HAVING clause) - OPTIONAL', 'Fail', 'QUERY', 'referential', 'Accuracy', 'Data Drift', 'Expected count of group totals below aggregate value', 'This test compares sums or counts of a column rolled up to one or more category combinations, but requires a match or increase in the aggregate value, rather than an exact match, across two different tables. Both tables must be accessible at the same time. Use this to confirm that aggregate values have not dropped for any set of categories, even if some values may rise. This test is useful to compare an older and newer version of a cumulative dataset. An error here means that one or more values per category set fail to match or exceed the prior dataset. New categories or combinations are allowed (but can be restricted independently with a Combo_Match test). Both tables must be present to run this test.', 'Y'),
- ('1502', 'Combo_Match', 'Reference Match', 'Column values or combinations found in reference', 'Tests for the presence of one or a set of column values in a reference table', 'Column value combinations are not found in reference table values.', 'Missing values', NULL, NULL, 'Categorical Column List', 'Specify one or more Categorical columns, separated by commas. \n\nDo not use continuous mesurements here. Do not use numeric values unless they represent discrete categories.', 'subset_condition,having_condition,match_schema_name,match_table_name,match_groupby_names,match_subset_condition,match_having_condition', NULL, 'Record Subset Condition,Group Subset Condition,Reference Schema Name,Reference Table Name,Matching Columns,Matching Record Subset Condition,Matching Group Subset Condition', 'Condition defining a subset of records in main table to evaluate, written like a condition within a SQL WHERE clause - OPTIONAL|Condition based on aggregate expression used to exclude value combinations in source table, written like a condition within a SQL HAVING clause (e.g. `SUM(sales) < 100`) - OPTIONAL|Schema location of matching table|Matching table name|Column Names in reference table used to validate source table values (separated by commas)|Condition defining a subset of records in reference table to match against, written like a condition within a SQL WHERE clause - OPTIONAL|Condition based on aggregate expression used to exclude value combinations in reference table, written like a condition within a SQL HAVING clause (e.g. `SUM(sales) < 100`) - OPTIONAL', 'Fail', 'QUERY', 'referential', 'Validity', 'Schema Drift', 'Expected count of non-matching value combinations', 'This test verifies that values, or combinations of values, that are present in the main table are also found in a reference table. This is a useful test for referential integrity between fact and dimension tables. You can also use it to confirm the validity of a code or category, or of combinations of values that should only be found together within each record, such as product/size/color. An error here means that one or more category combinations in the main table are not found in the reference table. Both tables must be present to run this test.', 'Y'),
- ('1503', 'Distribution_Shift', 'Distribution Shift', 'Probability distribution consistent with reference', 'Tests the closeness of match between two distributions of aggregate measures across combinations of column values, using Jensen-Shannon Divergence test', 'Divergence between two distributions exceeds specified threshold.', 'Divergence level (0-1)', 'Jensen-Shannon Divergence, from 0 (identical distributions), to 1.0 (max divergence)', NULL, 'Categorical Column List', 'Specify one or more Categorical columns, separated by commas. Do not use continuous mesurements here. Do not use numeric values unless they represent discrete categories.', 'subset_condition,match_schema_name,match_table_name,match_groupby_names,match_subset_condition', NULL, 'Record Subset Condition,Reference Schema Name,Reference Table Name,Matching Columns to Compare,Matching Record Subset Condition', 'Condition defining a subset of records in main table to evaluate, written like a condition within a SQL WHERE clause - OPTIONAL|Schema location of matching table|Matching table name|Column Names in reference table used to compare counts with source table values (separated by commas)|Condition defining a subset of records in reference table to match against, written like a condition within a SQL WHERE clause - OPTIONAL', 'Warning', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected maximum divergence level between 0 and 1', 'This test measures the similarity of two sets of counts per categories, by using their proportional counts as probability distributions. Using Jensen-Shannon divergence, a measure of relative entropy or difference between two distributions, the test assigns a score ranging from 0, meaning that the distributions are identical, to 1, meaning that the distributions are completely unrelated. This test can be used to compare datasets that may not match exactly, but should have similar distributions. For example, it is a useful sanity check for data from different sources that you would expect to have a consistent spread, such as shipment of building materials per state and construction projects by state. Scores can be compared over time even if the distributions are not identical -- a dataset can be expected to maintain a comparable divergence score with a reference dataset over time. Both tables must be present to run this test.', 'Y'),
- ('1508', 'Timeframe_Combo_Gain', 'Timeframe No Drops', 'Latest timeframe has at least all value combinations from prior period', 'Tests that column values in most recent time-window include at least same as prior time window', 'Column values in most recent time-window don''t include all values in prior window.', 'Mismatched values', NULL, NULL, 'Categorical Column List', 'Specify one or more Categorical columns, separated by commas. Make sure not to use continuous measurements here. Do not use numeric values unless they represent discrete categories.', 'window_date_column,window_days,subset_condition', NULL, 'Date Column for Time Windows,Time Window in Days,Record Subset Condition', 'The date column used to define the time windows. This must be a DATE or DATETIME type.|Length in days of the time window. The test will compare the most recent period of days to the prior period of the same duration.|Condition defining a subset of records in main table to evaluate, written like a condition within a SQL WHERE clause - OPTIONAL', 'Fail', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected count of missing value combinations', 'This test checks a single transactional table to verify that categorical values or combinations that are present in the most recent time window you define include at least all those found in the prior time window of the same duration. Missing values in the latest time window will trigger the test to fail. New values are permitted. Use this test to confirm that codes or categories are not lost across successive time periods in a transactional table.', 'Y'),
- ('1509', 'Timeframe_Combo_Match', 'Timeframe Match', 'Column value combinations from latest timeframe same as prior period', 'Tests for presence of same column values in most recent time-window vs. prior time window', 'Column values don''t match in most recent time-windows.', 'Mismatched values', NULL, NULL, 'Categorical Column List', 'Specify one or more Categorical columns, separated by commas. Do not use continuous measurements here. Do not use numeric values unless they represent discrete categories.', 'window_date_column,window_days,subset_condition', NULL, 'Date Column for Time Windows,Time Window in Days,Record Subset Condition', NULL, 'Fail', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected count of non-matching value combinations', 'This test checks a single transactional table (such as a fact table) to verify that categorical values or combinations that are present in the most recent time window you define match those found in the prior time window of the same duration. New or missing values in the latest time window will trigger the test to fail. Use this test to confirm the consistency in the occurrence of codes or categories across successive time periods in a transactional table.', 'Y'),
+ ('1500', 'Aggregate_Balance', 'Aggregate Balance', 'Aggregate values per group match reference', 'Tests for exact match in aggregate values for each set of column values vs. reference dataset', 'Aggregate measure per set of column values does not exactly match reference dataset.', 'Mismatched measures', NULL, NULL, '1', '1.0', 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_schema_name,match_table_name,match_column_names,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'Record Subset Condition,Grouping Columns,Group Subset Condition,Matching Schema Name,Matching Table Name,Matching Aggregate Expression,Matching Record Subset Condition,Matching Grouping Columns,Matching Group Subset Condition', 'Condition defining a subset of records in main table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in main table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in main table (e.g. HAVING clause) - OPTIONAL|Schema location of matching table|Matching table name|Agregate column expression in matching table: one of `SUM([column_name])` or `COUNT([column_name])`|Condition defining a subset of records in matching table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in matching table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in matching table (e.g. HAVING clause) - OPTIONAL', 'Fail', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected count of group totals not matching aggregate value', 'This test compares sums or counts of a column rolled up to one or more category combinations across two different tables. Both tables must be accessible at the same time. It''s ideal for confirming that two datasets exactly match -- that the sum of a measure or count of a value hasn''t changed or shifted between categories. Use this test to compare a raw and processed version of the same dataset, or to confirm that an aggregated table exactly matches the detail table that it''s built from. An error here means that one or more value combinations fail to match. New categories or combinations will cause failure.', 'Y'),
+ ('1501', 'Aggregate_Minimum', 'Aggregate Minimum', 'Aggregate values per group are at or above reference', 'Tests that aggregate values for each set of column values are at least the same as reference dataset', 'Aggregate measure per set of column values is not at least the same as reference dataset.', 'Mismatched measures', NULL, NULL, '1', '1.0', 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_schema_name,match_table_name,match_column_names,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'Record Subset Condition,Grouping Columns,Group Subset Condition,Matching Schema Name,Matching Table Name,Matching Aggregate Expression,Matching Record Subset Condition,Matching Grouping Columns,Matching Group Subset Condition', 'Condition defining a subset of records in main table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in main table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in main table (e.g. HAVING clause) - OPTIONAL|Schema location of reference table|Reference table name|Aggregate column expression in reference table (e.g. `SUM(sales)`)|Condition defining a subset of records in reference table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in reference table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in reference table (e.g. HAVING clause) - OPTIONAL', 'Fail', 'QUERY', 'referential', 'Accuracy', 'Data Drift', 'Expected count of group totals below aggregate value', 'This test compares sums or counts of a column rolled up to one or more category combinations, but requires a match or increase in the aggregate value, rather than an exact match, across two different tables. Both tables must be accessible at the same time. Use this to confirm that aggregate values have not dropped for any set of categories, even if some values may rise. This test is useful to compare an older and newer version of a cumulative dataset. An error here means that one or more values per category set fail to match or exceed the prior dataset. New categories or combinations are allowed (but can be restricted independently with a Combo_Match test). Both tables must be present to run this test.', 'Y'),
+ ('1502', 'Combo_Match', 'Reference Match', 'Column values or combinations found in reference', 'Tests for the presence of one or a set of column values in a reference table', 'Column value combinations are not found in reference table values.', 'Missing values', NULL, NULL, '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/{RECORD_CT}::FLOAT', '1.0', 'Categorical Column List', 'Specify one or more Categorical columns, separated by commas. \n\nDo not use continuous mesurements here. Do not use numeric values unless they represent discrete categories.', 'subset_condition,having_condition,match_schema_name,match_table_name,match_groupby_names,match_subset_condition,match_having_condition', NULL, 'Record Subset Condition,Group Subset Condition,Reference Schema Name,Reference Table Name,Matching Columns,Matching Record Subset Condition,Matching Group Subset Condition', 'Condition defining a subset of records in main table to evaluate, written like a condition within a SQL WHERE clause - OPTIONAL|Condition based on aggregate expression used to exclude value combinations in source table, written like a condition within a SQL HAVING clause (e.g. `SUM(sales) < 100`) - OPTIONAL|Schema location of matching table|Matching table name|Column Names in reference table used to validate source table values (separated by commas)|Condition defining a subset of records in reference table to match against, written like a condition within a SQL WHERE clause - OPTIONAL|Condition based on aggregate expression used to exclude value combinations in reference table, written like a condition within a SQL HAVING clause (e.g. `SUM(sales) < 100`) - OPTIONAL', 'Fail', 'QUERY', 'referential', 'Validity', 'Schema Drift', 'Expected count of non-matching value combinations', 'This test verifies that values, or combinations of values, that are present in the main table are also found in a reference table. This is a useful test for referential integrity between fact and dimension tables. You can also use it to confirm the validity of a code or category, or of combinations of values that should only be found together within each record, such as product/size/color. An error here means that one or more category combinations in the main table are not found in the reference table. Both tables must be present to run this test.', 'Y'),
+ ('1503', 'Distribution_Shift', 'Distribution Shift', 'Probability distribution consistent with reference', 'Tests the closeness of match between two distributions of aggregate measures across combinations of column values, using Jensen-Shannon Divergence test', 'Divergence between two distributions exceeds specified threshold.', 'Divergence level (0-1)', 'Jensen-Shannon Divergence, from 0 (identical distributions), to 1.0 (max divergence)', NULL, '1', '0.75', 'Categorical Column List', 'Specify one or more Categorical columns, separated by commas. Do not use continuous mesurements here. Do not use numeric values unless they represent discrete categories.', 'subset_condition,match_schema_name,match_table_name,match_groupby_names,match_subset_condition', NULL, 'Record Subset Condition,Reference Schema Name,Reference Table Name,Matching Columns to Compare,Matching Record Subset Condition', 'Condition defining a subset of records in main table to evaluate, written like a condition within a SQL WHERE clause - OPTIONAL|Schema location of matching table|Matching table name|Column Names in reference table used to compare counts with source table values (separated by commas)|Condition defining a subset of records in reference table to match against, written like a condition within a SQL WHERE clause - OPTIONAL', 'Warning', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected maximum divergence level between 0 and 1', 'This test measures the similarity of two sets of counts per categories, by using their proportional counts as probability distributions. Using Jensen-Shannon divergence, a measure of relative entropy or difference between two distributions, the test assigns a score ranging from 0, meaning that the distributions are identical, to 1, meaning that the distributions are completely unrelated. This test can be used to compare datasets that may not match exactly, but should have similar distributions. For example, it is a useful sanity check for data from different sources that you would expect to have a consistent spread, such as shipment of building materials per state and construction projects by state. Scores can be compared over time even if the distributions are not identical -- a dataset can be expected to maintain a comparable divergence score with a reference dataset over time. Both tables must be present to run this test.', 'Y'),
+ ('1508', 'Timeframe_Combo_Gain', 'Timeframe No Drops', 'Latest timeframe has at least all value combinations from prior period', 'Tests that column values in most recent time-window include at least same as prior time window', 'Column values in most recent time-window don''t include all values in prior window.', 'Mismatched values', NULL, NULL, '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/{RECORD_CT}::FLOAT', '1.0', 'Categorical Column List', 'Specify one or more Categorical columns, separated by commas. Make sure not to use continuous measurements here. Do not use numeric values unless they represent discrete categories.', 'window_date_column,window_days,subset_condition', NULL, 'Date Column for Time Windows,Time Window in Days,Record Subset Condition', 'The date column used to define the time windows. This must be a DATE or DATETIME type.|Length in days of the time window. The test will compare the most recent period of days to the prior period of the same duration.|Condition defining a subset of records in main table to evaluate, written like a condition within a SQL WHERE clause - OPTIONAL', 'Fail', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected count of missing value combinations', 'This test checks a single transactional table to verify that categorical values or combinations that are present in the most recent time window you define include at least all those found in the prior time window of the same duration. Missing values in the latest time window will trigger the test to fail. New values are permitted. Use this test to confirm that codes or categories are not lost across successive time periods in a transactional table.', 'Y'),
+ ('1509', 'Timeframe_Combo_Match', 'Timeframe Match', 'Column value combinations from latest timeframe same as prior period', 'Tests for presence of same column values in most recent time-window vs. prior time window', 'Column values don''t match in most recent time-windows.', 'Mismatched values', NULL, NULL, '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/{RECORD_CT}::FLOAT', '1.0', 'Categorical Column List', 'Specify one or more Categorical columns, separated by commas. Do not use continuous measurements here. Do not use numeric values unless they represent discrete categories.', 'window_date_column,window_days,subset_condition', NULL, 'Date Column for Time Windows,Time Window in Days,Record Subset Condition', NULL, 'Fail', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected count of non-matching value combinations', 'This test checks a single transactional table (such as a fact table) to verify that categorical values or combinations that are present in the most recent time window you define match those found in the prior time window of the same duration. New or missing values in the latest time window will trigger the test to fail. Use this test to confirm the consistency in the occurrence of codes or categories across successive time periods in a transactional table.', 'Y'),
- ('1504', 'Aggregate_Pct_Above', 'Aggregate Pct Above', 'Aggregate values per group exceed reference', 'Tests that aggregate values for each set of column values exceed values for reference dataset', 'Aggregate measure per set of column values fails to exceed the reference dataset.', 'Mismatched measures', NULL, NULL, 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_column_names,match_schema_name,match_table_name,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'TODO Fill in default_parm_prompts match_schema_name,TODO Fill in default_parm_prompts match_table_name,TODO Fill in default_parm_prompts match_column_names,TODO Fill in default_parm_prompts match_subset_condition,TODO Fill in default_parm_prompts match_groupby_names,TODO Fill in default_parm_prompts match_having_condition,TODO Fill in default_parm_prompts subset_condition,TODO Fill in default_parm_prompts groupby_names,TODO Fill in default_parm_prompts having_condition', NULL, 'Fail', 'QUERY', 'referential', 'Accuracy', 'Data Drift', 'Expected count of group totals with not exceeding aggregate measure', NULL, 'N'),
- ('1505', 'Aggregate_Pct_Within', 'Aggregate Pct Within', 'Aggregate values per group exceed reference', 'Tests that aggregate values for each set of column values exceed values for reference dataset', 'Aggregate measure per set of column values fails to exceed the reference dataset.', 'Mismatched measures', NULL, NULL, 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_column_names,match_schema_name,match_table_name,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'TODO Fill in default_parm_prompts match_schema_name,TODO Fill in default_parm_prompts match_table_name,TODO Fill in default_parm_prompts match_column_names,TODO Fill in default_parm_prompts match_subset_condition,TODO Fill in default_parm_prompts match_groupby_names,TODO Fill in default_parm_prompts match_having_condition,TODO Fill in default_parm_prompts subset_condition,TODO Fill in default_parm_prompts groupby_names,TODO Fill in default_parm_prompts having_condition', NULL, 'Fail', 'QUERY', 'referential', 'Accuracy', 'Data Drift', 'Expected count of group totals with not exceeding aggregate measure', NULL, 'N'),
- ('1506', 'Aggregate_Increase', 'Aggregate Increase', 'Aggregate values per group exceed reference', 'Tests that aggregate values for each set of column values exceed values for reference dataset', 'Aggregate measure per set of column values fails to exceed the reference dataset.', 'Mismatched measures', NULL, NULL, 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_column_names,match_schema_name,match_table_name,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'TODO Fill in default_parm_prompts match_schema_name,TODO Fill in default_parm_prompts match_table_name,TODO Fill in default_parm_prompts match_column_names,TODO Fill in default_parm_prompts match_subset_condition,TODO Fill in default_parm_prompts match_groupby_names,TODO Fill in default_parm_prompts match_having_condition,TODO Fill in default_parm_prompts subset_condition,TODO Fill in default_parm_prompts groupby_names,TODO Fill in default_parm_prompts having_condition', NULL, 'Fail', 'QUERY', 'referential', 'Accuracy', 'Data Drift', 'Expected count of group totals below reference value', NULL, 'N')
+ ('1504', 'Aggregate_Pct_Above', 'Aggregate Pct Above', 'Aggregate values per group exceed reference', 'Tests that aggregate values for each set of column values exceed values for reference dataset', 'Aggregate measure per set of column values fails to exceed the reference dataset.', 'Mismatched measures', NULL, NULL, '1', '1.0', 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_column_names,match_schema_name,match_table_name,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'TODO Fill in default_parm_prompts match_schema_name,TODO Fill in default_parm_prompts match_table_name,TODO Fill in default_parm_prompts match_column_names,TODO Fill in default_parm_prompts match_subset_condition,TODO Fill in default_parm_prompts match_groupby_names,TODO Fill in default_parm_prompts match_having_condition,TODO Fill in default_parm_prompts subset_condition,TODO Fill in default_parm_prompts groupby_names,TODO Fill in default_parm_prompts having_condition', NULL, 'Fail', 'QUERY', 'referential', 'Accuracy', 'Data Drift', 'Expected count of group totals with not exceeding aggregate measure', NULL, 'N'),
+ ('1505', 'Aggregate_Pct_Within', 'Aggregate Pct Within', 'Aggregate values per group exceed reference', 'Tests that aggregate values for each set of column values exceed values for reference dataset', 'Aggregate measure per set of column values fails to exceed the reference dataset.', 'Mismatched measures', NULL, NULL, '1', '1.0', 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_column_names,match_schema_name,match_table_name,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'TODO Fill in default_parm_prompts match_schema_name,TODO Fill in default_parm_prompts match_table_name,TODO Fill in default_parm_prompts match_column_names,TODO Fill in default_parm_prompts match_subset_condition,TODO Fill in default_parm_prompts match_groupby_names,TODO Fill in default_parm_prompts match_having_condition,TODO Fill in default_parm_prompts subset_condition,TODO Fill in default_parm_prompts groupby_names,TODO Fill in default_parm_prompts having_condition', NULL, 'Fail', 'QUERY', 'referential', 'Accuracy', 'Data Drift', 'Expected count of group totals with not exceeding aggregate measure', NULL, 'N'),
+ ('1506', 'Aggregate_Increase', 'Aggregate Increase', 'Aggregate values per group exceed reference', 'Tests that aggregate values for each set of column values exceed values for reference dataset', 'Aggregate measure per set of column values fails to exceed the reference dataset.', 'Mismatched measures', NULL, NULL, '1', '1.0', 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_column_names,match_schema_name,match_table_name,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'TODO Fill in default_parm_prompts match_schema_name,TODO Fill in default_parm_prompts match_table_name,TODO Fill in default_parm_prompts match_column_names,TODO Fill in default_parm_prompts match_subset_condition,TODO Fill in default_parm_prompts match_groupby_names,TODO Fill in default_parm_prompts match_having_condition,TODO Fill in default_parm_prompts subset_condition,TODO Fill in default_parm_prompts groupby_names,TODO Fill in default_parm_prompts having_condition', NULL, 'Fail', 'QUERY', 'referential', 'Accuracy', 'Data Drift', 'Expected count of group totals below reference value', NULL, 'N')
;
diff --git a/testgen/template/dbsetup/060_create_standard_views.sql b/testgen/template/dbsetup/060_create_standard_views.sql
index 9ec8331..fbbf2f1 100644
--- a/testgen/template/dbsetup/060_create_standard_views.sql
+++ b/testgen/template/dbsetup/060_create_standard_views.sql
@@ -133,7 +133,7 @@ SELECT p.project_name,
ELSE 'Passed'
END as disposition,
r.result_code as passed_ct,
- (1 - r.result_code)::INTEGER as exception_ct,
+ (1 - COALESCE(r.result_code, 0))::INTEGER as exception_ct,
CASE
WHEN result_status = 'Warning'
AND result_message NOT ILIKE 'Inactivated%' THEN 1
diff --git a/testgen/template/dbupgrade/0120_incremental_upgrade.sql b/testgen/template/dbupgrade/0120_incremental_upgrade.sql
new file mode 100644
index 0000000..0081e19
--- /dev/null
+++ b/testgen/template/dbupgrade/0120_incremental_upgrade.sql
@@ -0,0 +1,133 @@
+SET SEARCH_PATH TO {SCHEMA_NAME};
+
+ALTER TABLE test_types
+ ADD COLUMN dq_score_prevalence_formula TEXT,
+ ADD COLUMN dq_score_risk_factor TEXT;
+
+ALTER TABLE test_suites
+ ADD COLUMN last_complete_test_run_id UUID,
+ ADD COLUMN dq_score_exclude BOOLEAN default FALSE;
+
+ALTER TABLE profile_anomaly_results
+ ADD COLUMN dq_prevalence FLOAT;
+
+ALTER TABLE profiling_runs
+ ADD COLUMN dq_affected_data_points BIGINT,
+ ADD COLUMN dq_total_data_points BIGINT,
+ ADD COLUMN dq_score_profiling FLOAT;
+
+ALTER TABLE test_results
+ ADD COLUMN dq_prevalence FLOAT,
+ ADD COLUMN dq_record_ct BIGINT;
+
+ALTER TABLE test_runs
+ ADD COLUMN dq_affected_data_points BIGINT,
+ ADD COLUMN dq_total_data_points BIGINT,
+ ADD COLUMN dq_score_test_run FLOAT;
+
+ALTER TABLE table_groups
+ ADD COLUMN last_complete_profile_run_id UUID,
+ ADD COLUMN dq_score_profiling FLOAT,
+ ADD COLUMN dq_score_testing FLOAT;
+
+ALTER TABLE data_table_chars
+ ADD COLUMN last_complete_profile_run_id UUID,
+ ADD COLUMN dq_score_profiling FLOAT,
+ ADD COLUMN dq_score_testing FLOAT;
+
+ALTER TABLE data_column_chars
+ ADD COLUMN last_complete_profile_run_id UUID,
+ ADD COLUMN dq_score_profiling FLOAT,
+ ADD COLUMN dq_score_testing FLOAT;
+
+
+ALTER TABLE profile_results
+ ADD COLUMN upper_case_ct BIGINT,
+ ADD COLUMN lower_case_ct BIGINT,
+ ADD COLUMN non_alpha_ct BIGINT,
+ ADD COLUMN mixed_case_ct BIGINT GENERATED ALWAYS AS ( value_ct - upper_case_ct - lower_case_ct - non_alpha_ct ) STORED,
+ ADD COLUMN before_100yr_date_ct BIGINT,
+ ADD COLUMN distant_future_date_ct BIGINT;
+
+
+CREATE OR REPLACE FUNCTION fn_normal_cdf(z_score DOUBLE PRECISION)
+RETURNS DOUBLE PRECISION AS
+$$
+/*
+ This function calculates the cumulative distribution function (CDF)
+ for the standard normal distribution for a given Z-score using
+ the Abramowitz and Stegun approximation method. It returns the
+ probability that a standard normal variable is less than or equal
+ to the given Z-score.
+
+ The approximation formula uses a series expansion to estimate the
+ CDF, which is accurate for most practical purposes.
+
+ To estimate the count of observations that fall outside a certain Z-score
+ (both above and below), you can use the `normal_cdf()` function. For a
+ total number of observations N, the proportion of values outside the Z-score
+ is given by: 2 * (1 - normal_cdf(ABS(Z)))
+
+ This gives the proportion of values greater than the positive Z-score and
+ less than the negative Z-score combined. To get the estimated count of
+ observations, multiply this proportion by N: N * 2 * (1 - normal_cdf(ABS(Z)))
+*/
+DECLARE
+ t DOUBLE PRECISION;
+ cdf DOUBLE PRECISION;
+BEGIN
+ t := 1.0 / (1.0 + 0.2316419 * ABS(z_score));
+
+ cdf := (1.0 / SQRT(2 * PI())) * EXP(-0.5 * z_score * z_score) *
+ (0.319381530 * t
+ - 0.356563782 * t * t
+ + 1.781477937 * t * t * t
+ - 1.821255978 * t * t * t * t
+ + 1.330274429 * t * t * t * t * t);
+
+ IF z_score >= 0 THEN
+ RETURN 1.0 - cdf;
+ ELSE
+ RETURN cdf;
+ END IF;
+END;
+$$ LANGUAGE plpgsql;
+
+
+CREATE OR REPLACE FUNCTION fn_eval(expression TEXT) RETURNS FLOAT
+AS
+$$
+DECLARE
+ result FLOAT;
+ invalid_parts TEXT;
+BEGIN
+ -- Check the modified expression for invalid characters, allowing colons
+ IF expression ~* E'[^0-9+\\-*/(),.\\sA-Z_:e\\\'"]' THEN
+ RAISE EXCEPTION 'Invalid characters detected in expression: %', expression;
+ END IF;
+
+ -- Check for dangerous PostgreSQL-specific keywords
+ IF expression ~* E'\b(DROP|ALTER|INSERT|UPDATE|DELETE|TRUNCATE|GRANT|REVOKE|COPY|EXECUTE|CREATE|COMMENT|SECURITY|WITH|SET ROLE|SET SESSION|DO|CALL|--|/\\*|;|pg_read_file|pg_write_file|pg_terminate_backend)\b' THEN
+ RAISE EXCEPTION 'Invalid expression: dangerous statement detected';
+ END IF;
+
+ -- Remove all allowed tokens from the validation expression, treating 'FLOAT' as a keyword
+ invalid_parts := regexp_replace(
+ expression,
+ E'(\\mGREATEST|LEAST|ABS|FN_NORMAL_CDF|DATEDIFF|DAY|FLOAT)\\M|[0-9]+(\\.[0-9]+)?([eE][+-]?[0-9]+)?|[+\\-*/(),\\\'":]+|\\s+',
+ '',
+ 'gi'
+ );
+
+ -- If anything is left in the validation expression, it's invalid
+ IF invalid_parts <> '' THEN
+ RAISE EXCEPTION 'Invalid tokens "%" in expression: %', invalid_parts, expression;
+ END IF;
+
+ -- Use the original expression (with ::FLOAT) for execution
+ EXECUTE format('SELECT (%s)::FLOAT', expression) INTO result;
+
+ RETURN result;
+END;
+$$
+LANGUAGE plpgsql;
diff --git a/testgen/template/execution/ex_finalize_test_run_results.sql b/testgen/template/execution/ex_finalize_test_run_results.sql
index e4d1d6e..c9f187c 100644
--- a/testgen/template/execution/ex_finalize_test_run_results.sql
+++ b/testgen/template/execution/ex_finalize_test_run_results.sql
@@ -3,6 +3,7 @@ UPDATE test_results
severity = COALESCE(d.severity, s.severity, tt.default_severity),
threshold_value = COALESCE(r.threshold_value, d.threshold_value),
result_status = CASE
+ WHEN r.result_status = 'Error' THEN 'Error'
WHEN r.result_code = 1 THEN 'Passed'
WHEN r.result_code = 0
AND COALESCE(d.severity, s.severity, tt.default_severity) = 'Warning' THEN 'Warning'
@@ -31,3 +32,68 @@ INNER JOIN test_definitions d ON r.test_definition_id = d.id
INNER JOIN test_types tt ON r.test_type = tt.test_type
WHERE r.test_run_id = '{TEST_RUN_ID}'
AND test_results.id = r.id;
+
+-- ==============================================================================
+-- | Data Quality Scoring
+-- | - Prevalence % * dq_score_risk_factor = calculated prevalence %
+-- | - Save with total datapoints (record count).
+-- | - When scoring, calculate SUM(calculated prevalence * record count)
+-- | / SUM(record count)
+-- ==============================================================================
+
+-- UPDATE prevalence to zero for all passed or excluded tests
+UPDATE test_results
+ SET dq_record_ct = tc.record_ct,
+ dq_prevalence = 0
+ FROM test_results r
+INNER JOIN data_table_chars tc
+ ON (r.table_groups_id = tc.table_groups_id
+ AND r.table_name ILIKE tc.table_name)
+ WHERE r.test_run_id = '{TEST_RUN_ID}'::UUID
+ AND ( r.result_code = 1
+ OR r.disposition IN ('Dismissed', 'Inactive') )
+ AND test_results.id = r.id;
+
+-- UPDATE TO calculated prevalence for all fails/warnings - result_code = 0
+WITH result_calc
+ AS ( SELECT r.id,
+ tt.dq_score_risk_factor::FLOAT as risk_calc,
+ REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE(
+ REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE(
+ tt.dq_score_prevalence_formula,
+ '{RESULT_MEASURE}', COALESCE(r.result_measure::VARCHAR, '')),
+ '{THRESHOLD_VALUE}', COALESCE(r.threshold_value::VARCHAR, '')),
+
+ '{PRO_RECORD_CT}', COALESCE(p.record_ct::VARCHAR, '')),
+ '{DATE_DAYS_PRESENT}', COALESCE(p.date_days_present::VARCHAR, '')),
+ '{DATE_MONTHS_PRESENT}', COALESCE(p.date_months_present::VARCHAR, '')),
+ '{DATE_WEEKS_PRESENT}', COALESCE(p.date_weeks_present::VARCHAR, '')),
+ '{MIN_DATE}', COALESCE(p.min_date::VARCHAR, '')),
+ '{MAX_DATE}', COALESCE(p.max_date::VARCHAR, '')),
+ '{DISTINCT_VALUE_CT}', COALESCE(p.distinct_value_ct::VARCHAR, '')),
+ '{VALUE_CT}', COALESCE(p.value_ct::VARCHAR, '')),
+ '{MAX_LENGTH}', COALESCE(p.max_length::VARCHAR, '')),
+ '{AVG_LENGTH}', COALESCE(p.avg_length::VARCHAR, '')),
+
+ '{RECORD_CT}', COALESCE(r.dq_record_ct::VARCHAR, tc.record_ct::VARCHAR, ''))
+ as built_score_prevalance_formula,
+ COALESCE(r.dq_record_ct, tc.record_ct) as dq_record_ct
+ FROM test_results r
+ INNER JOIN test_types tt
+ ON r.test_type = tt.test_type
+ LEFT JOIN v_latest_profile_results p
+ ON (r.table_groups_id = p.table_groups_id
+ AND r.table_name = p.table_name
+ AND r.column_names = p.column_name)
+ LEFT JOIN data_table_chars tc
+ ON (r.table_groups_id = tc.table_groups_id
+ AND r.table_name ILIKE tc.table_name)
+ WHERE r.test_run_id = '{TEST_RUN_ID}'::UUID
+ AND result_code = 0
+ AND NOT COALESCE(disposition, '') IN ('Dismissed', 'Inactive') )
+UPDATE test_results
+ SET dq_record_ct = c.dq_record_ct,
+ dq_prevalence = risk_calc * fn_eval(c.built_score_prevalance_formula)
+ FROM result_calc c
+ WHERE test_results.id = c.id;
+
diff --git a/testgen/template/execution/ex_update_test_suite.sql b/testgen/template/execution/ex_update_test_suite.sql
new file mode 100644
index 0000000..68283f1
--- /dev/null
+++ b/testgen/template/execution/ex_update_test_suite.sql
@@ -0,0 +1,13 @@
+WITH last_run
+ AS (SELECT test_suite_id, MAX(test_starttime) as max_starttime
+ FROM test_runs
+ WHERE test_suite_id = '{TEST_SUITE_ID}'
+ AND status = 'Complete'
+ GROUP BY test_suite_id)
+UPDATE test_suites
+ SET last_complete_test_run_id = r.id
+ FROM test_runs r
+INNER JOIN last_run l
+ ON (r.test_suite_id = l.test_suite_id
+ AND r.test_starttime = l.max_starttime)
+ WHERE test_suites.id = r.test_suite_id;
\ No newline at end of file
diff --git a/testgen/template/execution/test_scoring_rollup.sql b/testgen/template/execution/test_scoring_rollup.sql
new file mode 100644
index 0000000..30c2798
--- /dev/null
+++ b/testgen/template/execution/test_scoring_rollup.sql
@@ -0,0 +1,123 @@
+-- Roll up scoring to test run
+WITH score_detail
+ AS (SELECT tr.test_run_id, tr.table_name, tr.column_names,
+ MAX(tr.dq_record_ct) as row_ct,
+ SUM(COALESCE(tr.dq_prevalence * tr.dq_record_ct, 0)) as affected_data_points
+ FROM test_results tr
+ INNER JOIN test_runs r
+ ON tr.test_run_id = r.id
+ WHERE tr.test_run_id = '{TEST_RUN_ID}'
+ AND COALESCE(tr.disposition, 'Confirmed') = 'Confirmed'
+ GROUP BY tr.test_run_id, tr.table_name, tr.column_names ),
+score_calc
+ AS ( SELECT test_run_id,
+ SUM(affected_data_points) as sum_affected_data_points,
+ SUM(row_ct) as sum_data_points
+ FROM score_detail
+ GROUP BY test_run_id )
+UPDATE test_runs
+ SET dq_affected_data_points = sum_affected_data_points,
+ dq_total_data_points = sum_data_points,
+ dq_score_test_run = 100.0 - sum_affected_data_points / sum_data_points
+ FROM score_calc
+ WHERE test_runs.id = score_calc.test_run_id;
+
+
+
+-- Roll up scores from latest Test Runs per Test Suite to Table Group
+WITH last_test_date
+ AS (SELECT r.test_suite_id, MAX(r.test_starttime) as last_test_run_date
+ FROM test_runs r
+ WHERE r.status = 'Complete'
+ GROUP BY r.test_suite_id),
+score_calc
+ AS (SELECT ts.table_groups_id,
+ SUM(run.dq_affected_data_points) as sum_affected_data_points,
+ SUM(run.dq_total_data_points) as sum_data_points
+ FROM test_runs run
+ INNER JOIN test_suites ts
+ ON (run.test_suite_id = ts.id)
+ INNER JOIN last_test_date lp
+ ON (run.test_suite_id = lp.test_suite_id
+ AND run.test_starttime = lp.last_test_run_date)
+ WHERE ts.table_groups_id = '{TABLE_GROUPS_ID}'
+ AND ts.dq_score_exclude = FALSE
+ GROUP BY ts.table_groups_id)
+UPDATE table_groups
+ SET dq_score_testing = 100.0 - s.sum_affected_data_points::FLOAT / s.sum_data_points::FLOAT
+ FROM score_calc s
+ WHERE table_groups.id = s.table_groups_id;
+
+-- Roll up latest scores to data_column_chars
+WITH last_test_date
+ AS (SELECT r.test_suite_id, MAX(r.test_starttime) as last_test_run_date
+ FROM test_runs r
+ WHERE r.status = 'Complete'
+ GROUP BY r.test_suite_id),
+score_calc
+ AS (SELECT dcc.column_id,
+ -- Use AVG instead of MAX because column counts may differ by test_run
+ AVG(tr.dq_record_ct) as row_ct,
+ -- Use SUM to combine impact of all fails per column
+ SUM(COALESCE(tr.dq_prevalence * tr.dq_record_ct, 0)) as affected_data_points
+ FROM test_results tr
+ INNER JOIN test_runs r
+ ON tr.test_run_id = r.id
+ INNER JOIN last_test_date lp
+ ON (r.test_suite_id = lp.test_suite_id
+ AND r.test_starttime = lp.last_test_run_date)
+ INNER JOIN test_suites ts
+ ON (r.test_suite_id = ts.id)
+ INNER JOIN data_column_chars dcc
+ ON (ts.table_groups_id = dcc.table_groups_id
+ AND tr.table_name = dcc.table_name
+ AND tr.column_names = dcc.column_name)
+ WHERE ts.table_groups_id = '{TABLE_GROUPS_ID}'
+ AND ts.dq_score_exclude = FALSE
+ AND COALESCE(tr.disposition, 'Confirmed') = 'Confirmed'
+ GROUP BY dcc.column_id )
+UPDATE data_column_chars
+ SET dq_score_testing = 100.0 - affected_data_points / row_ct
+ FROM score_calc s
+ WHERE data_column_chars.column_id = s.column_id;
+
+
+
+-- Roll up latest scores to data_table_chars
+WITH last_test_date
+ AS (SELECT r.test_suite_id, MAX(r.test_starttime) as last_test_run_date
+ FROM test_runs r
+ WHERE r.status = 'Complete'
+ GROUP BY r.test_suite_id),
+score_detail
+ AS (SELECT dcc.table_id, dcc.column_id,
+ -- Use AVG instead of MAX because column counts may differ by test_run
+ AVG(tr.dq_record_ct) as row_ct,
+ -- Use SUM to combine impact of all fails per column
+ SUM(COALESCE(tr.dq_prevalence * tr.dq_record_ct, 0)) as affected_data_points
+ FROM test_results tr
+ INNER JOIN test_runs r
+ ON tr.test_run_id = r.id
+ INNER JOIN last_test_date lp
+ ON (r.test_suite_id = lp.test_suite_id
+ AND r.test_starttime = lp.last_test_run_date)
+ INNER JOIN test_suites ts
+ ON (r.test_suite_id = ts.id)
+ INNER JOIN data_column_chars dcc
+ ON (ts.table_groups_id = dcc.table_groups_id
+ AND tr.table_name = dcc.table_name
+ AND tr.column_names = dcc.column_name)
+ WHERE ts.table_groups_id = '{TABLE_GROUPS_ID}'
+ AND ts.dq_score_exclude = FALSE
+ AND COALESCE(tr.disposition, 'Confirmed') = 'Confirmed'
+ GROUP BY table_id, dcc.column_id ),
+score_calc
+ AS (SELECT table_id,
+ SUM(affected_data_points) as sum_affected_data_points,
+ SUM(row_ct) as sum_data_points
+ FROM score_detail
+ GROUP BY table_id)
+UPDATE data_table_chars
+ SET dq_score_testing = 100.0 - sum_affected_data_points / sum_data_points
+ FROM score_calc s
+ WHERE data_table_chars.table_id = s.table_id;
\ No newline at end of file
diff --git a/testgen/template/flavors/mssql/profiling/project_profiling_query_mssql.yaml b/testgen/template/flavors/mssql/profiling/project_profiling_query_mssql.yaml
index 5c5e433..8ca20a1 100644
--- a/testgen/template/flavors/mssql/profiling/project_profiling_query_mssql.yaml
+++ b/testgen/template/flavors/mssql/profiling/project_profiling_query_mssql.yaml
@@ -57,6 +57,20 @@ strTemplate05_A: COUNT(DISTINCT UPPER(REPLACE(TRANSLATE("{COL_NAME}",' '''',.-',
END ) AS filled_value_ct,
LEFT(MIN(NULLIF("{COL_NAME}", '') COLLATE Latin1_General_BIN ), 100) AS min_text,
LEFT(MAX(NULLIF("{COL_NAME}", '') COLLATE Latin1_General_BIN ), 100) AS max_text,
+ SUM(CASE
+ WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', ' ') = "{COL_NAME}" THEN 0
+ WHEN TRANSLATE("{COL_NAME}", 'abcdefghijklmnopqrstuvwxyz', ' ') = "{COL_NAME}" THEN 1
+ ELSE 0
+ END) AS upper_case_ct,
+ SUM(CASE
+ WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', ' ') = "{COL_NAME}" THEN 0
+ WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', ' ') = "{COL_NAME}" THEN 1
+ ELSE 0
+ END) AS lower_case_ct,
+ SUM(CASE
+ WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', ' ') = "{COL_NAME}" THEN 1
+ ELSE 0
+ END) AS non_alpha_ct,
SUM({{DKFN_IS_NUM;;LEFT("{COL_NAME}", 31)}} ) AS numeric_ct,
SUM({{DKFN_IS_DATE;;LEFT("{COL_NAME}", 26)}} ) AS date_ct,
CASE
@@ -107,6 +121,9 @@ strTemplate05_else: NULL as distinct_std_value_ct,
NULL as filled_value_ct,
NULL as min_text,
NULL as max_text,
+ NULL as upper_case_ct,
+ NULL as lower_case_ct,
+ NULL as non_alpha_ct,
NULL as numeric_ct,
NULL as date_ct,
NULL as std_pattern_match,
@@ -175,6 +192,10 @@ strTemplate11_D: CASE
WHEN DATEDIFF(month, "{COL_NAME}", '{RUN_DATE}') > 240 THEN 1
ELSE 0
END) AS before_20yr_date_ct,
+ SUM(CASE
+ WHEN DATEDIFF(month, "{COL_NAME}", '{RUN_DATE}') > 1200 THEN 1
+ ELSE 0
+ END) AS before_100yr_date_ct,
SUM(CASE
WHEN DATEDIFF(day, "{COL_NAME}", '{RUN_DATE}') BETWEEN 0 AND 365 THEN 1
ELSE 0
@@ -186,6 +207,10 @@ strTemplate11_D: CASE
SUM(CASE
WHEN "{COL_NAME}" > '{RUN_DATE}' THEN 1 ELSE 0
END) AS future_date_ct,
+ SUM(CASE
+ WHEN DATEDIFF(month, '{RUN_DATE}', "{COL_NAME}") > 240 THEN 1
+ ELSE 0
+ END) AS distant_future_date_ct,
COUNT(DISTINCT DATEDIFF(day, "{COL_NAME}", '{RUN_DATE}' ) ) as date_days_present,
COUNT(DISTINCT DATEDIFF(week, "{COL_NAME}", '{RUN_DATE}' ) ) as date_weeks_present,
COUNT(DISTINCT DATEDIFF(month, "{COL_NAME}", '{RUN_DATE}' ) ) as date_months_present,
@@ -195,9 +220,11 @@ strTemplate11_else: NULL as min_date,
NULL as before_1yr_date_ct,
NULL as before_5yr_date_ct,
NULL as before_20yr_date_ct,
+ NULL AS before_100yr_date_ct,
NULL as within_1yr_date_ct,
NULL as within_1mo_date_ct,
NULL as future_date_ct,
+ NULL as distant_future_date_ct,
NULL as date_days_present,
NULL as date_weeks_present,
NULL as date_months_present,
diff --git a/testgen/template/flavors/postgresql/profiling/project_profiling_query_postgresql.yaml b/testgen/template/flavors/postgresql/profiling/project_profiling_query_postgresql.yaml
index e32c609..746c25f 100644
--- a/testgen/template/flavors/postgresql/profiling/project_profiling_query_postgresql.yaml
+++ b/testgen/template/flavors/postgresql/profiling/project_profiling_query_postgresql.yaml
@@ -51,6 +51,20 @@ strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) a
END ) AS filled_value_ct,
LEFT(MIN(NULLIF("{COL_NAME}", '')), 100) AS min_text,
LEFT(MAX(NULLIF("{COL_NAME}", '')), 100) AS max_text,
+ SUM(CASE
+ WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', '') = "{COL_NAME}" THEN 0
+ WHEN TRANSLATE("{COL_NAME}", 'abcdefghijklmnopqrstuvwxyz', '') = "{COL_NAME}" THEN 1
+ ELSE 0
+ END) AS upper_case_ct,
+ SUM(CASE
+ WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', '') = "{COL_NAME}" THEN 0
+ WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', '') = "{COL_NAME}" THEN 1
+ ELSE 0
+ END) AS lower_case_ct,
+ SUM(CASE
+ WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', '') = "{COL_NAME}" THEN 1
+ ELSE 0
+ END) AS non_alpha_ct,
SUM({{DKFN_IS_NUM;;LEFT("{COL_NAME}", 31)}} ) AS numeric_ct,
SUM({{DKFN_IS_DATE;;LEFT("{COL_NAME}", 26)}} ) AS date_ct,
CASE
@@ -84,6 +98,9 @@ strTemplate05_else: NULL as distinct_std_value_ct,
NULL as filled_value_ct,
NULL as min_text,
NULL as max_text,
+ NULL as upper_case_ct,
+ NULL as lower_case_ct,
+ NULL as non_alpha_ct,
NULL as numeric_ct,
NULL as date_ct,
NULL as std_pattern_match,
@@ -153,6 +170,10 @@ strTemplate11_D: CASE
WHEN {{DKFN_DATEDIFF_MONTH;;"{COL_NAME}";;'{RUN_DATE}'}} > 240 THEN 1
ELSE 0
END) AS before_20yr_date_ct,
+ SUM(CASE
+ WHEN {{DKFN_DATEDIFF_MONTH;;"{COL_NAME}";;'{RUN_DATE}'}} > 1200 THEN 1
+ ELSE 0
+ END) AS before_100yr_date_ct,
SUM(CASE
WHEN {{DKFN_DATEDIFF_DAY;;"{COL_NAME}";;'{RUN_DATE}'}} BETWEEN 0 AND 365 THEN 1
ELSE 0
@@ -164,6 +185,10 @@ strTemplate11_D: CASE
SUM(CASE
WHEN "{COL_NAME}" > '{RUN_DATE}' THEN 1 ELSE 0
END) AS future_date_ct,
+ SUM(CASE
+ WHEN {{DKFN_DATEDIFF_MONTH;;'{RUN_DATE}';;"{COL_NAME}"}} > 240 THEN 1
+ ELSE 0
+ END) AS distant_future_date_ct,
COUNT(DISTINCT {{DKFN_DATEDIFF_DAY;;"{COL_NAME}";;'{RUN_DATE}'}} ) as date_days_present,
COUNT(DISTINCT {{DKFN_DATEDIFF_WEEK;;"{COL_NAME}";;'{RUN_DATE}'}} ) as date_weeks_present,
COUNT(DISTINCT {{DKFN_DATEDIFF_MONTH;;"{COL_NAME}";;'{RUN_DATE}'}} ) as date_months_present,
@@ -174,9 +199,11 @@ strTemplate11_else: NULL as min_date,
NULL as before_1yr_date_ct,
NULL as before_5yr_date_ct,
NULL as before_20yr_date_ct,
+ NULL AS before_100yr_date_ct,
NULL as within_1yr_date_ct,
NULL as within_1mo_date_ct,
NULL as future_date_ct,
+ NULL as distant_future_date_ct,
NULL as date_days_present,
NULL as date_weeks_present,
NULL as date_months_present,
diff --git a/testgen/template/flavors/redshift/profiling/project_profiling_query_redshift.yaml b/testgen/template/flavors/redshift/profiling/project_profiling_query_redshift.yaml
index b876a4d..e54bdf4 100644
--- a/testgen/template/flavors/redshift/profiling/project_profiling_query_redshift.yaml
+++ b/testgen/template/flavors/redshift/profiling/project_profiling_query_redshift.yaml
@@ -51,6 +51,20 @@ strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) a
END ) AS filled_value_ct,
LEFT(MIN(NULLIF("{COL_NAME}", '')), 100) AS min_text,
LEFT(MAX(NULLIF("{COL_NAME}", '')), 100) AS max_text,
+ SUM(CASE
+ WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', '') = "{COL_NAME}" THEN 0
+ WHEN TRANSLATE("{COL_NAME}", 'abcdefghijklmnopqrstuvwxyz', '') = "{COL_NAME}" THEN 1
+ ELSE 0
+ END) AS upper_case_ct,
+ SUM(CASE
+ WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', '') = "{COL_NAME}" THEN 0
+ WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', '') = "{COL_NAME}" THEN 1
+ ELSE 0
+ END) AS lower_case_ct,
+ SUM(CASE
+ WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', '') = "{COL_NAME}" THEN 1
+ ELSE 0
+ END) AS non_alpha_ct,
SUM({{DKFN_IS_NUM;;LEFT("{COL_NAME}", 31)}} ) AS numeric_ct,
SUM({{DKFN_IS_DATE;;LEFT("{COL_NAME}", 26)}} ) AS date_ct,
CASE
@@ -84,6 +98,9 @@ strTemplate05_else: NULL as distinct_std_value_ct,
NULL as filled_value_ct,
NULL as min_text,
NULL as max_text,
+ NULL as upper_case_ct,
+ NULL as lower_case_ct,
+ NULL as non_alpha_ct,
NULL as numeric_ct,
NULL as date_ct,
NULL as std_pattern_match,
@@ -150,6 +167,10 @@ strTemplate11_D: CASE
WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 240 THEN 1
ELSE 0
END) AS before_20yr_date_ct,
+ SUM(CASE
+ WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 1200 THEN 1
+ ELSE 0
+ END) AS before_100yr_date_ct,
SUM(CASE
WHEN DATEDIFF('DAY', "{COL_NAME}", '{RUN_DATE}') BETWEEN 0 AND 365 THEN 1
ELSE 0
@@ -161,6 +182,10 @@ strTemplate11_D: CASE
SUM(CASE
WHEN "{COL_NAME}" > '{RUN_DATE}' THEN 1 ELSE 0
END) AS future_date_ct,
+ SUM(CASE
+ WHEN DATEDIFF('MON', '{RUN_DATE}', "{COL_NAME}") > 240 THEN 1
+ ELSE 0
+ END) AS distant_future_date_ct,
COUNT(DISTINCT DATEDIFF(day, "{COL_NAME}", '{RUN_DATE}' ) ) as date_days_present,
COUNT(DISTINCT DATEDIFF(week, "{COL_NAME}", '{RUN_DATE}' ) ) as date_weeks_present,
COUNT(DISTINCT DATEDIFF(month, "{COL_NAME}", '{RUN_DATE}' ) ) as date_months_present,
@@ -170,9 +195,11 @@ strTemplate11_else: NULL as min_date,
NULL as before_1yr_date_ct,
NULL as before_5yr_date_ct,
NULL as before_20yr_date_ct,
+ NULL AS before_100yr_date_ct,
NULL as within_1yr_date_ct,
NULL as within_1mo_date_ct,
NULL as future_date_ct,
+ NULL as distant_future_date_ct,
NULL as date_days_present,
NULL as date_weeks_present,
NULL as date_months_present,
diff --git a/testgen/template/flavors/snowflake/profiling/project_profiling_query_snowflake.yaml b/testgen/template/flavors/snowflake/profiling/project_profiling_query_snowflake.yaml
index 4538d10..f0a784f 100644
--- a/testgen/template/flavors/snowflake/profiling/project_profiling_query_snowflake.yaml
+++ b/testgen/template/flavors/snowflake/profiling/project_profiling_query_snowflake.yaml
@@ -52,6 +52,20 @@ strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) a
END ) AS filled_value_ct,
LEFT(MIN(NULLIF("{COL_NAME}", '')), 100) AS min_text,
LEFT(MAX(NULLIF("{COL_NAME}", '')), 100) AS max_text,
+ SUM(CASE
+ WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', '') = "{COL_NAME}" THEN 0
+ WHEN TRANSLATE("{COL_NAME}", 'abcdefghijklmnopqrstuvwxyz', '') = "{COL_NAME}" THEN 1
+ ELSE 0
+ END) AS upper_case_ct,
+ SUM(CASE
+ WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', '') = "{COL_NAME}" THEN 0
+ WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', '') = "{COL_NAME}" THEN 1
+ ELSE 0
+ END) AS lower_case_ct,
+ SUM(CASE
+ WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', '') = "{COL_NAME}" THEN 1
+ ELSE 0
+ END) AS non_alpha_ct,
SUM({{DKFN_IS_NUM;;LEFT("{COL_NAME}", 31)}} ) AS numeric_ct,
SUM({{DKFN_IS_DATE;;LEFT("{COL_NAME}", 26)}} ) AS date_ct,
CASE
@@ -85,6 +99,9 @@ strTemplate05_else: NULL as distinct_std_value_ct,
NULL as filled_value_ct,
NULL as min_text,
NULL as max_text,
+ NULL as upper_case_ct,
+ NULL as lower_case_ct,
+ NULL as non_alpha_ct,
NULL as numeric_ct,
NULL as date_ct,
NULL as std_pattern_match,
@@ -149,6 +166,10 @@ strTemplate11_D: GREATEST(MIN("{COL_NAME}"), '0001-01-01') as min_date,
WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 240 THEN 1
ELSE 0
END) AS before_20yr_date_ct,
+ SUM(CASE
+ WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 1200 THEN 1
+ ELSE 0
+ END) AS before_100yr_date_ct,
SUM(CASE
WHEN DATEDIFF('DAY', "{COL_NAME}", '{RUN_DATE}') BETWEEN 0 AND 365 THEN 1
ELSE 0
@@ -160,6 +181,10 @@ strTemplate11_D: GREATEST(MIN("{COL_NAME}"), '0001-01-01') as min_date,
SUM(CASE
WHEN "{COL_NAME}" > '{RUN_DATE}' THEN 1 ELSE 0
END) AS future_date_ct,
+ SUM(CASE
+ WHEN DATEDIFF('MON', '{RUN_DATE}', "{COL_NAME}") > 240 THEN 1
+ ELSE 0
+ END) AS distant_future_date_ct,
COUNT(DISTINCT DATEDIFF(day, "{COL_NAME}", '{RUN_DATE}' ) ) as date_days_present,
COUNT(DISTINCT DATEDIFF(week, "{COL_NAME}", '{RUN_DATE}' ) ) as date_weeks_present,
COUNT(DISTINCT DATEDIFF(month, "{COL_NAME}", '{RUN_DATE}' ) ) as date_months_present,
@@ -169,9 +194,11 @@ strTemplate11_else: NULL as min_date,
NULL as before_1yr_date_ct,
NULL as before_5yr_date_ct,
NULL as before_20yr_date_ct,
+ NULL AS before_100yr_date_ct,
NULL as within_1yr_date_ct,
NULL as within_1mo_date_ct,
NULL as future_date_ct,
+ NULL as distant_future_date_ct,
NULL as date_days_present,
NULL as date_weeks_present,
NULL as date_months_present,
diff --git a/testgen/template/flavors/trino/profiling/project_profiling_query_trino.yaml b/testgen/template/flavors/trino/profiling/project_profiling_query_trino.yaml
index 0968a2d..87b216f 100644
--- a/testgen/template/flavors/trino/profiling/project_profiling_query_trino.yaml
+++ b/testgen/template/flavors/trino/profiling/project_profiling_query_trino.yaml
@@ -51,6 +51,20 @@ strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) a
END ) AS filled_value_ct,
SUBSTRING(MIN(NULLIF("{COL_NAME}", '')), 1, 100) AS min_text,
SUBSTRING(MAX(NULLIF("{COL_NAME}", '')), 1, 100) AS max_text,
+ SUM(CASE
+ WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', '') = "{COL_NAME}" THEN 0
+ WHEN TRANSLATE("{COL_NAME}", 'abcdefghijklmnopqrstuvwxyz', '') = "{COL_NAME}" THEN 1
+ ELSE 0
+ END) AS upper_case_ct,
+ SUM(CASE
+ WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', '') = "{COL_NAME}" THEN 0
+ WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', '') = "{COL_NAME}" THEN 1
+ ELSE 0
+ END) AS lower_case_ct,
+ SUM(CASE
+ WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', '') = "{COL_NAME}" THEN 1
+ ELSE 0
+ END) AS non_alpha_ct,
SUM(fndk_isnum(SUBSTRING("{COL_NAME}", 1, 31))) AS numeric_ct,
SUM(fndk_isdate(SUBSTRING("{COL_NAME}", 1, 26))) AS date_ct,
CASE
@@ -84,6 +98,9 @@ strTemplate05_else: NULL as distinct_std_value_ct,
NULL as filled_value_ct,
NULL as min_text,
NULL as max_text,
+ NULL as upper_case_ct,
+ NULL as lower_case_ct,
+ NULL as non_alpha_ct,
NULL as numeric_ct,
NULL as date_ct,
NULL as std_pattern_match,
@@ -148,6 +165,10 @@ strTemplate11_D: CASE
WHEN DATE_DIFF('MONTH', TIMESTAMP "{COL_NAME}", TIMESTAMP '{RUN_DATE}') > 240 THEN 1
ELSE 0
END) AS before_20yr_date_ct,
+ SUM(CASE
+ WHEN DATE_DIFF('MONTH', TIMESTAMP "{COL_NAME}", TIMESTAMP '{RUN_DATE}') > 1200 THEN 1
+ ELSE 0
+ END) AS before_100yr_date_ct,
SUM(CASE
WHEN DATE_DIFF('DAY', TIMESTAMP "{COL_NAME}", TIMESTAMP '{RUN_DATE}') BETWEEN 0 AND 365 THEN 1
ELSE 0
@@ -159,6 +180,10 @@ strTemplate11_D: CASE
SUM(CASE
WHEN "{COL_NAME}" > '{RUN_DATE}' THEN 1 ELSE 0
END) AS future_date_ct,
+ SUM(CASE
+ WHEN DATE_DIFF('MONTH', TIMESTAMP '{RUN_DATE}', TIMESTAMP "{COL_NAME}") > 240 THEN 1
+ ELSE 0
+ END) AS distant_future_date_ct,
COUNT(DISTINCT DATE_DIFF('day', TIMESTAMP "{COL_NAME}", TIMESTAMP '{RUN_DATE}' ) ) as date_days_present,
COUNT(DISTINCT DATE_DIFF('week', TIMESTAMP "{COL_NAME}", TIMESTAMP '{RUN_DATE}' ) ) as date_weeks_present,
COUNT(DISTINCT DATE_DIFF('month', TIMESTAMP "{COL_NAME}", TIMESTAMP '{RUN_DATE}' ) ) as date_months_present,
@@ -168,9 +193,11 @@ strTemplate11_else: NULL as min_date,
NULL as before_1yr_date_ct,
NULL as before_5yr_date_ct,
NULL as before_20yr_date_ct,
+ NULL AS before_100yr_date_ct,
NULL as within_1yr_date_ct,
NULL as within_1mo_date_ct,
NULL as future_date_ct,
+ NULL as distant_future_date_ct,
NULL as date_days_present,
NULL as date_weeks_present,
NULL as date_months_present,
diff --git a/testgen/template/parms/parms_test_execution.sql b/testgen/template/parms/parms_test_execution.sql
index 204b49c..d39b644 100644
--- a/testgen/template/parms/parms_test_execution.sql
+++ b/testgen/template/parms/parms_test_execution.sql
@@ -1,6 +1,7 @@
SELECT ts.project_code,
ts.connection_id::VARCHAR,
ts.id::VARCHAR as test_suite_id,
+ ts.table_groups_id::VARCHAR,
tg.table_group_schema,
cc.sql_flavor,
cc.project_host,
diff --git a/testgen/template/profiling/functional_datatype.sql b/testgen/template/profiling/functional_datatype.sql
index a74cfb4..b7822a5 100644
--- a/testgen/template/profiling/functional_datatype.sql
+++ b/testgen/template/profiling/functional_datatype.sql
@@ -232,6 +232,7 @@ WHERE profile_run_id = '{PROFILE_RUN_ID}'
UPDATE profile_results
SET functional_data_type =
CASE WHEN (std_pattern_match = 'ZIP_USA' AND (column_name ILIKE '%zip%' OR column_name ILIKE '%postal%'))
+ OR (lower(column_name) IN ('ZIP_CODE', 'ZIP'))
THEN 'Zip'
WHEN std_pattern_match = 'EMAIL'
THEN 'Email'
diff --git a/testgen/template/profiling/profile_anomaly_scoring.sql b/testgen/template/profiling/profile_anomaly_scoring.sql
new file mode 100644
index 0000000..9511c12
--- /dev/null
+++ b/testgen/template/profiling/profile_anomaly_scoring.sql
@@ -0,0 +1,10 @@
+UPDATE profile_anomaly_results r
+ SET dq_prevalence = ({PREV_FORMULA}) * {RISK}
+ FROM profile_anomaly_results r2
+INNER JOIN profile_results p
+ ON (r2.profile_run_id = p.profile_run_id
+ AND r2.table_name = p.table_name
+ AND r2.column_name = p.column_name)
+ WHERE r.profile_run_id = '{PROFILE_RUN_ID}'::UUID
+ AND r2.anomaly_id = '{ANOMALY_ID}'
+ AND r.id = r2.id;
\ No newline at end of file
diff --git a/testgen/template/profiling/profile_anomaly_scoring_rollup.sql b/testgen/template/profiling/profile_anomaly_scoring_rollup.sql
new file mode 100644
index 0000000..9c7047b
--- /dev/null
+++ b/testgen/template/profiling/profile_anomaly_scoring_rollup.sql
@@ -0,0 +1,109 @@
+-- Roll up scoring to profiling run
+WITH score_detail
+ AS (SELECT pr.profile_run_id, pr.table_name, pr.column_name,
+ MAX(pr.record_ct) as row_ct,
+ SUM(COALESCE(p.dq_prevalence * pr.record_ct, 0)) as affected_data_points
+ FROM profile_results pr
+ INNER JOIN profiling_runs r
+ ON (pr.profile_run_id = r.id)
+ LEFT JOIN profile_anomaly_results p
+ ON (pr.profile_run_id = p.profile_run_id
+ AND pr.column_name = p.column_name
+ AND pr.table_name = p.table_name)
+ WHERE pr.profile_run_id = '{PROFILE_RUN_ID}'
+ AND COALESCE(p.disposition, 'Confirmed') = 'Confirmed'
+ GROUP BY 1, 2, 3 ),
+score_calc
+ AS ( SELECT profile_run_id,
+ SUM(affected_data_points) as sum_affected_data_points,
+ SUM(row_ct) as sum_data_points
+ FROM score_detail
+ GROUP BY profile_run_id )
+UPDATE profiling_runs
+ SET dq_affected_data_points = sum_affected_data_points,
+ dq_total_data_points = sum_data_points,
+ dq_score_profiling = 100.0 - sum_affected_data_points / sum_data_points
+ FROM score_calc
+ WHERE profiling_runs.id = score_calc.profile_run_id;
+
+
+-- Roll up latest scores to Table Group
+WITH last_profile_date
+ AS (SELECT table_groups_id, MAX(profiling_starttime) as last_profile_run_date
+ FROM profiling_runs
+ WHERE status = 'Complete'
+ GROUP BY table_groups_id),
+score_calc
+ AS (SELECT run.table_groups_id, run.id as profile_run_id,
+ run.dq_affected_data_points as sum_affected_data_points,
+ run.dq_total_data_points as sum_data_points
+ FROM profiling_runs run
+ INNER JOIN last_profile_date lp
+ ON (run.table_groups_id = lp.table_groups_id
+ AND run.profiling_starttime = lp.last_profile_run_date)
+ WHERE run.table_groups_id = '{TABLE_GROUPS_ID}' )
+UPDATE table_groups
+ SET dq_score_profiling = 100.0 - s.sum_affected_data_points::FLOAT / s.sum_data_points::FLOAT,
+ last_complete_profile_run_id = s.profile_run_id
+ FROM score_calc s
+ WHERE table_groups.id = s.table_groups_id;
+
+-- Roll up latest scores to data_column_chars
+WITH score_detail
+ AS (SELECT dcc.column_id, tg.last_complete_profile_run_id,
+ MAX(pr.record_ct) as row_ct,
+ SUM(COALESCE(p.dq_prevalence * pr.record_ct, 0)) as affected_data_points
+ FROM table_groups tg
+ INNER JOIN profiling_runs r
+ ON (tg.last_complete_profile_run_id = r.id)
+ INNER JOIN profile_results pr
+ ON (r.id = pr.profile_run_id)
+ INNER JOIN data_column_chars dcc
+ ON (pr.table_groups_id = dcc.table_groups_id
+ AND pr.table_name = dcc.table_name
+ AND pr.column_name = dcc.column_name)
+ LEFT JOIN profile_anomaly_results p
+ ON (pr.profile_run_id = p.profile_run_id
+ AND pr.column_name = p.column_name
+ AND pr.table_name = p.table_name)
+ WHERE tg.id = '{TABLE_GROUPS_ID}'
+ AND COALESCE(p.disposition, 'Confirmed') = 'Confirmed'
+ GROUP BY dcc.column_id, tg.last_complete_profile_run_id )
+UPDATE data_column_chars
+ SET dq_score_profiling = 100.0 - s.affected_data_points / s.row_ct,
+ last_complete_profile_run_id = s.last_complete_profile_run_id
+ FROM score_detail s
+ WHERE data_column_chars.column_id = s.column_id;
+
+-- Roll up latest scores to data_table_chars
+WITH score_detail
+ AS (SELECT dcc.column_id, dcc.table_id, tg.last_complete_profile_run_id,
+ MAX(pr.record_ct) as row_ct,
+ SUM(COALESCE(p.dq_prevalence * pr.record_ct, 0)) as affected_data_points
+ FROM table_groups tg
+ INNER JOIN profiling_runs r
+ ON (tg.last_complete_profile_run_id = r.id)
+ INNER JOIN profile_results pr
+ ON (r.id = pr.profile_run_id)
+ INNER JOIN data_column_chars dcc
+ ON (pr.table_groups_id = dcc.table_groups_id
+ AND pr.table_name = dcc.table_name
+ AND pr.column_name = dcc.column_name)
+ LEFT JOIN profile_anomaly_results p
+ ON (pr.profile_run_id = p.profile_run_id
+ AND pr.column_name = p.column_name
+ AND pr.table_name = p.table_name)
+ WHERE tg.id = '{TABLE_GROUPS_ID}'
+ AND COALESCE(p.disposition, 'Confirmed') = 'Confirmed'
+ GROUP BY dcc.column_id, dcc.table_id, tg.last_complete_profile_run_id ),
+score_calc
+ AS ( SELECT table_id, last_complete_profile_run_id,
+ SUM(affected_data_points) as sum_affected_data_points,
+ SUM(row_ct) as sum_data_points
+ FROM score_detail
+ GROUP BY table_id, last_complete_profile_run_id )
+UPDATE data_table_chars
+ SET dq_score_profiling = 100.0 - s.sum_affected_data_points / s.sum_data_points,
+ last_complete_profile_run_id = s.last_complete_profile_run_id
+ FROM score_calc s
+ WHERE data_table_chars.table_id = s.table_id;
diff --git a/testgen/template/profiling/profile_anomaly_types_get.sql b/testgen/template/profiling/profile_anomaly_types_get.sql
index f1cd576..c1f3950 100644
--- a/testgen/template/profiling/profile_anomaly_types_get.sql
+++ b/testgen/template/profiling/profile_anomaly_types_get.sql
@@ -1,3 +1,3 @@
-SELECT id, anomaly_type, data_object, anomaly_criteria, detail_expression
+SELECT id, anomaly_type, data_object, anomaly_criteria, detail_expression, dq_score_prevalence_formula, dq_score_risk_factor
FROM profile_anomaly_types t
ORDER BY id;
diff --git a/testgen/template/validate_tests/ex_get_test_column_list_tg.sql b/testgen/template/validate_tests/ex_get_test_column_list_tg.sql
index df7bdde..b0953b1 100644
--- a/testgen/template/validate_tests/ex_get_test_column_list_tg.sql
+++ b/testgen/template/validate_tests/ex_get_test_column_list_tg.sql
@@ -1,6 +1,19 @@
SELECT schema_name || '.' || table_name || '.' || column_name AS columns,
ARRAY_AGG(cat_test_id) as test_id_array
- FROM (SELECT cat_test_id,
+ FROM (
+ -- FROM: column_name - column scope (single column)
+ SELECT cat_test_id,
+ schema_name AS schema_name,
+ table_name AS table_name,
+ column_name
+ FROM test_definitions d
+ INNER JOIN test_types t
+ ON d.test_type = t.test_type
+ WHERE test_suite_id = '{TEST_SUITE_ID}'
+ AND t.test_scope = 'column'
+ UNION
+ -- FROM: column_name - referential scope (could be multiple columns)
+ SELECT cat_test_id,
schema_name AS schema_name,
table_name AS table_name,
TRIM(UNNEST(STRING_TO_ARRAY(column_name, ','))) as column_name
@@ -8,8 +21,9 @@
INNER JOIN test_types t
ON d.test_type = t.test_type
WHERE test_suite_id = '{TEST_SUITE_ID}'
- AND t.test_scope IN ('column', 'referential')
+ AND t.test_scope = 'referential'
UNION
+ -- FROM: groupby_names (should be referential)
SELECT cat_test_id,
schema_name AS schema_name,
table_name AS table_name,
@@ -20,6 +34,7 @@
WHERE test_suite_id = '{TEST_SUITE_ID}'
AND t.test_scope IN ('column', 'referential')
UNION
+ -- FROM: window_date_column (referential)
SELECT cat_test_id,
schema_name AS schema_name,
table_name AS table_name,
@@ -28,8 +43,9 @@
INNER JOIN test_types t
ON d.test_type = t.test_type
WHERE test_suite_id = '{TEST_SUITE_ID}'
- AND t.test_scope IN ('column', 'referential')
+ AND t.test_scope = 'referential'
UNION
+ -- FROM: match_column_names (referential)
SELECT cat_test_id,
match_schema_name AS schema_name,
match_table_name AS table_name,
@@ -40,6 +56,7 @@
WHERE test_suite_id = '{TEST_SUITE_ID}'
AND t.test_scope = 'referential'
UNION
+ -- FROM: match_groupby_names (referential)
SELECT cat_test_id,
match_schema_name AS schema_name,
match_table_name AS table_name,
@@ -49,5 +66,5 @@
ON d.test_type = t.test_type
WHERE test_suite_id = '{TEST_SUITE_ID}'
AND t.test_scope = 'referential' ) cols
- WHERE column_name SIMILAR TO '[A-Za-z0-9_]+'
+-- WHERE column_name SIMILAR TO '[A-Za-z0-9_]+'
GROUP BY columns;
diff --git a/testgen/template/validate_tests/ex_write_test_val_errors.sql b/testgen/template/validate_tests/ex_write_test_val_errors.sql
index b1d47d3..639cc3e 100644
--- a/testgen/template/validate_tests/ex_write_test_val_errors.sql
+++ b/testgen/template/validate_tests/ex_write_test_val_errors.sql
@@ -9,6 +9,7 @@ INSERT INTO test_results
test_run_id,
input_parameters,
result_code,
+ result_status,
result_message,
result_measure )
SELECT '{TEST_SUITE_ID}'::UUID,
@@ -20,7 +21,8 @@ INSERT INTO test_results
'{RUN_DATE}' as test_time,
'{TEST_RUN_ID}' as test_run_id,
NULL as input_parameters,
- 0 as result_code,
+ NULL as result_code,
+ 'Error' as result_status,
test_definition_status AS result_message,
NULL as result_measure
FROM test_definitions
From c54266bdb5b9503659a0714e6734897427f52180 Mon Sep 17 00:00:00 2001
From: "Chip.Bloche"
Date: Mon, 11 Nov 2024 09:26:24 -0500
Subject: [PATCH 64/91] Tweaked Functional Datatypes
---
.../profiling/functional_datatype.sql | 24 +++++++++----------
1 file changed, 11 insertions(+), 13 deletions(-)
diff --git a/testgen/template/profiling/functional_datatype.sql b/testgen/template/profiling/functional_datatype.sql
index b7822a5..af64286 100644
--- a/testgen/template/profiling/functional_datatype.sql
+++ b/testgen/template/profiling/functional_datatype.sql
@@ -460,15 +460,6 @@ UPDATE profile_results
AND p.distinct_value_ct BETWEEN 15 AND 40000 ) c
WHERE profile_results.id = c.id;
--- 7. Assign 'ID-Unique' functional data type to the columns that are identity columns
-
-UPDATE profile_results
-SET functional_data_type = 'ID-Unique'
-WHERE profile_run_id = '{PROFILE_RUN_ID}'
- AND functional_data_type IN ('ID', 'ID-Secondary')
- AND record_ct = distinct_value_ct
- AND record_ct > 50;
-
-- Update alpha ID's to ID-Secondary and ID-Grouping
UPDATE profile_results
@@ -482,7 +473,16 @@ SET functional_data_type = CASE
WHERE profile_run_id = '{PROFILE_RUN_ID}'
AND functional_data_type = 'ID';
--- 8. Assign 'ID-FK' functional data type to the columns that are foreign keys of the identity columns identified in the previous step
+-- Assign 'ID-Unique' functional data type to the columns that are identity columns
+
+UPDATE profile_results
+SET functional_data_type = 'ID-Unique'
+WHERE profile_run_id = '{PROFILE_RUN_ID}'
+ AND functional_data_type IN ('ID', 'ID-Secondary')
+ AND record_ct = distinct_value_ct
+ AND record_ct > 50;
+
+-- Assign 'ID-FK' functional data type to the columns that are foreign keys of the identity columns identified in the previous step
UPDATE profile_results
SET functional_data_type = 'ID-FK'
@@ -496,9 +496,7 @@ WHERE profile_results.profile_run_id = '{PROFILE_RUN_ID}'
and profile_results.table_name <> ui.table_name
and profile_results.functional_data_type <> 'ID-Unique';
--- Assign
-
--- 9. Functional Data Type: 'Measurement Pct'
+-- Functional Data Type: 'Measurement Pct'
UPDATE profile_results
SET functional_data_type = 'Measurement Pct'
From b62367918c57d272236e81f0bc4f6625f2a7b7da Mon Sep 17 00:00:00 2001
From: "Chip.Bloche"
Date: Mon, 11 Nov 2024 13:55:27 -0500
Subject: [PATCH 65/91] Tweaked Incremental Upgrade
---
...incremental_upgrade.sql => 0113_incremental_upgrade.sql} | 6 ++++++
1 file changed, 6 insertions(+)
rename testgen/template/dbupgrade/{0120_incremental_upgrade.sql => 0113_incremental_upgrade.sql} (94%)
diff --git a/testgen/template/dbupgrade/0120_incremental_upgrade.sql b/testgen/template/dbupgrade/0113_incremental_upgrade.sql
similarity index 94%
rename from testgen/template/dbupgrade/0120_incremental_upgrade.sql
rename to testgen/template/dbupgrade/0113_incremental_upgrade.sql
index 0081e19..dafc6f1 100644
--- a/testgen/template/dbupgrade/0120_incremental_upgrade.sql
+++ b/testgen/template/dbupgrade/0113_incremental_upgrade.sql
@@ -8,6 +8,12 @@ ALTER TABLE test_suites
ADD COLUMN last_complete_test_run_id UUID,
ADD COLUMN dq_score_exclude BOOLEAN default FALSE;
+ALTER TABLE profile_anomaly_types
+ ADD COLUMN upper_case_ct BIGINT,
+ ADD COLUMN lower_case_ct BIGINT,
+ ADD COLUMN non_alpha_ct BIGINT,
+ ADD COLUMN mixed_case_ct BIGINT GENERATED ALWAYS AS ( value_ct - upper_case_ct - lower_case_ct - non_alpha_ct ) STORED;
+
ALTER TABLE profile_anomaly_results
ADD COLUMN dq_prevalence FLOAT;
From bde97caab229558f26c5f5d1808085e884268518 Mon Sep 17 00:00:00 2001
From: "Chip.Bloche"
Date: Mon, 11 Nov 2024 14:29:01 -0500
Subject: [PATCH 66/91] Incremental upgrade fix
---
testgen/template/dbupgrade/0113_incremental_upgrade.sql | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/testgen/template/dbupgrade/0113_incremental_upgrade.sql b/testgen/template/dbupgrade/0113_incremental_upgrade.sql
index dafc6f1..8907660 100644
--- a/testgen/template/dbupgrade/0113_incremental_upgrade.sql
+++ b/testgen/template/dbupgrade/0113_incremental_upgrade.sql
@@ -9,10 +9,8 @@ ALTER TABLE test_suites
ADD COLUMN dq_score_exclude BOOLEAN default FALSE;
ALTER TABLE profile_anomaly_types
- ADD COLUMN upper_case_ct BIGINT,
- ADD COLUMN lower_case_ct BIGINT,
- ADD COLUMN non_alpha_ct BIGINT,
- ADD COLUMN mixed_case_ct BIGINT GENERATED ALWAYS AS ( value_ct - upper_case_ct - lower_case_ct - non_alpha_ct ) STORED;
+ ADD COLUMN dq_score_prevalence_formula TEXT,
+ ADD COLUMN dq_score_risk_factor TEXT;
ALTER TABLE profile_anomaly_results
ADD COLUMN dq_prevalence FLOAT;
From b9b17432901095cccfd1c4da2cf385ab79620cad Mon Sep 17 00:00:00 2001
From: Ricardo Boni
Date: Tue, 12 Nov 2024 11:40:31 -0500
Subject: [PATCH 67/91] fix(ui): Adding database icons to the python built
package
---
pyproject.toml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pyproject.toml b/pyproject.toml
index cc41773..b3fec2f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -102,7 +102,7 @@ include-package-data = true
[tool.setuptools.package-data]
"*" = ["*.toml", "*.sql", "*.yaml"]
"testgen.template" = ["*.sql", "*.yaml", "**/*.sql", "**/*.yaml"]
-"testgen.ui.assets" = ["*.svg", "*.png", "*.js", "*.css"]
+"testgen.ui.assets" = ["*.svg", "*.png", "*.js", "*.css", "flavors/*.svg"]
"testgen.ui.components.frontend" = ["*.html", "**/*.js", "**/*.css", "**/*.woff2", "**/*.svg"]
[tool.setuptools.packages.find]
From bd3235381f7b93a33a66e903a1da66c03c326f92 Mon Sep 17 00:00:00 2001
From: Aarthy Adityan
Date: Fri, 8 Nov 2024 18:01:25 -0500
Subject: [PATCH 68/91] fix(ui): add table and column filters to hygiene issues
and test results
---
testgen/ui/bootstrap.py | 4 +-
.../frontend/js/pages/data_hierarchy.js | 12 +-
...ofiling_anomalies.py => hygiene_issues.py} | 111 +++++++++++----
testgen/ui/views/test_results.py | 131 +++++++++++++-----
4 files changed, 189 insertions(+), 69 deletions(-)
rename testgen/ui/views/{profiling_anomalies.py => hygiene_issues.py} (85%)
diff --git a/testgen/ui/bootstrap.py b/testgen/ui/bootstrap.py
index 414f7e5..3abacce 100644
--- a/testgen/ui/bootstrap.py
+++ b/testgen/ui/bootstrap.py
@@ -12,9 +12,9 @@
from testgen.ui.session import session
from testgen.ui.views.connections import ConnectionsPage
from testgen.ui.views.data_hierarchy import DataHierarchyPage
+from testgen.ui.views.hygiene_issues import HygieneIssuesPage
from testgen.ui.views.login import LoginPage
from testgen.ui.views.overview import OverviewPage
-from testgen.ui.views.profiling_anomalies import ProfilingAnomaliesPage
from testgen.ui.views.profiling_results import ProfilingResultsPage
from testgen.ui.views.profiling_runs import DataProfilingPage
from testgen.ui.views.project_settings import ProjectSettingsPage
@@ -31,7 +31,7 @@
DataHierarchyPage,
DataProfilingPage,
ProfilingResultsPage,
- ProfilingAnomaliesPage,
+ HygieneIssuesPage,
TestRunsPage,
TestResultsPage,
ConnectionsPage,
diff --git a/testgen/ui/components/frontend/js/pages/data_hierarchy.js b/testgen/ui/components/frontend/js/pages/data_hierarchy.js
index a1d09ce..2916a50 100644
--- a/testgen/ui/components/frontend/js/pages/data_hierarchy.js
+++ b/testgen/ui/components/frontend/js/pages/data_hierarchy.js
@@ -463,7 +463,11 @@ const HygieneIssuesCard = (/** @type Table | Column */ item) => {
const hygieneIssues = item.latest_anomalies.filter(({ issue_likelihood }) => issue_likelihood !== 'Potential PII');
const linkProps = {
href: 'profiling-runs:hygiene',
- params: { run_id: item.latest_profile_id },
+ params: {
+ run_id: item.latest_profile_id,
+ table_name: item.table_name,
+ column_name: item.column_name,
+ },
};
return IssuesCard('Hygiene Issues', hygieneIssues, attributes, linkProps, 'No hygiene issues detected');
@@ -496,7 +500,11 @@ const TestIssuesCard = (/** @type Table | Column */ item) => {
),
Link({
href: 'test-runs:results',
- params: { run_id: issue.test_run_id },
+ params: {
+ run_id: issue.test_run_id,
+ table_name: item.table_name,
+ column_name: item.column_name,
+ },
open_new: true,
label: formatTimestamp(issue.test_run_date),
style: 'font-size: 12px; margin-top: 2px;',
diff --git a/testgen/ui/views/profiling_anomalies.py b/testgen/ui/views/hygiene_issues.py
similarity index 85%
rename from testgen/ui/views/profiling_anomalies.py
rename to testgen/ui/views/hygiene_issues.py
index 4e70ce5..7f6aec5 100644
--- a/testgen/ui/views/profiling_anomalies.py
+++ b/testgen/ui/views/hygiene_issues.py
@@ -20,14 +20,22 @@
from testgen.ui.views.dialogs.profiling_results_dialog import view_profiling_button
-class ProfilingAnomaliesPage(Page):
+class HygieneIssuesPage(Page):
path = "profiling-runs:hygiene"
can_activate: typing.ClassVar = [
lambda: session.authentication_status,
lambda: "run_id" in session.current_page_args or "profiling-runs",
]
- def render(self, run_id: str, issue_class: str | None = None, issue_type: str | None = None, **_kwargs) -> None:
+ def render(
+ self,
+ run_id: str,
+ issue_class: str | None = None,
+ issue_type: str | None = None,
+ table_name: str | None = None,
+ column_name: str | None = None,
+ **_kwargs,
+ ) -> None:
run_parentage = profiling_queries.lookup_db_parentage_from_run(run_id)
if not run_parentage:
self.router.navigate_with_warning(
@@ -49,9 +57,9 @@ def render(self, run_id: str, issue_class: str | None = None, issue_type: str |
],
)
- others_summary_column, pii_summary_column, _ = st.columns([.3, .3, .4])
- (liklihood_filter_column, issue_type_filter_column, sort_column, actions_column, export_button_column) = (
- st.columns([.16, .34, .08, .32, .1], vertical_alignment="bottom")
+ others_summary_column, pii_summary_column, actions_column = st.columns([.25, .25, .5], vertical_alignment="bottom")
+ (liklihood_filter_column, issue_type_filter_column, table_filter_column, column_filter_column, sort_column, export_button_column) = (
+ st.columns([.15, .25, .2, .2, .1, .1], vertical_alignment="bottom")
)
testgen.flex_row_end(actions_column)
testgen.flex_row_end(export_button_column)
@@ -78,6 +86,26 @@ def render(self, run_id: str, issue_class: str | None = None, issue_type: str |
disabled=issue_class == "Potential PII",
)
+ run_columns_df = get_profiling_run_columns(run_id)
+ with table_filter_column:
+ table_name = testgen.select(
+ options=list(run_columns_df["table_name"].unique()),
+ default_value=table_name,
+ bind_to_query="table_name",
+ label="Table Name",
+ )
+
+ with column_filter_column:
+ column_options = list(run_columns_df.loc[run_columns_df["table_name"] == table_name]["column_name"])
+ column_name = testgen.select(
+ options=column_options,
+ value_column="column_name",
+ default_value=column_name,
+ bind_to_query="column_name",
+ label="Column Name",
+ disabled=not table_name,
+ )
+
with sort_column:
sortable_columns = (
("Table", "r.table_name"),
@@ -95,7 +123,7 @@ def render(self, run_id: str, issue_class: str | None = None, issue_type: str |
# Get hygiene issue list
- df_pa = get_profiling_anomalies(run_id, issue_class, issue_type_id, sorting_columns)
+ df_pa = get_profiling_anomalies(run_id, issue_class, issue_type_id, table_name, column_name, sorting_columns)
# Retrieve disposition action (cache refreshed)
df_action = get_anomaly_disposition(run_id)
@@ -110,7 +138,7 @@ def render(self, run_id: str, issue_class: str | None = None, issue_type: str |
testgen.summary_bar(
items=others_summary,
label="Hygiene Issues",
- height=40,
+ height=20,
width=400,
)
@@ -120,7 +148,7 @@ def render(self, run_id: str, issue_class: str | None = None, issue_type: str |
testgen.summary_bar(
items=anomalies_pii_summary,
label="Potential PII",
- height=40,
+ height=20,
width=400,
)
# write_frequency_graph(df_pa)
@@ -252,24 +280,48 @@ def render(self, run_id: str, issue_class: str | None = None, issue_type: str |
@st.cache_data(show_spinner=False)
-def get_db_table_group_choices(str_project_code):
- str_schema = st.session_state["dbschema"]
- return dq.run_table_groups_lookup_query(str_schema, str_project_code)
+def get_db_table_group_choices(project_code: str) -> pd.DataFrame:
+ schema: str = st.session_state["dbschema"]
+ return dq.run_table_groups_lookup_query(schema, project_code)
+
+
+@st.cache_data(show_spinner="False")
+def get_profiling_run_columns(profiling_run_id: str) -> pd.DataFrame:
+ schema: str = st.session_state["dbschema"]
+ sql = f"""
+ SELECT table_name, column_name
+ FROM {schema}.profile_anomaly_results
+ WHERE profile_run_id = '{profiling_run_id}'
+ ORDER BY table_name, column_name;
+ """
+ return db.retrieve_data(sql)
@st.cache_data(show_spinner="Retrieving Data")
-def get_profiling_anomalies(str_profile_run_id, str_likelihood, issue_type_id, sorting_columns):
- str_schema = st.session_state["dbschema"]
- if str_likelihood is None:
- str_criteria = " AND t.issue_likelihood <> 'Potential PII'"
- else:
- str_criteria = f" AND t.issue_likelihood = '{str_likelihood}'"
- if sorting_columns:
- str_order_by = "ORDER BY " + (", ".join(" ".join(col) for col in sorting_columns))
- else:
- str_order_by = ""
+def get_profiling_anomalies(
+ profile_run_id: str,
+ likelihood: str | None,
+ issue_type_id: str | None,
+ table_name: str | None,
+ column_name: str | None,
+ sorting_columns: list[str] | None,
+):
+ schema: str = st.session_state["dbschema"]
+ criteria = ""
+ order_by = ""
+
+ if likelihood:
+ criteria += f" AND t.issue_likelihood = '{likelihood}'"
if issue_type_id:
- str_criteria += f" AND t.id = '{issue_type_id}'"
+ criteria += f" AND t.id = '{issue_type_id}'"
+ if table_name:
+ criteria += f" AND r.table_name = '{table_name}'"
+ if column_name:
+ criteria += f" AND r.column_name = '{column_name}'"
+
+ if sorting_columns:
+ order_by = "ORDER BY " + (", ".join(" ".join(col) for col in sorting_columns))
+
# Define the query -- first visible column must be first, because will hold the multi-select box
str_sql = f"""
SELECT r.table_name, r.column_name, r.schema_name,
@@ -291,17 +343,16 @@ def get_profiling_anomalies(str_profile_run_id, str_likelihood, issue_type_id, s
t.anomaly_description, r.detail, t.suggested_action,
r.anomaly_id, r.table_groups_id::VARCHAR, r.id::VARCHAR, p.profiling_starttime,
tg.table_groups_name
- FROM {str_schema}.profile_anomaly_results r
- INNER JOIN {str_schema}.profile_anomaly_types t
+ FROM {schema}.profile_anomaly_results r
+ INNER JOIN {schema}.profile_anomaly_types t
ON r.anomaly_id = t.id
- INNER JOIN {str_schema}.profiling_runs p
+ INNER JOIN {schema}.profiling_runs p
ON r.profile_run_id = p.id
- INNER JOIN {str_schema}.table_groups tg
+ INNER JOIN {schema}.table_groups tg
ON r.table_groups_id = tg.id
-
- WHERE r.profile_run_id = '{str_profile_run_id}'
- {str_criteria}
- {str_order_by}
+ WHERE r.profile_run_id = '{profile_run_id}'
+ {criteria}
+ {order_by}
"""
# Retrieve data as df
df = db.retrieve_data(str_sql)
diff --git a/testgen/ui/views/test_results.py b/testgen/ui/views/test_results.py
index 9cc88eb..a1c3ea8 100644
--- a/testgen/ui/views/test_results.py
+++ b/testgen/ui/views/test_results.py
@@ -6,6 +6,7 @@
import plotly.express as px
import plotly.graph_objects as go
import streamlit as st
+from streamlit.delta_generator import DeltaGenerator
import testgen.ui.services.database_service as db
import testgen.ui.services.form_service as fm
@@ -43,7 +44,15 @@ class TestResultsPage(Page):
lambda: "run_id" in session.current_page_args or "test-runs",
]
- def render(self, run_id: str, status: str | None = None, test_type: str | None = None, **_kwargs) -> None:
+ def render(
+ self,
+ run_id: str,
+ status: str | None = None,
+ test_type: str | None = None,
+ table_name: str | None = None,
+ column_name: str | None = None,
+ **_kwargs,
+ ) -> None:
run_parentage = get_drill_test_run(run_id)
if not run_parentage:
self.router.navigate_with_warning(
@@ -65,17 +74,18 @@ def render(self, run_id: str, status: str | None = None, test_type: str | None =
],
)
- # Display summary bar
- tests_summary = get_test_result_summary(run_id)
- testgen.summary_bar(items=tests_summary, height=40, width=800)
-
- # Setup Toolbar
- status_filter_column, test_type_filter_column, sort_column, actions_column, export_button_column = st.columns(
- [.2, .2, .08, .4, .12], vertical_alignment="bottom"
+ summary_column, actions_column = st.columns([.5, .5], vertical_alignment="bottom")
+ status_filter_column, test_type_filter_column, table_filter_column, column_filter_column, sort_column, export_button_column = st.columns(
+ [.2, .2, .2, .2, .1, .1], vertical_alignment="bottom"
)
+
testgen.flex_row_end(actions_column)
testgen.flex_row_end(export_button_column)
+ with summary_column:
+ tests_summary = get_test_result_summary(run_id)
+ testgen.summary_bar(items=tests_summary, height=20, width=800)
+
with status_filter_column:
status_options = [
"Failed + Warning",
@@ -102,6 +112,26 @@ def render(self, run_id: str, status: str | None = None, test_type: str | None =
label="Test Type",
)
+ run_columns_df = get_test_run_columns(run_id)
+ with table_filter_column:
+ table_name = testgen.select(
+ options=list(run_columns_df["table_name"].unique()),
+ default_value=table_name,
+ bind_to_query="table_name",
+ label="Table Name",
+ )
+
+ with column_filter_column:
+ column_options = list(run_columns_df.loc[run_columns_df["table_name"] == table_name]["column_name"])
+ column_name = testgen.select(
+ options=column_options,
+ value_column="column_name",
+ default_value=column_name,
+ bind_to_query="column_name",
+ label="Column Name",
+ disabled=not table_name,
+ )
+
with sort_column:
sortable_columns = (
("Table Name", "r.table_name"),
@@ -131,7 +161,7 @@ def render(self, run_id: str, status: str | None = None, test_type: str | None =
# Display main grid and retrieve selection
selected = show_result_detail(
- run_id, status, test_type, sorting_columns, do_multi_select, export_button_column
+ run_id, export_button_column, status, test_type, table_name, column_name, sorting_columns, do_multi_select
)
# Need to render toolbar buttons after grid, so selection status is maintained
@@ -190,25 +220,47 @@ def get_test_types():
return df
-@st.cache_data(show_spinner="Retrieving Results")
-def get_test_results(str_run_id, str_sel_test_status, test_type_id, sorting_columns):
- schema = st.session_state["dbschema"]
- return get_test_results_uncached(schema, str_run_id, str_sel_test_status, test_type_id, sorting_columns)
+@st.cache_data(show_spinner="False")
+def get_test_run_columns(test_run_id: str) -> pd.DataFrame:
+ schema: str = st.session_state["dbschema"]
+ sql = f"""
+ SELECT table_name, column_names AS column_name
+ FROM {schema}.test_results
+ WHERE test_run_id = '{test_run_id}'
+ ORDER BY table_name, column_names;
+ """
+ return db.retrieve_data(sql)
-def get_test_results_uncached(str_schema, str_run_id, str_sel_test_status, test_type_id=None, sorting_columns=None):
+@st.cache_data(show_spinner="Retrieving Results")
+def get_test_results(
+ run_id: str,
+ test_status: str | None = None,
+ test_type_id: str | None = None,
+ table_name: str | None = None,
+ column_name: str | None = None,
+ sorting_columns: list[str] | None = None,
+) -> pd.DataFrame:
+ schema: str = st.session_state["dbschema"]
# First visible row first, so multi-select checkbox will render
- str_order_by = "ORDER BY " + (", ".join(" ".join(col) for col in sorting_columns)) if sorting_columns else ""
- test_type_clause = f"AND r.test_type = '{test_type_id}'" if test_type_id else ""
- status_clause = f" AND r.result_status IN ({str_sel_test_status})" if str_sel_test_status else ""
- str_sql = f"""
+ order_by = "ORDER BY " + (", ".join(" ".join(col) for col in sorting_columns)) if sorting_columns else ""
+ filters = ""
+ if test_status:
+ filters += f" AND r.result_status IN ({test_status})"
+ if test_type_id:
+ filters += f" AND r.test_type = '{test_type_id}'"
+ if table_name:
+ filters += f" AND r.table_name = '{table_name}'"
+ if column_name:
+ filters += f" AND r.column_names = '{column_name}'"
+
+ sql = f"""
WITH run_results
AS (SELECT *
- FROM {str_schema}.test_results r
+ FROM {schema}.test_results r
WHERE
- r.test_run_id = '{str_run_id}'
- {status_clause}
- {test_type_clause}
+ r.test_run_id = '{run_id}'
+ {filters}
)
SELECT r.table_name,
p.project_name, ts.test_suite, tg.table_groups_name, cn.connection_name, cn.project_host, cn.sql_flavor,
@@ -249,31 +301,31 @@ def get_test_results_uncached(str_schema, str_run_id, str_sel_test_status, test_
tt.threshold_description, tt.usage_notes, r.test_time
FROM run_results r
- INNER JOIN {str_schema}.test_types tt
+ INNER JOIN {schema}.test_types tt
ON (r.test_type = tt.test_type)
- LEFT JOIN {str_schema}.test_definitions rd
+ LEFT JOIN {schema}.test_definitions rd
ON (r.test_definition_id = rd.id)
- LEFT JOIN {str_schema}.test_definitions d
+ LEFT JOIN {schema}.test_definitions d
ON (r.test_suite_id = d.test_suite_id
AND r.table_name = d.table_name
AND r.column_names = COALESCE(d.column_name, 'N/A')
AND r.test_type = d.test_type
AND r.auto_gen = TRUE
AND d.last_auto_gen_date IS NOT NULL)
- INNER JOIN {str_schema}.test_suites ts
+ INNER JOIN {schema}.test_suites ts
ON r.test_suite_id = ts.id
- INNER JOIN {str_schema}.projects p
+ INNER JOIN {schema}.projects p
ON (ts.project_code = p.project_code)
- INNER JOIN {str_schema}.table_groups tg
+ INNER JOIN {schema}.table_groups tg
ON (ts.table_groups_id = tg.id)
- INNER JOIN {str_schema}.connections cn
+ INNER JOIN {schema}.connections cn
ON (tg.connection_id = cn.connection_id)
- LEFT JOIN {str_schema}.cat_test_conditions c
+ LEFT JOIN {schema}.cat_test_conditions c
ON (cn.sql_flavor = c.sql_flavor
AND r.test_type = c.test_type)
- {str_order_by} ;
+ {order_by} ;
"""
- df = db.retrieve_data(str_sql)
+ df = db.retrieve_data(sql)
# Clean Up
df["test_date"] = pd.to_datetime(df["test_date"])
@@ -449,11 +501,20 @@ def show_test_def_detail(str_test_def_id):
)
-def show_result_detail(str_run_id, str_sel_test_status, test_type_id, sorting_columns, do_multi_select, export_container):
+def show_result_detail(
+ run_id: str,
+ export_container: DeltaGenerator,
+ test_status: str | None = None,
+ test_type_id: str | None = None,
+ table_name: str | None = None,
+ column_name: str | None = None,
+ sorting_columns: list[str] | None = None,
+ do_multi_select: bool = False,
+):
# Retrieve test results (always cached, action as null)
- df = get_test_results(str_run_id, str_sel_test_status, test_type_id, sorting_columns)
+ df = get_test_results(run_id, test_status, test_type_id, table_name, column_name, sorting_columns)
# Retrieve disposition action (cache refreshed)
- df_action = get_test_disposition(str_run_id)
+ df_action = get_test_disposition(run_id)
# Update action from disposition df
action_map = df_action.set_index("id")["action"].to_dict()
df["action"] = df["test_result_id"].map(action_map).fillna(df["action"])
From 6116f3b2f8a6786ff1f8b7dd9a4c406bcc176f38 Mon Sep 17 00:00:00 2001
From: Aarthy Adityan
Date: Fri, 8 Nov 2024 18:02:09 -0500
Subject: [PATCH 69/91] fix(ui): add dk favicon to image
---
pyproject.toml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pyproject.toml b/pyproject.toml
index b3fec2f..ce2438b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -102,7 +102,7 @@ include-package-data = true
[tool.setuptools.package-data]
"*" = ["*.toml", "*.sql", "*.yaml"]
"testgen.template" = ["*.sql", "*.yaml", "**/*.sql", "**/*.yaml"]
-"testgen.ui.assets" = ["*.svg", "*.png", "*.js", "*.css", "flavors/*.svg"]
+"testgen.ui.assets" = ["*.svg", "*.png", "*.js", "*.css", "*.ico", "flavors/*.svg"]
"testgen.ui.components.frontend" = ["*.html", "**/*.js", "**/*.css", "**/*.woff2", "**/*.svg"]
[tool.setuptools.packages.find]
From b8a94b3785ceff0abd01eacac1ba5e1dcf9d210f Mon Sep 17 00:00:00 2001
From: Aarthy Adityan
Date: Mon, 11 Nov 2024 13:18:58 -0500
Subject: [PATCH 70/91] feat(ui): add community and training links to header
---
testgen/ui/assets/style.css | 43 ++++++++++++++++++++--
testgen/ui/components/widgets/__init__.py | 1 +
testgen/ui/components/widgets/page.py | 44 ++++++++++++++---------
testgen/ui/views/connections/page.py | 4 +--
testgen/ui/views/hygiene_issues.py | 2 +-
testgen/ui/views/login.py | 13 ++++---
testgen/ui/views/overview.py | 3 +-
testgen/ui/views/profiling_results.py | 2 +-
testgen/ui/views/profiling_runs.py | 2 +-
testgen/ui/views/project_settings.py | 2 +-
testgen/ui/views/table_groups/page.py | 3 +-
testgen/ui/views/test_definitions.py | 2 +-
testgen/ui/views/test_results.py | 2 +-
testgen/ui/views/test_runs.py | 2 +-
testgen/ui/views/test_suites.py | 2 +-
15 files changed, 93 insertions(+), 34 deletions(-)
diff --git a/testgen/ui/assets/style.css b/testgen/ui/assets/style.css
index 3122291..c5beb62 100644
--- a/testgen/ui/assets/style.css
+++ b/testgen/ui/assets/style.css
@@ -16,6 +16,7 @@ body {
--secondary-text-color: #0000008a;
--disabled-text-color: #00000042;
--caption-text-color: rgba(49, 51, 63, 0.6); /* Match Streamlit's caption color */
+ --border-color: rgba(0, 0, 0, .12);
--sidebar-background-color: white;
--sidebar-item-hover-color: #f5f5f5;
@@ -68,15 +69,18 @@ section[data-testid="stSidebar"] {
}
section.main > :nth-child(1 of div).block-container {
- padding: 24px;
+ padding: 12px 24px 24px;
}
div[data-testid="stVerticalBlock"] {
gap: 0.5rem;
}
-div[data-testid="stSidebarCollapsedControl"] {
+.appview-container:has(section[data-testid="stSidebar"]) div[data-testid="stSidebarCollapsedControl"] {
top: 0.5rem;
+ border-radius: 4px;
+ background-color: var(--border-color);
+ padding: 3px 0 0 8px;
}
/* */
@@ -250,6 +254,40 @@ Use as testgen.text("text", "extra_styles") */
}
/* */
+/* Page header */
+.tg-header {
+ margin: 0;
+ padding: 0;
+ font-weight: 500;
+ transition: padding 0.3s;
+}
+
+[data-testid="stSidebarCollapsedControl"] ~ section.main .tg-header {
+ padding-left: 80px;
+}
+
+.tg-header--line {
+ margin: 0;
+ border: none;
+ border-radius: 2px;
+ height: 2px;
+ background-color: var(--disabled-text-color);
+}
+
+div[data-testid="stVerticalBlockBorderWrapper"]:has(> div > div[data-testid="stVerticalBlock"] > div.element-container > div.stHtml > i.tg-header--links) [data-testid="stLinkButton"] a {
+ border: none;
+ background: none;
+ padding: 6px;
+ min-height: 24px;
+ color: var(--primary-text-color);
+}
+
+div[data-testid="stVerticalBlockBorderWrapper"]:has(> div > div[data-testid="stVerticalBlock"] > div.element-container > div.stHtml > i.tg-header--links) [data-testid="stLinkButton"] a p {
+ font-size: 20px;
+ line-height: 1;
+}
+/* */
+
/* Summary bar component */
.tg-summary-bar--label {
margin-bottom: 4px;
@@ -309,6 +347,7 @@ Use as testgen.text("text", "extra_styles") */
--secondary-text-color: rgba(255, 255, 255, .7);
--disabled-text-color: rgba(255, 255, 255, .5);
--caption-text-color: rgba(250, 250, 250, .6); /* Match Streamlit's caption color */
+ --border-color: rgba(255, 255, 255, .25);
--sidebar-background-color: #14181f;
--sidebar-item-hover-color: #10141b;
diff --git a/testgen/ui/components/widgets/__init__.py b/testgen/ui/components/widgets/__init__.py
index c2d490c..2dc7762 100644
--- a/testgen/ui/components/widgets/__init__.py
+++ b/testgen/ui/components/widgets/__init__.py
@@ -15,6 +15,7 @@
flex_row_start,
no_flex_gap,
page_header,
+ page_links,
text,
whitespace,
)
diff --git a/testgen/ui/components/widgets/page.py b/testgen/ui/components/widgets/page.py
index 2703982..55e63a9 100644
--- a/testgen/ui/components/widgets/page.py
+++ b/testgen/ui/components/widgets/page.py
@@ -4,33 +4,45 @@
from testgen.ui.components.widgets.breadcrumbs import Breadcrumb
from testgen.ui.components.widgets.breadcrumbs import breadcrumbs as tg_breadcrumbs
+BASE_HELP_URL = "https://docs.datakitchen.io/articles/#!dataops-testgen-help/"
+DEFAULT_HELP_TOPIC = "dataops-testgen-help"
+SLACK_URL = "https://data-observability-slack.datakitchen.io/join"
+TRAINING_URL = "https://info.datakitchen.io/data-quality-training-and-certifications"
def page_header(
title: str,
- help_link:str | None = None,
+ help_topic: str | None = None,
breadcrumbs: list["Breadcrumb"] | None = None,
):
- hcol1, hcol2 = st.columns([0.95, 0.05])
- hcol1.subheader(title, anchor=False)
- if help_link:
- with hcol2:
- whitespace(0.8)
- st.page_link(help_link, label=" ", icon=":material/help:")
-
- if breadcrumbs:
- tg_breadcrumbs(breadcrumbs=breadcrumbs)
-
- st.write(
- '',
- unsafe_allow_html=True,
- )
+ with st.container():
+ no_flex_gap()
+ title_column, links_column = st.columns([0.95, 0.05], vertical_alignment="bottom")
+
+ with title_column:
+ no_flex_gap()
+ st.html(f'