diff --git a/Dockerfile b/Dockerfile index e436ca4..cdab57c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,7 +2,7 @@ FROM python:3.10-slim-bookworm AS build-image RUN mkdir -p /dk && \ apt-get update && \ - apt-get install -y gcc libpcre3 libpcre3-dev g++ + apt-get install -y gcc libpcre3 libpcre3-dev g++ git COPY ./pyproject.toml /tmp/dk/ RUN python3 -m pip install /tmp/dk --prefix=/dk diff --git a/docs/configuration.md b/docs/configuration.md index 2b844b1..d5a1035 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -159,12 +159,6 @@ Determine how many tests are grouped together in a single query. Increase for be default: `5000` -#### `PROJECT_QC_SCHEMA` - -Name of the schema to be created in the project database. - -default: `qc` - #### `PROJECT_DATABASE_NAME` Name of the database the auto generated project will run test against. diff --git a/pyproject.toml b/pyproject.toml index 7085769..4c70813 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ build-backend = "setuptools.build_meta" [project] name = "dataops-testgen" -version = "2.15.3" +version = "2.24.7" description = "DataKitchen's Data Quality DataOps TestGen" authors = [ { "name" = "DataKitchen, Inc.", "email" = "info@datakitchen.io" }, @@ -32,6 +32,7 @@ requires-python = ">=3.10" dependencies = [ "PyYAML==6.0.1", "click==8.1.3", + "regex==2024.9.11", "sqlalchemy==1.4.46", "snowflake-sqlalchemy==1.4.7", "pyodbc==5.0.0", @@ -60,6 +61,8 @@ dependencies = [ "concurrent_log_handler==0.9.25", "cryptography==42.0.8", "validators==0.33.0", + "reportlab==4.2.2", + "streamlit-pydantic @ git+https://github.com/LukasMasuch/streamlit-pydantic.git@9f84145b6b6e74cdff3a7815ab75b0464c4d4f24", ] [project.optional-dependencies] @@ -99,7 +102,7 @@ include-package-data = true [tool.setuptools.package-data] "*" = ["*.toml", "*.sql", "*.yaml"] "testgen.template" = ["*.sql", "*.yaml", "**/*.sql", "**/*.yaml"] -"testgen.ui.assets" = ["*.svg", "*.png", "*.js", "*.css"] +"testgen.ui.assets" = ["*.svg", "*.png", "*.js", "*.css", "*.ico", "flavors/*.svg"] "testgen.ui.components.frontend" = ["*.html", "**/*.js", "**/*.css", "**/*.woff2", "**/*.svg"] [tool.setuptools.packages.find] @@ -224,8 +227,9 @@ select = ["A", "F", "S", "I", "T10", "B", "UP", "ISC", "T20", "RSE", "Q", "ARG", # globally ignore the following error codes # * TRY003: Avoid specifying long messages outside the exception class # * S608: Hardcoded SQL -# # F841: Unused local variable (it is instable) -ignore = ["TRY003", "S608", "S404", "F841"] +# * F841: Unused local variable (it is instable) +# * B023: Buggy: https://github.com/astral-sh/ruff/issues/7847 +ignore = ["TRY003", "S608", "S404", "F841", "B023"] # Ignore the following errors in files: # F403 - in __init__.py: We use __all__ in our module files so this behavior is acceptable in __init__.py @@ -237,6 +241,7 @@ ignore = ["TRY003", "S608", "S404", "F841"] "tests*" = ["S101", "T201"] "invocations/**" = ["ARG001", "T201"] "testgen/common/encrypt.py" = ["S413"] +"testgen/ui/pdf/dk_logo.py" = ["T201"] # See: https://coverage.readthedocs.io/en/latest/config.html [tool.coverage.run] diff --git a/testgen/__main__.py b/testgen/__main__.py index 285e949..fd19379 100644 --- a/testgen/__main__.py +++ b/testgen/__main__.py @@ -1,4 +1,3 @@ -import getpass import logging import os import subprocess @@ -33,7 +32,6 @@ from testgen.commands.run_observability_exporter import run_observability_exporter from testgen.commands.run_profiling_bridge import run_profiling_queries from testgen.commands.run_quick_start import run_quick_start, run_quick_start_increment -from testgen.commands.run_setup_profiling_tools import run_setup_profiling_tools from testgen.commands.run_upgrade_db_config import get_schema_revision, is_db_revision_up_to_date, run_upgrade_db_config from testgen.common import ( configure_logging, @@ -450,84 +448,6 @@ def do_upgrade_system_version(): click.echo("System and services upgrade is not required.") -@cli.command( - "setup-target-db-functions", help="Use to set up the utility functions in the target database for running profiles." -) -@click.option( - "-c", - "--connection-id", - help="The identifier for the connection. Use a connection_id shown in list-connections.", - required=True, - type=click.STRING, -) -@click.option( - "-dr", - "--dry-run", - default=False, - is_flag=True, - required=False, - help="Dry run to show which schema will be modified", -) -@click.option( - "-cs", - "--create-qc-schema", - default=False, - is_flag=True, - required=False, - help="Create the QC utility schema required in the target database", -) -@click.option("--yes", "-y", default=False, is_flag=True, required=False, help="Force yes") -@click.option( - "--skip-asking-credentials", - "-s", - default=False, - is_flag=True, - required=False, - help="Skip request for special write credentials for target database, uses standard credentials instead", -) -@click.option( - "--skip-granting-privileges", - "-sgp", - default=False, - is_flag=True, - required=False, - help="Skip granting execute privileges to the user for the QC utility schema in the target database", -) -@pass_configuration -def setup_profiling_tools( - configuration: Configuration, - connection_id: str, - dry_run: bool, - create_qc_schema: bool, - yes: bool, - skip_asking_credentials: bool, - skip_granting_privileges: bool, -): - db_user = None - db_password = None - if not skip_asking_credentials: - db_user = input("Admin DB User?") - db_password = getpass.getpass("Admin DB Password?") - - if not yes and not dry_run: - confirm = input( - f"Are you sure you want to setup the utility functions to be able to run the profile for connection {connection_id}? [yes/No]" - ) - if confirm.lower() != "yes": - click.echo("Exiting without any operation performed.") - return - project_qc_schema = run_setup_profiling_tools( - connection_id, dry_run, create_qc_schema, db_user, db_password, skip_granting_privileges - ) - if not dry_run: - message = f"Project DB has been set up. Modified schema: {project_qc_schema}" - else: - message = ( - f"Project DB dry run completed, no changes applied. Modified schema would have been: {project_qc_schema}" - ) - click.echo(message) - - @cli.command("get-test-results", help="Fetches results for a test run.") @click.option( "-tr", diff --git a/testgen/commands/queries/execute_cat_tests_query.py b/testgen/commands/queries/execute_cat_tests_query.py index fc91e2b..ecbd6aa 100644 --- a/testgen/commands/queries/execute_cat_tests_query.py +++ b/testgen/commands/queries/execute_cat_tests_query.py @@ -2,6 +2,7 @@ from testgen.common import date_service, read_template_sql_file from testgen.common.database import database_service +from testgen.common.read_file import replace_templated_functions class CCATExecutionSQL: @@ -11,13 +12,13 @@ class CCATExecutionSQL: test_suite = "" run_date = "" test_run_id = "" + table_groups_id = "" max_query_chars = "" exception_message = "" # Test Set Parameters target_schema = "" target_table = "" - replace_qc_schema = "" dctTestParms: typing.ClassVar = {} def __init__(self, strProjectCode, strTestSuiteId, strTestSuite, strSQLFlavor, max_query_chars, minutes_offset=0): @@ -38,9 +39,8 @@ def _ReplaceParms(self, strInputString): strInputString = strInputString.replace("{PROJECT_CODE}", self.project_code) strInputString = strInputString.replace("{TEST_SUITE}", self.test_suite) strInputString = strInputString.replace("{TEST_SUITE_ID}", self.test_suite_id) - # NOTE: REPLACE_QC_SCHEMA is parm replaced to run build query: sets the actual value to replace. - # DATA_QC_SCHEMA is parm in cat_test_conditions that build query replaces via SQL. - strInputString = strInputString.replace("{REPLACE_QC_SCHEMA}", self.replace_qc_schema) + strInputString = strInputString.replace("{TABLE_GROUPS_ID}", self.table_groups_id) + strInputString = strInputString.replace("{SQL_FLAVOR}", self.flavor) strInputString = strInputString.replace("{CONCAT_OPERATOR}", self.concat_operator) @@ -60,6 +60,9 @@ def _ReplaceParms(self, strInputString): strInputString = strInputString.replace("{RUN_DATE}", self.run_date) + if "{{DKFN_" in strInputString: + strInputString = replace_templated_functions(strInputString, self.flavor) + # Adding escape character where ':' is referenced strInputString = strInputString.replace(":", "\\:") @@ -95,3 +98,12 @@ def FinalizeTestResultsSQL(self): def PushTestRunStatusUpdateSQL(self): strQ = self._ReplaceParms(read_template_sql_file("ex_update_test_record_in_testrun_table.sql", "execution")) return strQ + + def FinalizeTestSuiteUpdateSQL(self): + strQ = self._ReplaceParms(read_template_sql_file("ex_update_test_suite.sql", "execution")) + return strQ + + + def TestScoringRollupSQL(self): + strQ = self._ReplaceParms(read_template_sql_file("test_scoring_rollup.sql", "execution")) + return strQ diff --git a/testgen/commands/queries/profiling_query.py b/testgen/commands/queries/profiling_query.py index db5ff1e..4a71df3 100644 --- a/testgen/commands/queries/profiling_query.py +++ b/testgen/commands/queries/profiling_query.py @@ -1,6 +1,7 @@ import typing from testgen.common import date_service, read_template_sql_file, read_template_yaml_file +from testgen.common.read_file import replace_templated_functions class CProfilingSQL: @@ -13,7 +14,6 @@ class CProfilingSQL: table_groups_id = "" flavor = "" run_date = "" - data_qc_schema = "" data_schema = "" data_table = "" @@ -74,7 +74,6 @@ def ReplaceParms(self, strInputString): strInputString = strInputString.replace("{TABLE_GROUPS_ID}", self.table_groups_id) strInputString = strInputString.replace("{RUN_DATE}", self.run_date) strInputString = strInputString.replace("{DATA_SCHEMA}", self.data_schema) - strInputString = strInputString.replace("{DATA_QC_SCHEMA}", self.data_qc_schema) strInputString = strInputString.replace("{DATA_TABLE}", self.data_table) strInputString = strInputString.replace("{COL_NAME}", self.col_name) strInputString = strInputString.replace("{COL_NAME_SANITIZED}", self.col_name.replace("'", "''")) @@ -98,6 +97,8 @@ def ReplaceParms(self, strInputString): strInputString = strInputString.replace("{CONTINGENCY_COLUMNS}", self.contingency_columns) strInputString = strInputString.replace("{CONTINGENCY_MAX_VALUES}", self.contingency_max_values) strInputString = strInputString.replace("{PROCESS_ID}", str(self.process_id)) + if "{{DKFN_" in strInputString: + strInputString = replace_templated_functions(strInputString, self.flavor) return strInputString @@ -141,11 +142,16 @@ def GetPIIFlagUpdateQuery(self): strQ = self.ReplaceParms(read_template_sql_file("pii_flag.sql", sub_directory="profiling")) return strQ - def GetAnomalyRefreshQuery(self): + def GetAnomalyStatsRefreshQuery(self): # Runs on DK Postgres Server strQ = self.ReplaceParms(read_template_sql_file("refresh_anomalies.sql", sub_directory="profiling")) return strQ + def GetAnomalyScoringRollupQuery(self): + # Runs on DK Postgres Server + strQ = self.ReplaceParms(read_template_sql_file("profile_anomaly_scoring_rollup.sql", sub_directory="profiling")) + return strQ + def GetAnomalyTestTypesQuery(self): # Runs on DK Postgres Server strQ = self.ReplaceParms(read_template_sql_file("profile_anomaly_types_get.sql", sub_directory="profiling")) @@ -175,6 +181,16 @@ def GetAnomalyTestQuery(self, dct_test_type): return strQ + def GetAnomalyScoringQuery(self, dct_test_type): + # Runs on DK Postgres Server + strQ = read_template_sql_file("profile_anomaly_scoring.sql", sub_directory="profiling") + if strQ: + strQ = strQ.replace("{PROFILE_RUN_ID}", self.profile_run_id) + strQ = strQ.replace("{ANOMALY_ID}", dct_test_type["id"]) + strQ = strQ.replace("{PREV_FORMULA}", dct_test_type["dq_score_prevalence_formula"]) + strQ = strQ.replace("{RISK}", dct_test_type["dq_score_risk_factor"]) + return strQ + def GetDataCharsRefreshQuery(self): # Runs on DK Postgres Server strQ = self.ReplaceParms( @@ -227,16 +243,6 @@ def _get_mask_query(self, mask, is_include): sub_query += ")" return sub_query - def GetFunctionCreatorQuery(self): - # Runs on Project DB - strQ = self.ReplaceParms( - read_template_sql_file( - f"project_function_creator_{self.flavor}.sql", - sub_directory=f"flavors/{self.flavor}/setup_profiling_tools", - ) - ) - return strQ - def GetProfilingQuery(self): # Runs on Project DB if not self.dctSnippetTemplate: diff --git a/testgen/commands/run_execute_cat_tests.py b/testgen/commands/run_execute_cat_tests.py index 9ca8de5..d126189 100644 --- a/testgen/commands/run_execute_cat_tests.py +++ b/testgen/commands/run_execute_cat_tests.py @@ -61,7 +61,10 @@ def ParseCATResults(clsCATExecute): def FinalizeTestRun(clsCATExecute): - lstQueries = [clsCATExecute.FinalizeTestResultsSQL(), clsCATExecute.PushTestRunStatusUpdateSQL()] + lstQueries = [clsCATExecute.FinalizeTestResultsSQL(), + clsCATExecute.PushTestRunStatusUpdateSQL(), + clsCATExecute.FinalizeTestSuiteUpdateSQL(), + clsCATExecute.TestScoringRollupSQL()] RunActionQueryList(("DKTG"), lstQueries) @@ -80,6 +83,7 @@ def run_cat_test_queries( ) clsCATExecute.test_run_id = strTestRunID clsCATExecute.run_date = strTestTime + clsCATExecute.table_groups_id = dctParms["table_groups_id"] clsCATExecute.exception_message += error_msg # Set Project Connection Params in common.db_bridgers from retrieved params @@ -119,7 +123,6 @@ def run_cat_test_queries( for dctTable in lstTables: clsCATExecute.target_schema = dctTable["schema_name"] clsCATExecute.target_table = dctTable["table_name"] - clsCATExecute.replace_qc_schema = dctTable["replace_qc_schema"] AggregateTableTests(clsCATExecute) LOG.info("CurrentStep: Retrieving CAT Tests to Run") diff --git a/testgen/commands/run_launch_db_config.py b/testgen/commands/run_launch_db_config.py index e6ab186..fed8176 100644 --- a/testgen/commands/run_launch_db_config.py +++ b/testgen/commands/run_launch_db_config.py @@ -47,7 +47,6 @@ def _get_params_mapping() -> dict: "PROFILING_SAMPLE_MIN_COUNT": "", "PROFILING_DELAY_DAYS": "", "CONNECTION_NAME": settings.PROJECT_CONNECTION_NAME, - "PROJECT_QC_SCHEMA": settings.PROJECT_QC_SCHEMA, "TABLE_GROUPS_NAME": settings.DEFAULT_TABLE_GROUPS_NAME, "TEST_SUITE": settings.DEFAULT_TEST_SUITE_KEY, "TEST_SUITE_DESCRIPTION": settings.DEFAULT_TEST_SUITE_DESCRIPTION, diff --git a/testgen/commands/run_profiling_bridge.py b/testgen/commands/run_profiling_bridge.py index c141c76..68654b6 100644 --- a/testgen/commands/run_profiling_bridge.py +++ b/testgen/commands/run_profiling_bridge.py @@ -29,10 +29,8 @@ def InitializeProfilingSQL(strProject, strSQLFlavor): return CProfilingSQL(strProject, strSQLFlavor) -def CompileAnomalyTestQueries(clsProfiling): - str_query = clsProfiling.GetAnomalyTestTypesQuery() - lst_tests = RetrieveDBResultsToDictList("DKTG", str_query) - +def CompileAnomalyTestQueries(clsProfiling, lst_tests): + # Get queries for each test lst_queries = [] for dct_test_type in lst_tests: str_query = clsProfiling.GetAnomalyTestQuery(dct_test_type) @@ -42,6 +40,18 @@ def CompileAnomalyTestQueries(clsProfiling): return lst_queries +def CompileAnomalyScoringQueries(clsProfiling, lst_tests): + # Get queries for each test + lst_queries = [] + for dct_test_type in lst_tests: + if dct_test_type["dq_score_prevalence_formula"]: + str_query = clsProfiling.GetAnomalyScoringQuery(dct_test_type) + if str_query: + lst_queries.append(str_query) + + return lst_queries + + def save_contingency_rules(df_merged, threshold_ratio): # Prep rows to save lst_rules = [] @@ -278,7 +288,6 @@ def run_profiling_queries(strTableGroupsID, spinner=None): clsProfiling.parm_do_patterns = "Y" clsProfiling.parm_max_pattern_length = 25 clsProfiling.profile_run_id = strProfileRunID - clsProfiling.data_qc_schema = dctParms["project_qc_schema"] clsProfiling.data_schema = dctParms["table_group_schema"] clsProfiling.parm_table_set = dctParms["profiling_table_set"] clsProfiling.parm_table_include_mask = dctParms["profiling_include_mask"] @@ -434,6 +443,7 @@ def run_profiling_queries(strTableGroupsID, spinner=None): LOG.info("CurrentStep: Generating profiling update queries") lstQueries = [] + lstAnomalyTypes = [] if lstUpdates: # Run single update query, then delete from staging @@ -451,9 +461,14 @@ def run_profiling_queries(strTableGroupsID, spinner=None): lstQueries.append(strQuery) strQuery = clsProfiling.GetPIIFlagUpdateQuery() lstQueries.append(strQuery) - lstQueries.extend(CompileAnomalyTestQueries(clsProfiling)) - strQuery = clsProfiling.GetAnomalyRefreshQuery() + + strQuery = clsProfiling.GetAnomalyTestTypesQuery() + lstAnomalyTypes = RetrieveDBResultsToDictList("DKTG", strQuery) + lstQueries.extend(CompileAnomalyTestQueries(clsProfiling, lstAnomalyTypes)) + lstQueries.extend(CompileAnomalyScoringQueries(clsProfiling, lstAnomalyTypes)) + strQuery = clsProfiling.GetAnomalyStatsRefreshQuery() lstQueries.append(strQuery) + # Always runs last strQuery = clsProfiling.GetDataCharsRefreshQuery() lstQueries.append(strQuery) @@ -475,6 +490,7 @@ def run_profiling_queries(strTableGroupsID, spinner=None): finally: LOG.info("Updating the profiling run record") lstProfileRunQuery = [clsProfiling.GetProfileRunInfoRecordUpdateQuery()] + lstProfileRunQuery.append(clsProfiling.GetAnomalyScoringRollupQuery()) RunActionQueryList("DKTG", lstProfileRunQuery) if booErrors: str_error_status = "with errors. Check log for details." diff --git a/testgen/commands/run_quick_start.py b/testgen/commands/run_quick_start.py index 67a22b5..487c47d 100644 --- a/testgen/commands/run_quick_start.py +++ b/testgen/commands/run_quick_start.py @@ -5,7 +5,6 @@ from testgen import settings from testgen.commands.run_get_entities import run_table_group_list from testgen.commands.run_launch_db_config import run_launch_db_config -from testgen.commands.run_setup_profiling_tools import run_setup_profiling_tools from testgen.common.database.database_service import ( AssignConnectParms, CreateDatabaseIfNotExists, @@ -140,16 +139,6 @@ def run_quick_start(delete_target_db: bool) -> None: rows, _ = run_table_group_list(project_key) connection_id = str(rows[0][2]) - # run qc - command = "testgen setup-target-db-functions --connection-id --create-qc-schema --yes" - click.echo(f"Running CLI command: {command}") - create_qc_schema = True - db_user = params_mapping["TESTGEN_ADMIN_USER"] - db_password = params_mapping["TESTGEN_ADMIN_PASSWORD"] - dry_run = False - project_qc_schema = run_setup_profiling_tools(connection_id, dry_run, create_qc_schema, db_user, db_password) - click.echo(f"Schema {project_qc_schema} has been created in the target db") - def run_quick_start_increment(iteration): params_mapping = _get_params_mapping(iteration) diff --git a/testgen/commands/run_setup_profiling_tools.py b/testgen/commands/run_setup_profiling_tools.py deleted file mode 100644 index c2d42f3..0000000 --- a/testgen/commands/run_setup_profiling_tools.py +++ /dev/null @@ -1,96 +0,0 @@ -import logging - -from testgen.commands.run_get_entities import run_get_connection -from testgen.common import AssignConnectParms, RunActionQueryList -from testgen.common.database.database_service import get_queries_for_command - -LOG = logging.getLogger("testgen") - - -def _get_params_mapping(project_qc_schema: str, user: str, user_role: str | None) -> dict: - return { - "DATA_QC_SCHEMA": project_qc_schema, - "DB_USER": user, - "DB_USER_ROLE": user_role, - } - - -def get_setup_profiling_tools_queries(sql_flavor, create_qc_schema, skip_granting_privileges, project_qc_schema, user, user_role=None): - queries = [] - - params_mapping = _get_params_mapping(project_qc_schema, user, user_role) - - if create_qc_schema: - queries.extend( - get_queries_for_command( - f"flavors/{sql_flavor}/setup_profiling_tools", - params_mapping, - mask=rf"^.*create_qc_schema_{sql_flavor}.sql$", - ) - ) - - queries.extend( - get_queries_for_command( - f"flavors/{sql_flavor}/setup_profiling_tools", params_mapping, mask=rf"^.*functions_{sql_flavor}.sql$" - ) - ) - - if not skip_granting_privileges: - queries.extend( - get_queries_for_command( - f"flavors/{sql_flavor}/setup_profiling_tools", - params_mapping, - mask=rf"^.*grant_execute_privileges_{sql_flavor}.sql$", - ) - ) - - return queries - - -def run_setup_profiling_tools( - connection_id: str | int, - dry_run: bool, - create_qc_schema: bool = True, - db_user: str | None = None, - db_password: str | None = None, - skip_granting_privileges: bool = False, - admin_private_key_passphrase: str | None = None, - admin_private_key: str | None = None, - user_role: str | None = None, -) -> str: - connection = run_get_connection(str(connection_id)) - - # Set Project Connection Parms in common.db_bridgers from retrieved parms - LOG.info("CurrentStep: Assigning Connection Parms") - user = db_user or connection["project_user"] - connect_by_key = admin_private_key is not None or connection["connect_by_key"] - private_key_passphrase = admin_private_key_passphrase if admin_private_key is not None else connection["private_key_passphrase"] - private_key = admin_private_key if admin_private_key is not None else connection["private_key"] - - AssignConnectParms( - connection["project_key"], - connection["connection_id"], - connection["project_host"], - connection["project_port"], - connection["project_db"], - connection["project_qc_schema"], - user, - connection["sql_flavor"], - connection["url"], - connection["connect_by_url"], - connect_by_key, - private_key, - private_key_passphrase, - "PROJECT", - ) - - project_qc_schema = connection["project_qc_schema"] - sql_flavor = connection["sql_flavor"] - user = connection["project_user"] - - queries = get_setup_profiling_tools_queries(sql_flavor, create_qc_schema, skip_granting_privileges, project_qc_schema, user, user_role) - - if not dry_run: - RunActionQueryList("PROJECT", queries, user_override=db_user, pwd_override=db_password) - - return project_qc_schema diff --git a/testgen/commands/run_test_parameter_validation.py b/testgen/commands/run_test_parameter_validation.py index 8e93148..f93ac32 100644 --- a/testgen/commands/run_test_parameter_validation.py +++ b/testgen/commands/run_test_parameter_validation.py @@ -65,8 +65,8 @@ def run_parameter_validation_queries( strSchemas = ", ".join([f"'{value}'" for value in setSchemas]) LOG.debug("Test column list successfully retrieved") - # Retrieve Project Column list - LOG.info("CurrentStep: Retrieve Test Columns for Validation") + # Retrieve Current Project Column list + LOG.info("CurrentStep: Retrieve Current Columns for Validation") clsExecute.test_schemas = strSchemas strProjectColumnList = clsExecute.GetProjectTestValidationColumns() if "where table_schema in ()" in strProjectColumnList: @@ -74,9 +74,9 @@ def run_parameter_validation_queries( lstProjectTestColumns = RetrieveDBResultsToDictList("PROJECT", strProjectColumnList) if len(lstProjectTestColumns) == 0: - LOG.info("Project Test Column list is empty") + LOG.info("Current Test Column list is empty") - LOG.debug("Project column list successfully received") + LOG.debug("Current column list successfully received") LOG.info("CurrentStep: Compare column sets") # load results into sets result_set1 = {col.lower() for col, _ in test_columns} @@ -86,7 +86,7 @@ def run_parameter_validation_queries( missing_columns = result_set1.difference(result_set2) if len(missing_columns) == 0: - LOG.info("No missing column in Project Column list.") + LOG.info("No missing column in Current Column list.") if missing_columns: LOG.debug("Test Columns are missing in target database: %s", ", ".join(missing_columns)) @@ -143,7 +143,7 @@ def run_parameter_validation_queries( # when run_parameter_validation_queries() is called from execute_tests_query.py: # we disable tests and write validation errors to test_results table. if booRunFromTestExec: - # Copy test results to DK DB, using temporary flagged -1 value to identify + # Copy test results to DK DB, using temporary flagged D value to identify LOG.info("CurrentStep: Saving error results for invalid tests") strReportValErrors = clsExecute.ReportTestValidationErrors() RunActionQueryList("DKTG", [strReportValErrors]) diff --git a/testgen/common/get_pipeline_parms.py b/testgen/common/get_pipeline_parms.py index c3f81d6..af673ca 100644 --- a/testgen/common/get_pipeline_parms.py +++ b/testgen/common/get_pipeline_parms.py @@ -20,7 +20,6 @@ def RetrieveProfilingParms(strTableGroupsID): or lstParms[0]["profile_use_sampling"] == "" or lstParms[0]["profile_sample_percent"] == "" or lstParms[0]["profile_sample_min_count"] == "" - or lstParms[0]["project_qc_schema"] == "" or lstParms[0]["table_group_schema"] == "" ): raise ValueError("Project Connection parameters not correctly set") diff --git a/testgen/common/read_file.py b/testgen/common/read_file.py index dda3ff8..41b5bbb 100644 --- a/testgen/common/read_file.py +++ b/testgen/common/read_file.py @@ -7,6 +7,7 @@ from importlib.abc import Traversable from importlib.resources import as_file, files +import regex import yaml LOG = logging.getLogger("testgen") @@ -67,3 +68,38 @@ def read_template_yaml_file(template_file_name: str, sub_directory: str | None = raise ValueError(f"{template_file_name}: File is empty") return template + + +@cache +def read_template_yaml_function(function_name: str, db_flavour: str) -> str: + yaml_functions = read_template_yaml_file( + "templated_functions.yaml", + sub_directory=f"flavors/{db_flavour}/profiling", + ) + template = yaml_functions[function_name] + template = re.sub(r"/\*.*?\*/", "", template, flags=re.DOTALL) + template = re.sub(r"\s\s*", " ", template) + return template + + +def replace_templated_functions(query: str, db_flavour: str) -> str: + # see regexr.com/872jv for regex explanation + # Regex package is needed due to variable number of capture groups ('re' package only returns last) + # Use double curly braces for the function call in sql {{ }} + # Separate function arguments with double semi colon ;; + # Arguments in the template yaml take the form {$} like {$1} + # Space is required after the closing braces + # e.g. "{{DKFN_ISNUM;;{COLUM_NAME}}} " + # Function template replacement is the last step of templating, therefore cannot use other templated parameters inside. + # If needed, those must be arguments to the templated function. + # I.E OK TO DO sql: "{{DKFN_FOO;;{COLUM_NAME}}}" and yaml: "FOO: foo({$1})" + # NOT OK TO DO sql: "{{DKFN_FOO}}" and yaml: "FOO: foo({"COLUM_NAME"})" + while match := regex.search(r"{{DKFN_([\w\d]+)(?:;;(.+?))*}}(\s)", query): + function_name = match.captures(1)[0] + function_arguments = match.captures(2) + function_template = read_template_yaml_function(function_name, db_flavour) + function_template = function_template + match.captures(3)[0] + for index, function_arg in enumerate(function_arguments, start=1): + function_template = function_template.replace(f"{{${index}}}", function_arg) + query = query.replace(match.captures(0)[0], function_template) + return query diff --git a/testgen/common/version_service.py b/testgen/common/version_service.py index c2317b1..8e03cb1 100644 --- a/testgen/common/version_service.py +++ b/testgen/common/version_service.py @@ -54,7 +54,7 @@ def _get_last_docker_release() -> str: ) if response.status_code != 200: - LOG.warning(f"version_service: Failed to fetch docker tags. Status code: {response.status_code}") + LOG.debug(f"version_service: Failed to fetch docker tags. Status code: {response.status_code}") return "unknown" tags_to_return = [] diff --git a/testgen/settings.py b/testgen/settings.py index 595e402..2a708af 100644 --- a/testgen/settings.py +++ b/testgen/settings.py @@ -214,14 +214,6 @@ defaults to: `5000` """ -PROJECT_QC_SCHEMA: str = os.getenv("PROJECT_QC_SCHEMA", "qc") -""" -Name of the schema to be created in the project database. - -from env variable: `PROJECT_QC_SCHEMA` -defaults to: `qc` -""" - PROJECT_DATABASE_NAME: str = os.getenv("PROJECT_DATABASE_NAME", "demo_db") """ Name of the database the auto generated project will run test diff --git a/testgen/template/dbsetup/020_create_standard_functions_sprocs.sql b/testgen/template/dbsetup/020_create_standard_functions_sprocs.sql index f21925e..c0bad4d 100644 --- a/testgen/template/dbsetup/020_create_standard_functions_sprocs.sql +++ b/testgen/template/dbsetup/020_create_standard_functions_sprocs.sql @@ -177,3 +177,86 @@ FROM ( ) AS t WHERE trim(value) <> '' $$ LANGUAGE sql; + + +CREATE OR REPLACE FUNCTION {SCHEMA_NAME}.fn_normal_cdf(z_score DOUBLE PRECISION) +RETURNS DOUBLE PRECISION AS +$$ +/* + This function calculates the cumulative distribution function (CDF) + for the standard normal distribution for a given Z-score using + the Abramowitz and Stegun approximation method. It returns the + probability that a standard normal variable is less than or equal + to the given Z-score. + + The approximation formula uses a series expansion to estimate the + CDF, which is accurate for most practical purposes. + + To estimate the count of observations that fall outside a certain Z-score + (both above and below), you can use the `normal_cdf()` function. For a + total number of observations N, the proportion of values outside the Z-score + is given by: 2 * (1 - normal_cdf(ABS(Z))) + + This gives the proportion of values greater than the positive Z-score and + less than the negative Z-score combined. To get the estimated count of + observations, multiply this proportion by N: N * 2 * (1 - normal_cdf(ABS(Z))) +*/ +DECLARE + t DOUBLE PRECISION; + cdf DOUBLE PRECISION; +BEGIN + t := 1.0 / (1.0 + 0.2316419 * ABS(z_score)); + + cdf := (1.0 / SQRT(2 * PI())) * EXP(-0.5 * z_score * z_score) * + (0.319381530 * t + - 0.356563782 * t * t + + 1.781477937 * t * t * t + - 1.821255978 * t * t * t * t + + 1.330274429 * t * t * t * t * t); + + IF z_score >= 0 THEN + RETURN 1.0 - cdf; + ELSE + RETURN cdf; + END IF; +END; +$$ LANGUAGE plpgsql; + + +CREATE OR REPLACE FUNCTION {SCHEMA_NAME}.fn_eval(expression TEXT) RETURNS FLOAT +AS +$$ +DECLARE + result FLOAT; + invalid_parts TEXT; +BEGIN + -- Check the modified expression for invalid characters, allowing colons + IF expression ~* E'[^0-9+\\-*/(),.\\sA-Z_:e\\\'"]' THEN + RAISE EXCEPTION 'Invalid characters detected in expression: %', expression; + END IF; + + -- Check for dangerous PostgreSQL-specific keywords + IF expression ~* E'\b(DROP|ALTER|INSERT|UPDATE|DELETE|TRUNCATE|GRANT|REVOKE|COPY|EXECUTE|CREATE|COMMENT|SECURITY|WITH|SET ROLE|SET SESSION|DO|CALL|--|/\\*|;|pg_read_file|pg_write_file|pg_terminate_backend)\b' THEN + RAISE EXCEPTION 'Invalid expression: dangerous statement detected'; + END IF; + + -- Remove all allowed tokens from the validation expression, treating 'FLOAT' as a keyword + invalid_parts := regexp_replace( + expression, + E'(\\mGREATEST|LEAST|ABS|FN_NORMAL_CDF|DATEDIFF|DAY|FLOAT)\\M|[0-9]+(\\.[0-9]+)?([eE][+-]?[0-9]+)?|[+\\-*/(),\\\'":]+|\\s+', + '', + 'gi' + ); + + -- If anything is left in the validation expression, it's invalid + IF invalid_parts <> '' THEN + RAISE EXCEPTION 'Invalid expression contains invalid tokens "%" in expression: %', invalid_parts, expression; + END IF; + + -- Use the original expression (with ::FLOAT) for execution + EXECUTE format('SELECT (%s)::FLOAT', expression) INTO result; + + RETURN result; +END; +$$ +LANGUAGE plpgsql; diff --git a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql index 8c14348..7c4b08f 100644 --- a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql +++ b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql @@ -30,13 +30,13 @@ CREATE TABLE stg_functional_table_updates ( ); CREATE TABLE projects ( - id UUID DEFAULT gen_random_uuid(), - project_code VARCHAR(30) NOT NULL + id UUID DEFAULT gen_random_uuid(), + project_code VARCHAR(30) NOT NULL CONSTRAINT projects_project_code_pk PRIMARY KEY, - project_name VARCHAR(50), - effective_from_date DATE, - effective_thru_date DATE, + project_name VARCHAR(50), + effective_from_date DATE, + effective_thru_date DATE, observability_api_key TEXT, observability_api_url TEXT DEFAULT '' ); @@ -55,7 +55,6 @@ CREATE TABLE connections ( project_user VARCHAR(50), project_db VARCHAR(100), connection_name VARCHAR(40), - project_qc_schema VARCHAR(200), project_pw_encrypted BYTEA, max_threads INTEGER DEFAULT 4, max_query_chars INTEGER, @@ -94,26 +93,32 @@ CREATE TABLE table_groups source_process VARCHAR(40), business_domain VARCHAR(40), stakeholder_group VARCHAR(40), - transform_level VARCHAR(40) + transform_level VARCHAR(40), + last_complete_profile_run_id UUID, + dq_score_profiling FLOAT, + dq_score_testing FLOAT ); CREATE TABLE profiling_runs ( - id UUID + id UUID CONSTRAINT pk_prun_id PRIMARY KEY, - project_code VARCHAR(30) NOT NULL, - connection_id BIGINT NOT NULL, - table_groups_id UUID NOT NULL, - profiling_starttime TIMESTAMP, - profiling_endtime TIMESTAMP, - status VARCHAR(100) DEFAULT 'Running', - log_message VARCHAR, - table_ct BIGINT, - column_ct BIGINT, - anomaly_ct BIGINT, - anomaly_table_ct BIGINT, - anomaly_column_ct BIGINT, - process_id INTEGER + project_code VARCHAR(30) NOT NULL, + connection_id BIGINT NOT NULL, + table_groups_id UUID NOT NULL, + profiling_starttime TIMESTAMP, + profiling_endtime TIMESTAMP, + status VARCHAR(100) DEFAULT 'Running', + log_message VARCHAR, + table_ct BIGINT, + column_ct BIGINT, + anomaly_ct BIGINT, + anomaly_table_ct BIGINT, + anomaly_column_ct BIGINT, + dq_affected_data_points BIGINT, + dq_total_data_points BIGINT, + dq_score_profiling FLOAT, + process_id INTEGER ); CREATE TABLE test_suites ( @@ -128,16 +133,12 @@ CREATE TABLE test_suites ( test_action VARCHAR(100), severity VARCHAR(10), export_to_observability VARCHAR(5) DEFAULT 'Y', --- email_list VARCHAR(200), --- email_slack VARCHAR(100), --- wiki_link VARCHAR(200), --- variation_link VARCHAR(200), --- wiki_page_id BIGINT, --- confluence_space VARCHAR(10), test_suite_schema VARCHAR(100), component_key VARCHAR(100), component_type VARCHAR(100), component_name VARCHAR(100), + last_complete_test_run_id UUID, + dq_score_exclude BOOLEAN default FALSE, CONSTRAINT test_suites_id_pk PRIMARY KEY (id) ); @@ -230,6 +231,10 @@ CREATE TABLE profile_results ( filled_value_ct BIGINT, min_text VARCHAR(1000), max_text VARCHAR(1000), + upper_case_ct BIGINT, + lower_case_ct BIGINT, + non_alpha_ct BIGINT, + mixed_case_ct BIGINT GENERATED ALWAYS AS ( value_ct - upper_case_ct - lower_case_ct - non_alpha_ct ) STORED, numeric_ct BIGINT, date_ct BIGINT, top_patterns VARCHAR(1000), @@ -249,9 +254,11 @@ CREATE TABLE profile_results ( before_1yr_date_ct BIGINT, before_5yr_date_ct BIGINT, before_20yr_date_ct BIGINT, + before_100yr_date_ct BIGINT, within_1yr_date_ct BIGINT, within_1mo_date_ct BIGINT, future_date_ct BIGINT, + distant_future_date_ct BIGINT, date_days_present BIGINT, date_weeks_present BIGINT, date_months_present BIGINT, @@ -275,13 +282,15 @@ CREATE TABLE profile_anomaly_types ( CONSTRAINT pk_anomaly_types_id PRIMARY KEY, anomaly_type VARCHAR(200) NOT NULL, - data_object VARCHAR(10), -- Table, Dates, Column + data_object VARCHAR(10), -- Column, Multi-Col, Dates, Variant anomaly_name VARCHAR(100), anomaly_description VARCHAR(500), anomaly_criteria VARCHAR(2000), detail_expression VARCHAR(2000), issue_likelihood VARCHAR(50), -- Potential, Likely, Certain - suggested_action VARCHAR(1000) -- Consider, Investigate, Correct + suggested_action VARCHAR(1000), + dq_score_prevalence_formula TEXT, + dq_score_risk_factor TEXT ); CREATE TABLE profile_anomaly_results ( @@ -298,7 +307,8 @@ CREATE TABLE profile_anomaly_results ( column_type VARCHAR(50), anomaly_id VARCHAR(10), detail VARCHAR, - disposition VARCHAR(20) -- Confirmed, Dismissed, Inactive + disposition VARCHAR(20), -- Confirmed, Dismissed, Inactive + dq_prevalence FLOAT ); @@ -350,7 +360,10 @@ CREATE TABLE data_table_chars ( drop_date TIMESTAMP, record_ct BIGINT, column_ct BIGINT, - data_point_ct BIGINT + data_point_ct BIGINT, + last_complete_profile_run_id UUID, + dq_score_profiling FLOAT, + dq_score_testing FLOAT ); CREATE TABLE data_column_chars ( @@ -384,7 +397,10 @@ CREATE TABLE data_column_chars ( fails_30_days_prior INTEGER, warnings_last_run INTEGER, warnings_7_days_prior INTEGER, - warnings_30_days_prior INTEGER + warnings_30_days_prior INTEGER, + last_complete_profile_run_id UUID, + dq_score_profiling FLOAT, + dq_score_testing FLOAT ); CREATE TABLE test_types ( @@ -399,6 +415,8 @@ CREATE TABLE test_types ( measure_uom VARCHAR(100), measure_uom_description VARCHAR(200), selection_criteria TEXT, + dq_score_prevalence_formula TEXT, + dq_score_risk_factor TEXT, column_name_prompt TEXT, column_name_help TEXT, default_parm_columns TEXT, @@ -434,25 +452,28 @@ CREATE TABLE generation_sets ( ); CREATE TABLE test_runs ( - id UUID NOT NULL + id UUID NOT NULL CONSTRAINT test_runs_id_pk PRIMARY KEY, - test_suite_id UUID NOT NULL, - test_starttime TIMESTAMP, - test_endtime TIMESTAMP, - status VARCHAR(100) DEFAULT 'Running', - log_message TEXT, - duration VARCHAR(50), - test_ct INTEGER, - passed_ct INTEGER, - failed_ct INTEGER, - warning_ct INTEGER, - error_ct INTEGER, - table_ct INTEGER, - column_ct INTEGER, - column_failed_ct INTEGER, - column_warning_ct INTEGER, - process_id INTEGER, + test_suite_id UUID NOT NULL, + test_starttime TIMESTAMP, + test_endtime TIMESTAMP, + status VARCHAR(100) DEFAULT 'Running', + log_message TEXT, + duration VARCHAR(50), + test_ct INTEGER, + passed_ct INTEGER, + failed_ct INTEGER, + warning_ct INTEGER, + error_ct INTEGER, + table_ct INTEGER, + column_ct INTEGER, + column_failed_ct INTEGER, + column_warning_ct INTEGER, + dq_affected_data_points BIGINT, + dq_total_data_points BIGINT, + dq_score_test_run FLOAT, + process_id INTEGER, CONSTRAINT test_runs_test_suites_fk FOREIGN KEY (test_suite_id) REFERENCES test_suites ); @@ -488,6 +509,8 @@ CREATE TABLE test_results ( test_description VARCHAR(1000), test_run_id UUID NOT NULL, table_groups_id UUID, + dq_prevalence FLOAT, + dq_record_ct BIGINT, observability_status VARCHAR(10), CONSTRAINT test_results_test_suites_project_code_test_suite_fk FOREIGN KEY (test_suite_id) REFERENCES test_suites diff --git a/testgen/template/dbsetup/040_populate_new_schema_project.sql b/testgen/template/dbsetup/040_populate_new_schema_project.sql index d71e944..8ac7fdc 100644 --- a/testgen/template/dbsetup/040_populate_new_schema_project.sql +++ b/testgen/template/dbsetup/040_populate_new_schema_project.sql @@ -10,7 +10,7 @@ SELECT '{PROJECT_CODE}' as project_code, INSERT INTO connections (project_code, sql_flavor, - project_host, project_port, project_user, project_db, project_qc_schema, + project_host, project_port, project_user, project_db, connection_name, project_pw_encrypted, max_threads, max_query_chars) SELECT '{PROJECT_CODE}' as project_code, '{SQL_FLAVOR}' as sql_flavor, @@ -18,7 +18,6 @@ SELECT '{PROJECT_CODE}' as project_code, '{PROJECT_PORT}' as project_port, '{PROJECT_USER}' as project_user, '{PROJECT_DB}' as project_db, - '{PROJECT_QC_SCHEMA}' as project_qc_schema, '{CONNECTION_NAME}' as connection_name, '{PROJECT_PW_ENCRYPTED}' as project_pw_encrypted, '{MAX_THREADS}'::INTEGER as max_threads, diff --git a/testgen/template/dbsetup/050_populate_new_schema_metadata.sql b/testgen/template/dbsetup/050_populate_new_schema_metadata.sql index a9643cf..3032a8e 100644 --- a/testgen/template/dbsetup/050_populate_new_schema_metadata.sql +++ b/testgen/template/dbsetup/050_populate_new_schema_metadata.sql @@ -12,15 +12,16 @@ ALTER TABLE cat_test_conditions DROP CONSTRAINT cat_test_conditions_cat_tests_te TRUNCATE TABLE profile_anomaly_types; -INSERT INTO profile_anomaly_types (id, anomaly_type, data_object, anomaly_name, anomaly_description, anomaly_criteria, detail_expression, issue_likelihood, suggested_action) +INSERT INTO profile_anomaly_types + (id, anomaly_type, data_object, anomaly_name, anomaly_description, anomaly_criteria, detail_expression, issue_likelihood, suggested_action, dq_score_prevalence_formula, dq_score_risk_factor) VALUES ('1001', 'Suggested_Type', 'Column', 'Suggested Data Type', 'Data stored as text all meets criteria for a more suitable type. ', '(functional_data_type NOT IN (''Boolean'', ''Flag'') ) AND (column_type ILIKE ''%ch ar%'' OR column_type ILIKE ''text'') AND NOT (datatype_suggestion ILIKE ''%char%'' OR datatype_suggestion ILIKE ''text'')', 'p.datatype_suggestion::VARCHAR(200)', 'Likely', 'Consider changing the column data type to tighte -n controls over data ingested and to make values more efficient, consistent and suitable for downstream analysis.'), - ('1002', 'Non_Standard_Blanks', 'Column', 'Non-Standard Blank Values', 'Values representing missing data may be unexpected or inconsistent. Non-standard values may include empty strings as opposed to nulls, dummy entries such as "MISSING" or repeated characters that may have been used to bypass entry requirements, processing artifacts such as "NULL", or spreadsheet artifacts such as "NA", "ERROR".', 'p.filled_value_ct > 0 OR p.zero_length_ct > 0', '''Filled Values: '' || p.filled_value_ct::VARCHAR || '', Empty String: '' || p.zero_length_ct::VARCHAR || '', Null: '' || p.null_value_ct::VARCHAR || '', Records: '' || p.record_ct::VARCHAR', 'Definite', 'Consider cleansing the column upon ingestion to replace all variants of missing data with a standard designation, like Null.'), - ('1003', 'Invalid_Zip_USA', 'Column', 'Invalid USA Zip Code Format', 'Some values present do not conform with the expected format of USA Zip Codes.', 'p.std_pattern_match = ''ZIP_USA'' AND (p.general_type <> ''A'' OR p.filled_value_ct > 0 OR p.min_length >= 1 AND p.min_length <= 4 OR p.max_length > 10)', 'CASE WHEN p.general_type = ''N'' THEN ''Type: '' || p.column_type || '', '' ELSE '''' END || ''Min Length: '' || p.min_length::VARCHAR || '', Max Length: '' || p.max_length::VARCHAR || '', Filled Values: '' || p.filled_value_ct::VARCHAR', 'Definite', 'Consider correcting invalid column values or changing them to indicate a missing value if corrections cannot be made.'), - ('1004', 'Multiple_Types_Minor', 'Multi-Col', 'Multiple Data Types per Column Name - Minor', 'Columns with the same name have the same general type across tables, but the types do not exactly match. Truncation issues may result if columns are commingled and assumed to be the same format.', 'm.general_type_ct = 1 AND m.type_ct > 1', '''Found '' || m.column_ct::VARCHAR || '' columns, '' || m.type_ct::VARCHAR(10) || '' types, '' || m.min_type || '' to '' || m.max_type', 'Possible', 'Consider changing the column data types to be fully consistent. This will tighten your standards at ingestion and assure that data is consistent between tables.'), - ('1005', 'Multiple_Types_Major', 'Multi-Col', 'Multiple Data Types per Column Name - Major', 'Columns with the same name have broadly different types across tables. Differences could be significant enough to cause errors in downstream analysis, extra steps resulting in divergent business logic and inconsistencies in results.', 'm.general_type_ct > 1', '''Found '' || m.column_ct::VARCHAR || '' columns, '' || m.type_ct::VARCHAR(10) || '' types, '' || m.min_type || '' to '' || m.max_type', 'Likely', 'Ideally, you should change the column data types to be fully consistent. If the data is meant to be different, you should change column names so downstream users aren''t led astray.'), - ('1006', 'No_Values', 'Column', 'No Column Values Present', 'This column is present in the table, but no values have been ingested or assigned in any records. This could indicate missing data or a processing error. Note that this considers dummy values and zero-length values as missing data. ', '(p.null_value_ct + p.filled_value_ct + p.zero_length_ct) = p.record_ct', '''Null: '' || p.null_value_ct::VARCHAR(10) || '', Filled: '' || p.filled_value_ct::VARCHAR(10) || '', Zero Len: '' || p.zero_length_ct::VARCHAR(10)', 'Possible', 'Review your source data, ingestion process, and any processing steps that update this column.'), +n controls over data ingested and to make values more efficient, consistent and suitable for downstream analysis.', NULL, NULL), + ('1002', 'Non_Standard_Blanks', 'Column', 'Non-Standard Blank Values', 'Values representing missing data may be unexpected or inconsistent. Non-standard values may include empty strings as opposed to nulls, dummy entries such as "MISSING" or repeated characters that may have been used to bypass entry requirements, processing artifacts such as "NULL", or spreadsheet artifacts such as "NA", "ERROR".', '(p.filled_value_ct > 0 OR p.zero_length_ct > 0)', '''Filled Values: '' || p.filled_value_ct::VARCHAR || '', Empty String: '' || p.zero_length_ct::VARCHAR || '', Null: '' || p.null_value_ct::VARCHAR || '', Records: '' || p.record_ct::VARCHAR', 'Definite', 'Consider cleansing the column upon ingestion to replace all variants of missing data with a standard designation, like Null.', 'p.filled_value_ct::FLOAT/p.record_ct::FLOAT', '1.0'), + ('1003', 'Invalid_Zip_USA', 'Column', 'Invalid USA Zip Code Format', 'Some values present do not conform with the expected format of USA Zip Codes.', 'p.std_pattern_match = ''ZIP_USA'' AND (p.general_type <> ''A'' OR p.filled_value_ct > 0 OR p.min_length >= 1 AND p.min_length <= 4 OR p.max_length > 10)', 'CASE WHEN p.general_type = ''N'' THEN ''Type: '' || p.column_type || '', '' ELSE '''' END || ''Min Length: '' || p.min_length::VARCHAR || '', Max Length: '' || p.max_length::VARCHAR || '', Filled Values: '' || p.filled_value_ct::VARCHAR', 'Definite', 'Consider correcting invalid column values or changing them to indicate a missing value if corrections cannot be made.', NULL, '1.0'), + ('1004', 'Multiple_Types_Minor', 'Multi-Col', 'Multiple Data Types per Column Name - Minor', 'Columns with the same name have the same general type across tables, but the types do not exactly match. Truncation issues may result if columns are commingled and assumed to be the same format.', 'm.general_type_ct = 1 AND m.type_ct > 1', '''Found '' || m.column_ct::VARCHAR || '' columns, '' || m.type_ct::VARCHAR(10) || '' types, '' || m.min_type || '' to '' || m.max_type', 'Possible', 'Consider changing the column data types to be fully consistent. This will tighten your standards at ingestion and assure that data is consistent between tables.', NULL, NULL), + ('1005', 'Multiple_Types_Major', 'Multi-Col', 'Multiple Data Types per Column Name - Major', 'Columns with the same name have broadly different types across tables. Differences could be significant enough to cause errors in downstream analysis, extra steps resulting in divergent business logic and inconsistencies in results.', 'm.general_type_ct > 1', '''Found '' || m.column_ct::VARCHAR || '' columns, '' || m.type_ct::VARCHAR(10) || '' types, '' || m.min_type || '' to '' || m.max_type', 'Likely', 'Ideally, you should change the column data types to be fully consistent. If the data is meant to be different, you should change column names so downstream users aren''t led astray.', NULL, NULL), + ('1006', 'No_Values', 'Column', 'No Column Values Present', 'This column is present in the table, but no values have been ingested or assigned in any records. This could indicate missing data or a processing error. Note that this considers dummy values and zero-length values as missing data. ', '(p.null_value_ct + p.filled_value_ct + p.zero_length_ct) = p.record_ct', '''Null: '' || p.null_value_ct::VARCHAR(10) || '', Filled: '' || p.filled_value_ct::VARCHAR(10) || '', Zero Len: '' || p.zero_length_ct::VARCHAR(10)', 'Possible', 'Review your source data, ingestion process, and any processing steps that update this column.', '1.0', '0.33'), ('1007', 'Column_Pattern_Mismatch', 'Column', 'Pattern Inconsistency Within Column', 'Alpha-numeric string data within this column conforms to 2-4 different patterns, with 95% matching the first pattern. This could indicate data errors in the remaining values. ', 'p.general_type = ''A'' AND p.max_length > 3 AND p.value_ct > (p.numeric_ct + p.filled_value_ct) @@ -31,127 +32,121 @@ n controls over data ingested and to make values more efficient, consistent and AND SPLIT_PART(p.top_patterns, ''|'', 3)::NUMERIC / SPLIT_PART(p.top_patterns, ''|'', 1)::NUMERIC < 0.05) OR SPLIT_PART(p.top_patterns, ''|'', 3)::NUMERIC / SPLIT_PART(p.top_patterns, ''|'', 1)::NUMERIC < 0.1 - )', '''Patterns: '' || p.top_patterns', 'Likely', 'Review the values for any data that doesn''t conform to the most common pattern and correct any data errors.'), + )', '''Patterns: '' || p.top_patterns', 'Likely', 'Review the values for any data that doesn''t conform to the most common pattern and correct any data errors.', '(p.record_ct - SPLIT_PART(p.top_patterns, ''|'', 1)::INT)::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '0.66'), ('1008', 'Table_Pattern_Mismatch', 'Multi-Col', 'Pattern Inconsistency Across Tables', 'Alpha-numeric string data within this column matches a single pattern, but other columns with the same name have data that matches a different single pattern. Inconsistent formatting may contradict user assumptions and cause downstream errors, extra steps and inconsistent business logic.', 'p.general_type = ''A'' AND p.max_length > 3 AND p.value_ct > (p.numeric_ct + p.filled_value_ct) AND m.max_pattern_ct = 1 AND m.column_ct > 1 AND SPLIT_PART(p.top_patterns, ''|'', 2) <> SPLIT_PART(m.very_top_pattern, ''|'', 2) - AND SPLIT_PART(p.top_patterns, ''|'', 1)::NUMERIC / SPLIT_PART(m.very_top_pattern, ''|'', 1)::NUMERIC < 0.1', '''Patterns: '' || SPLIT_PART(p.top_patterns, ''|'', 2) || '', '' || SPLIT_PART(ltrim(m.very_top_pattern, ''0''), ''|'', 2)', 'Likely', 'Review the profiled patterns for the same column in other tables. You may want to add a hygiene step to your processing to make patterns consistent.'), - ('1009', 'Leading_Spaces', 'Column', 'Leading Spaces Found in Column Values', 'Spaces were found before data at the front of column string values. This likely contradicts user expectations and could be a sign of broader ingestion or processing errors.', 'p.lead_space_ct > 0', '''Cases Found: '' || p.lead_space_ct::VARCHAR(10)', 'Likely', 'Review your source data, ingestion process, and any processing steps that update this column.'), - ('1010', 'Quoted_Values', 'Column', 'Quoted Values Found in Column Values', 'Column values were found within quotes. This likely contradicts user expectations and could be a sign of broader ingestion or processing errors.', 'p.quoted_value_ct > 0', '''Cases Found: '' || p.quoted_value_ct::VARCHAR(10)', 'Likely', 'Review your source data, ingestion process, and any processing steps that update this column.'), + AND SPLIT_PART(p.top_patterns, ''|'', 1)::NUMERIC / SPLIT_PART(m.very_top_pattern, ''|'', 1)::NUMERIC < 0.1', '''Patterns: '' || SPLIT_PART(p.top_patterns, ''|'', 2) || '', '' || SPLIT_PART(ltrim(m.very_top_pattern, ''0''), ''|'', 2)', 'Likely', 'Review the profiled patterns for the same column in other tables. You may want to add a hygiene step to your processing to make patterns consistent.', NULL, NULL), + ('1009', 'Leading_Spaces', 'Column', 'Leading Spaces Found in Column Values', 'Spaces were found before data at the front of column string values. This likely contradicts user expectations and could be a sign of broader ingestion or processing errors.', 'p.lead_space_ct > 0', '''Cases Found: '' || p.lead_space_ct::VARCHAR(10)', 'Likely', 'Review your source data, ingestion process, and any processing steps that update this column.', 'p.lead_space_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '0.66'), + ('1010', 'Quoted_Values', 'Column', 'Quoted Values Found in Column Values', 'Column values were found within quotes. This likely contradicts user expectations and could be a sign of broader ingestion or processing errors.', 'p.quoted_value_ct > 0', '''Cases Found: '' || p.quoted_value_ct::VARCHAR(10)', 'Likely', 'Review your source data, ingestion process, and any processing steps that update this column.', 'p.quoted_value_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '0.66'), ('1011', 'Char_Column_Number_Values', 'Column', 'Character Column with Mostly Numeric Values', 'This column is defined as alpha, but more than 95% of its values are numeric. Numbers in alpha columns won''t sort correctly, and might contradict user expectations downstream. It''s also possible that more than one type of information is stored in the column, making it harder to retrieve.', 'p.general_type = ''A'' AND p.column_name NOT ILIKE ''%zip%'' AND p.functional_data_type NOT ILIKE ''id%'' AND p.value_ct > p.numeric_ct - AND p.numeric_ct::NUMERIC > (0.95 * p.value_ct::NUMERIC)', '''Numeric Ct: '' || p.numeric_ct || '' of '' || p.value_ct || '' (Numeric Percent: '' || ROUND(100.0 * p.numeric_ct::NUMERIC(18, 5) / p.value_ct::NUMERIC(18, 5), 2) || '' )''::VARCHAR(200)', 'Likely', 'Review your source data and ingestion process. Consider whether it might be better to store the numeric data in a numeric column. If the alpha data is significant, you could store it in a different column.'), + AND p.numeric_ct::NUMERIC > (0.95 * p.value_ct::NUMERIC)', '''Numeric Ct: '' || p.numeric_ct || '' of '' || p.value_ct || '' (Numeric Percent: '' || ROUND(100.0 * p.numeric_ct::NUMERIC(18, 5) / p.value_ct::NUMERIC(18, 5), 2) || '' )''::VARCHAR(200)', 'Likely', 'Review your source data and ingestion process. Consider whether it might be better to store the numeric data in a numeric column. If the alpha data is significant, you could store it in a different column.', 'p.numeric_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '0.66'), ('1012', 'Char_Column_Date_Values', 'Column', 'Character Column with Mostly Date Values', 'This column is defined as alpha, but more than 95% of its values are dates. Dates in alpha columns might not sort correctly, and might contradict user expectations downstream. It''s also possible that more than one type of information is stored in the column, making it harder to retrieve. ', 'p.general_type = ''A'' AND p.value_ct > p.date_ct - AND p.date_ct::NUMERIC > (0.95 * p.value_ct::NUMERIC)', ''' Date Ct: '' || p.date_ct || '' of '' || p.value_ct || '' (Date Percent: '' || ROUND(100.0 * p.date_ct::NUMERIC(18, 5) / p.value_ct::NUMERIC(18, 5), 2) || '' )''::VARCHAR(200)' , 'Likely', 'Review your source data and ingestion process. Consider whether it might be better to store the date values as a date or datetime column. If the alpha data is also significant, you could store it in a different column.'), + AND p.date_ct::NUMERIC > (0.95 * p.value_ct::NUMERIC)', ''' Date Ct: '' || p.date_ct || '' of '' || p.value_ct || '' (Date Percent: '' || ROUND(100.0 * p.date_ct::NUMERIC(18, 5) / p.value_ct::NUMERIC(18, 5), 2) || '' )''::VARCHAR(200)', 'Likely', 'Review your source data and ingestion process. Consider whether it might be better to store the date values as a date or datetime column. If the alpha data is also significant, you could store it in a different column.', 'p.date_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '0.66'), ('1013', 'Small Missing Value Ct', 'Column', 'Small Percentage of Missing Values Found', 'Under 3% of values in this column were found to be null, zero-length or dummy values, but values are not universally present. This could indicate unexpected missing values in a required column.', '(p.value_ct - p.zero_length_ct - p.filled_value_ct)::FLOAT / p.record_ct::FLOAT > 0.97 AND (p.value_ct - p.zero_length_ct - p.filled_value_ct) < p.record_ct', '(p.record_ct - (p.value_ct - p.zero_length_ct - p.filled_value_ct))::VARCHAR(20) || '' of '' || p.record_ct::VARCHAR(20) || '' blank values: '' || ROUND(100.0 * (p.record_ct - (p.value_ct - p.zero_length_ct - p.filled_value_ct))::NUMERIC(18, 5) - / NULLIF(p.value_ct, 0)::NUMERIC(18, 5), 2)::VARCHAR(40) || ''%''', 'Possible', 'Review your source data and follow-up with data owners to determine whether this data needs to be corrected, supplemented or excluded.'), + / NULLIF(p.value_ct, 0)::NUMERIC(18, 5), 2)::VARCHAR(40) || ''%''', 'Possible', 'Review your source data and follow-up with data owners to determine whether this data needs to be corrected, supplemented or excluded.', '(p.null_value_ct + filled_value_ct + zero_length_ct)::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '0.33'), ('1014', 'Small Divergent Value Ct', 'Column', 'Small Percentage of Divergent Values Found', 'Under 3% of values in this column were found to be different from the most common value. This could indicate a data error.', '(100.0 * fn_parsefreq(p.top_freq_values, 1, 2)::FLOAT / p.value_ct::FLOAT) > 97::FLOAT AND (100.0 * fn_parsefreq(p.top_freq_values, 1, 2)::FLOAT / NULLIF(p.value_ct, 0)::FLOAT) < 100::FLOAT', '''Single Value Pct: '' || ROUND(100.0 * fn_parsefreq(p.top_freq_values, 1, 2)::FLOAT / NULLIF(p.value_ct, 0)::FLOAT)::VARCHAR(40) - || '', Value | Freq: '' || top_freq_values', 'Possible', 'Review your source data and follow-up with data owners to determine whether this data needs to be corrected.'), + || '', Value | Freq: '' || top_freq_values', 'Possible', 'Review your source data and follow-up with data owners to determine whether this data needs to be corrected.', '(p.record_ct - fn_parsefreq(p.top_freq_values, 1, 2)::INT)::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '0.33'), ('1015', 'Boolean_Value_Mismatch', 'Column', 'Unexpected Boolean Values Found', 'This column appears to contain boolean (True/False) data, but unexpected values were found. This could indicate inconsistent coding for the same intended values, potentially leading to downstream errors or inconsistent business logic. ', '(distinct_value_ct > 1 AND ((lower(top_freq_values) ILIKE ''| true |%'' OR lower(top_freq_values) ILIKE ''| false |%'') AND NOT (lower(top_freq_values) ILIKE ''%| true |%'' AND lower(top_freq_values) ILIKE ''%| false |%'')) OR ((lower(top_freq_values) ILIKE ''| yes |%'' OR lower(top_freq_values) ILIKE ''| no |%'' ) AND NOT (lower(top_freq_values) ILIKE ''%| yes |%'' AND lower(top_freq_values) ILIKE ''%| no |%'')) )', 'CASE WHEN p.top_freq_values IS NULL THEN ''Min: '' || p.min_text || '', Max: '' || p.max_text - ELSE ''Top Freq: '' || p.top_freq_values END', 'Likely', 'Review your source data and follow-up with data owners to determine whether this data needs to be corrected. '), + ELSE ''Top Freq: '' || p.top_freq_values END', 'Likely', 'Review your source data and follow-up with data owners to determine whether this data needs to be corrected. ', NULL, '0.66'), ('1016', 'Potential_Duplicates', 'Column', 'Potential Duplicate Values Found', 'This column is largely unique, but some duplicate values are present. This pattern is uncommon and could indicate inadvertant duplication. ', 'p.distinct_value_ct > 1000 - AND fn_parsefreq(p.top_freq_values, 1, 2)::BIGINT BETWEEN 2 AND 4', '''Top Freq: '' || p.top_freq_values', 'Possible', 'Review your source data and follow-up with data owners to determine whether this data needs to be corrected. '), + AND fn_parsefreq(p.top_freq_values, 1, 2)::BIGINT BETWEEN 2 AND 4', '''Top Freq: '' || p.top_freq_values', 'Possible', 'Review your source data and follow-up with data owners to determine whether this data needs to be corrected. ', '(p.value_ct - p.distinct_value_ct)::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '0.33'), ('1017', 'Standardized_Value_Matches', 'Column', 'Similar Values Match When Standardized', 'When column values are standardized (removing spaces, single-quotes, periods and dashes), matching values are found in other records. This may indicate that formats should be further standardized to allow consistent comparisons for merges, joins and roll-ups. It could also indicate the presence of unintended duplicates.', 'p.general_type = ''A'' AND p.distinct_std_value_ct <> p.distinct_value_ct', '''Distinct Values: '' || p.distinct_value_ct::VARCHAR - || '', Standardized: '' || p.distinct_std_value_ct::VARCHAR', 'Likely', 'Review standardized vs. raw data values for all matches. Correct data if values should be consistent.'), + || '', Standardized: '' || p.distinct_std_value_ct::VARCHAR', 'Likely', 'Review standardized vs. raw data values for all matches. Correct data if values should be consistent.', '(p.distinct_value_ct - p.distinct_std_value_ct)::FLOAT/NULLIF(p.value_ct, 0)', '0.66'), ('1018', 'Unlikely_Date_Values', 'Column', 'Unlikely Dates out of Typical Range', 'Some date values in this column are earlier than 1900-01-01 or later than 30 years after Profiling date.', 'p.general_type = ''D'' AND (p.min_date BETWEEN ''0001-01-02''::DATE AND ''1900-01-01''::DATE - OR p.max_date > CURRENT_DATE + INTERVAL ''30 year'')', '''Date Range: '' || p.min_date::VARCHAR || '' thru '' || p.max_date::VARCHAR', 'Likely', 'Review your source data and follow-up with data owners to determine whether this data needs to be corrected or removed.'), - ('1019', 'Recency_One_Year', 'Dates', 'Recency - No Table Dates within 1 Year', 'Among all date columns present in the table, none fall inside of one year from Profile date.', 'MAX(p.max_date) < CURRENT_DATE - INTERVAL ''1 year''', '''Most Recent Date: '' || MAX(p.max_date)::VARCHAR', 'Possible', 'Review your source data and follow-up with data owners to determine whether dates in table should be more recent.'), - ('1020', 'Recency_Six_Months', 'Dates', 'Recency - No Table Dates within 6 Months', 'Among all date columns present in the table, the most recent date falls 6 months to 1 year back from Profile date. ', 'MAX(p.max_date) >= CURRENT_DATE - INTERVAL ''1 year'' AND MAX(p.max_date) < CURRENT_DATE - INTERVAL ''6 months''', '''Most Recent Date: '' || MAX(p.max_date)::VARCHAR', 'Possible', 'Review your source data and follow-up with data owners to determine whether dates in table should be more recent.'), + OR p.max_date > CURRENT_DATE + INTERVAL ''30 year'')', '''Date Range: '' || p.min_date::VARCHAR || '' thru '' || p.max_date::VARCHAR', 'Likely', 'Review your source data and follow-up with data owners to determine whether this data needs to be corrected or removed.', '(COALESCE(p.before_100yr_date_ct,0)+COALESCE(p.distant_future_date_ct, 0))::FLOAT/NULLIF(p.record_ct, 0)', '0.66'), + ('1019', 'Recency_One_Year', 'Dates', 'Recency - No Table Dates within 1 Year', 'Among all date columns present in the table, none fall inside of one year from Profile date.', 'MAX(p.max_date) < CURRENT_DATE - INTERVAL ''1 year''', '''Most Recent Date: '' || MAX(p.max_date)::VARCHAR', 'Possible', 'Review your source data and follow-up with data owners to determine whether dates in table should be more recent.', NULL, NULL), + ('1020', 'Recency_Six_Months', 'Dates', 'Recency - No Table Dates within 6 Months', 'Among all date columns present in the table, the most recent date falls 6 months to 1 year back from Profile date. ', 'MAX(p.max_date) >= CURRENT_DATE - INTERVAL ''1 year'' AND MAX(p.max_date) < CURRENT_DATE - INTERVAL ''6 months''', '''Most Recent Date: '' || MAX(p.max_date)::VARCHAR', 'Possible', 'Review your source data and follow-up with data owners to determine whether dates in table should be more recent.', NULL, NULL), ('1021', 'Unexpected US States', 'Column', 'Unexpected Column Contains US States', 'This column is not labeled as a state, but contains mostly US State abbreviations. This could indicate shifted or switched source data columns.', 'p.std_pattern_match = ''STATE_USA'' AND p.distinct_value_ct > 5 - AND NOT (p.column_name ILIKE ''%state%'' OR p.column_name ILIKE ''%_st'')', '''Value Range: '' || p.min_text || '' thru '' || max_text || CASE WHEN p.top_freq_values > '''' THEN ''Top Freq Values: '' || REPLACE(p.top_freq_values, CHR(10), '' ; '') ELSE '''' END ', 'Possible', 'Review your source data and follow-up with data owners to determine whether column should be populated with US states.'), + AND NOT (p.column_name ILIKE ''%state%'' OR p.column_name ILIKE ''%_st'')', '''Value Range: '' || p.min_text || '' thru '' || max_text || CASE WHEN p.top_freq_values > '''' THEN ''Top Freq Values: '' || REPLACE(p.top_freq_values, CHR(10), '' ; '') ELSE '''' END ', 'Possible', 'Review your source data and follow-up with data owners to determine whether column should be populated with US states.', NULL, '0.33'), ('1022', 'Unexpected Emails', 'Column', 'Unexpected Column Contains Emails', 'This column is not labeled as email, but contains mostly email addresses. This could indicate shifted or switched source data columns.', 'p.std_pattern_match = ''EMAIL'' - AND NOT (p.column_name ILIKE ''%email%'' OR p.column_name ILIKE ''%addr%'')', '''Value Range: '' || p.min_text || '' thru '' || max_text', 'Possible', 'Review your source data and follow-up with data owners to determine whether column should be populated with email addresses.'), - ('1023', 'Small_Numeric_Value_Ct', 'Column', 'Unexpected Numeric Values Found', - 'Under 3% of values in this column were found to be numeric. This could indicate a data error.', e'p.general_type = \'A\' + AND NOT (p.column_name ILIKE ''%email%'' OR p.column_name ILIKE ''%addr%'')', '''Value Range: '' || p.min_text || '' thru '' || max_text', 'Possible', 'Review your source data and follow-up with data owners to determine whether column should be populated with email addresses.', NULL, '0.33'), + ('1023', 'Small_Numeric_Value_Ct', 'Column', 'Unexpected Numeric Values Found', 'Under 3% of values in this column were found to be numeric. This could indicate a data error.', 'p.general_type = ''A'' AND p.numeric_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT < 0.03 - AND p.numeric_ct > 0', - '''Numeric Ct: '' || p.numeric_ct || '' of '' || p.value_ct || '' (Numeric Percent: '' || ROUND(100.0 * p.numeric_ct::NUMERIC(18, 5)/NULLIF(p.value_ct, 0)::NUMERIC(18, 5), 2) || '' )''::VARCHAR(200)', - 'Likely', 'Review your source data and follow-up with data owners to determine whether numeric values are invalid entries here.'), - ('1024', 'Invalid_Zip3_USA', 'Column', 'Invalid USA ZIP-3 Format', 'The majority of values in this column are 3-digit zips, but divergent patterns were found. This could indicate an incorrect roll-up category or a PII concern.', 'p.distinct_pattern_ct > 1 + AND p.numeric_ct > 0', '''Numeric Ct: '' || p.numeric_ct || '' of '' || p.value_ct || '' (Numeric Percent: '' || ROUND(100.0 * p.numeric_ct::NUMERIC(18, 5)/NULLIF(p.value_ct, 0)::NUMERIC(18, 5), 2) || '' )''::VARCHAR(200)', 'Likely', 'Review your source data and follow-up with data owners to determine whether numeric values are invalid entries here.', 'p.numeric_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '0.66'), + ('1024', 'Invalid_Zip3_USA', 'Column', 'Invalid USA ZIP-3 Format', 'The majority of values in this column are 3-digit zips, but divergent patterns were found. This could indicate an incorrect roll-up category or a PII concern.', 'p.distinct_pattern_ct > 1 AND (p.column_name ilike ''%zip%'' OR p.column_name ILIKE ''%postal%'') AND SPLIT_PART(p.top_patterns, '' | '', 2) = ''NNN'' - AND SPLIT_PART(p.top_patterns, '' | '', 1)::FLOAT/NULLIF(value_ct, 0)::FLOAT > 0.50', '''Pattern: '' || p.top_patterns', 'Definite', 'Review your source data, ingestion process, and any processing steps that update this column.'), - ('1025', 'Delimited_Data_Embedded', 'Column', 'Delimited Data Embedded in Column', 'Delimited data, separated by a common delimiter (comma, tab, pipe or caret) is present in over 80% of column values. This could indicate data that was incorrectly ingested, or data that would be better represented in parsed form.', 'p.std_pattern_match = ''DELIMITED_DATA''', 'CASE WHEN p.top_freq_values IS NULL THEN ''Min: '' || p.min_text || '', Max: '' || p.max_text ELSE ''Top Freq: '' || p.top_freq_values END', 'Likely', 'Review your source data and follow-up with data consumers to determine the most useful representation of this data.'), - ('1026', 'Char_Column_Number_Units', 'Column', 'Character Column with Numbers and Units', - 'This column is defined as alpha, but values include numbers with percents or common units. Embedded measures in alpha columns are harder to access, won''t sort correctly, and might contradict user expectations downstream. Consider parsing into numeric and UOM columns to improve usability.', - 'p.includes_digit_ct::FLOAT/NULLIF(p.value_ct, 0)::FLOAT > 0.5 AND TRIM(fn_parsefreq(p.top_freq_values, 1, 1)) ~ ''(?i)^[0-9]+(\.[0-9]+)? ?(%|lb|oz|kg|g|mg|km|m|cm|mm|mi|ft|in)$''', - '''Top Freq: '' || p.top_freq_values', 'Possible', - 'Review your source data and ingestion process. Consider whether it might be better to parse the numeric and unit data and store in separate columns.'), - ('1027', 'Variant_Coded_Values', 'Variant', 'Variant Codings for Same Values', 'This column contains more than one common variants that represent a single value or state. This can occur when data is integrated from multiple sources with different standards, or when free entry is permitted without validation. The variations can cause confusion and error for downstream data users and multiple versions of the truth. ', 'p.distinct_value_ct <= 20', '''Variants Found: '' || intersect_list', 'Definite', 'Review your source data and ingestion process. Consider cleansing this data to standardize on a single set of definitive codes.'), - ('1100', 'Potential_PII', 'Column', 'Personally Identifiable Information', 'This column contains data that could be Personally Identifiable Information (PII)', 'p.pii_flag > ''''', '''Risk: '' || CASE LEFT(p.pii_flag, 1) WHEN ''A'' THEN ''HIGH'' WHEN ''B'' THEN ''MODERATE'' WHEN ''C'' THEN ''LOW'' END || '', PII Type: '' || SUBSTRING(p.pii_flag, 3)', 'Potential PII', 'PII may require steps to ensure data security and compliance with relevant privacy regulations and legal requirements. You may have to classify and inventory PII, implement appropriate access controls, encrypt data, and monitor for unauthorized access. Your organization might be required to update privacy policies and train staff on data protection practices. Note that PII that is lower-risk in isolation might be high-risk in conjunction with other data.'); + AND SPLIT_PART(p.top_patterns, '' | '', 1)::FLOAT/NULLIF(value_ct, 0)::FLOAT > 0.50', '''Pattern: '' || p.top_patterns', 'Definite', 'Review your source data, ingestion process, and any processing steps that update this column.', '(NULLIF(p.record_ct, 0)::INT - SPLIT_PART(p.top_patterns, '' | '', 1)::INT)::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '1'), + ('1025', 'Delimited_Data_Embedded', 'Column', 'Delimited Data Embedded in Column', 'Delimited data, separated by a common delimiter (comma, tab, pipe or caret) is present in over 80% of column values. This could indicate data that was incorrectly ingested, or data that would be better represented in parsed form.', 'p.std_pattern_match = ''DELIMITED_DATA''', 'CASE WHEN p.top_freq_values IS NULL THEN ''Min: '' || p.min_text || '', Max: '' || p.max_text ELSE ''Top Freq: '' || p.top_freq_values END', 'Likely', 'Review your source data and follow-up with data consumers to determine the most useful representation of this data.', NULL, '0.66'), + ('1026', 'Char_Column_Number_Units', 'Column', 'Character Column with Numbers and Units', 'This column is defined as alpha, but values include numbers with percents or common units. Embedded measures in alpha columns are harder to access, won''t sort correctly, and might contradict user expectations downstream. Consider parsing into numeric and UOM columns to improve usability.', 'p.includes_digit_ct::FLOAT/NULLIF(p.value_ct, 0)::FLOAT > 0.5 AND TRIM(fn_parsefreq(p.top_freq_values, 1, 1)) ~ ''(?i)^[0-9]+(\.[0-9]+)? ?(%|lb|oz|kg|g|mg|km|m|cm|mm|mi|ft|in)$''', '''Top Freq: '' || p.top_freq_values', 'Possible', 'Review your source data and ingestion process. Consider whether it might be better to parse the numeric and unit data and store in separate columns.', NULL, '0.33'), + ('1027', 'Variant_Coded_Values', 'Variant', 'Variant Codings for Same Values', 'This column contains more than one common variants that represent a single value or state. This can occur when data is integrated from multiple sources with different standards, or when free entry is permitted without validation. The variations can cause confusion and error for downstream data users and multiple versions of the truth. ', 'p.distinct_value_ct <= 20', '''Variants Found: '' || intersect_list', 'Definite', 'Review your source data and ingestion process. Consider cleansing this data to standardize on a single set of definitive codes.', NULL, NULL), + ('1100', 'Potential_PII', 'Column', 'Personally Identifiable Information', 'This column contains data that could be Personally Identifiable Information (PII)', 'p.pii_flag > ''''', '''Risk: '' || CASE LEFT(p.pii_flag, 1) WHEN ''A'' THEN ''HIGH'' WHEN ''B'' THEN ''MODERATE'' WHEN ''C'' THEN ''LOW'' END || '', PII Type: '' || SUBSTRING(p.pii_flag, 3)', 'Potential PII', 'PII may require steps to ensure data security and compliance with relevant privacy regulations and legal requirements. You may have to classify and inventory PII, implement appropriate access controls, encrypt data, and monitor for unauthorized access. Your organization might be required to update privacy policies and train staff on data protection practices. Note that PII that is lower-risk in isolation might be high-risk in conjunction with other data.', NULL, 'CASE LEFT(p.pii_flag, 1) WHEN ''A'' THEN 1 WHEN ''B'' THEN 0.66 WHEN ''C'' THEN 0.33 END') +; TRUNCATE TABLE test_types; INSERT INTO test_types - (id, test_type, test_name_short, test_name_long, test_description, except_message, measure_uom, measure_uom_description, selection_criteria, column_name_prompt, column_name_help, default_parm_columns, default_parm_values, default_parm_prompts, default_parm_help, default_severity, run_type, test_scope, dq_dimension, health_dimension, threshold_description, usage_notes, active) -VALUES ('1004', 'Alpha_Trunc', 'Alpha Truncation', 'Maximum character count consistent', 'Tests that the maximum count of characters in a column value has not dropped vs. baseline data', 'Maximum length of values has dropped from prior expected length.', 'Values over max', NULL, 'general_type =''A'' AND max_length > 0 AND ( (min_length = avg_length AND max_length = avg_length) OR (numeric_ct <> value_ct ) ) AND functional_table_type NOT LIKE ''%window%'' /* The conditions below are to eliminate overlap with : LOV_Match (excluded selection criteria for this test_type), Pattern_Match (excluded selection criteria for this test_type), Constant (excluded functional_data_type Constant and Boolean) */ AND ( (distinct_value_ct NOT BETWEEN 2 AND 10 AND functional_data_type NOT IN ( ''Constant'', ''Boolean'') ) AND NOT ( fn_charcount(top_patterns, E'' \| '' ) = 1 AND fn_charcount(top_patterns, E'' \| '' ) IS NOT NULL AND REPLACE(SPLIT_PART(top_patterns, ''|'' , 2), ''N'' , '''' ) > ''''))', NULL, NULL, 'threshold_value', 'max_length', 'Maximum String Length at Baseline', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Maximum length expected', 'Alpha Truncation tests that the longest text value in a column hasn''t become shorter than the longest value at baseline. This could indicate a problem in a cumulative dataset, where prior values should still exist unchanged. A failure here would suggest that some process changed data that you would still expect to be present and matching its value when the column was profiled. This test would not be appropriate for an incremental or windowed dataset.', 'Y'), - ('1005', 'Avg_Shift', 'Average Shift', 'Column mean is consistent with reference', 'Tests for statistically-significant shift in mean value for column from average calculated at baseline.', 'Standardized difference between averages is over the selected threshold level.', 'Difference Measure', 'Cohen''s D Difference (0.20 small, 0.5 mod, 0.8 large, 1.2 very large, 2.0 huge)', 'general_type=''N'' AND distinct_value_ct > 10 AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%''', NULL, NULL, 'baseline_value_ct,baseline_avg,baseline_sd,threshold_value', 'value_ct,avg_value,stdev_value,0.5::VARCHAR', 'Value Ct at Baseline,Mean at Baseline,Std Deviation at Baseline,Threshold Difference Measure ', NULL, 'Warning', 'CAT', 'column', 'Consistency', 'Data Drift', 'Standardized Difference Measure', 'Average Shift tests that the average of a numeric column has not significantly changed since baseline, when profiling was done. A significant shift may indicate errors in processing, differences in source data, or valid changes that may nevertheless impact assumptions in downstream data products. The test uses Cohen''s D, a statistical technique to identify significant shifts in a value. Cohen''s D measures the difference between the two averages, reporting results on a standardized scale, which can be interpreted via a rule-of-thumb from small to huge. Depending on your data, some difference may be expected, so it''s reasonable to adjust the threshold value that triggers test failure. This test works well for measures, or even for identifiers if you expect them to increment consistently. You may want to periodically adjust the expected threshold, or even the expected average value if you expect shifting over time. Consider this test along with Variability Increase. If variability rises too, process or measurement flaws could be at work. If variability remains consistent, the issue is more likely to be with the source data itself. ', 'Y'), - ('1007', 'Constant', 'Constant Match', 'All column values match constant value', 'Tests that all values in the column match the constant value identified in baseline data', 'A constant value is expected for this column.', 'Mismatched values', NULL, 'TEMPLATE', NULL, NULL, 'baseline_value,threshold_value', NULL, 'Constant Value at Baseline,Threshold Error Count', 'The single, unchanging value of the column, per baseline|The number of errors that are acceptable before test fails.', 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Count of records with unexpected values', 'Constant Match tests that a single value determined to be a constant in baseline profiling is still the only value for the column that appears in subsequent versions of the dataset. Sometimes new data or business knowledge may reveal that the value is not a constant at all, even though only one value was present at profiling. In this case, you will want to disable this test. Alternatively, you can use the Value Match test to provide a limited number of valid values for the column.', 'Y'), - ('1009', 'Daily_Record_Ct', 'Daily Records', 'All dates present within date range', 'Tests for presence of every calendar date within min/max date range, per baseline data', 'Not every date value between min and max dates is present, unlike at baseline.', 'Missing dates', NULL, 'general_type= ''D'' AND date_days_present > 21 AND date_days_present - (DATEDIFF(''day'', ''1800-01-05''::DATE, max_date) - DATEDIFF(''day'', ''1800-01-05''::DATE, min_date) + 1) = 0 AND future_date_ct::FLOAT / NULLIF(value_ct, 0) <= 0.75', NULL, NULL, 'threshold_value', '0', 'Threshold Missing Calendar Days', NULL, 'Warning', 'CAT', 'column', 'Completeness', 'Volume', 'Missing calendar days within min/max range', 'Daily Records tests that at least one record is present for every day within the minimum and maximum date range for the column. The test is relevant for transactional data, where you would expect at least one transaction to be recorded each day. A failure here would suggest missing records for the number of days identified without data. You can adjust the threshold to accept a number of days that you know legitimately have no records. ', 'Y'), - ('1011', 'Dec_Trunc', 'Decimal Truncation', 'Sum of fractional values at or above reference', 'Tests for decimal truncation by confirming that the sum of fractional values in data is no less than the sum at baseline', 'The sum of fractional values is under baseline, which may indicate decimal truncation', 'Fractional sum', 'The sum of all decimal values from all data for this column', 'fractional_sum IS NOT NULL AND functional_table_type LIKE''%cumulative%''', NULL, NULL, 'threshold_value', 'ROUND(fractional_sum, 0)', 'Sum of Fractional Values at Baseline', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Minimum expected sum of all fractional values', 'Decimal Truncation tests that the fractional (decimal) part of a numeric column has not been truncated since Baseline. This works by summing all the fractional values after the decimal point and confirming that the total is at least equal to the fractional total at baseline. This could indicate a problem in a cumulative dataset, where prior values should still exist unchanged. A failure here would suggest that some process changed data that you would still expect to be present and matching its value when the column was profiled. This test would not be appropriate for an incremental or windowed dataset.', 'Y'), - ('1012', 'Distinct_Date_Ct', 'Date Count', 'Count of distinct dates at or above reference', 'Tests that the count of distinct dates referenced in the column has not dropped vs. baseline data', 'Drop in count of unique dates recorded in column.', 'Unique dates', 'Count of unique dates in transactional date column', 'general_type=''D'' and date_days_present IS NOT NULL AND functional_table_type NOT LIKE ''%window%''', NULL, NULL, 'baseline_value,threshold_value', 'date_days_present,date_days_present', 'Distinct Date Count at Baseline,Min Expected Date Count', NULL, 'Fail', 'CAT', 'column', 'Timeliness', 'Recency', 'Minimum distinct date count expected', 'Date Count tests that the count of distinct dates present in the column has not dropped since baseline. The test is relevant for cumulative datasets, where old records are retained. A failure here would indicate missing records, which could be caused by a processing error or changed upstream data sources.', 'Y'), - ('1013', 'Distinct_Value_Ct', 'Value Count', 'Count of distinct values has not dropped', 'Tests that the count of unique values in the column has not changed from baseline.', 'Count of unique values in column has changed from baseline.', 'Unique Values', NULL, 'distinct_value_ct between 2 and 10 AND value_ct > 0 AND NOT (coalesce(top_freq_values,'''') > '''' AND distinct_value_ct BETWEEN 2 and 10) AND NOT (lower(functional_data_type) LIKE ''%sequence%'' OR lower(functional_data_type) LIKE ''%measurement%'' OR functional_data_type LIKE ''%date%'' OR general_type = ''D'')', NULL, NULL, 'baseline_value_ct,threshold_value', 'distinct_value_ct,distinct_value_ct', 'Distinct Value Count at Baseline,Min Expected Value Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected distinct value count', 'Value Count tests that the count of unique values present in the column has not dropped since baseline. The test is relevant for cumulative datasets, where old records are retained, or for any dataset where you would expect a set number of distinct values should be present. A failure here would indicate missing records or a change in categories or value assignment.', 'Y'), - ('1014', 'Email_Format', 'Email Format', 'Email is correctly formatted', 'Tests that non-blank, non-empty email addresses match the standard format', 'Invalid email address formats found.', 'Invalid emails', 'Number of emails that do not match standard format', 'std_pattern_match=''EMAIL''', NULL, NULL, 'threshold_value', '0', 'Maximum Invalid Email Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of invalid email addresses', NULL, 'Y'), - ('1015', 'Future_Date', 'Past Dates', 'Latest date is prior to test run date', 'Tests that the maximum date referenced in the column is no greater than the test date, consistent with baseline data', 'Future date found when absent in baseline data.', 'Future dates', NULL, 'general_type=''D''AND future_date_ct = 0', NULL, NULL, 'threshold_value', '0', 'Maximum Future Date Count', NULL, 'Fail', 'CAT', 'column', 'Timeliness', 'Recency', 'Expected count of future dates', NULL, 'Y'), - ('1016', 'Future_Date_1Y', 'Future Year', 'Future dates within year of test run date', 'Tests that the maximum date referenced in the column is no greater than one year beyond the test date, consistent with baseline data', 'Future date beyond one-year found when absent in baseline.', 'Future dates post 1 year', NULL, 'general_type=''D''AND future_date_ct > 0 AND max_date <=''{AS_OF_DATE}''::DATE + INTERVAL''365 DAYS''', NULL, NULL, 'threshold_value', '0', 'Maximum Post 1-Year Future Date Count', NULL, 'Fail', 'CAT', 'column', 'Timeliness', 'Recency', 'Expected count of future dates beyond one year', 'Future Year looks for date values in the column that extend beyond one year after the test date. This would be appropriate for transactional dates where you would expect to find dates in the near future, but not beyond one year ahead. Errors could indicate invalid entries or possibly dummy dates representing blank values.', 'Y'), - ('1017', 'Incr_Avg_Shift', 'New Shift', 'New record mean is consistent with reference', 'Tests for statistically-significant shift in mean of new values for column compared to average calculated at baseline.', 'Significant shift in average of new values vs. baseline avg', 'Z-score of mean shift', 'Absolute Z-score (number of SD''s outside mean) of prior avg - incremental avg', 'general_type=''N'' AND distinct_value_ct > 10 AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%''', NULL, NULL, 'baseline_value_ct,baseline_sum,baseline_avg,baseline_sd,threshold_value', 'value_ct,(avg_value * value_ct)::FLOAT,avg_value,stdev_value,2', 'Value Count at Baseline,Sum at Baseline,Mean Value at Baseline,Std Deviation at Baseline,Threshold Max Z-Score', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Maximum Z-Score (number of SD''s beyond mean) expected', 'This is a more sensitive test than Average Shift, because it calculates an incremental difference in the average of new values compared to the average of values at baseline. This is appropriate for a cumulative dataset only, because it calculates the average of new entries based on the assumption that the count and average of records present at baseline are still present at the time of the test. This test compares the mean of new values with the standard deviation of the baseline average to calculate a Z-score. If the new mean falls outside the Z-score threshold, a shift is detected. Potential Z-score thresholds may range from 0 to 3, depending on the sensitivity you prefer. A failed test could indicate a quality issue or a legitimate shift in new data that should be noted and assessed by business users. Consider this test along with Variability Increase. If variability rises too, process, methodology or measurement flaws could be at issue. If variability remains consistent, the problem is more likely to be with the source data itself.', 'Y'), - ('1018', 'LOV_All', 'Value Match All', 'List of expected values all present in column', 'Tests that all values match a pipe-delimited list of expected values and that all expected values are present', 'Column values found don''t exactly match the expected list of values', 'Values found', NULL, NULL, NULL, NULL, 'threshold_value', NULL, 'List of Expected Values', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'List of values expected, in form (''Val1'',''Val2)', 'This is a more restrictive form of Value Match, testing that all values in the dataset match the list provided, and also that all values present in the list appear at least once in the dataset. This would be appropriate for tables where all category values in the column are represented at least once.', 'Y'), - ('1019', 'LOV_Match', 'Value Match', 'All column values present in expected list', 'Tests that all values in the column match the list-of-values identified in baseline data.', 'Values not matching expected List-of-Values from baseline.', 'Non-matching records', NULL, 'top_freq_values > '''' AND distinct_value_ct BETWEEN 2 and 10 AND NOT (functional_data_type LIKE ''%date%'' OR lower(datatype_suggestion) LIKE ''%date%'' OR general_type = ''D'' OR lower(column_name) IN (''file_name'', ''filename''))', NULL, NULL, 'baseline_value,threshold_value', '''('' || SUBSTRING( CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 2) > '''' THEN '','''''' || TRIM( REPLACE ( SPLIT_PART(top_freq_values, ''|'' , 2), '''''''' , '''''''''''' ) ) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 4) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 4), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 6) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 6), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 8) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 8), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 10) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 10), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 12) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 12), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 14) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 14), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 16) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 16), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 18) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 18), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 20) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 20), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END, 2, 999) || '')'',0', 'List of Expected Values,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'List of values expected, in form (''Val1'',''Val2)', 'This tests that all values in the column match the hard-coded list provided. This is relevant when the list of allowable values is small and not expected to change often. Even if new values might occasionally be added, this test is useful for downstream data products to provide warning that assumptions and logic may need to change.', 'Y'), - ('1020', 'Min_Date', 'Minimum Date', 'All dates on or after set minimum', 'Tests that the earliest date referenced in the column is no earlier than baseline data', 'The earliest date value found is before the earliest value at baseline.', 'Dates prior to limit', NULL, 'general_type=''D''and min_date IS NOT NULL AND distinct_value_ct > 1', NULL, NULL, 'baseline_value,threshold_value', 'min_date,0', 'Minimum Date at Baseline,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of dates prior to minimum', 'This test is appropriate for a cumulative dataset only, because it assumes all prior values are still present. It''s appropriate where new records are added with more recent dates, but old dates dates do not change.', 'Y'), - ('1021', 'Min_Val', 'Minimum Value', 'All values at or above set minimum', 'Tests that the minimum value present in the column is no lower than the minimum value in baseline data', 'Minimum column value less than baseline.', 'Values under limit', NULL, 'general_type=''N''and min_value IS NOT NULL AND (distinct_value_ct >= 2 OR (distinct_value_ct=2 and min_value<>0 and max_value<>1))', NULL, NULL, 'baseline_value,threshold_value', 'min_value,0', 'Minimum Value at Baseline,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of values under limit', 'This test is appropriate for a cumulative dataset only, assuming all prior values are still present. It is also appropriate for any measure that has an absolute, definable minimum value, or a heuristic that makes senes for valid data.', 'Y'), - ('1022', 'Missing_Pct', 'Percent Missing', 'Consistent ratio of missing values', 'Tests for statistically-significant shift in percentage of missing values in column vs. baseline data', 'Significant shift in percent of missing values vs. baseline.', 'Difference measure', 'Cohen''s H Difference (0.20 small, 0.5 mod, 0.8 large, 1.2 very large, 2.0 huge)', 'record_ct <> value_ct', NULL, NULL, 'baseline_ct,baseline_value_ct,threshold_value', 'record_ct,value_ct,2::VARCHAR(10)', 'Baseline Record Count,Baseline Value Count,Standardized Difference Measure', NULL, 'Warning', 'CAT', 'column', 'Completeness', 'Data Drift', 'Expected maximum Cohen''s H Difference', 'This test uses Cohen''s H, a statistical test to identify a significant difference between two ratios. Results are reported on a standardized scale, which can be interpreted via a rule-of-thumb from small to huge. An uptick in missing data may indicate a collection issue at the source. A larger change may indicate a processing failure. A drop in missing data may also be significant, if it affects assumptions built into analytic products downstream. You can refine the expected threshold value as you view legitimate results of the measure over time.', 'Y'), - ('1023', 'Monthly_Rec_Ct', 'Monthly Records', 'At least one date per month present within date range', 'Tests for presence of at least one date per calendar month within min/max date range, per baseline data', 'At least one date per month expected in min/max date range.', 'Missing months', 'Calendar months without date values present', 'general_type= ''D'' AND date_days_present > 1 AND date_months_present > 2 AND date_months_present - (datediff( ''MON'' , min_date, max_date) + 1) = 0 AND future_date_ct::FLOAT / NULLIF(value_ct, 0) <= 0.75', NULL, NULL, 'threshold_value', '0', 'Threshold Count of Months without Dates', NULL, 'Fail', 'CAT', 'column', 'Completeness', 'Volume', 'Expected maximum count of calendar months without dates present', 'Monthly Records tests that at least one record is present for every calendar month within the minimum and maximum date range for the column. The test is relevant for transactional data, where you would expect at least one transaction to be recorded each month. A failure here would suggest missing records for the number of months identified without data. You can adjust the threshold to accept a number of month that you know legitimately have no records.', 'Y'), - ('1024', 'Outlier_Pct_Above', 'Outliers Above', 'Consistent outlier counts over 2 SD above mean', 'Tests that percent of outliers over 2 SD above Mean doesn''t exceed threshold', 'Percent of outliers exceeding 2 SD above the mean is greater than expected threshold.', 'Pct records over limit', NULL, 'general_type = ''N'' AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%'' AND value_ct <> distinct_value_ct AND distinct_value_ct > 10 AND stdev_value > 0 AND avg_value IS NOT NULL AND NOT (distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct > 2)', NULL, NULL, 'baseline_avg,baseline_sd,threshold_value', 'avg_value,stdev_value,0.05', 'Baseline Mean, Baseline Std Deviation, Pct Records over 2 SD', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Expected maximum pct records over upper 2 SD limit', 'This test counts the number of data points that may be considered as outliers, determined by whether their value exceeds 2 standard deviations above the mean at baseline. Assuming a normal distribution, a small percentage (defaulted to 5%) of outliers is expected. The actual number may vary for different distributions. The expected threshold reflects the maximum percentage of outliers you expect to see. This test uses the baseline mean rather than the mean for the latest dataset to capture systemic shift as well as individual outliers. ', 'Y'), - ('1025', 'Outlier_Pct_Below', 'Outliers Below', 'Consistent outlier counts under 2 SD below mean', 'Tests that percent of outliers over 2 SD below Mean doesn''t exceed threshold', 'Percent of outliers exceeding 2 SD below the mean is greater than expected threshold.', 'Pct records under limit', NULL, 'general_type = ''N'' AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%'' AND value_ct <> distinct_value_ct AND distinct_value_ct > 10 AND stdev_value > 0 AND avg_value IS NOT NULL AND NOT (distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct > 2)', NULL, NULL, 'baseline_avg,baseline_sd,threshold_value', 'avg_value,stdev_value,0.05', 'Baseline Mean, Baseline Std Deviation, Pct Records over 2 SD', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Expected maximum pct records over lower 2 SD limit', 'This test counts the number of data points that may be considered as outliers, determined by whether their value exceeds 2 standard deviations below the mean at baseline. Assuming a normal distribution, a small percentage (defaulted to 5%) of outliers is expected. The actual number may vary for different distributions. The expected threshold reflects the maximum percentage of outliers you expect to see. This test uses the baseline mean rather than the mean for the latest dataset to capture systemic shift as well as individual outliers. ', 'Y'), - ('1026', 'Pattern_Match', 'Pattern Match', 'Column values match alpha-numeric pattern', 'Tests that all values in the column match the same alpha-numeric pattern identified in baseline data', 'Alpha values do not match consistent pattern in baseline.', 'Pattern Mismatches', NULL, 'fn_charcount(top_patterns, E'' \| '' ) = 1 AND REPLACE(SPLIT_PART(top_patterns, ''|'' , 2), ''N'' , '''' ) > '''' AND distinct_value_ct > 10', NULL, NULL, 'baseline_value,threshold_value', 'trim(REPLACE(REPLACE(REPLACE(SPLIT_PART(top_patterns, '' | '', 2), ''A'', ''[A-Z]''), ''N'', ''[0-9]''), ''a'', ''[a-z]'')),0', 'Pattern at Baseline,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of pattern mismatches', 'This test is appropriate for character fields that are expected to appear in a consistent format. It uses pattern matching syntax as appropriate for your database: REGEX matching if available, otherwise LIKE expressions. The expected threshold is the number of records that fail to match the defined pattern.', 'Y'), - ('1028', 'Recency', 'Recency', 'Latest date within expected range of test date', 'Tests that the latest date in column is within a set number of days of the test date', 'Most recent date value not within expected days of test date.', 'Days before test', 'Number of days that most recent date precedes the date of test', 'general_type= ''D'' AND max_date <= run_date AND NOT column_name IN ( ''filedate'' , ''file_date'' ) AND NOT functional_data_type IN (''Future Date'', ''Schedule Date'') AND DATEDIFF( ''DAY'' , max_date, run_date) <= 62', NULL, NULL, 'threshold_value', 'CASE WHEN DATEDIFF( ''DAY'' , max_date, run_date) <= 3 THEN DATEDIFF(''DAY'', max_date, run_date) + 3 WHEN DATEDIFF(''DAY'', max_date, run_date) <= 7 then DATEDIFF(''DAY'', max_date, run_date) + 7 WHEN DATEDIFF( ''DAY'' , max_date, run_date) <= 31 THEN CEILING( DATEDIFF( ''DAY'' , max_date, run_date)::FLOAT / 7.0) * 7 WHEN DATEDIFF( ''DAY'' , max_date, run_date) > 31 THEN CEILING( DATEDIFF( ''DAY'' , max_date, run_date)::FLOAT / 30.0) * 30 END', 'Threshold Maximum Days before Test', NULL, 'Warning', 'CAT', 'column', 'Timeliness', 'Recency', 'Expected maximum count of days preceding test date', 'This test evaluates recency based on the latest referenced dates in the column. The test is appropriate for transactional dates and timestamps. The test can be especially valuable because timely data deliveries themselves may not assure that the most recent data is present. You can adjust the expected threshold to the maximum number of days that you expect the data to age before the dataset is refreshed. ', 'Y'), - ('1030', 'Required', 'Required Entry', 'Required non-null value present', 'Tests that a non-null value is present in each record for the column, consistent with baseline data', 'Every record for this column is expected to be filled, but some are missing.', 'Missing values', NULL, 'record_ct = value_ct', NULL, NULL, 'threshold_value', '0', 'Threshold Missing Value Count', NULL, 'Fail', 'CAT', 'column', 'Completeness', 'Schema Drift', 'Expected count of missing values', NULL, 'Y'), - ('1033', 'Street_Addr_Pattern', 'Street Address', 'Enough street address entries match defined pattern', 'Tests for percent of records matching standard street address pattern.', 'Percent of values matching standard street address format is under expected threshold.', 'Percent matches', 'Percent of records that match street address pattern', '(std_pattern_match=''STREET_ADDR'') AND (avg_length <> round(avg_length)) AND (avg_embedded_spaces BETWEEN 2 AND 6) AND (avg_length < 35)', NULL, NULL, 'threshold_value', '75', 'Threshold Pct that Match Address Pattern', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected percent of records that match standard street address pattern', 'The street address pattern used in this test should match the vast majority of USA addresses. You can adjust the threshold percent of matches based on the results you are getting -- you may well want to tighten it to make the test more sensitive to invalid entries.', 'Y'), - ('1034', 'Unique', 'Unique Values', 'Each column value is unique', 'Tests that no values for the column are repeated in multiple records.', 'Column values should be unique per row.', 'Duplicate values', 'Count of non-unique values', 'record_ct > 500 and record_ct = distinct_value_ct and value_ct > 0', NULL, NULL, 'threshold_value', '0', 'Threshold Duplicate Value Count', NULL, 'Fail', 'CAT', 'column', 'Uniqueness', 'Schema Drift', 'Expected count of duplicate values', 'This test is ideal when the database itself does not enforce a primary key constraint on the table. It serves as an independent check on uniqueness. If''s also useful when there are a small number of exceptions to uniqueness, which can be reflected in the expected threshold count of duplicates.', 'Y'), - ('1035', 'Unique_Pct', 'Percent Unique', 'Consistent ratio of unique values', 'Tests for statistically-significant shift in percentage of unique values vs. baseline data.', 'Significant shift in percent of unique values vs. baseline.', 'Difference measure', 'Cohen''s H Difference (0.20 small, 0.5 mod, 0.8 large, 1.2 very large, 2.0 huge)', 'distinct_value_ct > 10', NULL, NULL, 'baseline_value_ct,baseline_unique_ct,threshold_value', 'value_ct,distinct_value_ct,0.5', 'Value Count at Baseline,Distinct Value Count at Baseline,Standardized Difference Measure (0 to 1)', NULL, 'Warning', 'CAT', 'column', 'Uniqueness', 'Data Drift', 'Expected maximum Cohen''s H Difference', 'You can think of this as a test of similarity that measures whether the percentage of unique values is consistent with the percentage at baseline. A significant change might indicate duplication or a telling shift in cardinality between entities. The test uses Cohen''s H, a statistical test to identify a significant difference between two ratios. Results are reported on a standardized scale, which can be interpreted via a rule-of-thumb from small to huge. You can refine the expected threshold value as you view legitimate results of the measure over time.', 'Y'), - ('1036', 'US_State', 'US State', 'Column value is two-letter US state code', 'Tests that the recorded column value is a valid US state.', 'Column Value is not a valid US state.', 'Not US States', 'Values that doo not match 2-character US state abbreviations.', 'general_type= ''A'' AND column_name ILIKE ''%state%'' AND distinct_value_ct < 70 AND max_length = 2', NULL, NULL, 'threshold_value', '0', 'Threshold Count not Matching State Abbreviations', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of values that are not US state abbreviations', 'This test validates entries against a fixed list of two-character US state codes and related Armed Forces codes.', 'Y'), - ('1037', 'Weekly_Rec_Ct', 'Weekly Records', 'At least one date per week present within date range', 'Tests for presence of at least one date per calendar week within min/max date range, per baseline data', 'At least one date per week expected in min/max date range.', 'Missing weeks', 'Calendar weeks without date values present', 'general_type= ''D'' AND date_days_present > 1 AND date_weeks_present > 3 AND date_weeks_present - (DATEDIFF(''week'', ''1800-01-05''::DATE, max_date) - DATEDIFF(''week'', ''1800-01-05''::DATE, min_date) + 1) = 0 AND future_date_ct::FLOAT / NULLIF(value_ct, 0) <= 0.75', NULL, NULL, 'threshold_value', '0', 'Threshold Weeks without Dates', NULL, 'Fail', 'CAT', 'column', 'Completeness', 'Volume', 'Expected maximum count of calendar weeks without dates present', 'Weekly Records tests that at least one record is present for every calendar week within the minimum and maximum date range for the column. The test is relevant for transactional data, where you would expect at least one transaction to be recorded each week. A failure here would suggest missing records for the number of weeks identified without data. You can adjust the threshold to accept a number of weeks that you know legitimately have no records.', 'Y'), - ('1040', 'Variability_Increase', 'Variability Increase', 'Variability has increased above threshold', 'Tests that the spread or dispersion of column values has increased significantly over baseline, indicating a drop in stability of the measure.', 'The Standard Deviation of the measure has increased beyond the defined threshold. This could signal a change in a process or a data quality issue.', 'Pct SD shift', 'Percent of baseline Standard Deviation', 'general_type = ''N'' AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%'' AND value_ct <> distinct_value_ct AND distinct_value_ct > 10 AND stdev_value > 0 AND avg_value IS NOT NULL AND NOT (distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct > 2)', NULL, NULL, 'baseline_sd,threshold_value', 'stdev_value,120', 'Std Deviation at Baseline,Expected Maximum Percent', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Expected maximum pct of baseline Standard Deviation (SD)', 'This test looks for percent shifts in standard deviation as a measure of the stability of a measure over time. A significant change could indicate that new values are erroneous, or that the cohort being evaluated is significantly different from baseline. An increase in particular could mark new problems in measurement, a more heterogeneous cohort, or that significant outliers have been introduced. Consider this test along with Average Shift and New Shift. If the average shifts as well, there may be a fundamental shift in the dataset or process used to collect the data point. This might suggest a data shift that should be noted and assessed by business users. If the average does not shift, this may point to a data quality or data collection problem. ', 'Y'), - ('1041', 'Variability_Decrease', 'Variability Decrease', 'Variability has decreased below threshold', 'Tests that the spread or dispersion of column values has decreased significantly over baseline, indicating a shift in stability of the measure. This could signal a change in a process or a data quality issue.', 'The Standard Deviation of the measure has decreased below the defined threshold. This could signal a change in a process or a data quality issue.', 'Pct SD shift', 'Percent of baseline Standard Deviation', 'general_type = ''N'' AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%'' AND value_ct <> distinct_value_ct AND distinct_value_ct > 10 AND stdev_value > 0 AND avg_value IS NOT NULL AND NOT (distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct > 2)', NULL, NULL, 'baseline_sd,threshold_value', 'stdev_value, 80', 'Std Deviation at Baseline,Expected Minimum Percent', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Expected minimum pct of baseline Standard Deviation (SD)', 'This test looks for percent shifts in standard deviation as a measure of the stability of a measure over time. A significant change could indicate that new values are erroneous, or that the cohort being evaluated is significantly different from baseline. A decrease in particular could indicate an improved process, better precision in measurement, the elimination of outliers, or a more homogeneous cohort. ', 'Y'), - ('1042', 'Valid_Month', 'Valid Month', 'Valid calendar month in expected format', 'Tests for the presence of a valid representation of a calendar month consistent with the format at baseline.', 'Column values are not a valid representation of a calendar month consistent with the format at baseline.', 'Invalid months', NULL, 'functional_data_type = ''Period Month''', NULL, NULL, 'threshold_value,baseline_value', '0,CASE WHEN max_length > 3 AND initcap(min_text) = min_text THEN ''''''January'''',''''February'''',''''March'''',''''April'''',''''May'''',''''June'''',''''July'''',''''August'''',''''September'''',''''October'''',''''November'''',''''December'''''' WHEN max_length > 3 AND upper(min_text) = min_text THEN ''''''JANUARY'''',''''FEBRUARY'''',''''MARCH'''',''''APRIL'''',''''MAY'''',''''JUNE'''',''''JULY'''',''''AUGUST'''',''''SEPTEMBER'''',''''OCTOBER'''',''''NOVEMBER'''',''''DECEMBER'''''' WHEN max_length > 3 AND lower(min_text) = min_text THEN ''''''january'''',''''february'''',''''march'''',''''april'''',''''may'''',''''june'''',''''july'''',''''august'''',''''september'''',''''october'''',''''november'''',''''december'''''' WHEN max_length = 3 AND initcap(min_text) = min_text THEN ''''''Jan'''',''''Feb'''',''''Mar'''',''''Apr'''',''''May'''',''''Jun'''',''''Jul'''',''''Aug'''',''''Sep'''',''''Oct'''',''''Nov'''',''''Dec'''''' WHEN max_length = 3 AND upper(min_text) = min_text THEN ''''''JAN'''',''''FEB'''',''''MAR'''',''''APR'''',''''MAY'''',''''JUN'''',''''JUL'''',''''AUG'''',''''SEP'''',''''OCT'''',''''NOV'''',''''DEC'''''' WHEN max_length = 3 AND lower(min_text) = min_text THEN ''''''jan'''',''''feb'''',''''mar'''',''''apr'''',''''may'''',''''jun'''',''''jul'''',''''aug'''',''''sep'''',''''oct'''',''''nov'''',''''dec'''''' WHEN max_length = 2 AND min_text = ''01'' THEN ''''''01'''',''''02'''',''''03'''',''''04'''',''''05'''',''''06'''',''''07'''',''''08'''',''''09'''',''''10'''',''''11'''',''''12'''''' WHEN max_length = 2 AND min_text = ''1'' THEN ''''''1'''',''''2'''',''''3'''',''''4'''',''''5'''',''''6'''',''''7'''',''''8'''',''''9'''',''''10'''',''''11'''',''''12'''''' WHEN min_value = 1 THEN ''1,2,3,4,5,6,7,8,9,10,11,12'' ELSE ''NULL'' END', 'Threshold Invalid Months,Valid Month List', 'The acceptable number of records with invalid months present.|List of valid month values for this field, in quotes if field is numeric, separated by commas.', 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of invalid months', NULL, 'N'), - ('1043', 'Valid_Characters', 'Valid Characters', 'Column contains no invalid characters', 'Tests for the presence of non-printing characters, leading spaces, or surrounding quotes.', 'Invalid characters, such as non-printing characters, leading spaces, or surrounding quotes, were found.', 'Invalid records', 'Expected count of values with invalid characters', 'general_type = ''A''', NULL, NULL, 'threshold_value', '0', NULL, 'The acceptable number of records with invalid character values present.', 'Warning', 'CAT', 'column', 'Validity', 'Schema Drift', 'Threshold Invalid Value Count', 'This test looks for the presence of non-printing ASCII characters that are considered non-standard in basic text processing. It also identifies leading spaces and values enclosed in quotes. Values that fail this test may be artifacts of data conversion, or just more difficult to process or analyze downstream.', 'N'), - ('1044', 'Valid_US_Zip', 'Valid US Zip', 'Valid USA Postal Codes', 'Tests that postal codes match the 5 or 9 digit standard US format', 'Invalid US Zip Code formats found.', 'Invalid Zip Codes', 'Expected count of values with invalid Zip Codes', 'functional_data_type = ''Zip''', NULL, NULL, 'threshold_value', '0', NULL, NULL, 'Warning', 'CAT', 'column', 'Validity', 'Schema Drift', 'Threshold Invalid Value Count', NULL, 'Y'), - ('1045', 'Valid_US_Zip3', 'Valid US Zip-3 ', 'Valid USA Zip-3 Prefix', 'Tests that postal codes match the 3 digit format of a regional prefix.', 'Invalid 3-digit US Zip Code regional prefix formats found.', 'Invalid Zip-3 Prefix', 'Expected count of values with invalid Zip-3 Prefix Codes', 'functional_data_type = ''Zip3''', NULL, NULL, 'threshold_value', '0', NULL, NULL, 'Warning', 'CAT', 'column', 'Validity', 'Schema Drift', 'Threshold Invalid Zip3 Count', 'This test looks for the presence of values that fail to match the three-digit numeric code expected for US Zip Code regional prefixes. These prefixes are often used to roll up Zip Code data to a regional level, and may be critical to anonymize detailed data and protect PID. Depending on your needs and regulatory requirements, longer zip codes could place PID at risk.', 'Y'), - ('1006', 'Condition_Flag', 'Custom Condition', 'Column values match pre-defined condition', 'Tests that each record in the table matches a pre-defined, custom condition', 'Value(s) found not matching defined condition.', 'Values Failing', NULL, NULL, 'Test Focus', 'Specify a brief descriptor of the focus of this test that is unique within this Test Suite for the Table and Test Type. This distinguishes this test from others of the same type on the same table. Example: `Quantity Consistency` if you are testing that quantity ordered matches quantity shipped.', 'threshold_value,custom_query', NULL, 'Threshold Error Count,Custom SQL Expression (TRUE on error)', 'The number of errors that are acceptable before test fails.|Expression should evaluate to TRUE to register an error or FALSE if no error. An expression can reference only columns in the selected table.', 'Fail', 'CAT', 'custom', 'Validity', 'Schema Drift', 'Count of records that don''t meet test condition', 'Custom Condition is a business-rule test for a user-defined error condition based on the value of one or more columns. The condition is applied to each record within the table, and the count of records failing the condition is added up. If that count exceeds a threshold of errors, the test as a whole is failed. This test is ideal for error conditions that TestGen cannot automatically infer, and any condition that involves the values of more than one column in the same record. Performance of this test is fast, since it is performed together with other aggregate tests. Interpretation is based on the user-defined meaning of the test.', 'Y'), + (id, test_type, test_name_short, test_name_long, test_description, except_message, measure_uom, measure_uom_description, selection_criteria, dq_score_prevalence_formula, dq_score_risk_factor, column_name_prompt, column_name_help, default_parm_columns, default_parm_values, default_parm_prompts, default_parm_help, default_severity, run_type, test_scope, dq_dimension, health_dimension, threshold_description, usage_notes, active) +VALUES ('1004', 'Alpha_Trunc', 'Alpha Truncation', 'Maximum character count consistent', 'Tests that the maximum count of characters in a column value has not dropped vs. baseline data', 'Maximum length of values has dropped from prior expected length.', 'Values over max', NULL, 'general_type =''A'' AND max_length > 0 AND ( (min_length = avg_length AND max_length = avg_length) OR (numeric_ct <> value_ct ) ) AND functional_table_type NOT LIKE ''%window%'' /* The conditions below are to eliminate overlap with : LOV_Match (excluded selection criteria for this test_type), Pattern_Match (excluded selection criteria for this test_type), Constant (excluded functional_data_type Constant and Boolean) */ AND ( (distinct_value_ct NOT BETWEEN 2 AND 10 AND functional_data_type NOT IN ( ''Constant'', ''Boolean'') ) AND NOT ( fn_charcount(top_patterns, E'' \| '' ) = 1 AND fn_charcount(top_patterns, E'' \| '' ) IS NOT NULL AND REPLACE(SPLIT_PART(top_patterns, ''|'' , 2), ''N'' , '''' ) > ''''))', '{VALUE_CT}::FLOAT * (FN_NORMAL_CDF(({MAX_LENGTH}::FLOAT - {AVG_LENGTH}::FLOAT) / ({MAX_LENGTH}::FLOAT / 3)) - FN_NORMAL_CDF(({RESULT_MEASURE}::FLOAT - {AVG_LENGTH}::FLOAT) / ({MAX_LENGTH}::FLOAT / 3)) ) /{RECORD_CT}::FLOAT', '1.0', NULL, NULL, 'threshold_value', 'max_length', 'Maximum String Length at Baseline', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Maximum length expected', 'Alpha Truncation tests that the longest text value in a column hasn''t become shorter than the longest value at baseline. This could indicate a problem in a cumulative dataset, where prior values should still exist unchanged. A failure here would suggest that some process changed data that you would still expect to be present and matching its value when the column was profiled. This test would not be appropriate for an incremental or windowed dataset.', 'Y'), + ('1005', 'Avg_Shift', 'Average Shift', 'Column mean is consistent with reference', 'Tests for statistically-significant shift in mean value for column from average calculated at baseline.', 'Standardized difference between averages is over the selected threshold level.', 'Difference Measure', 'Cohen''s D Difference (0.20 small, 0.5 mod, 0.8 large, 1.2 very large, 2.0 huge)', 'general_type=''N'' AND distinct_value_ct > 10 AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%''', 'LEAST({RESULT_MEASURE}, 1.0)', '0.75', NULL, NULL, 'baseline_value_ct,baseline_avg,baseline_sd,threshold_value', 'value_ct,avg_value,stdev_value,0.5::VARCHAR', 'Value Ct at Baseline,Mean at Baseline,Std Deviation at Baseline,Threshold Difference Measure ', NULL, 'Warning', 'CAT', 'column', 'Consistency', 'Data Drift', 'Standardized Difference Measure', 'Average Shift tests that the average of a numeric column has not significantly changed since baseline, when profiling was done. A significant shift may indicate errors in processing, differences in source data, or valid changes that may nevertheless impact assumptions in downstream data products. The test uses Cohen''s D, a statistical technique to identify significant shifts in a value. Cohen''s D measures the difference between the two averages, reporting results on a standardized scale, which can be interpreted via a rule-of-thumb from small to huge. Depending on your data, some difference may be expected, so it''s reasonable to adjust the threshold value that triggers test failure. This test works well for measures, or even for identifiers if you expect them to increment consistently. You may want to periodically adjust the expected threshold, or even the expected average value if you expect shifting over time. Consider this test along with Variability Increase. If variability rises too, process or measurement flaws could be at work. If variability remains consistent, the issue is more likely to be with the source data itself. ', 'Y'), + ('1007', 'Constant', 'Constant Match', 'All column values match constant value', 'Tests that all values in the column match the constant value identified in baseline data', 'A constant value is expected for this column.', 'Mismatched values', NULL, 'TEMPLATE', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/{RECORD_CT}::FLOAT', '1.0', NULL, NULL, 'baseline_value,threshold_value', NULL, 'Constant Value at Baseline,Threshold Error Count', 'The single, unchanging value of the column, per baseline|The number of errors that are acceptable before test fails.', 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Count of records with unexpected values', 'Constant Match tests that a single value determined to be a constant in baseline profiling is still the only value for the column that appears in subsequent versions of the dataset. Sometimes new data or business knowledge may reveal that the value is not a constant at all, even though only one value was present at profiling. In this case, you will want to disable this test. Alternatively, you can use the Value Match test to provide a limited number of valid values for the column.', 'Y'), + ('1009', 'Daily_Record_Ct', 'Daily Records', 'All dates present within date range', 'Tests for presence of every calendar date within min/max date range, per baseline data', 'Not every date value between min and max dates is present, unlike at baseline.', 'Missing dates', NULL, 'general_type= ''D'' AND date_days_present > 21 AND date_days_present - (DATEDIFF(''day'', ''1800-01-05''::DATE, max_date) - DATEDIFF(''day'', ''1800-01-05''::DATE, min_date) + 1) = 0 AND future_date_ct::FLOAT / NULLIF(value_ct, 0) <= 0.75', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT*{PRO_RECORD_CT}::FLOAT/{DATE_DAYS_PRESENT}::FLOAT/{RECORD_CT}::FLOAT', '0.75', NULL, NULL, 'threshold_value', '0', 'Threshold Missing Calendar Days', NULL, 'Warning', 'CAT', 'column', 'Completeness', 'Volume', 'Missing calendar days within min/max range', 'Daily Records tests that at least one record is present for every day within the minimum and maximum date range for the column. The test is relevant for transactional data, where you would expect at least one transaction to be recorded each day. A failure here would suggest missing records for the number of days identified without data. You can adjust the threshold to accept a number of days that you know legitimately have no records. ', 'Y'), + ('1011', 'Dec_Trunc', 'Decimal Truncation', 'Sum of fractional values at or above reference', 'Tests for decimal truncation by confirming that the sum of fractional values in data is no less than the sum at baseline', 'The sum of fractional values is under baseline, which may indicate decimal truncation', 'Fractional sum', 'The sum of all decimal values from all data for this column', 'fractional_sum IS NOT NULL AND functional_table_type LIKE''%cumulative%''', '1', '1.0', NULL, NULL, 'threshold_value', 'ROUND(fractional_sum, 0)', 'Sum of Fractional Values at Baseline', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Minimum expected sum of all fractional values', 'Decimal Truncation tests that the fractional (decimal) part of a numeric column has not been truncated since Baseline. This works by summing all the fractional values after the decimal point and confirming that the total is at least equal to the fractional total at baseline. This could indicate a problem in a cumulative dataset, where prior values should still exist unchanged. A failure here would suggest that some process changed data that you would still expect to be present and matching its value when the column was profiled. This test would not be appropriate for an incremental or windowed dataset.', 'Y'), + ('1012', 'Distinct_Date_Ct', 'Date Count', 'Count of distinct dates at or above reference', 'Tests that the count of distinct dates referenced in the column has not dropped vs. baseline data', 'Drop in count of unique dates recorded in column.', 'Unique dates', 'Count of unique dates in transactional date column', 'general_type=''D'' and date_days_present IS NOT NULL AND functional_table_type NOT LIKE ''%window%''', '(({RECORD_CT}-{PRO_RECORD_CT})::FLOAT*{DISTINCT_VALUE_CT}::FLOAT/{PRO_RECORD_CT}::FLOAT)/{PRO_RECORD_CT}::FLOAT', '1.0', NULL, NULL, 'baseline_value,threshold_value', 'date_days_present,date_days_present', 'Distinct Date Count at Baseline,Min Expected Date Count', NULL, 'Fail', 'CAT', 'column', 'Timeliness', 'Recency', 'Minimum distinct date count expected', 'Date Count tests that the count of distinct dates present in the column has not dropped since baseline. The test is relevant for cumulative datasets, where old records are retained. A failure here would indicate missing records, which could be caused by a processing error or changed upstream data sources.', 'Y'), + ('1013', 'Distinct_Value_Ct', 'Value Count', 'Count of distinct values has not dropped', 'Tests that the count of unique values in the column has not changed from baseline.', 'Count of unique values in column has changed from baseline.', 'Unique Values', NULL, 'distinct_value_ct between 2 and 10 AND value_ct > 0 AND NOT (coalesce(top_freq_values,'''') > '''' AND distinct_value_ct BETWEEN 2 and 10) AND NOT (lower(functional_data_type) LIKE ''%sequence%'' OR lower(functional_data_type) LIKE ''%measurement%'' OR functional_data_type LIKE ''%date%'' OR general_type = ''D'')', 'ABS({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT*{PRO_RECORD_CT}::FLOAT/{DISTINCT_VALUE_CT}::FLOAT', '1.0', NULL, NULL, 'baseline_value_ct,threshold_value', 'distinct_value_ct,distinct_value_ct', 'Distinct Value Count at Baseline,Min Expected Value Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected distinct value count', 'Value Count tests that the count of unique values present in the column has not dropped since baseline. The test is relevant for cumulative datasets, where old records are retained, or for any dataset where you would expect a set number of distinct values should be present. A failure here would indicate missing records or a change in categories or value assignment.', 'Y'), + ('1014', 'Email_Format', 'Email Format', 'Email is correctly formatted', 'Tests that non-blank, non-empty email addresses match the standard format', 'Invalid email address formats found.', 'Invalid emails', 'Number of emails that do not match standard format', 'std_pattern_match=''EMAIL''', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/{RECORD_CT}::FLOAT', '1.0', NULL, NULL, 'threshold_value', '0', 'Maximum Invalid Email Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of invalid email addresses', NULL, 'Y'), + ('1015', 'Future_Date', 'Past Dates', 'Latest date is prior to test run date', 'Tests that the maximum date referenced in the column is no greater than the test date, consistent with baseline data', 'Future date found when absent in baseline data.', 'Future dates', NULL, 'general_type=''D''AND future_date_ct = 0', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/{RECORD_CT}::FLOAT', '1.0', NULL, NULL, 'threshold_value', '0', 'Maximum Future Date Count', NULL, 'Fail', 'CAT', 'column', 'Timeliness', 'Recency', 'Expected count of future dates', NULL, 'Y'), + ('1016', 'Future_Date_1Y', 'Future Year', 'Future dates within year of test run date', 'Tests that the maximum date referenced in the column is no greater than one year beyond the test date, consistent with baseline data', 'Future date beyond one-year found when absent in baseline.', 'Future dates post 1 year', NULL, 'general_type=''D''AND future_date_ct > 0 AND max_date <=''{AS_OF_DATE}''::DATE + INTERVAL''365 DAYS''', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/{RECORD_CT}::FLOAT', '1.0', NULL, NULL, 'threshold_value', '0', 'Maximum Post 1-Year Future Date Count', NULL, 'Fail', 'CAT', 'column', 'Timeliness', 'Recency', 'Expected count of future dates beyond one year', 'Future Year looks for date values in the column that extend beyond one year after the test date. This would be appropriate for transactional dates where you would expect to find dates in the near future, but not beyond one year ahead. Errors could indicate invalid entries or possibly dummy dates representing blank values.', 'Y'), + ('1017', 'Incr_Avg_Shift', 'New Shift', 'New record mean is consistent with reference', 'Tests for statistically-significant shift in mean of new values for column compared to average calculated at baseline.', 'Significant shift in average of new values vs. baseline avg', 'Z-score of mean shift', 'Absolute Z-score (number of SD''s outside mean) of prior avg - incremental avg', 'general_type=''N'' AND distinct_value_ct > 10 AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%''', '{RECORD_CT}::FLOAT*(1-FN_NORMAL_CDF({RESULT_MEASURE}::FLOAT))/{RECORD_CT}::FLOAT', '0.75', NULL, NULL, 'baseline_value_ct,baseline_sum,baseline_avg,baseline_sd,threshold_value', 'value_ct,(avg_value * value_ct)::FLOAT,avg_value,stdev_value,2', 'Value Count at Baseline,Sum at Baseline,Mean Value at Baseline,Std Deviation at Baseline,Threshold Max Z-Score', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Maximum Z-Score (number of SD''s beyond mean) expected', 'This is a more sensitive test than Average Shift, because it calculates an incremental difference in the average of new values compared to the average of values at baseline. This is appropriate for a cumulative dataset only, because it calculates the average of new entries based on the assumption that the count and average of records present at baseline are still present at the time of the test. This test compares the mean of new values with the standard deviation of the baseline average to calculate a Z-score. If the new mean falls outside the Z-score threshold, a shift is detected. Potential Z-score thresholds may range from 0 to 3, depending on the sensitivity you prefer. A failed test could indicate a quality issue or a legitimate shift in new data that should be noted and assessed by business users. Consider this test along with Variability Increase. If variability rises too, process, methodology or measurement flaws could be at issue. If variability remains consistent, the problem is more likely to be with the source data itself.', 'Y'), + ('1018', 'LOV_All', 'Value Match All', 'List of expected values all present in column', 'Tests that all values match a pipe-delimited list of expected values and that all expected values are present', 'Column values found don''t exactly match the expected list of values', 'Values found', NULL, NULL, '1', '1.0', NULL, NULL, 'threshold_value', NULL, 'List of Expected Values', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'List of values expected, in form (''Val1'',''Val2)', 'This is a more restrictive form of Value Match, testing that all values in the dataset match the list provided, and also that all values present in the list appear at least once in the dataset. This would be appropriate for tables where all category values in the column are represented at least once.', 'Y'), + ('1019', 'LOV_Match', 'Value Match', 'All column values present in expected list', 'Tests that all values in the column match the list-of-values identified in baseline data.', 'Values not matching expected List-of-Values from baseline.', 'Non-matching records', NULL, 'top_freq_values > '''' AND distinct_value_ct BETWEEN 2 and 10 AND NOT (functional_data_type LIKE ''%date%'' OR lower(datatype_suggestion) LIKE ''%date%'' OR general_type = ''D'' OR lower(column_name) IN (''file_name'', ''filename''))', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/{RECORD_CT}::FLOAT', '1.0', NULL, NULL, 'baseline_value,threshold_value', '''('' || SUBSTRING( CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 2) > '''' THEN '','''''' || TRIM( REPLACE ( SPLIT_PART(top_freq_values, ''|'' , 2), '''''''' , '''''''''''' ) ) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 4) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 4), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 6) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 6), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 8) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 8), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 10) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 10), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 12) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 12), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 14) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 14), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 16) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 16), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 18) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 18), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 20) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 20), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END, 2, 999) || '')'',0', 'List of Expected Values,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'List of values expected, in form (''Val1'',''Val2)', 'This tests that all values in the column match the hard-coded list provided. This is relevant when the list of allowable values is small and not expected to change often. Even if new values might occasionally be added, this test is useful for downstream data products to provide warning that assumptions and logic may need to change.', 'Y'), + ('1020', 'Min_Date', 'Minimum Date', 'All dates on or after set minimum', 'Tests that the earliest date referenced in the column is no earlier than baseline data', 'The earliest date value found is before the earliest value at baseline.', 'Dates prior to limit', NULL, 'general_type=''D''and min_date IS NOT NULL AND distinct_value_ct > 1', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/{RECORD_CT}::FLOAT', '1.0', NULL, NULL, 'baseline_value,threshold_value', 'min_date,0', 'Minimum Date at Baseline,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of dates prior to minimum', 'This test is appropriate for a cumulative dataset only, because it assumes all prior values are still present. It''s appropriate where new records are added with more recent dates, but old dates dates do not change.', 'Y'), + ('1021', 'Min_Val', 'Minimum Value', 'All values at or above set minimum', 'Tests that the minimum value present in the column is no lower than the minimum value in baseline data', 'Minimum column value less than baseline.', 'Values under limit', NULL, 'general_type=''N''and min_value IS NOT NULL AND (distinct_value_ct >= 2 OR (distinct_value_ct=2 and min_value<>0 and max_value<>1))', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/{RECORD_CT}::FLOAT', '1.0', NULL, NULL, 'baseline_value,threshold_value', 'min_value,0', 'Minimum Value at Baseline,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of values under limit', 'This test is appropriate for a cumulative dataset only, assuming all prior values are still present. It is also appropriate for any measure that has an absolute, definable minimum value, or a heuristic that makes senes for valid data.', 'Y'), + ('1022', 'Missing_Pct', 'Percent Missing', 'Consistent ratio of missing values', 'Tests for statistically-significant shift in percentage of missing values in column vs. baseline data', 'Significant shift in percent of missing values vs. baseline.', 'Difference measure', 'Cohen''s H Difference (0.20 small, 0.5 mod, 0.8 large, 1.2 very large, 2.0 huge)', 'record_ct <> value_ct', 'LEAST({RESULT_MEASURE}, 1.0)', '0.75', NULL, NULL, 'baseline_ct,baseline_value_ct,threshold_value', 'record_ct,value_ct,2::VARCHAR(10)', 'Baseline Record Count,Baseline Value Count,Standardized Difference Measure', NULL, 'Warning', 'CAT', 'column', 'Completeness', 'Data Drift', 'Expected maximum Cohen''s H Difference', 'This test uses Cohen''s H, a statistical test to identify a significant difference between two ratios. Results are reported on a standardized scale, which can be interpreted via a rule-of-thumb from small to huge. An uptick in missing data may indicate a collection issue at the source. A larger change may indicate a processing failure. A drop in missing data may also be significant, if it affects assumptions built into analytic products downstream. You can refine the expected threshold value as you view legitimate results of the measure over time.', 'Y'), + ('1023', 'Monthly_Rec_Ct', 'Monthly Records', 'At least one date per month present within date range', 'Tests for presence of at least one date per calendar month within min/max date range, per baseline data', 'At least one date per month expected in min/max date range.', 'Missing months', 'Calendar months without date values present', 'general_type= ''D'' AND date_days_present > 1 AND date_months_present > 2 AND date_months_present - (datediff( ''MON'' , min_date, max_date) + 1) = 0 AND future_date_ct::FLOAT / NULLIF(value_ct, 0) <= 0.75', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT*{PRO_RECORD_CT}::FLOAT/{DATE_MONTHS_PRESENT}::FLOAT/{RECORD_CT}::FLOAT', '1.0', NULL, NULL, 'threshold_value', '0', 'Threshold Count of Months without Dates', NULL, 'Fail', 'CAT', 'column', 'Completeness', 'Volume', 'Expected maximum count of calendar months without dates present', 'Monthly Records tests that at least one record is present for every calendar month within the minimum and maximum date range for the column. The test is relevant for transactional data, where you would expect at least one transaction to be recorded each month. A failure here would suggest missing records for the number of months identified without data. You can adjust the threshold to accept a number of month that you know legitimately have no records.', 'Y'), + ('1024', 'Outlier_Pct_Above', 'Outliers Above', 'Consistent outlier counts over 2 SD above mean', 'Tests that percent of outliers over 2 SD above Mean doesn''t exceed threshold', 'Percent of outliers exceeding 2 SD above the mean is greater than expected threshold.', 'Pct records over limit', NULL, 'general_type = ''N'' AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%'' AND value_ct <> distinct_value_ct AND distinct_value_ct > 10 AND stdev_value > 0 AND avg_value IS NOT NULL AND NOT (distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct > 2)', 'GREATEST(0, {RESULT_MEASURE}::FLOAT-{THRESHOLD_VALUE}::FLOAT)', '0.75', NULL, NULL, 'baseline_avg,baseline_sd,threshold_value', 'avg_value,stdev_value,0.05', 'Baseline Mean, Baseline Std Deviation, Pct Records over 2 SD', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Expected maximum pct records over upper 2 SD limit', 'This test counts the number of data points that may be considered as outliers, determined by whether their value exceeds 2 standard deviations above the mean at baseline. Assuming a normal distribution, a small percentage (defaulted to 5%) of outliers is expected. The actual number may vary for different distributions. The expected threshold reflects the maximum percentage of outliers you expect to see. This test uses the baseline mean rather than the mean for the latest dataset to capture systemic shift as well as individual outliers. ', 'Y'), + ('1025', 'Outlier_Pct_Below', 'Outliers Below', 'Consistent outlier counts under 2 SD below mean', 'Tests that percent of outliers over 2 SD below Mean doesn''t exceed threshold', 'Percent of outliers exceeding 2 SD below the mean is greater than expected threshold.', 'Pct records under limit', NULL, 'general_type = ''N'' AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%'' AND value_ct <> distinct_value_ct AND distinct_value_ct > 10 AND stdev_value > 0 AND avg_value IS NOT NULL AND NOT (distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct > 2)', 'GREATEST(0, {RESULT_MEASURE}::FLOAT-{THRESHOLD_VALUE}::FLOAT)', '0.75', NULL, NULL, 'baseline_avg,baseline_sd,threshold_value', 'avg_value,stdev_value,0.05', 'Baseline Mean, Baseline Std Deviation, Pct Records over 2 SD', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Expected maximum pct records over lower 2 SD limit', 'This test counts the number of data points that may be considered as outliers, determined by whether their value exceeds 2 standard deviations below the mean at baseline. Assuming a normal distribution, a small percentage (defaulted to 5%) of outliers is expected. The actual number may vary for different distributions. The expected threshold reflects the maximum percentage of outliers you expect to see. This test uses the baseline mean rather than the mean for the latest dataset to capture systemic shift as well as individual outliers. ', 'Y'), + ('1026', 'Pattern_Match', 'Pattern Match', 'Column values match alpha-numeric pattern', 'Tests that all values in the column match the same alpha-numeric pattern identified in baseline data', 'Alpha values do not match consistent pattern in baseline.', 'Pattern Mismatches', NULL, 'fn_charcount(top_patterns, E'' \| '' ) = 1 AND REPLACE(SPLIT_PART(top_patterns, ''|'' , 2), ''N'' , '''' ) > '''' AND distinct_value_ct > 10', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/{RECORD_CT}::FLOAT', '1.0', NULL, NULL, 'baseline_value,threshold_value', 'TRIM(REPLACE(REPLACE(REPLACE(REGEXP_REPLACE(SPLIT_PART(top_patterns, '' | '', 2), ''([*+\-%_])'', ''[\1]'', ''g''), ''A'', ''[A-Z]''), ''N'', ''[0-9]''), ''a'', ''[a-z]'')),0', 'Pattern at Baseline,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of pattern mismatches', 'This test is appropriate for character fields that are expected to appear in a consistent format. It uses pattern matching syntax as appropriate for your database: REGEX matching if available, otherwise LIKE expressions. The expected threshold is the number of records that fail to match the defined pattern.', 'Y'), + ('1028', 'Recency', 'Recency', 'Latest date within expected range of test date', 'Tests that the latest date in column is within a set number of days of the test date', 'Most recent date value not within expected days of test date.', 'Days before test', 'Number of days that most recent date precedes the date of test', 'general_type= ''D'' AND max_date <= run_date AND NOT column_name IN ( ''filedate'' , ''file_date'' ) AND NOT functional_data_type IN (''Future Date'', ''Schedule Date'') AND DATEDIFF( ''DAY'' , max_date, run_date) <= 62', '(ABS({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT*{PRO_RECORD_CT}::FLOAT/(1.0+DATEDIFF(''DAY'', ''{MIN_DATE}'', ''{MAX_DATE}''))::FLOAT)/{RECORD_CT}::FLOAT', '0.75', NULL, NULL, 'threshold_value', 'CASE WHEN DATEDIFF( ''DAY'' , max_date, run_date) <= 3 THEN DATEDIFF(''DAY'', max_date, run_date) + 3 WHEN DATEDIFF(''DAY'', max_date, run_date) <= 7 then DATEDIFF(''DAY'', max_date, run_date) + 7 WHEN DATEDIFF( ''DAY'' , max_date, run_date) <= 31 THEN CEILING( DATEDIFF( ''DAY'' , max_date, run_date)::FLOAT / 7.0) * 7 WHEN DATEDIFF( ''DAY'' , max_date, run_date) > 31 THEN CEILING( DATEDIFF( ''DAY'' , max_date, run_date)::FLOAT / 30.0) * 30 END', 'Threshold Maximum Days before Test', NULL, 'Warning', 'CAT', 'column', 'Timeliness', 'Recency', 'Expected maximum count of days preceding test date', 'This test evaluates recency based on the latest referenced dates in the column. The test is appropriate for transactional dates and timestamps. The test can be especially valuable because timely data deliveries themselves may not assure that the most recent data is present. You can adjust the expected threshold to the maximum number of days that you expect the data to age before the dataset is refreshed. ', 'Y'), + ('1030', 'Required', 'Required Entry', 'Required non-null value present', 'Tests that a non-null value is present in each record for the column, consistent with baseline data', 'Every record for this column is expected to be filled, but some are missing.', 'Missing values', NULL, 'record_ct = value_ct', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/{RECORD_CT}::FLOAT', '1.0', NULL, NULL, 'threshold_value', '0', 'Threshold Missing Value Count', NULL, 'Fail', 'CAT', 'column', 'Completeness', 'Schema Drift', 'Expected count of missing values', NULL, 'Y'), + ('1033', 'Street_Addr_Pattern', 'Street Address', 'Enough street address entries match defined pattern', 'Tests for percent of records matching standard street address pattern.', 'Percent of values matching standard street address format is under expected threshold.', 'Percent matches', 'Percent of records that match street address pattern', '(std_pattern_match=''STREET_ADDR'') AND (avg_length <> round(avg_length)) AND (avg_embedded_spaces BETWEEN 2 AND 6) AND (avg_length < 35)', '({VALUE_CT}::FLOAT * ({RESULT_MEASURE}::FLOAT - {THRESHOLD_VALUE}::FLOAT)/100.0)/{RECORD_CT}::FLOAT', '1.0', NULL, NULL, 'threshold_value', '75', 'Threshold Pct that Match Address Pattern', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected percent of records that match standard street address pattern', 'The street address pattern used in this test should match the vast majority of USA addresses. You can adjust the threshold percent of matches based on the results you are getting -- you may well want to tighten it to make the test more sensitive to invalid entries.', 'Y'), + ('1034', 'Unique', 'Unique Values', 'Each column value is unique', 'Tests that no values for the column are repeated in multiple records.', 'Column values should be unique per row.', 'Duplicate values', 'Count of non-unique values', 'record_ct > 500 and record_ct = distinct_value_ct and value_ct > 0', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/{RECORD_CT}::FLOAT', '1.0', NULL, NULL, 'threshold_value', '0', 'Threshold Duplicate Value Count', NULL, 'Fail', 'CAT', 'column', 'Uniqueness', 'Schema Drift', 'Expected count of duplicate values', 'This test is ideal when the database itself does not enforce a primary key constraint on the table. It serves as an independent check on uniqueness. If''s also useful when there are a small number of exceptions to uniqueness, which can be reflected in the expected threshold count of duplicates.', 'Y'), + ('1035', 'Unique_Pct', 'Percent Unique', 'Consistent ratio of unique values', 'Tests for statistically-significant shift in percentage of unique values vs. baseline data.', 'Significant shift in percent of unique values vs. baseline.', 'Difference measure', 'Cohen''s H Difference (0.20 small, 0.5 mod, 0.8 large, 1.2 very large, 2.0 huge)', 'distinct_value_ct > 10', 'LEAST({RESULT_MEASURE}, 1.0)', '0.75', NULL, NULL, 'baseline_value_ct,baseline_unique_ct,threshold_value', 'value_ct,distinct_value_ct,0.5', 'Value Count at Baseline,Distinct Value Count at Baseline,Standardized Difference Measure (0 to 1)', NULL, 'Warning', 'CAT', 'column', 'Uniqueness', 'Data Drift', 'Expected maximum Cohen''s H Difference', 'You can think of this as a test of similarity that measures whether the percentage of unique values is consistent with the percentage at baseline. A significant change might indicate duplication or a telling shift in cardinality between entities. The test uses Cohen''s H, a statistical test to identify a significant difference between two ratios. Results are reported on a standardized scale, which can be interpreted via a rule-of-thumb from small to huge. You can refine the expected threshold value as you view legitimate results of the measure over time.', 'Y'), + ('1036', 'US_State', 'US State', 'Column value is two-letter US state code', 'Tests that the recorded column value is a valid US state.', 'Column Value is not a valid US state.', 'Not US States', 'Values that doo not match 2-character US state abbreviations.', 'general_type= ''A'' AND column_name ILIKE ''%state%'' AND distinct_value_ct < 70 AND max_length = 2', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/{RECORD_CT}::FLOAT', '1.0', NULL, NULL, 'threshold_value', '0', 'Threshold Count not Matching State Abbreviations', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of values that are not US state abbreviations', 'This test validates entries against a fixed list of two-character US state codes and related Armed Forces codes.', 'Y'), + ('1037', 'Weekly_Rec_Ct', 'Weekly Records', 'At least one date per week present within date range', 'Tests for presence of at least one date per calendar week within min/max date range, per baseline data', 'At least one date per week expected in min/max date range.', 'Missing weeks', 'Calendar weeks without date values present', 'general_type= ''D'' AND date_days_present > 1 AND date_weeks_present > 3 AND date_weeks_present - (DATEDIFF(''week'', ''1800-01-05''::DATE, max_date) - DATEDIFF(''week'', ''1800-01-05''::DATE, min_date) + 1) = 0 AND future_date_ct::FLOAT / NULLIF(value_ct, 0) <= 0.75', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT*{PRO_RECORD_CT}::FLOAT/{DATE_WEEKS_PRESENT}::FLOAT/{RECORD_CT}::FLOAT', '1.0', NULL, NULL, 'threshold_value', '0', 'Threshold Weeks without Dates', NULL, 'Fail', 'CAT', 'column', 'Completeness', 'Volume', 'Expected maximum count of calendar weeks without dates present', 'Weekly Records tests that at least one record is present for every calendar week within the minimum and maximum date range for the column. The test is relevant for transactional data, where you would expect at least one transaction to be recorded each week. A failure here would suggest missing records for the number of weeks identified without data. You can adjust the threshold to accept a number of weeks that you know legitimately have no records.', 'Y'), + ('1040', 'Variability_Increase', 'Variability Increase', 'Variability has increased above threshold', 'Tests that the spread or dispersion of column values has increased significantly over baseline, indicating a drop in stability of the measure.', 'The Standard Deviation of the measure has increased beyond the defined threshold. This could signal a change in a process or a data quality issue.', 'Pct SD shift', 'Percent of baseline Standard Deviation', 'general_type = ''N'' AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%'' AND value_ct <> distinct_value_ct AND distinct_value_ct > 10 AND stdev_value > 0 AND avg_value IS NOT NULL AND NOT (distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct > 2)', '1', '0.75', NULL, NULL, 'baseline_sd,threshold_value', 'stdev_value,120', 'Std Deviation at Baseline,Expected Maximum Percent', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Expected maximum pct of baseline Standard Deviation (SD)', 'This test looks for percent shifts in standard deviation as a measure of the stability of a measure over time. A significant change could indicate that new values are erroneous, or that the cohort being evaluated is significantly different from baseline. An increase in particular could mark new problems in measurement, a more heterogeneous cohort, or that significant outliers have been introduced. Consider this test along with Average Shift and New Shift. If the average shifts as well, there may be a fundamental shift in the dataset or process used to collect the data point. This might suggest a data shift that should be noted and assessed by business users. If the average does not shift, this may point to a data quality or data collection problem. ', 'Y'), + ('1041', 'Variability_Decrease', 'Variability Decrease', 'Variability has decreased below threshold', 'Tests that the spread or dispersion of column values has decreased significantly over baseline, indicating a shift in stability of the measure. This could signal a change in a process or a data quality issue.', 'The Standard Deviation of the measure has decreased below the defined threshold. This could signal a change in a process or a data quality issue.', 'Pct SD shift', 'Percent of baseline Standard Deviation', 'general_type = ''N'' AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%'' AND value_ct <> distinct_value_ct AND distinct_value_ct > 10 AND stdev_value > 0 AND avg_value IS NOT NULL AND NOT (distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct > 2)', '1', '0.75', NULL, NULL, 'baseline_sd,threshold_value', 'stdev_value, 80', 'Std Deviation at Baseline,Expected Minimum Percent', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Expected minimum pct of baseline Standard Deviation (SD)', 'This test looks for percent shifts in standard deviation as a measure of the stability of a measure over time. A significant change could indicate that new values are erroneous, or that the cohort being evaluated is significantly different from baseline. A decrease in particular could indicate an improved process, better precision in measurement, the elimination of outliers, or a more homogeneous cohort. ', 'Y'), + ('1042', 'Valid_Month', 'Valid Month', 'Valid calendar month in expected format', 'Tests for the presence of a valid representation of a calendar month consistent with the format at baseline.', 'Column values are not a valid representation of a calendar month consistent with the format at baseline.', 'Invalid months', NULL, 'functional_data_type = ''Period Month''', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/{RECORD_CT}::FLOAT', '1.0', NULL, NULL, 'threshold_value,baseline_value', '0,CASE WHEN max_length > 3 AND initcap(min_text) = min_text THEN ''''''January'''',''''February'''',''''March'''',''''April'''',''''May'''',''''June'''',''''July'''',''''August'''',''''September'''',''''October'''',''''November'''',''''December'''''' WHEN max_length > 3 AND upper(min_text) = min_text THEN ''''''JANUARY'''',''''FEBRUARY'''',''''MARCH'''',''''APRIL'''',''''MAY'''',''''JUNE'''',''''JULY'''',''''AUGUST'''',''''SEPTEMBER'''',''''OCTOBER'''',''''NOVEMBER'''',''''DECEMBER'''''' WHEN max_length > 3 AND lower(min_text) = min_text THEN ''''''january'''',''''february'''',''''march'''',''''april'''',''''may'''',''''june'''',''''july'''',''''august'''',''''september'''',''''october'''',''''november'''',''''december'''''' WHEN max_length = 3 AND initcap(min_text) = min_text THEN ''''''Jan'''',''''Feb'''',''''Mar'''',''''Apr'''',''''May'''',''''Jun'''',''''Jul'''',''''Aug'''',''''Sep'''',''''Oct'''',''''Nov'''',''''Dec'''''' WHEN max_length = 3 AND upper(min_text) = min_text THEN ''''''JAN'''',''''FEB'''',''''MAR'''',''''APR'''',''''MAY'''',''''JUN'''',''''JUL'''',''''AUG'''',''''SEP'''',''''OCT'''',''''NOV'''',''''DEC'''''' WHEN max_length = 3 AND lower(min_text) = min_text THEN ''''''jan'''',''''feb'''',''''mar'''',''''apr'''',''''may'''',''''jun'''',''''jul'''',''''aug'''',''''sep'''',''''oct'''',''''nov'''',''''dec'''''' WHEN max_length = 2 AND min_text = ''01'' THEN ''''''01'''',''''02'''',''''03'''',''''04'''',''''05'''',''''06'''',''''07'''',''''08'''',''''09'''',''''10'''',''''11'''',''''12'''''' WHEN max_length = 2 AND min_text = ''1'' THEN ''''''1'''',''''2'''',''''3'''',''''4'''',''''5'''',''''6'''',''''7'''',''''8'''',''''9'''',''''10'''',''''11'''',''''12'''''' WHEN min_value = 1 THEN ''1,2,3,4,5,6,7,8,9,10,11,12'' ELSE ''NULL'' END', 'Threshold Invalid Months,Valid Month List', 'The acceptable number of records with invalid months present.|List of valid month values for this field, in quotes if field is numeric, separated by commas.', 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of invalid months', NULL, 'N'), + ('1043', 'Valid_Characters', 'Valid Characters', 'Column contains no invalid characters', 'Tests for the presence of non-printing characters, leading spaces, or surrounding quotes.', 'Invalid characters, such as non-printing characters, leading spaces, or surrounding quotes, were found.', 'Invalid records', 'Expected count of values with invalid characters', 'general_type = ''A''', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/{RECORD_CT}::FLOAT', '0.75', NULL, NULL, 'threshold_value', '0', NULL, 'The acceptable number of records with invalid character values present.', 'Warning', 'CAT', 'column', 'Validity', 'Schema Drift', 'Threshold Invalid Value Count', 'This test looks for the presence of non-printing ASCII characters that are considered non-standard in basic text processing. It also identifies leading spaces and values enclosed in quotes. Values that fail this test may be artifacts of data conversion, or just more difficult to process or analyze downstream.', 'N'), + ('1044', 'Valid_US_Zip', 'Valid US Zip', 'Valid USA Postal Codes', 'Tests that postal codes match the 5 or 9 digit standard US format', 'Invalid US Zip Code formats found.', 'Invalid Zip Codes', 'Expected count of values with invalid Zip Codes', 'functional_data_type = ''Zip''', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/{RECORD_CT}::FLOAT', '0.75', NULL, NULL, 'threshold_value', '0', NULL, NULL, 'Warning', 'CAT', 'column', 'Validity', 'Schema Drift', 'Threshold Invalid Value Count', NULL, 'Y'), + ('1045', 'Valid_US_Zip3', 'Valid US Zip-3 ', 'Valid USA Zip-3 Prefix', 'Tests that postal codes match the 3 digit format of a regional prefix.', 'Invalid 3-digit US Zip Code regional prefix formats found.', 'Invalid Zip-3 Prefix', 'Expected count of values with invalid Zip-3 Prefix Codes', 'functional_data_type = ''Zip3''', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/{RECORD_CT}::FLOAT', '0.75', NULL, NULL, 'threshold_value', '0', NULL, NULL, 'Warning', 'CAT', 'column', 'Validity', 'Schema Drift', 'Threshold Invalid Zip3 Count', 'This test looks for the presence of values that fail to match the three-digit numeric code expected for US Zip Code regional prefixes. These prefixes are often used to roll up Zip Code data to a regional level, and may be critical to anonymize detailed data and protect PID. Depending on your needs and regulatory requirements, longer zip codes could place PID at risk.', 'Y'), + ('1006', 'Condition_Flag', 'Custom Condition', 'Column values match pre-defined condition', 'Tests that each record in the table matches a pre-defined, custom condition', 'Value(s) found not matching defined condition.', 'Values Failing', NULL, NULL, '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/{RECORD_CT}::FLOAT', '1.0', 'Test Focus', 'Specify a brief descriptor of the focus of this test that is unique within this Test Suite for the Table and Test Type. This distinguishes this test from others of the same type on the same table. Example: `Quantity Consistency` if you are testing that quantity ordered matches quantity shipped.', 'threshold_value,custom_query', NULL, 'Threshold Error Count,Custom SQL Expression (TRUE on error)', 'The number of errors that are acceptable before test fails.|Expression should evaluate to TRUE to register an error or FALSE if no error. An expression can reference only columns in the selected table.', 'Fail', 'CAT', 'custom', 'Validity', 'Schema Drift', 'Count of records that don''t meet test condition', 'Custom Condition is a business-rule test for a user-defined error condition based on the value of one or more columns. The condition is applied to each record within the table, and the count of records failing the condition is added up. If that count exceeds a threshold of errors, the test as a whole is failed. This test is ideal for error conditions that TestGen cannot automatically infer, and any condition that involves the values of more than one column in the same record. Performance of this test is fast, since it is performed together with other aggregate tests. Interpretation is based on the user-defined meaning of the test.', 'Y'), - ('1031', 'Row_Ct', 'Row Count', 'Number of rows is at or above threshold', 'Tests that the count of records has not decreased from the baseline count.', 'Row count less than baseline count.', 'Row count', NULL, 'TEMPLATE', NULL, NULL, 'threshold_value', NULL, 'Threshold Minimum Record Count', NULL, 'Fail', 'CAT', 'table', 'Completeness', 'Volume', 'Expected minimum row count', 'Because this tests the row count against a constant minimum threshold, it''s appropriate for any dataset, as long as the number of rows doesn''t radically change from refresh to refresh. But it''s not responsive to change over time. You may want to adjust the threshold periodically if you are dealing with a cumulative dataset.', 'Y'), - ('1032', 'Row_Ct_Pct', 'Row Range', 'Number of rows within percent range of threshold', 'Tests that the count of records is within a percentage above or below the baseline count.', 'Row Count is outside of threshold percent of baseline count.', 'Percent of baseline', 'Row count percent above or below baseline', 'TEMPLATE', NULL, NULL, 'baseline_ct,threshold_value', NULL, 'Baseline Record Count,Threshold Pct Above or Below Baseline', NULL, 'Fail', 'CAT', 'table', 'Completeness', 'Volume', 'Expected percent window below or above baseline', 'This test is better than Row Count for an incremental or windowed dataset where you would expect the row count to range within a percentage of baseline.', 'Y'), + ('1031', 'Row_Ct', 'Row Count', 'Number of rows is at or above threshold', 'Tests that the count of records has not decreased from the baseline count.', 'Row count less than baseline count.', 'Row count', NULL, 'TEMPLATE', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/{THRESHOLD_VALUE}::FLOAT', '1.0', NULL, NULL, 'threshold_value', NULL, 'Threshold Minimum Record Count', NULL, 'Fail', 'CAT', 'table', 'Completeness', 'Volume', 'Expected minimum row count', 'Because this tests the row count against a constant minimum threshold, it''s appropriate for any dataset, as long as the number of rows doesn''t radically change from refresh to refresh. But it''s not responsive to change over time. You may want to adjust the threshold periodically if you are dealing with a cumulative dataset.', 'Y'), + ('1032', 'Row_Ct_Pct', 'Row Range', 'Number of rows within percent range of threshold', 'Tests that the count of records is within a percentage above or below the baseline count.', 'Row Count is outside of threshold percent of baseline count.', 'Percent of baseline', 'Row count percent above or below baseline', 'TEMPLATE', '(100.0 - {RESULT_MEASURE}::FLOAT)/100.0', '1.0', NULL, NULL, 'baseline_ct,threshold_value', NULL, 'Baseline Record Count,Threshold Pct Above or Below Baseline', NULL, 'Fail', 'CAT', 'table', 'Completeness', 'Volume', 'Expected percent window below or above baseline', 'This test is better than Row Count for an incremental or windowed dataset where you would expect the row count to range within a percentage of baseline.', 'Y'), - ('1008', 'CUSTOM', 'Custom Test', 'Custom-defined business rule', 'Custom SQL Query Test', 'Errors were detected according to test definition.', 'Errors found', 'Count of errors identified by query', NULL, 'Test Focus', 'Specify a brief descriptor of the focus of this test that is unique within this Test Suite for the Table and Test Type. This distinguishes this test from others of the same type on the same table. Example: `Order Total Matches Detail` if you are testing that the total in one table matches the sum of lines in another.', 'custom_query', NULL, 'Custom SQL Query Returning Error Records', 'Query should return records indicating one or more errors. The test passes if no records are returned. Results of the query will be shown when you click `Review Source Data` for a failed test, so be sure to include enough data in your results to follow-up. \n\nA query can refer to any tables in the database. You must hard-code the schema or use `{DATA_SCHEMA}` to represent the schema defined for the Table Group.', 'Fail', 'QUERY', 'custom', 'Accuracy', 'Data Drift', 'Expected count of errors found by custom query', 'This business-rule test is highly flexible, covering any error state that can be expressed by a SQL query against one or more tables in the database. In operation, the user-defined query is embedded within a parent query returning the count of error rows identified. Any row returned by the query is interpreted as a single error condition in the test. Note that this query is run independently of other tests, and that performance will be slower, depending in large part on the efficiency of the query you write. Interpretation is based on the user-defined meaning of the test. Your query might be written to return errors in individual rows identified by joining tables. Or it might return an error based on a multi-column aggregate condition returning a single row if an error is found. This query is run separately when you click `Review Source Data` from Test Results, so be sure to include enough data in your results to follow-up. Interpretation is based on the user-defined meaning of the test.', 'Y'), + ('1008', 'CUSTOM', 'Custom Test', 'Custom-defined business rule', 'Custom SQL Query Test', 'Errors were detected according to test definition.', 'Errors found', 'Count of errors identified by query', NULL, '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/{RECORD_CT}::FLOAT', '1.0', 'Test Focus', 'Specify a brief descriptor of the focus of this test that is unique within this Test Suite for the Table and Test Type. This distinguishes this test from others of the same type on the same table. Example: `Order Total Matches Detail` if you are testing that the total in one table matches the sum of lines in another.', 'custom_query', NULL, 'Custom SQL Query Returning Error Records', 'Query should return records indicating one or more errors. The test passes if no records are returned. Results of the query will be shown when you click `Review Source Data` for a failed test, so be sure to include enough data in your results to follow-up. \n\nA query can refer to any tables in the database. You must hard-code the schema or use `{DATA_SCHEMA}` to represent the schema defined for the Table Group.', 'Fail', 'QUERY', 'custom', 'Accuracy', 'Data Drift', 'Expected count of errors found by custom query', 'This business-rule test is highly flexible, covering any error state that can be expressed by a SQL query against one or more tables in the database. In operation, the user-defined query is embedded within a parent query returning the count of error rows identified. Any row returned by the query is interpreted as a single error condition in the test. Note that this query is run independently of other tests, and that performance will be slower, depending in large part on the efficiency of the query you write. Interpretation is based on the user-defined meaning of the test. Your query might be written to return errors in individual rows identified by joining tables. Or it might return an error based on a multi-column aggregate condition returning a single row if an error is found. This query is run separately when you click `Review Source Data` from Test Results, so be sure to include enough data in your results to follow-up. Interpretation is based on the user-defined meaning of the test.', 'Y'), - ('1500', 'Aggregate_Balance', 'Aggregate Balance', 'Aggregate values per group match reference', 'Tests for exact match in aggregate values for each set of column values vs. reference dataset', 'Aggregate measure per set of column values does not exactly match reference dataset.', 'Mismatched measures', NULL, NULL, 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_schema_name,match_table_name,match_column_names,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'Record Subset Condition,Grouping Columns,Group Subset Condition,Matching Schema Name,Matching Table Name,Matching Aggregate Expression,Matching Record Subset Condition,Matching Grouping Columns,Matching Group Subset Condition', 'Condition defining a subset of records in main table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in main table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in main table (e.g. HAVING clause) - OPTIONAL|Schema location of matching table|Matching table name|Agregate column expression in matching table: one of `SUM([column_name])` or `COUNT([column_name])`|Condition defining a subset of records in matching table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in matching table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in matching table (e.g. HAVING clause) - OPTIONAL', 'Fail', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected count of group totals not matching aggregate value', 'This test compares sums or counts of a column rolled up to one or more category combinations across two different tables. Both tables must be accessible at the same time. It''s ideal for confirming that two datasets exactly match -- that the sum of a measure or count of a value hasn''t changed or shifted between categories. Use this test to compare a raw and processed version of the same dataset, or to confirm that an aggregated table exactly matches the detail table that it''s built from. An error here means that one or more value combinations fail to match. New categories or combinations will cause failure.', 'Y'), - ('1501', 'Aggregate_Minimum', 'Aggregate Minimum', 'Aggregate values per group are at or above reference', 'Tests that aggregate values for each set of column values are at least the same as reference dataset', 'Aggregate measure per set of column values is not at least the same as reference dataset.', 'Mismatched measures', NULL, NULL, 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_schema_name,match_table_name,match_column_names,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'Record Subset Condition,Grouping Columns,Group Subset Condition,Matching Schema Name,Matching Table Name,Matching Aggregate Expression,Matching Record Subset Condition,Matching Grouping Columns,Matching Group Subset Condition', 'Condition defining a subset of records in main table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in main table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in main table (e.g. HAVING clause) - OPTIONAL|Schema location of reference table|Reference table name|Aggregate column expression in reference table (e.g. `SUM(sales)`)|Condition defining a subset of records in reference table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in reference table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in reference table (e.g. HAVING clause) - OPTIONAL', 'Fail', 'QUERY', 'referential', 'Accuracy', 'Data Drift', 'Expected count of group totals below aggregate value', 'This test compares sums or counts of a column rolled up to one or more category combinations, but requires a match or increase in the aggregate value, rather than an exact match, across two different tables. Both tables must be accessible at the same time. Use this to confirm that aggregate values have not dropped for any set of categories, even if some values may rise. This test is useful to compare an older and newer version of a cumulative dataset. An error here means that one or more values per category set fail to match or exceed the prior dataset. New categories or combinations are allowed (but can be restricted independently with a Combo_Match test). Both tables must be present to run this test.', 'Y'), - ('1502', 'Combo_Match', 'Reference Match', 'Column values or combinations found in reference', 'Tests for the presence of one or a set of column values in a reference table', 'Column value combinations are not found in reference table values.', 'Missing values', NULL, NULL, 'Categorical Column List', 'Specify one or more Categorical columns, separated by commas. \n\nDo not use continuous mesurements here. Do not use numeric values unless they represent discrete categories.', 'subset_condition,having_condition,match_schema_name,match_table_name,match_groupby_names,match_subset_condition,match_having_condition', NULL, 'Record Subset Condition,Group Subset Condition,Reference Schema Name,Reference Table Name,Matching Columns,Matching Record Subset Condition,Matching Group Subset Condition', 'Condition defining a subset of records in main table to evaluate, written like a condition within a SQL WHERE clause - OPTIONAL|Condition based on aggregate expression used to exclude value combinations in source table, written like a condition within a SQL HAVING clause (e.g. `SUM(sales) < 100`) - OPTIONAL|Schema location of matching table|Matching table name|Column Names in reference table used to validate source table values (separated by commas)|Condition defining a subset of records in reference table to match against, written like a condition within a SQL WHERE clause - OPTIONAL|Condition based on aggregate expression used to exclude value combinations in reference table, written like a condition within a SQL HAVING clause (e.g. `SUM(sales) < 100`) - OPTIONAL', 'Fail', 'QUERY', 'referential', 'Validity', 'Schema Drift', 'Expected count of non-matching value combinations', 'This test verifies that values, or combinations of values, that are present in the main table are also found in a reference table. This is a useful test for referential integrity between fact and dimension tables. You can also use it to confirm the validity of a code or category, or of combinations of values that should only be found together within each record, such as product/size/color. An error here means that one or more category combinations in the main table are not found in the reference table. Both tables must be present to run this test.', 'Y'), - ('1503', 'Distribution_Shift', 'Distribution Shift', 'Probability distribution consistent with reference', 'Tests the closeness of match between two distributions of aggregate measures across combinations of column values, using Jensen-Shannon Divergence test', 'Divergence between two distributions exceeds specified threshold.', 'Divergence level (0-1)', 'Jensen-Shannon Divergence, from 0 (identical distributions), to 1.0 (max divergence)', NULL, 'Categorical Column List', 'Specify one or more Categorical columns, separated by commas. Do not use continuous mesurements here. Do not use numeric values unless they represent discrete categories.', 'subset_condition,match_schema_name,match_table_name,match_groupby_names,match_subset_condition', NULL, 'Record Subset Condition,Reference Schema Name,Reference Table Name,Matching Columns to Compare,Matching Record Subset Condition', 'Condition defining a subset of records in main table to evaluate, written like a condition within a SQL WHERE clause - OPTIONAL|Schema location of matching table|Matching table name|Column Names in reference table used to compare counts with source table values (separated by commas)|Condition defining a subset of records in reference table to match against, written like a condition within a SQL WHERE clause - OPTIONAL', 'Warning', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected maximum divergence level between 0 and 1', 'This test measures the similarity of two sets of counts per categories, by using their proportional counts as probability distributions. Using Jensen-Shannon divergence, a measure of relative entropy or difference between two distributions, the test assigns a score ranging from 0, meaning that the distributions are identical, to 1, meaning that the distributions are completely unrelated. This test can be used to compare datasets that may not match exactly, but should have similar distributions. For example, it is a useful sanity check for data from different sources that you would expect to have a consistent spread, such as shipment of building materials per state and construction projects by state. Scores can be compared over time even if the distributions are not identical -- a dataset can be expected to maintain a comparable divergence score with a reference dataset over time. Both tables must be present to run this test.', 'Y'), - ('1508', 'Timeframe_Combo_Gain', 'Timeframe No Drops', 'Latest timeframe has at least all value combinations from prior period', 'Tests that column values in most recent time-window include at least same as prior time window', 'Column values in most recent time-window don''t include all values in prior window.', 'Mismatched values', NULL, NULL, 'Categorical Column List', 'Specify one or more Categorical columns, separated by commas. Make sure not to use continuous measurements here. Do not use numeric values unless they represent discrete categories.', 'window_date_column,window_days,subset_condition', NULL, 'Date Column for Time Windows,Time Window in Days,Record Subset Condition', 'The date column used to define the time windows. This must be a DATE or DATETIME type.|Length in days of the time window. The test will compare the most recent period of days to the prior period of the same duration.|Condition defining a subset of records in main table to evaluate, written like a condition within a SQL WHERE clause - OPTIONAL', 'Fail', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected count of missing value combinations', 'This test checks a single transactional table to verify that categorical values or combinations that are present in the most recent time window you define include at least all those found in the prior time window of the same duration. Missing values in the latest time window will trigger the test to fail. New values are permitted. Use this test to confirm that codes or categories are not lost across successive time periods in a transactional table.', 'Y'), - ('1509', 'Timeframe_Combo_Match', 'Timeframe Match', 'Column value combinations from latest timeframe same as prior period', 'Tests for presence of same column values in most recent time-window vs. prior time window', 'Column values don''t match in most recent time-windows.', 'Mismatched values', NULL, NULL, 'Categorical Column List', 'Specify one or more Categorical columns, separated by commas. Do not use continuous measurements here. Do not use numeric values unless they represent discrete categories.', 'window_date_column,window_days,subset_condition', NULL, 'Date Column for Time Windows,Time Window in Days,Record Subset Condition', NULL, 'Fail', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected count of non-matching value combinations', 'This test checks a single transactional table (such as a fact table) to verify that categorical values or combinations that are present in the most recent time window you define match those found in the prior time window of the same duration. New or missing values in the latest time window will trigger the test to fail. Use this test to confirm the consistency in the occurrence of codes or categories across successive time periods in a transactional table.', 'Y'), + ('1500', 'Aggregate_Balance', 'Aggregate Balance', 'Aggregate values per group match reference', 'Tests for exact match in aggregate values for each set of column values vs. reference dataset', 'Aggregate measure per set of column values does not exactly match reference dataset.', 'Mismatched measures', NULL, NULL, '1', '1.0', 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_schema_name,match_table_name,match_column_names,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'Record Subset Condition,Grouping Columns,Group Subset Condition,Matching Schema Name,Matching Table Name,Matching Aggregate Expression,Matching Record Subset Condition,Matching Grouping Columns,Matching Group Subset Condition', 'Condition defining a subset of records in main table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in main table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in main table (e.g. HAVING clause) - OPTIONAL|Schema location of matching table|Matching table name|Agregate column expression in matching table: one of `SUM([column_name])` or `COUNT([column_name])`|Condition defining a subset of records in matching table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in matching table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in matching table (e.g. HAVING clause) - OPTIONAL', 'Fail', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected count of group totals not matching aggregate value', 'This test compares sums or counts of a column rolled up to one or more category combinations across two different tables. Both tables must be accessible at the same time. It''s ideal for confirming that two datasets exactly match -- that the sum of a measure or count of a value hasn''t changed or shifted between categories. Use this test to compare a raw and processed version of the same dataset, or to confirm that an aggregated table exactly matches the detail table that it''s built from. An error here means that one or more value combinations fail to match. New categories or combinations will cause failure.', 'Y'), + ('1501', 'Aggregate_Minimum', 'Aggregate Minimum', 'Aggregate values per group are at or above reference', 'Tests that aggregate values for each set of column values are at least the same as reference dataset', 'Aggregate measure per set of column values is not at least the same as reference dataset.', 'Mismatched measures', NULL, NULL, '1', '1.0', 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_schema_name,match_table_name,match_column_names,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'Record Subset Condition,Grouping Columns,Group Subset Condition,Matching Schema Name,Matching Table Name,Matching Aggregate Expression,Matching Record Subset Condition,Matching Grouping Columns,Matching Group Subset Condition', 'Condition defining a subset of records in main table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in main table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in main table (e.g. HAVING clause) - OPTIONAL|Schema location of reference table|Reference table name|Aggregate column expression in reference table (e.g. `SUM(sales)`)|Condition defining a subset of records in reference table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in reference table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in reference table (e.g. HAVING clause) - OPTIONAL', 'Fail', 'QUERY', 'referential', 'Accuracy', 'Data Drift', 'Expected count of group totals below aggregate value', 'This test compares sums or counts of a column rolled up to one or more category combinations, but requires a match or increase in the aggregate value, rather than an exact match, across two different tables. Both tables must be accessible at the same time. Use this to confirm that aggregate values have not dropped for any set of categories, even if some values may rise. This test is useful to compare an older and newer version of a cumulative dataset. An error here means that one or more values per category set fail to match or exceed the prior dataset. New categories or combinations are allowed (but can be restricted independently with a Combo_Match test). Both tables must be present to run this test.', 'Y'), + ('1502', 'Combo_Match', 'Reference Match', 'Column values or combinations found in reference', 'Tests for the presence of one or a set of column values in a reference table', 'Column value combinations are not found in reference table values.', 'Missing values', NULL, NULL, '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/{RECORD_CT}::FLOAT', '1.0', 'Categorical Column List', 'Specify one or more Categorical columns, separated by commas. \n\nDo not use continuous mesurements here. Do not use numeric values unless they represent discrete categories.', 'subset_condition,having_condition,match_schema_name,match_table_name,match_groupby_names,match_subset_condition,match_having_condition', NULL, 'Record Subset Condition,Group Subset Condition,Reference Schema Name,Reference Table Name,Matching Columns,Matching Record Subset Condition,Matching Group Subset Condition', 'Condition defining a subset of records in main table to evaluate, written like a condition within a SQL WHERE clause - OPTIONAL|Condition based on aggregate expression used to exclude value combinations in source table, written like a condition within a SQL HAVING clause (e.g. `SUM(sales) < 100`) - OPTIONAL|Schema location of matching table|Matching table name|Column Names in reference table used to validate source table values (separated by commas)|Condition defining a subset of records in reference table to match against, written like a condition within a SQL WHERE clause - OPTIONAL|Condition based on aggregate expression used to exclude value combinations in reference table, written like a condition within a SQL HAVING clause (e.g. `SUM(sales) < 100`) - OPTIONAL', 'Fail', 'QUERY', 'referential', 'Validity', 'Schema Drift', 'Expected count of non-matching value combinations', 'This test verifies that values, or combinations of values, that are present in the main table are also found in a reference table. This is a useful test for referential integrity between fact and dimension tables. You can also use it to confirm the validity of a code or category, or of combinations of values that should only be found together within each record, such as product/size/color. An error here means that one or more category combinations in the main table are not found in the reference table. Both tables must be present to run this test.', 'Y'), + ('1503', 'Distribution_Shift', 'Distribution Shift', 'Probability distribution consistent with reference', 'Tests the closeness of match between two distributions of aggregate measures across combinations of column values, using Jensen-Shannon Divergence test', 'Divergence between two distributions exceeds specified threshold.', 'Divergence level (0-1)', 'Jensen-Shannon Divergence, from 0 (identical distributions), to 1.0 (max divergence)', NULL, '1', '0.75', 'Categorical Column List', 'Specify one or more Categorical columns, separated by commas. Do not use continuous mesurements here. Do not use numeric values unless they represent discrete categories.', 'subset_condition,match_schema_name,match_table_name,match_groupby_names,match_subset_condition', NULL, 'Record Subset Condition,Reference Schema Name,Reference Table Name,Matching Columns to Compare,Matching Record Subset Condition', 'Condition defining a subset of records in main table to evaluate, written like a condition within a SQL WHERE clause - OPTIONAL|Schema location of matching table|Matching table name|Column Names in reference table used to compare counts with source table values (separated by commas)|Condition defining a subset of records in reference table to match against, written like a condition within a SQL WHERE clause - OPTIONAL', 'Warning', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected maximum divergence level between 0 and 1', 'This test measures the similarity of two sets of counts per categories, by using their proportional counts as probability distributions. Using Jensen-Shannon divergence, a measure of relative entropy or difference between two distributions, the test assigns a score ranging from 0, meaning that the distributions are identical, to 1, meaning that the distributions are completely unrelated. This test can be used to compare datasets that may not match exactly, but should have similar distributions. For example, it is a useful sanity check for data from different sources that you would expect to have a consistent spread, such as shipment of building materials per state and construction projects by state. Scores can be compared over time even if the distributions are not identical -- a dataset can be expected to maintain a comparable divergence score with a reference dataset over time. Both tables must be present to run this test.', 'Y'), + ('1508', 'Timeframe_Combo_Gain', 'Timeframe No Drops', 'Latest timeframe has at least all value combinations from prior period', 'Tests that column values in most recent time-window include at least same as prior time window', 'Column values in most recent time-window don''t include all values in prior window.', 'Mismatched values', NULL, NULL, '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/{RECORD_CT}::FLOAT', '1.0', 'Categorical Column List', 'Specify one or more Categorical columns, separated by commas. Make sure not to use continuous measurements here. Do not use numeric values unless they represent discrete categories.', 'window_date_column,window_days,subset_condition', NULL, 'Date Column for Time Windows,Time Window in Days,Record Subset Condition', 'The date column used to define the time windows. This must be a DATE or DATETIME type.|Length in days of the time window. The test will compare the most recent period of days to the prior period of the same duration.|Condition defining a subset of records in main table to evaluate, written like a condition within a SQL WHERE clause - OPTIONAL', 'Fail', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected count of missing value combinations', 'This test checks a single transactional table to verify that categorical values or combinations that are present in the most recent time window you define include at least all those found in the prior time window of the same duration. Missing values in the latest time window will trigger the test to fail. New values are permitted. Use this test to confirm that codes or categories are not lost across successive time periods in a transactional table.', 'Y'), + ('1509', 'Timeframe_Combo_Match', 'Timeframe Match', 'Column value combinations from latest timeframe same as prior period', 'Tests for presence of same column values in most recent time-window vs. prior time window', 'Column values don''t match in most recent time-windows.', 'Mismatched values', NULL, NULL, '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/{RECORD_CT}::FLOAT', '1.0', 'Categorical Column List', 'Specify one or more Categorical columns, separated by commas. Do not use continuous measurements here. Do not use numeric values unless they represent discrete categories.', 'window_date_column,window_days,subset_condition', NULL, 'Date Column for Time Windows,Time Window in Days,Record Subset Condition', NULL, 'Fail', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected count of non-matching value combinations', 'This test checks a single transactional table (such as a fact table) to verify that categorical values or combinations that are present in the most recent time window you define match those found in the prior time window of the same duration. New or missing values in the latest time window will trigger the test to fail. Use this test to confirm the consistency in the occurrence of codes or categories across successive time periods in a transactional table.', 'Y'), - ('1504', 'Aggregate_Pct_Above', 'Aggregate Pct Above', 'Aggregate values per group exceed reference', 'Tests that aggregate values for each set of column values exceed values for reference dataset', 'Aggregate measure per set of column values fails to exceed the reference dataset.', 'Mismatched measures', NULL, NULL, 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_column_names,match_schema_name,match_table_name,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'TODO Fill in default_parm_prompts match_schema_name,TODO Fill in default_parm_prompts match_table_name,TODO Fill in default_parm_prompts match_column_names,TODO Fill in default_parm_prompts match_subset_condition,TODO Fill in default_parm_prompts match_groupby_names,TODO Fill in default_parm_prompts match_having_condition,TODO Fill in default_parm_prompts subset_condition,TODO Fill in default_parm_prompts groupby_names,TODO Fill in default_parm_prompts having_condition', NULL, 'Fail', 'QUERY', 'referential', 'Accuracy', 'Data Drift', 'Expected count of group totals with not exceeding aggregate measure', NULL, 'N'), - ('1505', 'Aggregate_Pct_Within', 'Aggregate Pct Within', 'Aggregate values per group exceed reference', 'Tests that aggregate values for each set of column values exceed values for reference dataset', 'Aggregate measure per set of column values fails to exceed the reference dataset.', 'Mismatched measures', NULL, NULL, 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_column_names,match_schema_name,match_table_name,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'TODO Fill in default_parm_prompts match_schema_name,TODO Fill in default_parm_prompts match_table_name,TODO Fill in default_parm_prompts match_column_names,TODO Fill in default_parm_prompts match_subset_condition,TODO Fill in default_parm_prompts match_groupby_names,TODO Fill in default_parm_prompts match_having_condition,TODO Fill in default_parm_prompts subset_condition,TODO Fill in default_parm_prompts groupby_names,TODO Fill in default_parm_prompts having_condition', NULL, 'Fail', 'QUERY', 'referential', 'Accuracy', 'Data Drift', 'Expected count of group totals with not exceeding aggregate measure', NULL, 'N'), - ('1506', 'Aggregate_Increase', 'Aggregate Increase', 'Aggregate values per group exceed reference', 'Tests that aggregate values for each set of column values exceed values for reference dataset', 'Aggregate measure per set of column values fails to exceed the reference dataset.', 'Mismatched measures', NULL, NULL, 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_column_names,match_schema_name,match_table_name,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'TODO Fill in default_parm_prompts match_schema_name,TODO Fill in default_parm_prompts match_table_name,TODO Fill in default_parm_prompts match_column_names,TODO Fill in default_parm_prompts match_subset_condition,TODO Fill in default_parm_prompts match_groupby_names,TODO Fill in default_parm_prompts match_having_condition,TODO Fill in default_parm_prompts subset_condition,TODO Fill in default_parm_prompts groupby_names,TODO Fill in default_parm_prompts having_condition', NULL, 'Fail', 'QUERY', 'referential', 'Accuracy', 'Data Drift', 'Expected count of group totals below reference value', NULL, 'N') + ('1504', 'Aggregate_Pct_Above', 'Aggregate Pct Above', 'Aggregate values per group exceed reference', 'Tests that aggregate values for each set of column values exceed values for reference dataset', 'Aggregate measure per set of column values fails to exceed the reference dataset.', 'Mismatched measures', NULL, NULL, '1', '1.0', 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_column_names,match_schema_name,match_table_name,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'TODO Fill in default_parm_prompts match_schema_name,TODO Fill in default_parm_prompts match_table_name,TODO Fill in default_parm_prompts match_column_names,TODO Fill in default_parm_prompts match_subset_condition,TODO Fill in default_parm_prompts match_groupby_names,TODO Fill in default_parm_prompts match_having_condition,TODO Fill in default_parm_prompts subset_condition,TODO Fill in default_parm_prompts groupby_names,TODO Fill in default_parm_prompts having_condition', NULL, 'Fail', 'QUERY', 'referential', 'Accuracy', 'Data Drift', 'Expected count of group totals with not exceeding aggregate measure', NULL, 'N'), + ('1505', 'Aggregate_Pct_Within', 'Aggregate Pct Within', 'Aggregate values per group exceed reference', 'Tests that aggregate values for each set of column values exceed values for reference dataset', 'Aggregate measure per set of column values fails to exceed the reference dataset.', 'Mismatched measures', NULL, NULL, '1', '1.0', 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_column_names,match_schema_name,match_table_name,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'TODO Fill in default_parm_prompts match_schema_name,TODO Fill in default_parm_prompts match_table_name,TODO Fill in default_parm_prompts match_column_names,TODO Fill in default_parm_prompts match_subset_condition,TODO Fill in default_parm_prompts match_groupby_names,TODO Fill in default_parm_prompts match_having_condition,TODO Fill in default_parm_prompts subset_condition,TODO Fill in default_parm_prompts groupby_names,TODO Fill in default_parm_prompts having_condition', NULL, 'Fail', 'QUERY', 'referential', 'Accuracy', 'Data Drift', 'Expected count of group totals with not exceeding aggregate measure', NULL, 'N'), + ('1506', 'Aggregate_Increase', 'Aggregate Increase', 'Aggregate values per group exceed reference', 'Tests that aggregate values for each set of column values exceed values for reference dataset', 'Aggregate measure per set of column values fails to exceed the reference dataset.', 'Mismatched measures', NULL, NULL, '1', '1.0', 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_column_names,match_schema_name,match_table_name,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'TODO Fill in default_parm_prompts match_schema_name,TODO Fill in default_parm_prompts match_table_name,TODO Fill in default_parm_prompts match_column_names,TODO Fill in default_parm_prompts match_subset_condition,TODO Fill in default_parm_prompts match_groupby_names,TODO Fill in default_parm_prompts match_having_condition,TODO Fill in default_parm_prompts subset_condition,TODO Fill in default_parm_prompts groupby_names,TODO Fill in default_parm_prompts having_condition', NULL, 'Fail', 'QUERY', 'referential', 'Accuracy', 'Data Drift', 'Expected count of group totals below reference value', NULL, 'N') ; @@ -302,7 +297,7 @@ VALUES ('1001', 'Alpha_Trunc', 'redshift', 'MAX(LENGTH({COLUMN_NAME}))', '<', ' ('4002', 'Avg_Shift', 'postgresql', 'ABS( (AVG({COLUMN_NAME}::FLOAT) - {BASELINE_AVG}) / SQRT(((COUNT({COLUMN_NAME})::FLOAT-1)*STDDEV({COLUMN_NAME})^2 + ({BASELINE_VALUE_CT}::FLOAT-1) * {BASELINE_SD}::FLOAT^2) /NULLIF(COUNT({COLUMN_NAME})::FLOAT + {BASELINE_VALUE_CT}::FLOAT, 0) ))', '>=', '{THRESHOLD_VALUE}'), ('4003', 'Condition_Flag', 'postgresql', 'SUM(CASE WHEN {CUSTOM_QUERY} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), ('4004', 'Constant', 'postgresql', 'SUM(CASE WHEN {COLUMN_NAME} <> {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('4005', 'Daily_Record_Ct', 'postgresql', '{DATA_QC_SCHEMA}.DATEDIFF(''DAY'', MIN({COLUMN_NAME}), MAX({COLUMN_NAME}))+1-COUNT(DISTINCT {COLUMN_NAME})', '>', '{THRESHOLD_VALUE}'), + ('4005', 'Daily_Record_Ct', 'postgresql', '{{DKFN_DATEDIFF_DAY;;MIN({COLUMN_NAME});;MAX({COLUMN_NAME})}} +1-COUNT(DISTINCT {COLUMN_NAME})', '>', '{THRESHOLD_VALUE}'), ('4006', 'Dec_Trunc', 'postgresql', 'ROUND(SUM(ABS({COLUMN_NAME})::DECIMAL(18,4) % 1), 0)', '<', '{THRESHOLD_VALUE}'), ('4007', 'Distinct_Date_Ct', 'postgresql', 'COUNT(DISTINCT {COLUMN_NAME})', '<', '{THRESHOLD_VALUE}'), ('4008', 'Distinct_Value_Ct', 'postgresql', 'COUNT(DISTINCT {COLUMN_NAME})', '<>', '{THRESHOLD_VALUE}'), @@ -315,11 +310,11 @@ VALUES ('1001', 'Alpha_Trunc', 'redshift', 'MAX(LENGTH({COLUMN_NAME}))', '<', ' ('4015', 'Min_Date', 'postgresql', 'SUM(CASE WHEN {COLUMN_NAME} < ''{BASELINE_VALUE}'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), ('4016', 'Min_Val', 'postgresql', 'SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), ('4017', 'Missing_Pct', 'postgresql', 'ABS( 2.0 * ASIN( SQRT( {BASELINE_VALUE_CT}::FLOAT / {BASELINE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( {COLUMN_NAME} )::FLOAT / NULLIF(COUNT(*), 0)::FLOAT )) )', '>=', '{THRESHOLD_VALUE}'), - ('4018', 'Monthly_Rec_Ct', 'postgresql', '(MAX({DATA_QC_SCHEMA}.DATEDIFF(''MON'', {COLUMN_NAME}, ''{RUN_DATE}''::DATE)) - MIN({DATA_QC_SCHEMA}.DATEDIFF(''MON'', {COLUMN_NAME}, ''{RUN_DATE}''::DATE)) + 1) - COUNT(DISTINCT {DATA_QC_SCHEMA}.DATEDIFF(''MON'', {COLUMN_NAME}, ''{RUN_DATE}''::DATE))', '>', '{THRESHOLD_VALUE}'), + ('4018', 'Monthly_Rec_Ct', 'postgresql', '(MAX({{DKFN_DATEDIFF_MONTH;;{COLUMN_NAME};;''{RUN_DATE}''::DATE}} ) - MIN({{DKFN_DATEDIFF_MONTH;;{COLUMN_NAME};;''{RUN_DATE}''::DATE}} ) + 1) - COUNT(DISTINCT {{DKFN_DATEDIFF_MONTH;;{COLUMN_NAME};;''{RUN_DATE}''::DATE}} )', '>', '{THRESHOLD_VALUE}'), ('4019', 'Outlier_Pct_Above', 'postgresql', 'SUM(CASE WHEN {COLUMN_NAME}::FLOAT > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT', '>', '{THRESHOLD_VALUE}'), ('4020', 'Outlier_Pct_Below', 'postgresql', 'SUM(CASE WHEN {COLUMN_NAME}::FLOAT < {BASELINE_AVG}-(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT', '>', '{THRESHOLD_VALUE}'), ('4021', 'Pattern_Match', 'postgresql', 'COUNT(NULLIF({COLUMN_NAME}, '''')) - SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') ~ ''{BASELINE_VALUE}'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), - ('4022', 'Recency', 'postgresql', '{DATA_QC_SCHEMA}.DATEDIFF(''DAY'', MAX({COLUMN_NAME}), ''{RUN_DATE}''::DATE)', '>', '{THRESHOLD_VALUE}'), + ('4022', 'Recency', 'postgresql', '{{DKFN_DATEDIFF_DAY;;MAX({COLUMN_NAME});;''{RUN_DATE}''::DATE}} ', '>', '{THRESHOLD_VALUE}'), ('4023', 'Required', 'postgresql', 'COUNT(*) - COUNT({COLUMN_NAME})', '>', '{THRESHOLD_VALUE}'), ('4024', 'Row_Ct', 'postgresql', 'COUNT(*)', '<', '{THRESHOLD_VALUE}'), ('4025', 'Row_Ct_Pct', 'postgresql', 'ABS(ROUND(100.0 * (COUNT(*) - {BASELINE_CT})::DECIMAL(18,4) / {BASELINE_CT}::DECIMAL(18,4), 2))', '>', '{THRESHOLD_VALUE}'), @@ -327,7 +322,7 @@ VALUES ('1001', 'Alpha_Trunc', 'redshift', 'MAX(LENGTH({COLUMN_NAME}))', '<', ' ('4027', 'US_State', 'postgresql', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN (''AL'',''AK'',''AS'',''AZ'',''AR'',''CA'',''CO'',''CT'',''DE'',''DC'',''FM'',''FL'',''GA'',''GU'',''HI'',''ID'',''IL'',''IN'',''IA'',''KS'',''KY'',''LA'',''ME'',''MH'',''MD'',''MA'',''MI'',''MN'',''MS'',''MO'',''MT'',''NE'',''NV'',''NH'',''NJ'',''NM'',''NY'',''NC'',''ND'',''MP'',''OH'',''OK'',''OR'',''PW'',''PA'',''PR'',''RI'',''SC'',''SD'',''TN'',''TX'',''UT'',''VT'',''VI'',''VA'',''WA'',''WV'',''WI'',''WY'',''AE'',''AP'',''AA'') THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), ('4028', 'Unique', 'postgresql', 'COUNT(*) - COUNT(DISTINCT {COLUMN_NAME})', '>', '{THRESHOLD_VALUE}'), ('4029', 'Unique_Pct', 'postgresql', 'ABS( 2.0 * ASIN( SQRT({BASELINE_UNIQUE_CT}::FLOAT / {BASELINE_VALUE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( DISTINCT {COLUMN_NAME} )::FLOAT / NULLIF(COUNT( {COLUMN_NAME} ), 0)::FLOAT )) )', '>=', '{THRESHOLD_VALUE}'), - ('4030', 'Weekly_Rec_Ct', 'postgresql', 'MAX({DATA_QC_SCHEMA}.DATEDIFF(''WEEK'', ''1800-01-01''::DATE, {COLUMN_NAME})) - MIN({DATA_QC_SCHEMA}.DATEDIFF(''WEEK'', ''1800-01-01''::DATE, {COLUMN_NAME}))+1 - COUNT(DISTINCT {DATA_QC_SCHEMA}.DATEDIFF(''WEEK'', ''1800-01-01''::DATE, {COLUMN_NAME}))', '>', '{THRESHOLD_VALUE}'), + ('4030', 'Weekly_Rec_Ct', 'postgresql', 'MAX({{DKFN_DATEDIFF_WEEK;;''1800-01-01''::DATE;;{COLUMN_NAME}}} ) - MIN({{DKFN_DATEDIFF_WEEK;;''1800-01-01''::DATE;;{COLUMN_NAME}}} )+1 - COUNT(DISTINCT {{DKFN_DATEDIFF_WEEK;;''1800-01-01''::DATE;;{COLUMN_NAME}}} )', '>', '{THRESHOLD_VALUE}'), ('1031', 'Variability_Increase', 'redshift', '100.0*STDDEV(CAST("{COLUMN_NAME}" AS FLOAT))/{BASELINE_SD}', '>', '{THRESHOLD_VALUE}'), ('1032', 'Variability_Decrease', 'redshift', '100.0*STDDEV(CAST("{COLUMN_NAME}" AS FLOAT))/{BASELINE_SD}', '<', '{THRESHOLD_VALUE}'), ('2031', 'Variability_Increase', 'snowflake', '100.0*STDDEV(CAST("{COLUMN_NAME}" AS FLOAT))/{BASELINE_SD}', '>', '{THRESHOLD_VALUE}'), @@ -443,8 +438,8 @@ VALUES ('1040', '1008', 'Profile Anomaly' , 'Table_Pattern_Mismatch', 'redshift', NULL, 'SELECT column_name, table_name, data_type FROM information_schema.columns WHERE table_schema = ''{TARGET_SCHEMA}'' AND column_name = ''{COLUMN_NAME}'' ORDER BY data_type;' ), ('1041', '1009', 'Profile Anomaly' , 'Leading_Spaces', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN '' !'' AND ''!'' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), ('1042', '1010', 'Profile Anomaly' , 'Quoted_Values', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE ''"%"'' OR "{COLUMN_NAME}" ILIKE ''''''%'''''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), - ('1043', '1011', 'Profile Anomaly' , 'Char_Column_Number_Values', 'redshift', NULL, 'SELECT A.* FROM ( SELECT TOP 10 DISTINCT ''Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {DATA_QC_SCHEMA}.fndk_isnum("{COLUMN_NAME}") = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT ''Non-Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {DATA_QC_SCHEMA}.fndk_isnum("{COLUMN_NAME}") != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;' ), - ('1044', '1012', 'Profile Anomaly' , 'Char_Column_Date_Values', 'redshift', NULL, 'SELECT A.* FROM ( SELECT TOP 10 DISTINCT ''Date'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {DATA_QC_SCHEMA}.fndk_isdate("{COLUMN_NAME}") = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT ''Non-Date'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {DATA_QC_SCHEMA}.fndk_isdate("{COLUMN_NAME}") != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;' ), + ('1043', '1011', 'Profile Anomaly' , 'Char_Column_Number_Values', 'redshift', NULL, 'SELECT A.* FROM ( SELECT TOP 10 DISTINCT ''Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {{DKFN_IS_NUM;;"{COLUMN_NAME}"}} = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT ''Non-Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {{DKFN_IS_NUM;;"{COLUMN_NAME}"}} != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;' ), + ('1044', '1012', 'Profile Anomaly' , 'Char_Column_Date_Values', 'redshift', NULL, 'SELECT A.* FROM ( SELECT TOP 10 DISTINCT ''Date'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {{DKFN_IS_DATE;;"{COLUMN_NAME}"}} = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT ''Non-Date'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {{DKFN_IS_DATE;;"{COLUMN_NAME}"}} != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;' ), ('1045', '1013', 'Profile Anomaly' , 'Small Missing Value Ct', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" IN (''.'', ''?'', '' '') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO ''(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)'' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN "{COLUMN_NAME}" = '''' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), ('1046', '1014', 'Profile Anomaly' , 'Small Divergent Value Ct', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;' ), ('1047', '1015', 'Profile Anomaly' , 'Boolean_Value_Mismatch', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;' ), @@ -455,7 +450,7 @@ VALUES ('1052', '1020', 'Profile Anomaly' , 'Recency_Six_Months', 'redshift', NULL, 'created_in_ui' ), ('1053', '1021', 'Profile Anomaly' , 'Unexpected US States', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;' ), ('1054', '1022', 'Profile Anomaly' , 'Unexpected Emails', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;' ), - ('1055', '1023', 'Profile Anomaly' , 'Small_Numeric_Value_Ct', 'redshift', NULL, 'SELECT A.* FROM ( SELECT TOP 10 DISTINCT ''Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {DATA_QC_SCHEMA}.fndk_isnum("{COLUMN_NAME}") = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT ''Non-Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {DATA_QC_SCHEMA}.fndk_isnum("{COLUMN_NAME}") != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;' ), + ('1055', '1023', 'Profile Anomaly' , 'Small_Numeric_Value_Ct', 'redshift', NULL, 'SELECT A.* FROM ( SELECT TOP 10 DISTINCT ''Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {{DKFN_IS_NUM;;"{COLUMN_NAME}"}} = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT TOP 10 DISTINCT ''Non-Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {{DKFN_IS_NUM;;"{COLUMN_NAME}"}} != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;' ), ('1056', '1024', 'Profile Anomaly' , 'Invalid_Zip3_USA', 'redshift', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') <> ''999'' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" LIMIT 500;'), ('1057', '1025', 'Profile Anomaly' , 'Delimited_Data_Embedded', 'redshift', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" ~ ''^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$'' AND "{COLUMN_NAME}" !~ ''\\s(and|but|or|yet)\\s'' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;' ), @@ -469,8 +464,8 @@ VALUES ('1065', '1008', 'Profile Anomaly' , 'Table_Pattern_Mismatch', 'postgresql', NULL, 'SELECT column_name, columns.table_name FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND UPPER(tables.table_type) = ''BASE TABLE'' ORDER BY columns.table_name;' ), ('1066', '1009', 'Profile Anomaly' , 'Leading_Spaces', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN '' !'' AND ''!'' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), ('1067', '1010', 'Profile Anomaly' , 'Quoted_Values', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE ''"%"'' OR "{COLUMN_NAME}" ILIKE ''''''%'''''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), - ('1068', '1011', 'Profile Anomaly' , 'Char_Column_Number_Values', 'postgresql', NULL, 'SELECT A.* FROM ( SELECT DISTINCT ''Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {DATA_QC_SCHEMA}.fndk_isnum("{COLUMN_NAME}") = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10 ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT ''Non-Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {DATA_QC_SCHEMA}.fndk_isnum("{COLUMN_NAME}") != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10 ) AS B ORDER BY data_type, count DESC;' ), - ('1069', '1012', 'Profile Anomaly' , 'Char_Column_Date_Values', 'postgresql', NULL, 'SELECT A.* FROM ( SELECT DISTINCT ''Date'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {DATA_QC_SCHEMA}.fndk_isdate("{COLUMN_NAME}") = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT ''Non-Date'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {DATA_QC_SCHEMA}.fndk_isdate("{COLUMN_NAME}") != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10) AS B ORDER BY data_type, count DESC;' ), + ('1068', '1011', 'Profile Anomaly' , 'Char_Column_Number_Values', 'postgresql', NULL, 'SELECT A.* FROM ( SELECT DISTINCT ''Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {{DKFN_IS_NUM;;"{COLUMN_NAME}"}} = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10 ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT ''Non-Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {{DKFN_IS_NUM;;"{COLUMN_NAME}"}} != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10 ) AS B ORDER BY data_type, count DESC;' ), + ('1069', '1012', 'Profile Anomaly' , 'Char_Column_Date_Values', 'postgresql', NULL, 'SELECT A.* FROM ( SELECT DISTINCT ''Date'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {{DKFN_IS_DATE;;"{COLUMN_NAME}"}} = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT ''Non-Date'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {{DKFN_IS_DATE;;"{COLUMN_NAME}"}} != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10) AS B ORDER BY data_type, count DESC;' ), ('1070', '1013', 'Profile Anomaly' , 'Small Missing Value Ct', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" IN (''.'', ''?'', '' '') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO ''(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)'' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN "{COLUMN_NAME}" = '''' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), ('1071', '1014', 'Profile Anomaly' , 'Small Divergent Value Ct', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;' ), ('1072', '1015', 'Profile Anomaly' , 'Boolean_Value_Mismatch', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;' ), @@ -481,7 +476,7 @@ VALUES ('1077', '1020', 'Profile Anomaly' , 'Recency_Six_Months', 'postgresql', NULL, 'created_in_ui' ), ('1078', '1021', 'Profile Anomaly' , 'Unexpected US States', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;' ), ('1079', '1022', 'Profile Anomaly' , 'Unexpected Emails', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;' ), - ('1080', '1023', 'Profile Anomaly' , 'Small_Numeric_Value_Ct', 'postgresql', NULL, 'SELECT A.* FROM ( SELECT DISTINCT ''Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {DATA_QC_SCHEMA}.fndk_isnum("{COLUMN_NAME}") = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10 ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT ''Non-Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {DATA_QC_SCHEMA}.fndk_isnum("{COLUMN_NAME}") != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10 ) AS B ORDER BY data_type, count DESC;' ), + ('1080', '1023', 'Profile Anomaly' , 'Small_Numeric_Value_Ct', 'postgresql', NULL, 'SELECT A.* FROM ( SELECT DISTINCT ''Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {{DKFN_IS_NUM;;"{COLUMN_NAME}"}} = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10 ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT ''Non-Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {{DKFN_IS_NUM;;"{COLUMN_NAME}"}} != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT 10 ) AS B ORDER BY data_type, count DESC;' ), ('1081', '1024', 'Profile Anomaly' , 'Invalid_Zip3_USA', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') <> ''999'' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" LIMIT 500;'), ('1082', '1025', 'Profile Anomaly' , 'Delimited_Data_Embedded', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" ~ ''^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$'' AND "{COLUMN_NAME}" !~ ''\s(and|but|or|yet)\s'' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;' ), @@ -507,7 +502,7 @@ VALUES ('1101', '1024', 'Test Results', 'Outlier_Pct_Above', 'postgresql', NULL, 'SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: FLOAT > ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;'), ('1102', '1025', 'Test Results', 'Outlier_Pct_Below', 'postgresql', NULL, 'SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" :: FLOAT < ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;'), ('1103', '1026', 'Test Results', 'Pattern_Match', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF("{COLUMN_NAME}", '''') NOT SIMILAR TO ''{BASELINE_VALUE}'' GROUP BY "{COLUMN_NAME}";'), - ('1104', '1028', 'Test Results', 'Recency', 'postgresql', NULL, 'SELECT DISTINCT col AS latest_date_available, ''{TEST_DATE}'' :: DATE as test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM {TARGET_SCHEMA}.{TABLE_NAME}) a WHERE {DATA_QC_SCHEMA}.DATEDIFF(''day'', col, ''{TEST_DATE}''::DATE) > {THRESHOLD_VALUE};'), + ('1104', '1028', 'Test Results', 'Recency', 'postgresql', NULL, 'SELECT DISTINCT col AS latest_date_available, ''{TEST_DATE}'' :: DATE as test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM {TARGET_SCHEMA}.{TABLE_NAME}) a WHERE {{DKFN_DATEDIFF_DAY;;col;;''{TEST_DATE}''::DATE}} > {THRESHOLD_VALUE};'), ('1105', '1030', 'Test Results', 'Required', 'postgresql', NULL, 'SELECT * FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE "{COLUMN_NAME}" IS NULL LIMIT 500;'), ('1106', '1031', 'Test Results', 'Row_Ct', 'postgresql', NULL, 'WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, ABS(ROUND(100 * (current_count - {THRESHOLD_VALUE}) :: NUMERIC / {THRESHOLD_VALUE} :: NUMERIC,2)) AS row_count_pct_decrease FROM cte WHERE current_count < {THRESHOLD_VALUE};'), ('1107', '1032', 'Test Results', 'Row_Ct_Pct', 'postgresql', NULL, 'WITH CTE AS (SELECT COUNT(*) AS current_count FROM {TARGET_SCHEMA}.{TABLE_NAME}) SELECT current_count, {BASELINE_CT} AS baseline_count, ABS(ROUND(100 * (current_count - {BASELINE_CT}) :: NUMERIC / {BASELINE_CT} :: NUMERIC,2)) AS row_count_pct_difference FROM cte;'), @@ -529,8 +524,8 @@ VALUES ('1122', '1008', 'Profile Anomaly' , 'Table_Pattern_Mismatch', 'mssql', NULL, 'SELECT TOP 500 column_name, columns.table_name FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND UPPER(tables.table_type) = ''BASE TABLE'' ORDER BY table_name;' ), ('1123', '1009', 'Profile Anomaly' , 'Leading_Spaces', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN '' !'' AND ''!'' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), ('1124', '1010', 'Profile Anomaly' , 'Quoted_Values', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" LIKE ''"%"'' OR "{COLUMN_NAME}" LIKE ''''''%'''''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), - ('1125', '1011', 'Profile Anomaly' , 'Char_Column_Number_Values', 'mssql', NULL, 'SELECT A.* FROM ( SELECT DISTINCT TOP 10 ''Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {DATA_QC_SCHEMA}.fndk_isnum("{COLUMN_NAME}") = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT TOP 10 ''Non-Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {DATA_QC_SCHEMA}.fndk_isnum("{COLUMN_NAME}") != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;' ), - ('1126', '1012', 'Profile Anomaly' , 'Char_Column_Date_Values', 'mssql', NULL, 'SELECT A.* FROM ( SELECT DISTINCT TOP 10 ''Date'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {DATA_QC_SCHEMA}.fndk_isdate("{COLUMN_NAME}") = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT TOP 10 ''Non-Date'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {DATA_QC_SCHEMA}.fndk_isdate("{COLUMN_NAME}") != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;' ), + ('1125', '1011', 'Profile Anomaly' , 'Char_Column_Number_Values', 'mssql', NULL, 'SELECT A.* FROM ( SELECT DISTINCT TOP 10 ''Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {{DKFN_IS_NUM;;"{COLUMN_NAME}"}} = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT TOP 10 ''Non-Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {{DKFN_IS_NUM;;"{COLUMN_NAME}"}} != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;' ), + ('1126', '1012', 'Profile Anomaly' , 'Char_Column_Date_Values', 'mssql', NULL, 'SELECT A.* FROM ( SELECT DISTINCT TOP 10 ''Date'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {{DKFN_IS_DATE;;"{COLUMN_NAME}"}} = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT TOP 10 ''Non-Date'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {{DKFN_IS_DATE;;"{COLUMN_NAME}"}} != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;' ), ('1127', '1013', 'Profile Anomaly' , 'Small Missing Value Ct', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" IN (''.'', ''?'', '' '') THEN 1 WHEN LEN("{COLUMN_NAME}") > 1 AND ( LOWER("{COLUMN_NAME}") LIKE ''%..%'' OR LOWER("{COLUMN_NAME}") LIKE ''%--%'' OR (LEN(REPLACE("{COLUMN_NAME}", ''0'', ''''))= 0 ) OR (LEN(REPLACE("{COLUMN_NAME}", ''9'', ''''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), ''x'', ''''))= 0 ) OR (LEN(REPLACE(LOWER("{COLUMN_NAME}"), ''z'', ''''))= 0 ) ) THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN "{COLUMN_NAME}" = '''' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), ('1128', '1014', 'Profile Anomaly' , 'Small Divergent Value Ct', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;' ), ('1129', '1015', 'Profile Anomaly' , 'Boolean_Value_Mismatch', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;' ), @@ -541,7 +536,7 @@ VALUES ('1134', '1020', 'Profile Anomaly' , 'Recency_Six_Months', 'mssql', NULL, 'created_in_ui' ), ('1135', '1021', 'Profile Anomaly' , 'Unexpected US States', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;' ), ('1136', '1022', 'Profile Anomaly' , 'Unexpected Emails', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC;' ), - ('1137', '1023', 'Profile Anomaly' , 'Small_Numeric_Value_Ct', 'mssql', NULL, 'SELECT A.* FROM ( SELECT DISTINCT TOP 10 ''Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {DATA_QC_SCHEMA}.fndk_isnum("{COLUMN_NAME}") = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT TOP 10 ''Non-Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {DATA_QC_SCHEMA}.fndk_isnum("{COLUMN_NAME}") != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;' ), + ('1137', '1023', 'Profile Anomaly' , 'Small_Numeric_Value_Ct', 'mssql', NULL, 'SELECT A.* FROM ( SELECT DISTINCT TOP 10 ''Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {{DKFN_IS_NUM;;"{COLUMN_NAME}"}} = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS A UNION ALL SELECT B.* FROM ( SELECT DISTINCT TOP 10 ''Non-Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {{DKFN_IS_NUM;;"{COLUMN_NAME}"}} != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC ) AS B ORDER BY data_type, count DESC;' ), ('1138', '1024', 'Profile Anomaly' , 'Invalid_Zip3_USA', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') <> ''999'' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}";'), ('1139', '1025', 'Profile Anomaly' , 'Delimited_Data_Embedded', 'mssql', NULL, 'SELECT TOP 500 "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE ( "{COLUMN_NAME}" LIKE ''%,%,%,%'' OR "{COLUMN_NAME}" LIKE ''%|%|%|%'' OR "{COLUMN_NAME}" LIKE ''%^%^%^%'' OR "{COLUMN_NAME}" LIKE ''%'' + CHAR(9) + ''%'' + CHAR(9) + ''%'' + CHAR(9) + ''%'' ) AND NOT ( "{COLUMN_NAME}" LIKE ''% and %'' OR "{COLUMN_NAME}" LIKE ''% but %'' OR "{COLUMN_NAME}" LIKE ''% or %'' OR "{COLUMN_NAME}" LIKE ''% yet %'' ) AND ISNULL(CAST(LEN("{COLUMN_NAME}") - LEN(REPLACE("{COLUMN_NAME}", '','', '''')) as FLOAT) / CAST(NULLIF(LEN("{COLUMN_NAME}") - LEN(REPLACE("{COLUMN_NAME}", '' '', '''')), 0) as FLOAT), 1) > 0.6 GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;' ), @@ -708,8 +703,8 @@ ORDER BY check_period DESC;'), ('1179', '1008', 'Profile Anomaly' , 'Table_Pattern_Mismatch', 'snowflake', NULL, 'SELECT DISTINCT column_name, columns.table_name FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND UPPER(tables.table_type) = ''BASE TABLE'' ORDER BY table_name; ' ), ('1180', '1009', 'Profile Anomaly' , 'Leading_Spaces', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN '' !'' AND ''!'' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), ('1181', '1010', 'Profile Anomaly' , 'Quoted_Values', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE ''"%"'' OR "{COLUMN_NAME}" ILIKE ''''''%'''''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), - ('1182', '1011', 'Profile Anomaly' , 'Char_Column_Number_Values', 'snowflake', NULL, 'SELECT A.* FROM (SELECT DISTINCT TOP 10 ''Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {DATA_QC_SCHEMA}.fndk_isnum("{COLUMN_NAME}") = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT TOP 10 ''Non-Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {DATA_QC_SCHEMA}.fndk_isnum("{COLUMN_NAME}") != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS B ORDER BY data_type, count DESC;' ), - ('1183', '1012', 'Profile Anomaly' , 'Char_Column_Date_Values', 'snowflake', NULL, 'SELECT A.* FROM (SELECT DISTINCT TOP 10 ''Date'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {DATA_QC_SCHEMA}.fndk_isdate("{COLUMN_NAME}") = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT TOP 10 ''Non-Date'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {DATA_QC_SCHEMA}.fndk_isdate("{COLUMN_NAME}") != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS B ORDER BY data_type, count DESC;' ), + ('1182', '1011', 'Profile Anomaly' , 'Char_Column_Number_Values', 'snowflake', NULL, 'SELECT A.* FROM (SELECT DISTINCT TOP 10 ''Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {{DKFN_IS_NUM;;"{COLUMN_NAME}"}} = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT TOP 10 ''Non-Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {{DKFN_IS_NUM;;"{COLUMN_NAME}"}} != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS B ORDER BY data_type, count DESC;' ), + ('1183', '1012', 'Profile Anomaly' , 'Char_Column_Date_Values', 'snowflake', NULL, 'SELECT A.* FROM (SELECT DISTINCT TOP 10 ''Date'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {{DKFN_IS_DATE;;"{COLUMN_NAME}"}} = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT TOP 10 ''Non-Date'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {{DKFN_IS_DATE;;"{COLUMN_NAME}"}} != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS B ORDER BY data_type, count DESC;' ), ('1184', '1013', 'Profile Anomaly' , 'Small Missing Value Ct', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN "{COLUMN_NAME}" IN (''.'', ''?'', '' '') THEN 1 WHEN LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''-{2,}'' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''0{2,}'' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''9{2,}'' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''x{2,}'' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP ''z{2,}'' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN "{COLUMN_NAME}" = '''' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), ('1185', '1014', 'Profile Anomaly' , 'Small Divergent Value Ct', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;' ), ('1186', '1015', 'Profile Anomaly' , 'Boolean_Value_Mismatch', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC;' ), @@ -720,7 +715,7 @@ ORDER BY check_period DESC;'), ('1191', '1020', 'Profile Anomaly' , 'Recency_Six_Months', 'snowflake', NULL, 'created_in_ui' ), ('1192', '1021', 'Profile Anomaly' , 'Unexpected US States', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;' ), ('1193', '1022', 'Profile Anomaly' , 'Unexpected Emails', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;' ), - ('1194', '1023', 'Profile Anomaly' , 'Small_Numeric_Value_Ct', 'snowflake', NULL, 'SELECT A.* FROM (SELECT DISTINCT TOP 10 ''Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {DATA_QC_SCHEMA}.fndk_isnum("{COLUMN_NAME}") = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT TOP 10 ''Non-Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {DATA_QC_SCHEMA}.fndk_isnum("{COLUMN_NAME}") != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS B ORDER BY data_type, count DESC;' ), + ('1194', '1023', 'Profile Anomaly' , 'Small_Numeric_Value_Ct', 'snowflake', NULL, 'SELECT A.* FROM (SELECT DISTINCT TOP 10 ''Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {{DKFN_IS_NUM;;"{COLUMN_NAME}"}} = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT TOP 10 ''Non-Numeric'' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {{DKFN_IS_NUM;;"{COLUMN_NAME}"}} != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC) AS B ORDER BY data_type, count DESC;' ), ('1195', '1024', 'Profile Anomaly' , 'Invalid_Zip3_USA', 'snowflake', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') <> ''999'' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" LIMIT 500;'), ('1196', '1025', 'Profile Anomaly' , 'Delimited_Data_Embedded', 'snowflake', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, ''^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$'') AND NOT REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, ''.*\\s(and|but|or|yet)\\s.*'') GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT 500;' ), diff --git a/testgen/template/dbsetup/060_create_standard_views.sql b/testgen/template/dbsetup/060_create_standard_views.sql index 9ec8331..fbbf2f1 100644 --- a/testgen/template/dbsetup/060_create_standard_views.sql +++ b/testgen/template/dbsetup/060_create_standard_views.sql @@ -133,7 +133,7 @@ SELECT p.project_name, ELSE 'Passed' END as disposition, r.result_code as passed_ct, - (1 - r.result_code)::INTEGER as exception_ct, + (1 - COALESCE(r.result_code, 0))::INTEGER as exception_ct, CASE WHEN result_status = 'Warning' AND result_message NOT ILIKE 'Inactivated%' THEN 1 diff --git a/testgen/template/dbupgrade/0112_incremental_upgrade.sql b/testgen/template/dbupgrade/0112_incremental_upgrade.sql new file mode 100644 index 0000000..c81cccb --- /dev/null +++ b/testgen/template/dbupgrade/0112_incremental_upgrade.sql @@ -0,0 +1,3 @@ +SET SEARCH_PATH TO {SCHEMA_NAME}; + +UPDATE profile_anomaly_types SET anomaly_criteria = '(p.filled_value_ct > 0 OR p.zero_length_ct > 0)' WHERE id = '1002'; diff --git a/testgen/template/dbupgrade/0113_incremental_upgrade.sql b/testgen/template/dbupgrade/0113_incremental_upgrade.sql new file mode 100644 index 0000000..8907660 --- /dev/null +++ b/testgen/template/dbupgrade/0113_incremental_upgrade.sql @@ -0,0 +1,137 @@ +SET SEARCH_PATH TO {SCHEMA_NAME}; + +ALTER TABLE test_types + ADD COLUMN dq_score_prevalence_formula TEXT, + ADD COLUMN dq_score_risk_factor TEXT; + +ALTER TABLE test_suites + ADD COLUMN last_complete_test_run_id UUID, + ADD COLUMN dq_score_exclude BOOLEAN default FALSE; + +ALTER TABLE profile_anomaly_types + ADD COLUMN dq_score_prevalence_formula TEXT, + ADD COLUMN dq_score_risk_factor TEXT; + +ALTER TABLE profile_anomaly_results + ADD COLUMN dq_prevalence FLOAT; + +ALTER TABLE profiling_runs + ADD COLUMN dq_affected_data_points BIGINT, + ADD COLUMN dq_total_data_points BIGINT, + ADD COLUMN dq_score_profiling FLOAT; + +ALTER TABLE test_results + ADD COLUMN dq_prevalence FLOAT, + ADD COLUMN dq_record_ct BIGINT; + +ALTER TABLE test_runs + ADD COLUMN dq_affected_data_points BIGINT, + ADD COLUMN dq_total_data_points BIGINT, + ADD COLUMN dq_score_test_run FLOAT; + +ALTER TABLE table_groups + ADD COLUMN last_complete_profile_run_id UUID, + ADD COLUMN dq_score_profiling FLOAT, + ADD COLUMN dq_score_testing FLOAT; + +ALTER TABLE data_table_chars + ADD COLUMN last_complete_profile_run_id UUID, + ADD COLUMN dq_score_profiling FLOAT, + ADD COLUMN dq_score_testing FLOAT; + +ALTER TABLE data_column_chars + ADD COLUMN last_complete_profile_run_id UUID, + ADD COLUMN dq_score_profiling FLOAT, + ADD COLUMN dq_score_testing FLOAT; + + +ALTER TABLE profile_results + ADD COLUMN upper_case_ct BIGINT, + ADD COLUMN lower_case_ct BIGINT, + ADD COLUMN non_alpha_ct BIGINT, + ADD COLUMN mixed_case_ct BIGINT GENERATED ALWAYS AS ( value_ct - upper_case_ct - lower_case_ct - non_alpha_ct ) STORED, + ADD COLUMN before_100yr_date_ct BIGINT, + ADD COLUMN distant_future_date_ct BIGINT; + + +CREATE OR REPLACE FUNCTION fn_normal_cdf(z_score DOUBLE PRECISION) +RETURNS DOUBLE PRECISION AS +$$ +/* + This function calculates the cumulative distribution function (CDF) + for the standard normal distribution for a given Z-score using + the Abramowitz and Stegun approximation method. It returns the + probability that a standard normal variable is less than or equal + to the given Z-score. + + The approximation formula uses a series expansion to estimate the + CDF, which is accurate for most practical purposes. + + To estimate the count of observations that fall outside a certain Z-score + (both above and below), you can use the `normal_cdf()` function. For a + total number of observations N, the proportion of values outside the Z-score + is given by: 2 * (1 - normal_cdf(ABS(Z))) + + This gives the proportion of values greater than the positive Z-score and + less than the negative Z-score combined. To get the estimated count of + observations, multiply this proportion by N: N * 2 * (1 - normal_cdf(ABS(Z))) +*/ +DECLARE + t DOUBLE PRECISION; + cdf DOUBLE PRECISION; +BEGIN + t := 1.0 / (1.0 + 0.2316419 * ABS(z_score)); + + cdf := (1.0 / SQRT(2 * PI())) * EXP(-0.5 * z_score * z_score) * + (0.319381530 * t + - 0.356563782 * t * t + + 1.781477937 * t * t * t + - 1.821255978 * t * t * t * t + + 1.330274429 * t * t * t * t * t); + + IF z_score >= 0 THEN + RETURN 1.0 - cdf; + ELSE + RETURN cdf; + END IF; +END; +$$ LANGUAGE plpgsql; + + +CREATE OR REPLACE FUNCTION fn_eval(expression TEXT) RETURNS FLOAT +AS +$$ +DECLARE + result FLOAT; + invalid_parts TEXT; +BEGIN + -- Check the modified expression for invalid characters, allowing colons + IF expression ~* E'[^0-9+\\-*/(),.\\sA-Z_:e\\\'"]' THEN + RAISE EXCEPTION 'Invalid characters detected in expression: %', expression; + END IF; + + -- Check for dangerous PostgreSQL-specific keywords + IF expression ~* E'\b(DROP|ALTER|INSERT|UPDATE|DELETE|TRUNCATE|GRANT|REVOKE|COPY|EXECUTE|CREATE|COMMENT|SECURITY|WITH|SET ROLE|SET SESSION|DO|CALL|--|/\\*|;|pg_read_file|pg_write_file|pg_terminate_backend)\b' THEN + RAISE EXCEPTION 'Invalid expression: dangerous statement detected'; + END IF; + + -- Remove all allowed tokens from the validation expression, treating 'FLOAT' as a keyword + invalid_parts := regexp_replace( + expression, + E'(\\mGREATEST|LEAST|ABS|FN_NORMAL_CDF|DATEDIFF|DAY|FLOAT)\\M|[0-9]+(\\.[0-9]+)?([eE][+-]?[0-9]+)?|[+\\-*/(),\\\'":]+|\\s+', + '', + 'gi' + ); + + -- If anything is left in the validation expression, it's invalid + IF invalid_parts <> '' THEN + RAISE EXCEPTION 'Invalid tokens "%" in expression: %', invalid_parts, expression; + END IF; + + -- Use the original expression (with ::FLOAT) for execution + EXECUTE format('SELECT (%s)::FLOAT', expression) INTO result; + + RETURN result; +END; +$$ +LANGUAGE plpgsql; diff --git a/testgen/template/dbupgrade/0114_incremental_upgrade.sql b/testgen/template/dbupgrade/0114_incremental_upgrade.sql new file mode 100644 index 0000000..86bbcf1 --- /dev/null +++ b/testgen/template/dbupgrade/0114_incremental_upgrade.sql @@ -0,0 +1,84 @@ +SET SEARCH_PATH TO {SCHEMA_NAME}; + + +WITH last_test_run_dates AS ( + SELECT test_suite_id, + MAX(test_starttime) AS test_starttime + FROM test_runs + WHERE status = 'Complete' + GROUP BY test_suite_id +) +UPDATE test_suites +SET last_complete_test_run_id = tr.id +FROM last_test_run_dates ltd + LEFT JOIN test_runs tr ON ( + ltd.test_suite_id = tr.test_suite_id + AND ltd.test_starttime = tr.test_starttime + ) +WHERE test_suites.id = ltd.test_suite_id; + + +WITH last_profile_dates AS ( + SELECT table_groups_id, + MAX(profiling_starttime) AS profiling_starttime + FROM profiling_runs + WHERE status = 'Complete' + GROUP BY table_groups_id +) +UPDATE table_groups +SET last_complete_profile_run_id = pr.id +FROM last_profile_dates lpd + LEFT JOIN profiling_runs pr ON ( + lpd.table_groups_id = pr.table_groups_id + AND lpd.profiling_starttime = pr.profiling_starttime + ) +WHERE table_groups.id = lpd.table_groups_id; + + +WITH last_profile_dates AS ( + SELECT profiling_runs.table_groups_id, + table_name, + MAX(profiling_starttime) AS profiling_starttime + FROM profile_results + LEFT JOIN profiling_runs ON ( + profile_results.profile_run_id = profiling_runs.id + ) + WHERE status = 'Complete' + GROUP BY profiling_runs.table_groups_id, + table_name +) +UPDATE data_table_chars +SET last_complete_profile_run_id = pr.id +FROM last_profile_dates lpd + LEFT JOIN profiling_runs pr ON ( + lpd.table_groups_id = pr.table_groups_id + AND lpd.profiling_starttime = pr.profiling_starttime + ) +WHERE data_table_chars.table_groups_id = lpd.table_groups_id + AND data_table_chars.table_name = lpd.table_name; + + +WITH last_profile_dates AS ( + SELECT profiling_runs.table_groups_id, + table_name, + column_name, + MAX(profiling_starttime) AS profiling_starttime + FROM profile_results + LEFT JOIN profiling_runs ON ( + profile_results.profile_run_id = profiling_runs.id + ) + WHERE status = 'Complete' + GROUP BY profiling_runs.table_groups_id, + table_name, + column_name +) +UPDATE data_column_chars +SET last_complete_profile_run_id = pr.id +FROM last_profile_dates lpd + LEFT JOIN profiling_runs pr ON ( + lpd.table_groups_id = pr.table_groups_id + AND lpd.profiling_starttime = pr.profiling_starttime + ) +WHERE data_column_chars.table_groups_id = lpd.table_groups_id + AND data_column_chars.table_name = lpd.table_name + AND data_column_chars.column_name = lpd.column_name; diff --git a/testgen/template/dbupgrade/0115_incremental_upgrade.sql b/testgen/template/dbupgrade/0115_incremental_upgrade.sql new file mode 100644 index 0000000..82a3058 --- /dev/null +++ b/testgen/template/dbupgrade/0115_incremental_upgrade.sql @@ -0,0 +1,3 @@ +SET SEARCH_PATH TO {SCHEMA_NAME}; + +ALTER TABLE connections DROP COLUMN project_qc_schema; diff --git a/testgen/template/exec_cat_tests/ex_cat_build_agg_table_tests.sql b/testgen/template/exec_cat_tests/ex_cat_build_agg_table_tests.sql index bee3588..c5e5fb6 100644 --- a/testgen/template/exec_cat_tests/ex_cat_build_agg_table_tests.sql +++ b/testgen/template/exec_cat_tests/ex_cat_build_agg_table_tests.sql @@ -29,10 +29,9 @@ WITH test_detail -- Standard Measure start 'CAST(' || -- Nested parm replacements - part of query, not Python parms - REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE( + REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE( c.measure, '{COLUMN_NAME}', COALESCE(fn_PrepColumnName(t.column_name), '')), - '{DATA_QC_SCHEMA}', '{REPLACE_QC_SCHEMA}'), '{BASELINE_CT}', COALESCE(t.baseline_ct, '')), '{BASELINE_UNIQUE_CT}', COALESCE(t.baseline_unique_ct, '')), '{BASELINE_VALUE}', COALESCE(t.baseline_value, '') ), @@ -48,10 +47,9 @@ WITH test_detail -- Standard CASE for condition starts 'CASE WHEN ' || -- Nested parm replacements - standard - REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE( + REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE( c.measure || c.test_operator || c.test_condition, '{COLUMN_NAME}', COALESCE(fn_PrepColumnName(t.column_name), '')), - '{DATA_QC_SCHEMA}', '{REPLACE_QC_SCHEMA}'), '{BASELINE_CT}', COALESCE(t.baseline_ct, '')), '{BASELINE_UNIQUE_CT}', COALESCE(t.baseline_unique_ct, '')), '{BASELINE_VALUE}', COALESCE(t.baseline_value, '') ), diff --git a/testgen/template/exec_cat_tests/ex_cat_get_distinct_tables.sql b/testgen/template/exec_cat_tests/ex_cat_get_distinct_tables.sql index ff2878b..e8e85d0 100644 --- a/testgen/template/exec_cat_tests/ex_cat_get_distinct_tables.sql +++ b/testgen/template/exec_cat_tests/ex_cat_get_distinct_tables.sql @@ -1,6 +1,5 @@ SELECT DISTINCT schema_name, - table_name, - project_qc_schema as replace_qc_schema + table_name FROM test_definitions td INNER JOIN test_types tt ON td.test_type = tt.test_type diff --git a/testgen/template/execution/ex_finalize_test_run_results.sql b/testgen/template/execution/ex_finalize_test_run_results.sql index e4d1d6e..c9f187c 100644 --- a/testgen/template/execution/ex_finalize_test_run_results.sql +++ b/testgen/template/execution/ex_finalize_test_run_results.sql @@ -3,6 +3,7 @@ UPDATE test_results severity = COALESCE(d.severity, s.severity, tt.default_severity), threshold_value = COALESCE(r.threshold_value, d.threshold_value), result_status = CASE + WHEN r.result_status = 'Error' THEN 'Error' WHEN r.result_code = 1 THEN 'Passed' WHEN r.result_code = 0 AND COALESCE(d.severity, s.severity, tt.default_severity) = 'Warning' THEN 'Warning' @@ -31,3 +32,68 @@ INNER JOIN test_definitions d ON r.test_definition_id = d.id INNER JOIN test_types tt ON r.test_type = tt.test_type WHERE r.test_run_id = '{TEST_RUN_ID}' AND test_results.id = r.id; + +-- ============================================================================== +-- | Data Quality Scoring +-- | - Prevalence % * dq_score_risk_factor = calculated prevalence % +-- | - Save with total datapoints (record count). +-- | - When scoring, calculate SUM(calculated prevalence * record count) +-- | / SUM(record count) +-- ============================================================================== + +-- UPDATE prevalence to zero for all passed or excluded tests +UPDATE test_results + SET dq_record_ct = tc.record_ct, + dq_prevalence = 0 + FROM test_results r +INNER JOIN data_table_chars tc + ON (r.table_groups_id = tc.table_groups_id + AND r.table_name ILIKE tc.table_name) + WHERE r.test_run_id = '{TEST_RUN_ID}'::UUID + AND ( r.result_code = 1 + OR r.disposition IN ('Dismissed', 'Inactive') ) + AND test_results.id = r.id; + +-- UPDATE TO calculated prevalence for all fails/warnings - result_code = 0 +WITH result_calc + AS ( SELECT r.id, + tt.dq_score_risk_factor::FLOAT as risk_calc, + REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( + REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( REPLACE( + tt.dq_score_prevalence_formula, + '{RESULT_MEASURE}', COALESCE(r.result_measure::VARCHAR, '')), + '{THRESHOLD_VALUE}', COALESCE(r.threshold_value::VARCHAR, '')), + + '{PRO_RECORD_CT}', COALESCE(p.record_ct::VARCHAR, '')), + '{DATE_DAYS_PRESENT}', COALESCE(p.date_days_present::VARCHAR, '')), + '{DATE_MONTHS_PRESENT}', COALESCE(p.date_months_present::VARCHAR, '')), + '{DATE_WEEKS_PRESENT}', COALESCE(p.date_weeks_present::VARCHAR, '')), + '{MIN_DATE}', COALESCE(p.min_date::VARCHAR, '')), + '{MAX_DATE}', COALESCE(p.max_date::VARCHAR, '')), + '{DISTINCT_VALUE_CT}', COALESCE(p.distinct_value_ct::VARCHAR, '')), + '{VALUE_CT}', COALESCE(p.value_ct::VARCHAR, '')), + '{MAX_LENGTH}', COALESCE(p.max_length::VARCHAR, '')), + '{AVG_LENGTH}', COALESCE(p.avg_length::VARCHAR, '')), + + '{RECORD_CT}', COALESCE(r.dq_record_ct::VARCHAR, tc.record_ct::VARCHAR, '')) + as built_score_prevalance_formula, + COALESCE(r.dq_record_ct, tc.record_ct) as dq_record_ct + FROM test_results r + INNER JOIN test_types tt + ON r.test_type = tt.test_type + LEFT JOIN v_latest_profile_results p + ON (r.table_groups_id = p.table_groups_id + AND r.table_name = p.table_name + AND r.column_names = p.column_name) + LEFT JOIN data_table_chars tc + ON (r.table_groups_id = tc.table_groups_id + AND r.table_name ILIKE tc.table_name) + WHERE r.test_run_id = '{TEST_RUN_ID}'::UUID + AND result_code = 0 + AND NOT COALESCE(disposition, '') IN ('Dismissed', 'Inactive') ) +UPDATE test_results + SET dq_record_ct = c.dq_record_ct, + dq_prevalence = risk_calc * fn_eval(c.built_score_prevalance_formula) + FROM result_calc c + WHERE test_results.id = c.id; + diff --git a/testgen/template/execution/ex_update_test_suite.sql b/testgen/template/execution/ex_update_test_suite.sql new file mode 100644 index 0000000..68283f1 --- /dev/null +++ b/testgen/template/execution/ex_update_test_suite.sql @@ -0,0 +1,13 @@ +WITH last_run + AS (SELECT test_suite_id, MAX(test_starttime) as max_starttime + FROM test_runs + WHERE test_suite_id = '{TEST_SUITE_ID}' + AND status = 'Complete' + GROUP BY test_suite_id) +UPDATE test_suites + SET last_complete_test_run_id = r.id + FROM test_runs r +INNER JOIN last_run l + ON (r.test_suite_id = l.test_suite_id + AND r.test_starttime = l.max_starttime) + WHERE test_suites.id = r.test_suite_id; \ No newline at end of file diff --git a/testgen/template/execution/test_scoring_rollup.sql b/testgen/template/execution/test_scoring_rollup.sql new file mode 100644 index 0000000..30c2798 --- /dev/null +++ b/testgen/template/execution/test_scoring_rollup.sql @@ -0,0 +1,123 @@ +-- Roll up scoring to test run +WITH score_detail + AS (SELECT tr.test_run_id, tr.table_name, tr.column_names, + MAX(tr.dq_record_ct) as row_ct, + SUM(COALESCE(tr.dq_prevalence * tr.dq_record_ct, 0)) as affected_data_points + FROM test_results tr + INNER JOIN test_runs r + ON tr.test_run_id = r.id + WHERE tr.test_run_id = '{TEST_RUN_ID}' + AND COALESCE(tr.disposition, 'Confirmed') = 'Confirmed' + GROUP BY tr.test_run_id, tr.table_name, tr.column_names ), +score_calc + AS ( SELECT test_run_id, + SUM(affected_data_points) as sum_affected_data_points, + SUM(row_ct) as sum_data_points + FROM score_detail + GROUP BY test_run_id ) +UPDATE test_runs + SET dq_affected_data_points = sum_affected_data_points, + dq_total_data_points = sum_data_points, + dq_score_test_run = 100.0 - sum_affected_data_points / sum_data_points + FROM score_calc + WHERE test_runs.id = score_calc.test_run_id; + + + +-- Roll up scores from latest Test Runs per Test Suite to Table Group +WITH last_test_date + AS (SELECT r.test_suite_id, MAX(r.test_starttime) as last_test_run_date + FROM test_runs r + WHERE r.status = 'Complete' + GROUP BY r.test_suite_id), +score_calc + AS (SELECT ts.table_groups_id, + SUM(run.dq_affected_data_points) as sum_affected_data_points, + SUM(run.dq_total_data_points) as sum_data_points + FROM test_runs run + INNER JOIN test_suites ts + ON (run.test_suite_id = ts.id) + INNER JOIN last_test_date lp + ON (run.test_suite_id = lp.test_suite_id + AND run.test_starttime = lp.last_test_run_date) + WHERE ts.table_groups_id = '{TABLE_GROUPS_ID}' + AND ts.dq_score_exclude = FALSE + GROUP BY ts.table_groups_id) +UPDATE table_groups + SET dq_score_testing = 100.0 - s.sum_affected_data_points::FLOAT / s.sum_data_points::FLOAT + FROM score_calc s + WHERE table_groups.id = s.table_groups_id; + +-- Roll up latest scores to data_column_chars +WITH last_test_date + AS (SELECT r.test_suite_id, MAX(r.test_starttime) as last_test_run_date + FROM test_runs r + WHERE r.status = 'Complete' + GROUP BY r.test_suite_id), +score_calc + AS (SELECT dcc.column_id, + -- Use AVG instead of MAX because column counts may differ by test_run + AVG(tr.dq_record_ct) as row_ct, + -- Use SUM to combine impact of all fails per column + SUM(COALESCE(tr.dq_prevalence * tr.dq_record_ct, 0)) as affected_data_points + FROM test_results tr + INNER JOIN test_runs r + ON tr.test_run_id = r.id + INNER JOIN last_test_date lp + ON (r.test_suite_id = lp.test_suite_id + AND r.test_starttime = lp.last_test_run_date) + INNER JOIN test_suites ts + ON (r.test_suite_id = ts.id) + INNER JOIN data_column_chars dcc + ON (ts.table_groups_id = dcc.table_groups_id + AND tr.table_name = dcc.table_name + AND tr.column_names = dcc.column_name) + WHERE ts.table_groups_id = '{TABLE_GROUPS_ID}' + AND ts.dq_score_exclude = FALSE + AND COALESCE(tr.disposition, 'Confirmed') = 'Confirmed' + GROUP BY dcc.column_id ) +UPDATE data_column_chars + SET dq_score_testing = 100.0 - affected_data_points / row_ct + FROM score_calc s + WHERE data_column_chars.column_id = s.column_id; + + + +-- Roll up latest scores to data_table_chars +WITH last_test_date + AS (SELECT r.test_suite_id, MAX(r.test_starttime) as last_test_run_date + FROM test_runs r + WHERE r.status = 'Complete' + GROUP BY r.test_suite_id), +score_detail + AS (SELECT dcc.table_id, dcc.column_id, + -- Use AVG instead of MAX because column counts may differ by test_run + AVG(tr.dq_record_ct) as row_ct, + -- Use SUM to combine impact of all fails per column + SUM(COALESCE(tr.dq_prevalence * tr.dq_record_ct, 0)) as affected_data_points + FROM test_results tr + INNER JOIN test_runs r + ON tr.test_run_id = r.id + INNER JOIN last_test_date lp + ON (r.test_suite_id = lp.test_suite_id + AND r.test_starttime = lp.last_test_run_date) + INNER JOIN test_suites ts + ON (r.test_suite_id = ts.id) + INNER JOIN data_column_chars dcc + ON (ts.table_groups_id = dcc.table_groups_id + AND tr.table_name = dcc.table_name + AND tr.column_names = dcc.column_name) + WHERE ts.table_groups_id = '{TABLE_GROUPS_ID}' + AND ts.dq_score_exclude = FALSE + AND COALESCE(tr.disposition, 'Confirmed') = 'Confirmed' + GROUP BY table_id, dcc.column_id ), +score_calc + AS (SELECT table_id, + SUM(affected_data_points) as sum_affected_data_points, + SUM(row_ct) as sum_data_points + FROM score_detail + GROUP BY table_id) +UPDATE data_table_chars + SET dq_score_testing = 100.0 - sum_affected_data_points / sum_data_points + FROM score_calc s + WHERE data_table_chars.table_id = s.table_id; \ No newline at end of file diff --git a/testgen/template/flavors/mssql/profiling/project_profiling_query_mssql.yaml b/testgen/template/flavors/mssql/profiling/project_profiling_query_mssql.yaml index 5ebda4a..8ca20a1 100644 --- a/testgen/template/flavors/mssql/profiling/project_profiling_query_mssql.yaml +++ b/testgen/template/flavors/mssql/profiling/project_profiling_query_mssql.yaml @@ -57,8 +57,22 @@ strTemplate05_A: COUNT(DISTINCT UPPER(REPLACE(TRANSLATE("{COL_NAME}",' '''',.-', END ) AS filled_value_ct, LEFT(MIN(NULLIF("{COL_NAME}", '') COLLATE Latin1_General_BIN ), 100) AS min_text, LEFT(MAX(NULLIF("{COL_NAME}", '') COLLATE Latin1_General_BIN ), 100) AS max_text, - SUM({DATA_QC_SCHEMA}.fndk_isnum(LEFT("{COL_NAME}", 31))) AS numeric_ct, - SUM({DATA_QC_SCHEMA}.fndk_isdate(LEFT("{COL_NAME}", 26))) AS date_ct, + SUM(CASE + WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', ' ') = "{COL_NAME}" THEN 0 + WHEN TRANSLATE("{COL_NAME}", 'abcdefghijklmnopqrstuvwxyz', ' ') = "{COL_NAME}" THEN 1 + ELSE 0 + END) AS upper_case_ct, + SUM(CASE + WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', ' ') = "{COL_NAME}" THEN 0 + WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', ' ') = "{COL_NAME}" THEN 1 + ELSE 0 + END) AS lower_case_ct, + SUM(CASE + WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', ' ') = "{COL_NAME}" THEN 1 + ELSE 0 + END) AS non_alpha_ct, + SUM({{DKFN_IS_NUM;;LEFT("{COL_NAME}", 31)}} ) AS numeric_ct, + SUM({{DKFN_IS_DATE;;LEFT("{COL_NAME}", 26)}} ) AS date_ct, CASE WHEN CAST(SUM( CASE WHEN UPPER("{COL_NAME}") LIKE '[1-9]% [A-Z]% %' AND CHARINDEX(' ', "{COL_NAME}") BETWEEN 2 and 6 THEN 1 @@ -107,6 +121,9 @@ strTemplate05_else: NULL as distinct_std_value_ct, NULL as filled_value_ct, NULL as min_text, NULL as max_text, + NULL as upper_case_ct, + NULL as lower_case_ct, + NULL as non_alpha_ct, NULL as numeric_ct, NULL as date_ct, NULL as std_pattern_match, @@ -175,6 +192,10 @@ strTemplate11_D: CASE WHEN DATEDIFF(month, "{COL_NAME}", '{RUN_DATE}') > 240 THEN 1 ELSE 0 END) AS before_20yr_date_ct, + SUM(CASE + WHEN DATEDIFF(month, "{COL_NAME}", '{RUN_DATE}') > 1200 THEN 1 + ELSE 0 + END) AS before_100yr_date_ct, SUM(CASE WHEN DATEDIFF(day, "{COL_NAME}", '{RUN_DATE}') BETWEEN 0 AND 365 THEN 1 ELSE 0 @@ -186,6 +207,10 @@ strTemplate11_D: CASE SUM(CASE WHEN "{COL_NAME}" > '{RUN_DATE}' THEN 1 ELSE 0 END) AS future_date_ct, + SUM(CASE + WHEN DATEDIFF(month, '{RUN_DATE}', "{COL_NAME}") > 240 THEN 1 + ELSE 0 + END) AS distant_future_date_ct, COUNT(DISTINCT DATEDIFF(day, "{COL_NAME}", '{RUN_DATE}' ) ) as date_days_present, COUNT(DISTINCT DATEDIFF(week, "{COL_NAME}", '{RUN_DATE}' ) ) as date_weeks_present, COUNT(DISTINCT DATEDIFF(month, "{COL_NAME}", '{RUN_DATE}' ) ) as date_months_present, @@ -195,9 +220,11 @@ strTemplate11_else: NULL as min_date, NULL as before_1yr_date_ct, NULL as before_5yr_date_ct, NULL as before_20yr_date_ct, + NULL AS before_100yr_date_ct, NULL as within_1yr_date_ct, NULL as within_1mo_date_ct, NULL as future_date_ct, + NULL as distant_future_date_ct, NULL as date_days_present, NULL as date_weeks_present, NULL as date_months_present, diff --git a/testgen/template/flavors/mssql/profiling/templated_functions.yaml b/testgen/template/flavors/mssql/profiling/templated_functions.yaml new file mode 100644 index 0000000..86d064b --- /dev/null +++ b/testgen/template/flavors/mssql/profiling/templated_functions.yaml @@ -0,0 +1,46 @@ +IS_NUM: CASE + WHEN TRY_CAST(NULLIF({$1}, '') AS float) IS NOT NULL THEN 1 + ELSE 0 + END + +IS_DATE: CASE WHEN TRY_CAST(NULLIF({$1}, '') AS float) IS NOT NULL + AND LEFT(NULLIF({$1}, ''),4) BETWEEN 1800 AND 2200 THEN + CASE + WHEN LEN((NULLIF({$1}, ''))) > 11 THEN 0 + /* YYYYMMDD */ + WHEN TRY_CONVERT(DATE, NULLIF({$1}, ''), 112) IS NOT NULL THEN 1 + + /* YYYY-MM-DD */ + WHEN TRY_CONVERT(DATE, NULLIF({$1}, ''), 23) IS NOT NULL THEN 1 + + /* MM/DD/YYYY */ + WHEN TRY_CONVERT(DATE, NULLIF({$1}, ''), 101) IS NOT NULL THEN 1 + + /* MM/DD/YY */ + WHEN TRY_CONVERT(DATE, NULLIF({$1}, ''), 1) IS NOT NULL THEN 1 + + /*MM-DD-YYYY */ + WHEN TRY_CONVERT(DATE, NULLIF({$1}, ''), 110) IS NOT NULL THEN 1 + + /*MM-DD-YY */ + WHEN TRY_CONVERT(DATE, NULLIF({$1}, ''), 10) IS NOT NULL THEN 1 + + + ELSE 0 END + /*DD MMM YYYY */ + WHEN (TRY_CONVERT(DATE, NULLIF({$1}, ''), 106) IS NOT NULL + AND LEFT(NULLIF({$1}, ''), 4) BETWEEN 1800 AND 2200) + THEN 1 + + /* YYYY-MM-DD HH:MM:SS SSSSSS */ + WHEN (TRY_CONVERT(DATETIME2, NULLIF({$1}, ''), 121) IS NOT NULL + AND LEFT(NULLIF({$1}, ''), 4) BETWEEN 1800 AND 2200) + THEN 1 + + /* YYYY-MM-DD HH:MM:SS */ + WHEN (TRY_CONVERT(DATETIME2, NULLIF({$1}, ''), 120) IS NOT NULL + AND LEFT(NULLIF({$1}, ''), 4) BETWEEN 1800 AND 2200) + THEN 1 + ELSE 0 + END + diff --git a/testgen/template/flavors/mssql/setup_profiling_tools/00_drop_existing_functions_mssql.sql b/testgen/template/flavors/mssql/setup_profiling_tools/00_drop_existing_functions_mssql.sql deleted file mode 100644 index ff358ce..0000000 --- a/testgen/template/flavors/mssql/setup_profiling_tools/00_drop_existing_functions_mssql.sql +++ /dev/null @@ -1,8 +0,0 @@ --- Step 1: Drop both functions if they exist -BEGIN - IF OBJECT_ID('{DATA_QC_SCHEMA}.fndk_isnum', 'FN') IS NOT NULL - DROP FUNCTION {DATA_QC_SCHEMA}.fndk_isnum; - - IF OBJECT_ID('{DATA_QC_SCHEMA}.fndk_isdate', 'FN') IS NOT NULL - DROP FUNCTION {DATA_QC_SCHEMA}.fndk_isdate; -END diff --git a/testgen/template/flavors/mssql/setup_profiling_tools/01_create_functions_mssql.sql b/testgen/template/flavors/mssql/setup_profiling_tools/01_create_functions_mssql.sql deleted file mode 100644 index 1547fe5..0000000 --- a/testgen/template/flavors/mssql/setup_profiling_tools/01_create_functions_mssql.sql +++ /dev/null @@ -1,12 +0,0 @@ --- Step 2: Create isnum function -CREATE FUNCTION {DATA_QC_SCHEMA}.fndk_isnum (@strparm VARCHAR(500)) -RETURNS INT -AS -BEGIN - IF TRY_CAST(NULLIF(@strparm, '') AS float) IS NOT NULL - BEGIN - RETURN(1) - END - - RETURN(0) -END; diff --git a/testgen/template/flavors/mssql/setup_profiling_tools/02_create_functions_mssql.sql b/testgen/template/flavors/mssql/setup_profiling_tools/02_create_functions_mssql.sql deleted file mode 100644 index 874938f..0000000 --- a/testgen/template/flavors/mssql/setup_profiling_tools/02_create_functions_mssql.sql +++ /dev/null @@ -1,54 +0,0 @@ --- Step 3: Create isdate function - -CREATE FUNCTION {DATA_QC_SCHEMA}.fndk_isdate(@strparm VARCHAR(500)) - RETURNS INT -AS -BEGIN - DECLARE @ret INT - - SET @ret = - - CASE WHEN TRY_CAST(NULLIF(@strparm, '') AS float) IS NOT NULL - AND LEFT(NULLIF(@strparm, ''),4) BETWEEN 1800 AND 2200 THEN - CASE - WHEN LEN((NULLIF(@strparm, ''))) > 11 THEN 0 - -- YYYYMMDD - WHEN TRY_CONVERT(DATE, NULLIF(@strparm, ''), 112) IS NOT NULL THEN 1 - - -- YYYY-MM-DD - WHEN TRY_CONVERT(DATE, NULLIF(@strparm, ''), 23) IS NOT NULL THEN 1 - - -- MM/DD/YYYY - WHEN TRY_CONVERT(DATE, NULLIF(@strparm, ''), 101) IS NOT NULL THEN 1 - - -- MM/DD/YY - WHEN TRY_CONVERT(DATE, NULLIF(@strparm, ''), 1) IS NOT NULL THEN 1 - - --MM-DD-YYYY - WHEN TRY_CONVERT(DATE, NULLIF(@strparm, ''), 110) IS NOT NULL THEN 1 - - --MM-DD-YY - WHEN TRY_CONVERT(DATE, NULLIF(@strparm, ''), 10) IS NOT NULL THEN 1 - - - ELSE 0 END - --DD MMM YYYY - WHEN (TRY_CONVERT(DATE, NULLIF(@strparm, ''), 106) IS NOT NULL - AND LEFT(NULLIF(@strparm, ''), 4) BETWEEN 1800 AND 2200) - THEN 1 - - -- YYYY-MM-DD HH:MM:SS SSSSSS - WHEN (TRY_CONVERT(DATETIME2, NULLIF(@strparm, ''), 121) IS NOT NULL - AND LEFT(NULLIF(@strparm, ''), 4) BETWEEN 1800 AND 2200) - THEN 1 - - -- YYYY-MM-DD HH:MM:SS - WHEN (TRY_CONVERT(DATETIME2, NULLIF(@strparm, ''), 120) IS NOT NULL - AND LEFT(NULLIF(@strparm, ''), 4) BETWEEN 1800 AND 2200) - THEN 1 - ELSE 0 - END - RETURN @ret - -END -; diff --git a/testgen/template/flavors/mssql/setup_profiling_tools/create_qc_schema_mssql.sql b/testgen/template/flavors/mssql/setup_profiling_tools/create_qc_schema_mssql.sql deleted file mode 100644 index 5bd4d06..0000000 --- a/testgen/template/flavors/mssql/setup_profiling_tools/create_qc_schema_mssql.sql +++ /dev/null @@ -1,4 +0,0 @@ -IF NOT EXISTS (SELECT * FROM sys.schemas WHERE name = '{DATA_QC_SCHEMA}') -BEGIN - EXEC('CREATE SCHEMA {DATA_QC_SCHEMA}') -END diff --git a/testgen/template/flavors/mssql/setup_profiling_tools/grant_execute_privileges_mssql.sql b/testgen/template/flavors/mssql/setup_profiling_tools/grant_execute_privileges_mssql.sql deleted file mode 100644 index 22b4576..0000000 --- a/testgen/template/flavors/mssql/setup_profiling_tools/grant_execute_privileges_mssql.sql +++ /dev/null @@ -1 +0,0 @@ -GRANT EXECUTE ON SCHEMA::{DATA_QC_SCHEMA} TO {DB_USER}; diff --git a/testgen/template/flavors/postgresql/profiling/project_profiling_query_postgresql.yaml b/testgen/template/flavors/postgresql/profiling/project_profiling_query_postgresql.yaml index db02274..746c25f 100644 --- a/testgen/template/flavors/postgresql/profiling/project_profiling_query_postgresql.yaml +++ b/testgen/template/flavors/postgresql/profiling/project_profiling_query_postgresql.yaml @@ -51,8 +51,22 @@ strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) a END ) AS filled_value_ct, LEFT(MIN(NULLIF("{COL_NAME}", '')), 100) AS min_text, LEFT(MAX(NULLIF("{COL_NAME}", '')), 100) AS max_text, - SUM({DATA_QC_SCHEMA}.fndk_isnum(LEFT("{COL_NAME}", 31))) AS numeric_ct, - SUM({DATA_QC_SCHEMA}.fndk_isdate(LEFT("{COL_NAME}", 26))) AS date_ct, + SUM(CASE + WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', '') = "{COL_NAME}" THEN 0 + WHEN TRANSLATE("{COL_NAME}", 'abcdefghijklmnopqrstuvwxyz', '') = "{COL_NAME}" THEN 1 + ELSE 0 + END) AS upper_case_ct, + SUM(CASE + WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', '') = "{COL_NAME}" THEN 0 + WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', '') = "{COL_NAME}" THEN 1 + ELSE 0 + END) AS lower_case_ct, + SUM(CASE + WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', '') = "{COL_NAME}" THEN 1 + ELSE 0 + END) AS non_alpha_ct, + SUM({{DKFN_IS_NUM;;LEFT("{COL_NAME}", 31)}} ) AS numeric_ct, + SUM({{DKFN_IS_DATE;;LEFT("{COL_NAME}", 26)}} ) AS date_ct, CASE WHEN SUM( CASE WHEN "{COL_NAME}" ~ '^[0-9]{1,5}[a-zA-Z]?\s\w{1,5}\.?\s?\w*\s?\w*\s[a-zA-Z]{1,6}\.?\s?[0-9]{0,5}[A-Z]{0,1}$' THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.8 THEN 'STREET_ADDR' @@ -84,6 +98,9 @@ strTemplate05_else: NULL as distinct_std_value_ct, NULL as filled_value_ct, NULL as min_text, NULL as max_text, + NULL as upper_case_ct, + NULL as lower_case_ct, + NULL as non_alpha_ct, NULL as numeric_ct, NULL as date_ct, NULL as std_pattern_match, @@ -142,31 +159,39 @@ strTemplate11_D: CASE END as min_date, MAX("{COL_NAME}") as max_date, SUM(CASE - WHEN {DATA_QC_SCHEMA}.DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 12 THEN 1 + WHEN {{DKFN_DATEDIFF_MONTH;;"{COL_NAME}";;'{RUN_DATE}'}} > 12 THEN 1 ELSE 0 END) AS before_1yr_date_ct, SUM(CASE - WHEN {DATA_QC_SCHEMA}.DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 60 THEN 1 + WHEN {{DKFN_DATEDIFF_MONTH;;"{COL_NAME}";;'{RUN_DATE}'}} > 60 THEN 1 ELSE 0 END) AS before_5yr_date_ct, SUM(CASE - WHEN {DATA_QC_SCHEMA}.DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 240 THEN 1 + WHEN {{DKFN_DATEDIFF_MONTH;;"{COL_NAME}";;'{RUN_DATE}'}} > 240 THEN 1 ELSE 0 END) AS before_20yr_date_ct, SUM(CASE - WHEN {DATA_QC_SCHEMA}.DATEDIFF('DAY', "{COL_NAME}", '{RUN_DATE}') BETWEEN 0 AND 365 THEN 1 + WHEN {{DKFN_DATEDIFF_MONTH;;"{COL_NAME}";;'{RUN_DATE}'}} > 1200 THEN 1 + ELSE 0 + END) AS before_100yr_date_ct, + SUM(CASE + WHEN {{DKFN_DATEDIFF_DAY;;"{COL_NAME}";;'{RUN_DATE}'}} BETWEEN 0 AND 365 THEN 1 ELSE 0 END) AS within_1yr_date_ct, SUM(CASE - WHEN {DATA_QC_SCHEMA}.DATEDIFF('DAY', "{COL_NAME}", '{RUN_DATE}') BETWEEN 0 AND 30 THEN 1 + WHEN {{DKFN_DATEDIFF_DAY;;"{COL_NAME}";;'{RUN_DATE}'}} BETWEEN 0 AND 30 THEN 1 ELSE 0 END) AS within_1mo_date_ct, SUM(CASE WHEN "{COL_NAME}" > '{RUN_DATE}' THEN 1 ELSE 0 END) AS future_date_ct, - COUNT(DISTINCT {DATA_QC_SCHEMA}.DATEDIFF('DAY', "{COL_NAME}", '{RUN_DATE}' ) ) as date_days_present, - COUNT(DISTINCT {DATA_QC_SCHEMA}.DATEDIFF('WEEK', "{COL_NAME}", '{RUN_DATE}' ) ) as date_weeks_present, - COUNT(DISTINCT {DATA_QC_SCHEMA}.DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}' ) ) as date_months_present, + SUM(CASE + WHEN {{DKFN_DATEDIFF_MONTH;;'{RUN_DATE}';;"{COL_NAME}"}} > 240 THEN 1 + ELSE 0 + END) AS distant_future_date_ct, + COUNT(DISTINCT {{DKFN_DATEDIFF_DAY;;"{COL_NAME}";;'{RUN_DATE}'}} ) as date_days_present, + COUNT(DISTINCT {{DKFN_DATEDIFF_WEEK;;"{COL_NAME}";;'{RUN_DATE}'}} ) as date_weeks_present, + COUNT(DISTINCT {{DKFN_DATEDIFF_MONTH;;"{COL_NAME}";;'{RUN_DATE}'}} ) as date_months_present, strTemplate11_else: NULL as min_date, @@ -174,9 +199,11 @@ strTemplate11_else: NULL as min_date, NULL as before_1yr_date_ct, NULL as before_5yr_date_ct, NULL as before_20yr_date_ct, + NULL AS before_100yr_date_ct, NULL as within_1yr_date_ct, NULL as within_1mo_date_ct, NULL as future_date_ct, + NULL as distant_future_date_ct, NULL as date_days_present, NULL as date_weeks_present, NULL as date_months_present, diff --git a/testgen/template/flavors/postgresql/profiling/templated_functions.yaml b/testgen/template/flavors/postgresql/profiling/templated_functions.yaml new file mode 100644 index 0000000..cf9d854 --- /dev/null +++ b/testgen/template/flavors/postgresql/profiling/templated_functions.yaml @@ -0,0 +1,109 @@ +DATEDIFF_DAY: DATE({$2}) - DATE({$1}) + +DATEDIFF_WEEK: (DATE({$2}) - DATE({$1})) / 7 + +DATEDIFF_MONTH: (DATE_PART('year', {$2}::TIMESTAMP) - DATE_PART('year', {$1}::TIMESTAMP)) * 12 + (DATE_PART('month', {$2}::TIMESTAMP) - DATE_PART('month', {$1}::TIMESTAMP)) + +DATEDIFF_QUARTER: ((DATE_PART('year', {$2}::TIMESTAMP) - DATE_PART('year', {$1}::TIMESTAMP)) * 4) + (DATE_PART('quarter', {$2}::TIMESTAMP) - DATE_PART('quarter', {$1}::TIMESTAMP)) + +DATEDIFF_YEAR: DATE_PART('year', {$2}::TIMESTAMP) - DATE_PART('year', {$1}::TIMESTAMP) + +IS_NUM: CASE + WHEN {$1} ~ E'^\\s*[+-]?\\$?\\s*[0-9]+(,[0-9]{3})*(\\.[0-9]*)?[\\%]?\\s*$' THEN 1 + ELSE 0 + END + +IS_DATE: CASE + /* YYYY-MM-DD HH:MM:SS SSSSSS or YYYY-MM-DD HH:MM:SS */ + WHEN {$1} ~ '^(\\d{4})-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])\\s(2[0-3]|[01][0-9]):([0-5][0-9]):([0-5][0-9])(\\s[0-9]{6})?$' + THEN CASE + WHEN LEFT({$1}, 4)::INT BETWEEN 1800 AND 2200 + AND ( + ( SUBSTRING ({$1}, 6, 2) IN ('01', '03', '05', '07', '08', + '10', '12') + AND SUBSTRING ({$1}, 9, 2)::INT BETWEEN 1 AND 31 ) + OR ( SUBSTRING ({$1}, 6, 2) IN ('04', '06', '09') + AND SUBSTRING ({$1}, 9, 2)::INT BETWEEN 1 AND 30 ) + OR ( SUBSTRING ({$1}, 6, 2) = '02' + AND SUBSTRING ({$1}, 9, 2)::INT ::INT BETWEEN 1 AND 29) + ) + THEN 1 + ELSE 0 + END + /* YYYYMMDDHHMMSSSSSS or YYYYMMDD */ + WHEN {$1} ~ '^(\\d{4})(0[1-9]|1[0-2])(0[1-9]|[12][0-9]|3[01])(2[0-3]|[01][0-9])([0-5][0-9])([0-5][0-9])([0-9]{6})$' + OR {$1} ~ '^(\\d{4})(0[1-9]|1[0-2])(0[1-9]|[12][0-9]|3[01])(2[0-3]|[01][0-9])$' + THEN CASE + WHEN LEFT({$1}, 4)::INT BETWEEN 1800 AND 2200 + AND ( + ( SUBSTRING({$1}, 5, 2) IN ('01', '03', '05', '07', '08', + '10', '12') + AND SUBSTRING({$1}, 7, 2)::INT BETWEEN 1 AND 31 ) + OR ( SUBSTRING({$1}, 5, 2) IN ('04', '06', '09') + AND SUBSTRING({$1}, 7, 2)::INT BETWEEN 1 AND 30 ) + OR ( SUBSTRING({$1}, 5, 2) = '02' + AND SUBSTRING({$1}, 7, 2)::INT::INT BETWEEN 1 AND 29) + ) + THEN 1 + ELSE 0 + END + /* Exclude anything else long */ + WHEN LENGTH({$1}) > 11 THEN 0 + /* YYYY-MMM/MM-DD */ + WHEN REGEXP_REPLACE(UPPER({$1}), '(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)', '12', 'g') + ~ '[12][09][0-9][0-9]-[0-1]?[0-9]-[0-3]?[0-9]' + THEN CASE + WHEN SPLIT_PART({$1}, '-', 1)::INT BETWEEN 1800 AND 2200 + AND ( + ( UPPER(SPLIT_PART({$1}, '-', 2)) IN ('01', '03', '05', '07', '08', + '1', '3', '5', '7', '8', '10', '12', + 'JAN', 'MAR', 'MAY', 'JUL', 'AUG', + 'OCT', 'DEC') + AND SPLIT_PART({$1}, '-', 3)::INT BETWEEN 1 AND 31 ) + OR ( UPPER(SPLIT_PART({$1}, '-', 2)) IN ('04', '06', '09', '4', '6', '9', '11', + 'APR', 'JUN', 'SEP', 'NOV') + AND SPLIT_PART({$1}, '-', 3)::INT BETWEEN 1 AND 30 ) + OR ( UPPER(SPLIT_PART({$1}, '-', 2)) IN ('02', '2', 'FEB') + AND SPLIT_PART({$1}, '-', 3)::INT BETWEEN 1 AND 29) + ) + THEN 1 + ELSE 0 + END + /* MM/-DD/-YY/YYYY */ + WHEN REPLACE({$1}, '-', '/') ~ '^[0-1]?[0-9]/[0-3]?[0-9]/[12][09][0-9][0-9]$' + OR REPLACE({$1}, '-', '/') ~ '^[0-1]?[0-9]/[0-3]?[0-9]/[0-9][0-9]$' + THEN + CASE + WHEN SPLIT_PART(REPLACE({$1}, '-', '/'), '/', 1)::INT BETWEEN 1 AND 12 + AND ( + ( SPLIT_PART(REPLACE({$1}, '-', '/'), '/', 1)::INT IN (1, 3, 5, 7, 8, 10, 12) + AND SPLIT_PART(REPLACE({$1}, '-', '/'), '/', 2)::INT BETWEEN 1 AND 31 ) + OR ( SPLIT_PART(REPLACE({$1}, '-', '/'), '/', 1)::INT IN (4, 6, 9, 11) + AND SPLIT_PART(REPLACE({$1}, '-', '/'), '/', 2)::INT BETWEEN 1 AND 30 ) + OR ( SPLIT_PART(REPLACE({$1}, '-', '/'), '/', 1)::INT = 2 + AND SPLIT_PART(REPLACE({$1}, '-', '/'), '/', 2)::INT BETWEEN 1 AND 29) + ) + AND + ('20' || RIGHT(SPLIT_PART(REPLACE({$1}, '-', '/'), '/', 3), 2))::INT BETWEEN 1800 AND 2200 + THEN 1 + ELSE 0 + END + /* DD-MMM-YYYY */ + WHEN UPPER({$1}) ~ '[0-3]?[0-9]-(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)-[12][09][0-9][0-9]' + THEN + CASE + WHEN SPLIT_PART({$1}, '-', 3)::INT BETWEEN 1800 AND 2200 + AND ( + ( UPPER(SPLIT_PART({$1}, '-', 2)) IN ('JAN', 'MAR', 'MAY', 'JUL', 'AUG', 'OCT', 'DEC') + AND SPLIT_PART({$1}, '-', 1)::INT BETWEEN 1 AND 31 ) + OR ( UPPER(SPLIT_PART({$1}, '-', 2)) IN ('APR', 'JUN', 'SEP', 'NOV') + AND SPLIT_PART({$1}, '-', 1)::INT BETWEEN 1 AND 30 ) + OR ( UPPER(SPLIT_PART({$1}, '-', 2)) = 'FEB' + AND SPLIT_PART({$1}, '-', 1)::INT BETWEEN 1 AND 29) + ) + THEN 1 + ELSE 0 + END + ELSE 0 + END + diff --git a/testgen/template/flavors/postgresql/setup_profiling_tools/create_functions_postgresql.sql b/testgen/template/flavors/postgresql/setup_profiling_tools/create_functions_postgresql.sql deleted file mode 100644 index cff460f..0000000 --- a/testgen/template/flavors/postgresql/setup_profiling_tools/create_functions_postgresql.sql +++ /dev/null @@ -1,157 +0,0 @@ -CREATE OR REPLACE FUNCTION {DATA_QC_SCHEMA}.DATEDIFF(difftype character varying, firstdate timestamp without time zone, seconddate timestamp without time zone) -RETURNS BIGINT AS $$ - SELECT - CASE - WHEN UPPER(difftype) IN ('DAY', 'DD', 'D') THEN - DATE(seconddate) - DATE(firstdate) - WHEN UPPER(difftype) IN ('WEEK','WK', 'W') THEN - (DATE(seconddate) - DATE(firstdate)) / 7 - WHEN UPPER(difftype) IN ('MON', 'MONTH', 'MM') THEN - (DATE_PART('year', seconddate) - DATE_PART('year', firstdate)) * 12 + (DATE_PART('month', seconddate) - DATE_PART('month', firstdate)) - WHEN UPPER(difftype) IN ('QUARTER', 'QTR', 'Q') THEN - ((DATE_PART('year', seconddate) - DATE_PART('year', firstdate)) * 4) + (DATE_PART('quarter', seconddate) - DATE_PART('quarter', firstdate)) - WHEN UPPER(difftype) IN ('YEAR', 'YY', 'Y') THEN - DATE_PART('year', seconddate) - DATE_PART('year', firstdate) - ELSE - NULL::BIGINT - END; -$$ LANGUAGE sql IMMUTABLE STRICT; - -CREATE OR REPLACE FUNCTION {DATA_QC_SCHEMA}.fn_charcount(instring character varying, searchstring character varying) returns bigint - language plpgsql -as -$$ - BEGIN - RETURN (CHAR_LENGTH(instring) - CHAR_LENGTH(REPLACE(instring, searchstring, ''))) / CHAR_LENGTH(searchstring); - END; -$$; - - -CREATE OR REPLACE FUNCTION {DATA_QC_SCHEMA}.fn_parsefreq(top_freq_values VARCHAR(1000), rowno INTEGER, colno INTEGER) returns VARCHAR(1000) - language plpgsql -as -$$ - BEGIN - RETURN SPLIT_PART(SPLIT_PART(top_freq_values, CHR(10), rowno), '|', colno+1); - END; -$$; - - -CREATE -OR REPLACE FUNCTION {DATA_QC_SCHEMA}.fndk_isnum(VARCHAR) - RETURNS INTEGER - IMMUTABLE - AS - $$ -SELECT CASE - WHEN $1 ~ E'^\\s*[+-]?\\$?\\s*[0-9]+(,[0-9]{3})*(\\.[0-9]*)?[\\%]?\\s*$' THEN 1 - ELSE 0 - END; -$$ -LANGUAGE sql; - - - - - -CREATE -OR REPLACE FUNCTION {DATA_QC_SCHEMA}.fndk_isdate(VARCHAR) - RETURNS INTEGER - IMMUTABLE - AS $$ -SELECT CASE - -- YYYY-MM-DD HH:MM:SS SSSSSS or YYYY-MM-DD HH:MM:SS - WHEN $1 ~ '^(\\d{4})-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])\\s(2[0-3]|[01][0-9]):([0-5][0-9]):([0-5][0-9])(\\s[0-9]{6})?$' - THEN CASE - WHEN LEFT($1, 4):: INT BETWEEN 1800 AND 2200 - AND ( - ( SUBSTRING ($1, 6, 2) IN ('01', '03', '05', '07', '08', - '10', '12') - AND SUBSTRING ($1, 9, 2):: INT BETWEEN 1 AND 31 ) - OR ( SUBSTRING ($1, 6, 2) IN ('04', '06', '09') - AND SUBSTRING ($1, 9, 2):: INT BETWEEN 1 AND 30 ) - OR ( SUBSTRING ($1, 6, 2) = '02' - AND SUBSTRING ($1, 9, 2):: INT :: INT BETWEEN 1 AND 29) - ) - THEN 1 - ELSE 0 -END - -- YYYYMMDDHHMMSSSSSS or YYYYMMDD -WHEN $1 ~ '^(\\d{4})(0[1-9]|1[0-2])(0[1-9]|[12][0-9]|3[01])(2[0-3]|[01][0-9])([0-5][0-9])([0-5][0-9])([0-9]{6})$' - OR $1 ~ '^(\\d{4})(0[1-9]|1[0-2])(0[1-9]|[12][0-9]|3[01])(2[0-3]|[01][0-9])$' - THEN CASE - WHEN LEFT($1, 4)::INT BETWEEN 1800 AND 2200 - AND ( - ( SUBSTRING($1, 5, 2) IN ('01', '03', '05', '07', '08', - '10', '12') - AND SUBSTRING($1, 7, 2)::INT BETWEEN 1 AND 31 ) - OR ( SUBSTRING($1, 5, 2) IN ('04', '06', '09') - AND SUBSTRING($1, 7, 2)::INT BETWEEN 1 AND 30 ) - OR ( SUBSTRING($1, 5, 2) = '02' - AND SUBSTRING($1, 7, 2)::INT::INT BETWEEN 1 AND 29) - ) - THEN 1 - ELSE 0 -END - -- Exclude anything else long -WHEN LENGTH($1) > 11 THEN 0 - -- YYYY-MMM/MM-DD - WHEN REGEXP_REPLACE(UPPER($1), '(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)', '12', 'g') - ~ '[12][09][0-9][0-9]-[0-1]?[0-9]-[0-3]?[0-9]' - THEN CASE - WHEN SPLIT_PART($1, '-', 1)::INT BETWEEN 1800 AND 2200 - AND ( - ( UPPER(SPLIT_PART($1, '-', 2)) IN ('01', '03', '05', '07', '08', - '1', '3', '5', '7', '8', '10', '12', - 'JAN', 'MAR', 'MAY', 'JUL', 'AUG', - 'OCT', 'DEC') - AND SPLIT_PART($1, '-', 3)::INT BETWEEN 1 AND 31 ) - OR ( UPPER(SPLIT_PART($1, '-', 2)) IN ('04', '06', '09', '4', '6', '9', '11', - 'APR', 'JUN', 'SEP', 'NOV') - AND SPLIT_PART($1, '-', 3)::INT BETWEEN 1 AND 30 ) - OR ( UPPER(SPLIT_PART($1, '-', 2)) IN ('02', '2', 'FEB') - AND SPLIT_PART($1, '-', 3)::INT BETWEEN 1 AND 29) - ) - THEN 1 - ELSE 0 -END - -- MM/-DD/-YY/YYYY -WHEN REPLACE($1, '-', '/') ~ '^[0-1]?[0-9]/[0-3]?[0-9]/[12][09][0-9][0-9]$' - OR REPLACE($1, '-', '/') ~ '^[0-1]?[0-9]/[0-3]?[0-9]/[0-9][0-9]$' - THEN - CASE - WHEN SPLIT_PART(REPLACE($1, '-', '/'), '/', 1)::INT BETWEEN 1 AND 12 - AND ( - ( SPLIT_PART(REPLACE($1, '-', '/'), '/', 1)::INT IN (1, 3, 5, 7, 8, 10, 12) - AND SPLIT_PART(REPLACE($1, '-', '/'), '/', 2)::INT BETWEEN 1 AND 31 ) - OR ( SPLIT_PART(REPLACE($1, '-', '/'), '/', 1)::INT IN (4, 6, 9, 11) - AND SPLIT_PART(REPLACE($1, '-', '/'), '/', 2)::INT BETWEEN 1 AND 30 ) - OR ( SPLIT_PART(REPLACE($1, '-', '/'), '/', 1)::INT = 2 - AND SPLIT_PART(REPLACE($1, '-', '/'), '/', 2)::INT BETWEEN 1 AND 29) - ) - AND - ('20' || RIGHT(SPLIT_PART(REPLACE($1, '-', '/'), '/', 3), 2))::INT BETWEEN 1800 AND 2200 - THEN 1 - ELSE 0 -END - -- DD-MMM-YYYY -WHEN UPPER($1) ~ '[0-3]?[0-9]-(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)-[12][09][0-9][0-9]' - THEN - CASE - WHEN SPLIT_PART($1, '-', 3)::INT BETWEEN 1800 AND 2200 - AND ( - ( UPPER(SPLIT_PART($1, '-', 2)) IN ('JAN', 'MAR', 'MAY', 'JUL', 'AUG', 'OCT', 'DEC') - AND SPLIT_PART($1, '-', 1)::INT BETWEEN 1 AND 31 ) - OR ( UPPER(SPLIT_PART($1, '-', 2)) IN ('APR', 'JUN', 'SEP', 'NOV') - AND SPLIT_PART($1, '-', 1)::INT BETWEEN 1 AND 30 ) - OR ( UPPER(SPLIT_PART($1, '-', 2)) = 'FEB' - AND SPLIT_PART($1, '-', 1)::INT BETWEEN 1 AND 29) - ) - THEN 1 - ELSE 0 -END -ELSE 0 -END -as isdate - $$ - LANGUAGE sql; diff --git a/testgen/template/flavors/postgresql/setup_profiling_tools/create_qc_schema_postgresql.sql b/testgen/template/flavors/postgresql/setup_profiling_tools/create_qc_schema_postgresql.sql deleted file mode 100644 index 4cd79fe..0000000 --- a/testgen/template/flavors/postgresql/setup_profiling_tools/create_qc_schema_postgresql.sql +++ /dev/null @@ -1 +0,0 @@ -CREATE SCHEMA IF NOT exists {DATA_QC_SCHEMA}; diff --git a/testgen/template/flavors/postgresql/setup_profiling_tools/grant_execute_privileges_postgresql.sql b/testgen/template/flavors/postgresql/setup_profiling_tools/grant_execute_privileges_postgresql.sql deleted file mode 100644 index ac6d077..0000000 --- a/testgen/template/flavors/postgresql/setup_profiling_tools/grant_execute_privileges_postgresql.sql +++ /dev/null @@ -1,2 +0,0 @@ -GRANT ALL PRIVILEGES ON SCHEMA {DATA_QC_SCHEMA} TO {DB_USER}; -GRANT EXECUTE ON ALL FUNCTIONS IN SCHEMA {DATA_QC_SCHEMA} TO {DB_USER}; diff --git a/testgen/template/flavors/redshift/profiling/project_profiling_query_redshift.yaml b/testgen/template/flavors/redshift/profiling/project_profiling_query_redshift.yaml index 8856fb2..e54bdf4 100644 --- a/testgen/template/flavors/redshift/profiling/project_profiling_query_redshift.yaml +++ b/testgen/template/flavors/redshift/profiling/project_profiling_query_redshift.yaml @@ -51,8 +51,22 @@ strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) a END ) AS filled_value_ct, LEFT(MIN(NULLIF("{COL_NAME}", '')), 100) AS min_text, LEFT(MAX(NULLIF("{COL_NAME}", '')), 100) AS max_text, - SUM({DATA_QC_SCHEMA}.fndk_isnum(LEFT("{COL_NAME}", 31))) AS numeric_ct, - SUM({DATA_QC_SCHEMA}.fndk_isdate(LEFT("{COL_NAME}", 26))) AS date_ct, + SUM(CASE + WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', '') = "{COL_NAME}" THEN 0 + WHEN TRANSLATE("{COL_NAME}", 'abcdefghijklmnopqrstuvwxyz', '') = "{COL_NAME}" THEN 1 + ELSE 0 + END) AS upper_case_ct, + SUM(CASE + WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', '') = "{COL_NAME}" THEN 0 + WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', '') = "{COL_NAME}" THEN 1 + ELSE 0 + END) AS lower_case_ct, + SUM(CASE + WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', '') = "{COL_NAME}" THEN 1 + ELSE 0 + END) AS non_alpha_ct, + SUM({{DKFN_IS_NUM;;LEFT("{COL_NAME}", 31)}} ) AS numeric_ct, + SUM({{DKFN_IS_DATE;;LEFT("{COL_NAME}", 26)}} ) AS date_ct, CASE WHEN SUM( CASE WHEN "{COL_NAME}" ~ '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$' THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.8 THEN 'STREET_ADDR' @@ -84,6 +98,9 @@ strTemplate05_else: NULL as distinct_std_value_ct, NULL as filled_value_ct, NULL as min_text, NULL as max_text, + NULL as upper_case_ct, + NULL as lower_case_ct, + NULL as non_alpha_ct, NULL as numeric_ct, NULL as date_ct, NULL as std_pattern_match, @@ -150,6 +167,10 @@ strTemplate11_D: CASE WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 240 THEN 1 ELSE 0 END) AS before_20yr_date_ct, + SUM(CASE + WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 1200 THEN 1 + ELSE 0 + END) AS before_100yr_date_ct, SUM(CASE WHEN DATEDIFF('DAY', "{COL_NAME}", '{RUN_DATE}') BETWEEN 0 AND 365 THEN 1 ELSE 0 @@ -161,6 +182,10 @@ strTemplate11_D: CASE SUM(CASE WHEN "{COL_NAME}" > '{RUN_DATE}' THEN 1 ELSE 0 END) AS future_date_ct, + SUM(CASE + WHEN DATEDIFF('MON', '{RUN_DATE}', "{COL_NAME}") > 240 THEN 1 + ELSE 0 + END) AS distant_future_date_ct, COUNT(DISTINCT DATEDIFF(day, "{COL_NAME}", '{RUN_DATE}' ) ) as date_days_present, COUNT(DISTINCT DATEDIFF(week, "{COL_NAME}", '{RUN_DATE}' ) ) as date_weeks_present, COUNT(DISTINCT DATEDIFF(month, "{COL_NAME}", '{RUN_DATE}' ) ) as date_months_present, @@ -170,9 +195,11 @@ strTemplate11_else: NULL as min_date, NULL as before_1yr_date_ct, NULL as before_5yr_date_ct, NULL as before_20yr_date_ct, + NULL AS before_100yr_date_ct, NULL as within_1yr_date_ct, NULL as within_1mo_date_ct, NULL as future_date_ct, + NULL as distant_future_date_ct, NULL as date_days_present, NULL as date_weeks_present, NULL as date_months_present, diff --git a/testgen/template/flavors/redshift/profiling/templated_functions.yaml b/testgen/template/flavors/redshift/profiling/templated_functions.yaml new file mode 100644 index 0000000..4953e25 --- /dev/null +++ b/testgen/template/flavors/redshift/profiling/templated_functions.yaml @@ -0,0 +1,101 @@ +IS_NUM: CASE + WHEN {$1} ~ '^\\s*[+-]?\\$?\\s*[0-9]+(,[0-9]{3})*(\\.[0-9]*)?[\\%]?\\s*$' THEN 1 + ELSE 0 + END + +IS_DATE: CASE + /* YYYY-MM-DD HH:MM:SS SSSSSS or YYYY-MM-DD HH:MM:SS */ + WHEN {$1} ~ + '^(\\d{4})-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])\\s(2[0-3]|[01][0-9]):([0-5][0-9]):([0-5][0-9])(\\s[0-9]{6})?$' + THEN CASE + WHEN LEFT({$1}, 4)::INT BETWEEN 1800 AND 2200 + AND ( + (SUBSTRING({$1}, 6, 2) IN ('01', '03', '05', '07', '08', + '10', '12') + AND SUBSTRING({$1}, 9, 2)::INT BETWEEN 1 AND 31) + OR (SUBSTRING({$1}, 6, 2) IN ('04', '06', '09') + AND SUBSTRING({$1}, 9, 2)::INT BETWEEN 1 AND 30) + OR (SUBSTRING({$1}, 6, 2) = '02' + AND SUBSTRING({$1}, 9, 2)::INT ::INT BETWEEN 1 AND 29) + ) + THEN 1 + ELSE 0 + END + /* YYYYMMDDHHMMSSSSSS or YYYYMMDD */ + WHEN {$1} ~ + '^(\\d{4})(0[1-9]|1[0-2])(0[1-9]|[12][0-9]|3[01])(2[0-3]|[01][0-9])([0-5][0-9])([0-5][0-9])([0-9]{6})$' + OR {$1} ~ '^(\\d{4})(0[1-9]|1[0-2])(0[1-9]|[12][0-9]|3[01])(2[0-3]|[01][0-9])$' + THEN CASE + WHEN LEFT({$1}, 4)::INT BETWEEN 1800 AND 2200 + AND ( + (SUBSTRING({$1}, 5, 2) IN ('01', '03', '05', '07', '08', + '10', '12') + AND SUBSTRING({$1}, 7, 2)::INT BETWEEN 1 AND 31) + OR (SUBSTRING({$1}, 5, 2) IN ('04', '06', '09') + AND SUBSTRING({$1}, 7, 2)::INT BETWEEN 1 AND 30) + OR (SUBSTRING({$1}, 5, 2) = '02' + AND SUBSTRING({$1}, 7, 2)::INT::INT BETWEEN 1 AND 29) + ) + THEN 1 + ELSE 0 + END + /* Exclude anything else long */ + WHEN LENGTH({$1}) > 11 THEN 0 + /* YYYY-MMM/MM-DD */ + WHEN REGEXP_REPLACE(UPPER({$1}), '(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)', '12') + ~ '[12][09][0-9][0-9]-[0-1]?[0-9]-[0-3]?[0-9]' + THEN CASE + WHEN SPLIT_PART({$1}, '-', 1)::INT BETWEEN 1800 AND 2200 + AND ( + (UPPER(SPLIT_PART({$1}, '-', 2)) IN ('01', '03', '05', '07', '08', + '1', '3', '5', '7', '8', '10', '12', + 'JAN', 'MAR', 'MAY', 'JUL', 'AUG', + 'OCT', 'DEC') + AND SPLIT_PART({$1}, '-', 3)::INT BETWEEN 1 AND 31) + OR (UPPER(SPLIT_PART({$1}, '-', 2)) IN ('04', '06', '09', '4', '6', '9', '11', + 'APR', 'JUN', 'SEP', 'NOV') + AND SPLIT_PART({$1}, '-', 3)::INT BETWEEN 1 AND 30) + OR (UPPER(SPLIT_PART({$1}, '-', 2)) IN ('02', '2', 'FEB') + AND SPLIT_PART({$1}, '-', 3)::INT BETWEEN 1 AND 29) + ) + THEN 1 + ELSE 0 + END + /* MM/-DD/-YY/YYYY */ + WHEN REPLACE({$1}, '-', '/') ~ '^[0-1]?[0-9]/[0-3]?[0-9]/[12][09][0-9][0-9]$' + OR REPLACE({$1}, '-', '/') ~ '^[0-1]?[0-9]/[0-3]?[0-9]/[0-9][0-9]$' + THEN + CASE + WHEN SPLIT_PART(REPLACE({$1}, '-', '/'), '/', 1)::INT BETWEEN 1 AND 12 + AND ( + (SPLIT_PART(REPLACE({$1}, '-', '/'), '/', 1)::INT IN (1, 3, 5, 7, 8, 10, 12) + AND SPLIT_PART(REPLACE({$1}, '-', '/'), '/', 2)::INT BETWEEN 1 AND 31) + OR (SPLIT_PART(REPLACE({$1}, '-', '/'), '/', 1)::INT IN (4, 6, 9, 11) + AND SPLIT_PART(REPLACE({$1}, '-', '/'), '/', 2)::INT BETWEEN 1 AND 30) + OR (SPLIT_PART(REPLACE({$1}, '-', '/'), '/', 1)::INT = 2 + AND SPLIT_PART(REPLACE({$1}, '-', '/'), '/', 2)::INT BETWEEN 1 AND 29) + ) + AND + ('20' + RIGHT(SPLIT_PART(REPLACE({$1}, '-', '/'), '/', 3), 2))::INT BETWEEN 1800 AND 2200 + THEN 1 + ELSE 0 + END + /* DD-MMM-YYYY */ + WHEN UPPER({$1}) ~ '[0-3]?[0-9]-(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)-[12][09][0-9][0-9]' + THEN + CASE + WHEN SPLIT_PART({$1}, '-', 3)::INT BETWEEN 1800 AND 2200 + AND ( + (UPPER(SPLIT_PART({$1}, '-', 2)) IN ('JAN', 'MAR', 'MAY', 'JUL', 'AUG', 'OCT', 'DEC') + AND SPLIT_PART({$1}, '-', 1)::INT BETWEEN 1 AND 31) + OR (UPPER(SPLIT_PART({$1}, '-', 2)) IN ('APR', 'JUN', 'SEP', 'NOV') + AND SPLIT_PART({$1}, '-', 1)::INT BETWEEN 1 AND 30) + OR (UPPER(SPLIT_PART({$1}, '-', 2)) = 'FEB' + AND SPLIT_PART({$1}, '-', 1)::INT BETWEEN 1 AND 29) + ) + THEN 1 + ELSE 0 + END + ELSE 0 + END + diff --git a/testgen/template/flavors/redshift/setup_profiling_tools/create_functions_redshift.sql b/testgen/template/flavors/redshift/setup_profiling_tools/create_functions_redshift.sql deleted file mode 100644 index 0270a38..0000000 --- a/testgen/template/flavors/redshift/setup_profiling_tools/create_functions_redshift.sql +++ /dev/null @@ -1,115 +0,0 @@ -CREATE OR REPLACE FUNCTION {DATA_QC_SCHEMA}.fndk_isnum(VARCHAR) - RETURNS INTEGER - IMMUTABLE - AS - $$ -SELECT CASE - WHEN $1 ~ '^\\s*[+-]?\\$?\\s*[0-9]+(,[0-9]{3})*(\\.[0-9]*)?[\\%]?\\s*$' THEN 1 - ELSE 0 - END; -$$ -LANGUAGE sql; - - -CREATE OR REPLACE FUNCTION {DATA_QC_SCHEMA}.fndk_isdate(VARCHAR) - RETURNS INTEGER - IMMUTABLE - AS $$ -SELECT CASE - -- YYYY-MM-DD HH:MM:SS SSSSSS or YYYY-MM-DD HH:MM:SS - WHEN $1 ~ - '^(\\d{4})-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])\\s(2[0-3]|[01][0-9]):([0-5][0-9]):([0-5][0-9])(\\s[0-9]{6})?$' - THEN CASE - WHEN LEFT($1, 4):: INT BETWEEN 1800 AND 2200 - AND ( - (SUBSTRING($1, 6, 2) IN ('01', '03', '05', '07', '08', - '10', '12') - AND SUBSTRING($1, 9, 2):: INT BETWEEN 1 AND 31) - OR (SUBSTRING($1, 6, 2) IN ('04', '06', '09') - AND SUBSTRING($1, 9, 2):: INT BETWEEN 1 AND 30) - OR (SUBSTRING($1, 6, 2) = '02' - AND SUBSTRING($1, 9, 2):: INT :: INT BETWEEN 1 AND 29) - ) - THEN 1 - ELSE 0 - END - -- YYYYMMDDHHMMSSSSSS or YYYYMMDD - WHEN $1 ~ - '^(\\d{4})(0[1-9]|1[0-2])(0[1-9]|[12][0-9]|3[01])(2[0-3]|[01][0-9])([0-5][0-9])([0-5][0-9])([0-9]{6})$' - OR $1 ~ '^(\\d{4})(0[1-9]|1[0-2])(0[1-9]|[12][0-9]|3[01])(2[0-3]|[01][0-9])$' - THEN CASE - WHEN LEFT($1, 4)::INT BETWEEN 1800 AND 2200 - AND ( - (SUBSTRING($1, 5, 2) IN ('01', '03', '05', '07', '08', - '10', '12') - AND SUBSTRING($1, 7, 2)::INT BETWEEN 1 AND 31) - OR (SUBSTRING($1, 5, 2) IN ('04', '06', '09') - AND SUBSTRING($1, 7, 2)::INT BETWEEN 1 AND 30) - OR (SUBSTRING($1, 5, 2) = '02' - AND SUBSTRING($1, 7, 2)::INT::INT BETWEEN 1 AND 29) - ) - THEN 1 - ELSE 0 - END - -- Exclude anything else long - WHEN LENGTH($1) > 11 THEN 0 - -- YYYY-MMM/MM-DD - WHEN REGEXP_REPLACE(UPPER($1), '(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)', '12') - ~ '[12][09][0-9][0-9]-[0-1]?[0-9]-[0-3]?[0-9]' - THEN CASE - WHEN SPLIT_PART($1, '-', 1)::INT BETWEEN 1800 AND 2200 - AND ( - (UPPER(SPLIT_PART($1, '-', 2)) IN ('01', '03', '05', '07', '08', - '1', '3', '5', '7', '8', '10', '12', - 'JAN', 'MAR', 'MAY', 'JUL', 'AUG', - 'OCT', 'DEC') - AND SPLIT_PART($1, '-', 3)::INT BETWEEN 1 AND 31) - OR (UPPER(SPLIT_PART($1, '-', 2)) IN ('04', '06', '09', '4', '6', '9', '11', - 'APR', 'JUN', 'SEP', 'NOV') - AND SPLIT_PART($1, '-', 3)::INT BETWEEN 1 AND 30) - OR (UPPER(SPLIT_PART($1, '-', 2)) IN ('02', '2', 'FEB') - AND SPLIT_PART($1, '-', 3)::INT BETWEEN 1 AND 29) - ) - THEN 1 - ELSE 0 - END - -- MM/-DD/-YY/YYYY - WHEN REPLACE($1, '-', '/') ~ '^[0-1]?[0-9]/[0-3]?[0-9]/[12][09][0-9][0-9]$' - OR REPLACE($1, '-', '/') ~ '^[0-1]?[0-9]/[0-3]?[0-9]/[0-9][0-9]$' - THEN - CASE - WHEN SPLIT_PART(REPLACE($1, '-', '/'), '/', 1)::INT BETWEEN 1 AND 12 - AND ( - (SPLIT_PART(REPLACE($1, '-', '/'), '/', 1)::INT IN (1, 3, 5, 7, 8, 10, 12) - AND SPLIT_PART(REPLACE($1, '-', '/'), '/', 2)::INT BETWEEN 1 AND 31) - OR (SPLIT_PART(REPLACE($1, '-', '/'), '/', 1)::INT IN (4, 6, 9, 11) - AND SPLIT_PART(REPLACE($1, '-', '/'), '/', 2)::INT BETWEEN 1 AND 30) - OR (SPLIT_PART(REPLACE($1, '-', '/'), '/', 1)::INT = 2 - AND SPLIT_PART(REPLACE($1, '-', '/'), '/', 2)::INT BETWEEN 1 AND 29) - ) - AND - ('20' + RIGHT(SPLIT_PART(REPLACE($1, '-', '/'), '/', 3), 2))::INT BETWEEN 1800 AND 2200 - THEN 1 - ELSE 0 - END - -- DD-MMM-YYYY - WHEN UPPER($1) ~ '[0-3]?[0-9]-(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)-[12][09][0-9][0-9]' - THEN - CASE - WHEN SPLIT_PART($1, '-', 3)::INT BETWEEN 1800 AND 2200 - AND ( - (UPPER(SPLIT_PART($1, '-', 2)) IN ('JAN', 'MAR', 'MAY', 'JUL', 'AUG', 'OCT', 'DEC') - AND SPLIT_PART($1, '-', 1)::INT BETWEEN 1 AND 31) - OR (UPPER(SPLIT_PART($1, '-', 2)) IN ('APR', 'JUN', 'SEP', 'NOV') - AND SPLIT_PART($1, '-', 1)::INT BETWEEN 1 AND 30) - OR (UPPER(SPLIT_PART($1, '-', 2)) = 'FEB' - AND SPLIT_PART($1, '-', 1)::INT BETWEEN 1 AND 29) - ) - THEN 1 - ELSE 0 - END - ELSE 0 - END - AS isdate; - $$ - LANGUAGE sql; diff --git a/testgen/template/flavors/redshift/setup_profiling_tools/create_qc_schema_redshift.sql b/testgen/template/flavors/redshift/setup_profiling_tools/create_qc_schema_redshift.sql deleted file mode 100644 index 4cd79fe..0000000 --- a/testgen/template/flavors/redshift/setup_profiling_tools/create_qc_schema_redshift.sql +++ /dev/null @@ -1 +0,0 @@ -CREATE SCHEMA IF NOT exists {DATA_QC_SCHEMA}; diff --git a/testgen/template/flavors/redshift/setup_profiling_tools/grant_execute_privileges_redshift.sql b/testgen/template/flavors/redshift/setup_profiling_tools/grant_execute_privileges_redshift.sql deleted file mode 100644 index ac6d077..0000000 --- a/testgen/template/flavors/redshift/setup_profiling_tools/grant_execute_privileges_redshift.sql +++ /dev/null @@ -1,2 +0,0 @@ -GRANT ALL PRIVILEGES ON SCHEMA {DATA_QC_SCHEMA} TO {DB_USER}; -GRANT EXECUTE ON ALL FUNCTIONS IN SCHEMA {DATA_QC_SCHEMA} TO {DB_USER}; diff --git a/testgen/template/flavors/snowflake/profiling/project_profiling_query_snowflake.yaml b/testgen/template/flavors/snowflake/profiling/project_profiling_query_snowflake.yaml index 5b3ab3e..f0a784f 100644 --- a/testgen/template/flavors/snowflake/profiling/project_profiling_query_snowflake.yaml +++ b/testgen/template/flavors/snowflake/profiling/project_profiling_query_snowflake.yaml @@ -52,8 +52,22 @@ strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) a END ) AS filled_value_ct, LEFT(MIN(NULLIF("{COL_NAME}", '')), 100) AS min_text, LEFT(MAX(NULLIF("{COL_NAME}", '')), 100) AS max_text, - SUM({DATA_QC_SCHEMA}.fndk_isnum(LEFT("{COL_NAME}", 31))) AS numeric_ct, - SUM({DATA_QC_SCHEMA}.fndk_isdate(LEFT("{COL_NAME}", 26))) AS date_ct, + SUM(CASE + WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', '') = "{COL_NAME}" THEN 0 + WHEN TRANSLATE("{COL_NAME}", 'abcdefghijklmnopqrstuvwxyz', '') = "{COL_NAME}" THEN 1 + ELSE 0 + END) AS upper_case_ct, + SUM(CASE + WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', '') = "{COL_NAME}" THEN 0 + WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', '') = "{COL_NAME}" THEN 1 + ELSE 0 + END) AS lower_case_ct, + SUM(CASE + WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', '') = "{COL_NAME}" THEN 1 + ELSE 0 + END) AS non_alpha_ct, + SUM({{DKFN_IS_NUM;;LEFT("{COL_NAME}", 31)}} ) AS numeric_ct, + SUM({{DKFN_IS_DATE;;LEFT("{COL_NAME}", 26)}} ) AS date_ct, CASE WHEN CAST(SUM( CASE WHEN REGEXP_LIKE("{COL_NAME}"::VARCHAR, '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$') THEN 1 END) AS FLOAT) / CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.8 THEN 'STREET_ADDR' @@ -85,6 +99,9 @@ strTemplate05_else: NULL as distinct_std_value_ct, NULL as filled_value_ct, NULL as min_text, NULL as max_text, + NULL as upper_case_ct, + NULL as lower_case_ct, + NULL as non_alpha_ct, NULL as numeric_ct, NULL as date_ct, NULL as std_pattern_match, @@ -149,6 +166,10 @@ strTemplate11_D: GREATEST(MIN("{COL_NAME}"), '0001-01-01') as min_date, WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 240 THEN 1 ELSE 0 END) AS before_20yr_date_ct, + SUM(CASE + WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 1200 THEN 1 + ELSE 0 + END) AS before_100yr_date_ct, SUM(CASE WHEN DATEDIFF('DAY', "{COL_NAME}", '{RUN_DATE}') BETWEEN 0 AND 365 THEN 1 ELSE 0 @@ -160,6 +181,10 @@ strTemplate11_D: GREATEST(MIN("{COL_NAME}"), '0001-01-01') as min_date, SUM(CASE WHEN "{COL_NAME}" > '{RUN_DATE}' THEN 1 ELSE 0 END) AS future_date_ct, + SUM(CASE + WHEN DATEDIFF('MON', '{RUN_DATE}', "{COL_NAME}") > 240 THEN 1 + ELSE 0 + END) AS distant_future_date_ct, COUNT(DISTINCT DATEDIFF(day, "{COL_NAME}", '{RUN_DATE}' ) ) as date_days_present, COUNT(DISTINCT DATEDIFF(week, "{COL_NAME}", '{RUN_DATE}' ) ) as date_weeks_present, COUNT(DISTINCT DATEDIFF(month, "{COL_NAME}", '{RUN_DATE}' ) ) as date_months_present, @@ -169,9 +194,11 @@ strTemplate11_else: NULL as min_date, NULL as before_1yr_date_ct, NULL as before_5yr_date_ct, NULL as before_20yr_date_ct, + NULL AS before_100yr_date_ct, NULL as within_1yr_date_ct, NULL as within_1mo_date_ct, NULL as future_date_ct, + NULL as distant_future_date_ct, NULL as date_days_present, NULL as date_weeks_present, NULL as date_months_present, diff --git a/testgen/template/flavors/snowflake/profiling/templated_functions.yaml b/testgen/template/flavors/snowflake/profiling/templated_functions.yaml new file mode 100644 index 0000000..1afbdea --- /dev/null +++ b/testgen/template/flavors/snowflake/profiling/templated_functions.yaml @@ -0,0 +1,55 @@ +IS_NUM: CASE + WHEN REGEXP_LIKE({$1}::VARCHAR, '^\\s*[+-]?\\$?\\s*[0-9]+(,[0-9]{3})*(\\.[0-9]*)?[\\%]?\\s*$') THEN 1 + ELSE 0 + END + +IS_DATE: CASE + /* YYYY-MM-DD HH:MM:SS SSSSSS */ + WHEN TRY_TO_DATE({$1}, 'YYYY-MM-DD HH:MI:SS SSSSSS') IS NOT NULL THEN 1 + + /* YYYY-MM-DD HH:MM:SS */ + WHEN TRY_TO_DATE({$1}, 'YYYY-MM-DD HH:MI:SS') IS NOT NULL THEN 1 + + /* YYYYMMDDHHMMSSSSSS */ + WHEN TRY_TO_DATE({$1}, 'YYYYMMDDHHMISSSSSS') IS NOT NULL THEN 1 + + /* YYYYMMDDHHMMSS */ + WHEN TRY_TO_DATE({$1}, 'YYYYMMDDHHMISS') IS NOT NULL THEN 1 + + /* YYYYMMDD */ + WHEN LENGTH({$1}) = 8 AND TRY_TO_DATE({$1}, 'YYYYMMDD') IS NOT NULL THEN 1 + + /* YYYY-MON-DD HH:MM:SS SSSSSS */ + /* WHEN TRY_TO_DATE({$1}, 'YYYY-MON-DD HH:MI:SS SSSSSS') IS NOT NULL THEN 1 */ + + /* YYYY-MON-DD HH:MM:SS */ + /* WHEN TRY_TO_DATE({$1}, 'YYYY-MON-DD HH:MI:SS') IS NOT NULL THEN 1 */ + + /* Exclude anything else long */ + WHEN LENGTH({$1}) > 11 THEN 0 + + /* YYYY-MON-DD */ + WHEN TRY_TO_DATE({$1}, 'YYYY-MON-DD') IS NOT NULL THEN 1 + + /* YYYY-MM-DD */ + WHEN TRY_TO_DATE({$1}, 'YYYY-MM-DD') IS NOT NULL THEN 1 + + /* MM/DD/YYYY */ + WHEN TRY_TO_DATE({$1}, 'MM/DD/YYYY') IS NOT NULL THEN 1 + + /* MM/DD/YY */ + WHEN TRY_TO_DATE({$1}, 'MM/DD/YY') IS NOT NULL THEN 1 + + /* MM-DD-YYYY */ + WHEN TRY_TO_DATE({$1}, 'MM-DD-YYYY') IS NOT NULL THEN 1 + + /* MM-DD-YY */ + WHEN TRY_TO_DATE({$1}, 'MM-DD-YY') IS NOT NULL THEN 1 + + /* DD-MMM-YYYY */ + WHEN TRY_TO_DATE({$1}, 'DD-MON-YYYY') IS NOT NULL THEN 1 + + + ELSE 0 + END + diff --git a/testgen/template/flavors/snowflake/setup_profiling_tools/create_functions_snowflake.sql b/testgen/template/flavors/snowflake/setup_profiling_tools/create_functions_snowflake.sql deleted file mode 100644 index f271a24..0000000 --- a/testgen/template/flavors/snowflake/setup_profiling_tools/create_functions_snowflake.sql +++ /dev/null @@ -1,69 +0,0 @@ -CREATE OR REPLACE FUNCTION {DATA_QC_SCHEMA}.fndk_isnum(strparm VARCHAR) -RETURNS INTEGER -LANGUAGE SQL -IMMUTABLE -AS -$$ -SELECT CASE - WHEN REGEXP_LIKE(strparm::VARCHAR, '^\\s*[+-]?\\$?\\s*[0-9]+(,[0-9]{3})*(\\.[0-9]*)?[\\%]?\\s*$') THEN 1 - ELSE 0 - END -$$; - - -CREATE OR REPLACE FUNCTION {DATA_QC_SCHEMA}.fndk_isdate(strparm VARCHAR) -RETURNS INTEGER -LANGUAGE SQL -IMMUTABLE -AS -$$ -SELECT CASE - -- YYYY-MM-DD HH:MM:SS SSSSSS - WHEN TRY_TO_DATE(strparm, 'YYYY-MM-DD HH:MI:SS SSSSSS') IS NOT NULL THEN 1 - - -- YYYY-MM-DD HH:MM:SS - WHEN TRY_TO_DATE(strparm, 'YYYY-MM-DD HH:MI:SS') IS NOT NULL THEN 1 - - -- YYYYMMDDHHMMSSSSSS - WHEN TRY_TO_DATE(strparm, 'YYYYMMDDHHMISSSSSS') IS NOT NULL THEN 1 - - -- YYYYMMDDHHMMSS - WHEN TRY_TO_DATE(strparm, 'YYYYMMDDHHMISS') IS NOT NULL THEN 1 - - -- YYYYMMDD - WHEN LENGTH(strparm) = 8 AND TRY_TO_DATE(strparm, 'YYYYMMDD') IS NOT NULL THEN 1 - - -- YYYY-MON-DD HH:MM:SS SSSSSS - --WHEN TRY_TO_DATE(strparm, 'YYYY-MON-DD HH:MI:SS SSSSSS') IS NOT NULL THEN 1 - - -- YYYY-MON-DD HH:MM:SS - --WHEN TRY_TO_DATE(strparm, 'YYYY-MON-DD HH:MI:SS') IS NOT NULL THEN 1 - - -- Exclude anything else long - WHEN LENGTH(strparm) > 11 THEN 0 - - -- YYYY-MON-DD - WHEN TRY_TO_DATE(strparm, 'YYYY-MON-DD') IS NOT NULL THEN 1 - - -- YYYY-MM-DD - WHEN TRY_TO_DATE(strparm, 'YYYY-MM-DD') IS NOT NULL THEN 1 - - -- MM/DD/YYYY - WHEN TRY_TO_DATE(strparm, 'MM/DD/YYYY') IS NOT NULL THEN 1 - - -- MM/DD/YY - WHEN TRY_TO_DATE(strparm, 'MM/DD/YY') IS NOT NULL THEN 1 - - --MM-DD-YYYY - WHEN TRY_TO_DATE(strparm, 'MM-DD-YYYY') IS NOT NULL THEN 1 - - --MM-DD-YY - WHEN TRY_TO_DATE(strparm, 'MM-DD-YY') IS NOT NULL THEN 1 - - --DD-MMM-YYYY - WHEN TRY_TO_DATE(strparm, 'DD-MON-YYYY') IS NOT NULL THEN 1 - - - ELSE 0 - END -$$; diff --git a/testgen/template/flavors/snowflake/setup_profiling_tools/create_qc_schema_snowflake.sql b/testgen/template/flavors/snowflake/setup_profiling_tools/create_qc_schema_snowflake.sql deleted file mode 100644 index 4cd79fe..0000000 --- a/testgen/template/flavors/snowflake/setup_profiling_tools/create_qc_schema_snowflake.sql +++ /dev/null @@ -1 +0,0 @@ -CREATE SCHEMA IF NOT exists {DATA_QC_SCHEMA}; diff --git a/testgen/template/flavors/snowflake/setup_profiling_tools/grant_execute_privileges_snowflake.sql b/testgen/template/flavors/snowflake/setup_profiling_tools/grant_execute_privileges_snowflake.sql deleted file mode 100644 index 2a60aa7..0000000 --- a/testgen/template/flavors/snowflake/setup_profiling_tools/grant_execute_privileges_snowflake.sql +++ /dev/null @@ -1,6 +0,0 @@ - -CREATE ROLE IF NOT EXISTS dk_qc_role; -GRANT ALL PRIVILEGES ON SCHEMA {DATA_QC_SCHEMA} TO ROLE dk_qc_role; -GRANT USAGE ON FUNCTION {DATA_QC_SCHEMA}.fndk_isnum(VARCHAR) TO dk_qc_role; -GRANT USAGE ON FUNCTION {DATA_QC_SCHEMA}.fndk_isdate(VARCHAR) TO dk_qc_role; -GRANT ROLE dk_qc_role TO USER {DB_USER}; \ No newline at end of file diff --git a/testgen/template/flavors/trino/profiling/project_profiling_query_trino.yaml b/testgen/template/flavors/trino/profiling/project_profiling_query_trino.yaml index 0968a2d..87b216f 100644 --- a/testgen/template/flavors/trino/profiling/project_profiling_query_trino.yaml +++ b/testgen/template/flavors/trino/profiling/project_profiling_query_trino.yaml @@ -51,6 +51,20 @@ strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) a END ) AS filled_value_ct, SUBSTRING(MIN(NULLIF("{COL_NAME}", '')), 1, 100) AS min_text, SUBSTRING(MAX(NULLIF("{COL_NAME}", '')), 1, 100) AS max_text, + SUM(CASE + WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', '') = "{COL_NAME}" THEN 0 + WHEN TRANSLATE("{COL_NAME}", 'abcdefghijklmnopqrstuvwxyz', '') = "{COL_NAME}" THEN 1 + ELSE 0 + END) AS upper_case_ct, + SUM(CASE + WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', '') = "{COL_NAME}" THEN 0 + WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', '') = "{COL_NAME}" THEN 1 + ELSE 0 + END) AS lower_case_ct, + SUM(CASE + WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', '') = "{COL_NAME}" THEN 1 + ELSE 0 + END) AS non_alpha_ct, SUM(fndk_isnum(SUBSTRING("{COL_NAME}", 1, 31))) AS numeric_ct, SUM(fndk_isdate(SUBSTRING("{COL_NAME}", 1, 26))) AS date_ct, CASE @@ -84,6 +98,9 @@ strTemplate05_else: NULL as distinct_std_value_ct, NULL as filled_value_ct, NULL as min_text, NULL as max_text, + NULL as upper_case_ct, + NULL as lower_case_ct, + NULL as non_alpha_ct, NULL as numeric_ct, NULL as date_ct, NULL as std_pattern_match, @@ -148,6 +165,10 @@ strTemplate11_D: CASE WHEN DATE_DIFF('MONTH', TIMESTAMP "{COL_NAME}", TIMESTAMP '{RUN_DATE}') > 240 THEN 1 ELSE 0 END) AS before_20yr_date_ct, + SUM(CASE + WHEN DATE_DIFF('MONTH', TIMESTAMP "{COL_NAME}", TIMESTAMP '{RUN_DATE}') > 1200 THEN 1 + ELSE 0 + END) AS before_100yr_date_ct, SUM(CASE WHEN DATE_DIFF('DAY', TIMESTAMP "{COL_NAME}", TIMESTAMP '{RUN_DATE}') BETWEEN 0 AND 365 THEN 1 ELSE 0 @@ -159,6 +180,10 @@ strTemplate11_D: CASE SUM(CASE WHEN "{COL_NAME}" > '{RUN_DATE}' THEN 1 ELSE 0 END) AS future_date_ct, + SUM(CASE + WHEN DATE_DIFF('MONTH', TIMESTAMP '{RUN_DATE}', TIMESTAMP "{COL_NAME}") > 240 THEN 1 + ELSE 0 + END) AS distant_future_date_ct, COUNT(DISTINCT DATE_DIFF('day', TIMESTAMP "{COL_NAME}", TIMESTAMP '{RUN_DATE}' ) ) as date_days_present, COUNT(DISTINCT DATE_DIFF('week', TIMESTAMP "{COL_NAME}", TIMESTAMP '{RUN_DATE}' ) ) as date_weeks_present, COUNT(DISTINCT DATE_DIFF('month', TIMESTAMP "{COL_NAME}", TIMESTAMP '{RUN_DATE}' ) ) as date_months_present, @@ -168,9 +193,11 @@ strTemplate11_else: NULL as min_date, NULL as before_1yr_date_ct, NULL as before_5yr_date_ct, NULL as before_20yr_date_ct, + NULL AS before_100yr_date_ct, NULL as within_1yr_date_ct, NULL as within_1mo_date_ct, NULL as future_date_ct, + NULL as distant_future_date_ct, NULL as date_days_present, NULL as date_weeks_present, NULL as date_months_present, diff --git a/testgen/template/flavors/trino/setup_profiling_tools/create_functions_trino.sql b/testgen/template/flavors/trino/setup_profiling_tools/create_functions_trino.sql deleted file mode 100644 index f4b1adc..0000000 --- a/testgen/template/flavors/trino/setup_profiling_tools/create_functions_trino.sql +++ /dev/null @@ -1,92 +0,0 @@ - --- The following functions are inline functions --- INLINE FUNCTION TO CHECK FOR A NUMBER - -WITH FUNCTION num_check(a varchar) - RETURNS integer - RETURN - CASE WHEN regexp_like(a, '^[0-9]+(\.[0-9]+)?$') = TRUE THEN 1 - WHEN regexp_like(a, '\$[0-9]+(\.[0-9]+)?$') = TRUE THEN 1 - WHEN regexp_like(a, '^[0-9]+(\.[0-9]+)?\$') = TRUE THEN 1 - ELSE 0 -END -SELECT num_check('1234567'), num_check('$45.945843'), num_check('0.123$'); - - --- INLINE FUNCTION TO CHECK FOR A DATE - -WITH FUNCTION date_check(a varchar) - RETURNS integer - RETURN - CASE WHEN REGEXP_LIKE(a, '^(\d{4})-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])\s(2[0-3]|[01][0-9]):([0-5][0-9]):([0-5][0-9])(\s[0-9]{6})?$') - THEN CASE WHEN CAST(SUBSTRING(a, 1, 4) AS INT) BETWEEN 1800 AND 2200 - AND( ( SUBSTRING(a, 6, 2) IN ('01', '03', '05', '07', '08', '10', '12') - AND CAST(SUBSTRING(a, 9, 2) AS INT) BETWEEN 1 AND 31) - OR (SUBSTRING(a, 6, 2) IN ('04', '06', '09') AND CAST(SUBSTRING(a, 9, 2) AS INT) BETWEEN 1 AND 30) - OR (SUBSTRING(a, 6, 2) = '02' AND CAST(SUBSTRING(a, 9, 2) AS INT) BETWEEN 1 AND 29) - ) - THEN 1 - ELSE 0 -END -WHEN REGEXP_LIKE(a, '^(\d{4})(0[1-9]|1[0-2])(0[1-9]|[12][0-9]|3[01])(2[0-3]|[01][0-9])([0-5][0-9])([0-5][0-9])([0-9]{6})$') - OR REGEXP_LIKE(a, '^(\d{4})(0[1-9]|1[0-2])(0[1-9]|[12][0-9]|3[01])$') - THEN CASE WHEN CAST(SUBSTRING(a, 1, 4) AS INT) BETWEEN 1800 AND 2200 - AND ( (SUBSTRING(a, 5, 2) IN ('01', '03', '05', '07', '08', '10', '12') - AND CAST(SUBSTRING(a, 7, 2) AS INT) BETWEEN 1 AND 31) - OR (SUBSTRING(a, 5, 2) IN ('04', '06', '09') AND CAST(SUBSTRING(a, 7, 2) AS INT) BETWEEN 1 AND 30) - OR (SUBSTRING(a, 5, 2) = '02' AND CAST(SUBSTRING(a, 7, 2) AS INT) BETWEEN 1 AND 29) - ) - THEN 1 - ELSE 0 -END -WHEN LENGTH(a) > 11 THEN 0 - WHEN REGEXP_LIKE(REGEXP_REPLACE(UPPER(a), '(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)', '12'), '[12][09][0-9][0-9]-[0-1]?[0-9]-[0-3]?[0-9]') - THEN CASE WHEN CAST(SPLIT_PART(a, '-', 1) AS INT) BETWEEN 1800 AND 2200 - AND ( (UPPER(SPLIT_PART(a, '-', 2)) IN ('01', '03', '05', '07', '08', - '1', '3', '5', '7', '8', '10', '12', - 'JAN', 'MAR', 'MAY', 'JUL', 'AUG', - 'OCT', 'DEC') - AND CAST(SPLIT_PART(a, '-', 3) AS INT) BETWEEN 1 AND 31) - OR (UPPER(SPLIT_PART(a, '-', 2)) IN ('04', '06', '09', '4', '6', '9', '11', 'APR', 'JUN', 'SEP', 'NOV') - AND CAST(SPLIT_PART(a, '-', 3) AS INT) BETWEEN 1 AND 30) - OR (UPPER(SPLIT_PART(a, '-', 2)) IN ('02', '2', 'FEB') AND CAST(SPLIT_PART(a, '-', 3) AS INT) BETWEEN 1 AND 29) - ) - THEN 1 - ELSE 0 -END -WHEN REGEXP_LIKE(REPLACE(a, '-', '/') , '^[0-1]?[0-9]/[0-3]?[0-9]/[12][09][0-9][0-9]$') - OR REGEXP_LIKE(REPLACE(a, '-', '/') , '^[0-1]?[0-9]/[0-3]?[0-9]/[0-9][0-9]$') - THEN CASE WHEN CAST(SPLIT_PART(REPLACE(a, '-', '/'), '/', 1) AS INT) BETWEEN 1 AND 12 - AND ( (CAST(SPLIT_PART(REPLACE(a, '-', '/'), '/', 1) AS INT) IN (1, 3, 5, 7, 8, 10, 12) - AND CAST(SPLIT_PART(REPLACE(a, '-', '/'), '/', 2) AS INT) BETWEEN 1 AND 31) - OR (CAST(SPLIT_PART(REPLACE(a, '-', '/'), '/', 1) AS INT) IN (4, 6, 9, 11) - AND CAST(SPLIT_PART(REPLACE(a, '-', '/'), '/', 2) AS INT) BETWEEN 1 AND 30) - OR (CAST(SPLIT_PART(REPLACE(a, '-', '/'), '/', 1) AS INT) = 2 - AND CAST(SPLIT_PART(REPLACE(a, '-', '/'), '/', 2) AS INT) BETWEEN 1 AND 29) - ) - AND CAST(('20' || SUBSTRING(SPLIT_PART(REPLACE(a, '-', '/'), '/', 3), -2 )) AS INT) BETWEEN 1800 AND 2200 - THEN 1 - ELSE 0 -END -WHEN REGEXP_LIKE(UPPER(a) , '[0-3]?[0-9]-(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)-[12][09][0-9][0-9]') - THEN CASE WHEN CAST(SPLIT_PART(a, '-', 3) AS INT) BETWEEN 1800 AND 2200 - AND ( (UPPER(SPLIT_PART(a, '-', 2)) IN ('JAN', 'MAR', 'MAY', 'JUL', 'AUG', 'OCT', 'DEC') - AND CAST(SPLIT_PART(a, '-', 1) AS INT) BETWEEN 1 AND 31) - OR (UPPER(SPLIT_PART(a, '-', 2)) IN ('APR', 'JUN', 'SEP', 'NOV') - AND CAST(SPLIT_PART(a, '-', 1) AS INT) BETWEEN 1 AND 30) - OR (UPPER(SPLIT_PART(a, '-', 2)) = 'FEB' - AND CAST(SPLIT_PART(a, '-', 1) AS INT) BETWEEN 1 AND 29) - ) - THEN 1 - ELSE 0 -END -ELSE 0 -END -SELECT date_check('2002-02-30 12:01:35'), - date_check('2002-02-21 12:01:35 121324'), - date_check('20100314224518304596'), - date_check('20100230'), - date_check('201002301234'), - date_check('2010-03-30'), date_check('2010-MAR-30'), - date_check('05-21-22'), date_check('10/23/2023'), - date_check('10-SEP-2024'); \ No newline at end of file diff --git a/testgen/template/flavors/trino/setup_profiling_tools/create_qc_schema_trino.sql b/testgen/template/flavors/trino/setup_profiling_tools/create_qc_schema_trino.sql deleted file mode 100644 index 4cd79fe..0000000 --- a/testgen/template/flavors/trino/setup_profiling_tools/create_qc_schema_trino.sql +++ /dev/null @@ -1 +0,0 @@ -CREATE SCHEMA IF NOT exists {DATA_QC_SCHEMA}; diff --git a/testgen/template/get_entities/get_connection.sql b/testgen/template/get_entities/get_connection.sql index b24c7ba..30621ea 100644 --- a/testgen/template/get_entities/get_connection.sql +++ b/testgen/template/get_entities/get_connection.sql @@ -11,7 +11,6 @@ SELECT project_pw_encrypted, max_threads, max_query_chars, - project_qc_schema, url, connect_by_url, connect_by_key, diff --git a/testgen/template/parms/parms_profiling.sql b/testgen/template/parms/parms_profiling.sql index eabb737..80c93c4 100644 --- a/testgen/template/parms/parms_profiling.sql +++ b/testgen/template/parms/parms_profiling.sql @@ -23,7 +23,6 @@ SELECT cc.project_code, tg.profile_use_sampling, tg.profile_sample_percent, tg.profile_sample_min_count, - cc.project_qc_schema, tg.profile_do_pair_rules, tg.profile_pair_rule_pct, cc.max_threads diff --git a/testgen/template/parms/parms_test_execution.sql b/testgen/template/parms/parms_test_execution.sql index 204b49c..15aba61 100644 --- a/testgen/template/parms/parms_test_execution.sql +++ b/testgen/template/parms/parms_test_execution.sql @@ -1,13 +1,13 @@ SELECT ts.project_code, ts.connection_id::VARCHAR, ts.id::VARCHAR as test_suite_id, + ts.table_groups_id::VARCHAR, tg.table_group_schema, cc.sql_flavor, cc.project_host, cc.project_port, cc.project_user, cc.project_db, - cc.project_qc_schema, cc.connect_by_key, cc.private_key, cc.private_key_passphrase, diff --git a/testgen/template/profiling/functional_datatype.sql b/testgen/template/profiling/functional_datatype.sql index a74cfb4..af64286 100644 --- a/testgen/template/profiling/functional_datatype.sql +++ b/testgen/template/profiling/functional_datatype.sql @@ -232,6 +232,7 @@ WHERE profile_run_id = '{PROFILE_RUN_ID}' UPDATE profile_results SET functional_data_type = CASE WHEN (std_pattern_match = 'ZIP_USA' AND (column_name ILIKE '%zip%' OR column_name ILIKE '%postal%')) + OR (lower(column_name) IN ('ZIP_CODE', 'ZIP')) THEN 'Zip' WHEN std_pattern_match = 'EMAIL' THEN 'Email' @@ -459,15 +460,6 @@ UPDATE profile_results AND p.distinct_value_ct BETWEEN 15 AND 40000 ) c WHERE profile_results.id = c.id; --- 7. Assign 'ID-Unique' functional data type to the columns that are identity columns - -UPDATE profile_results -SET functional_data_type = 'ID-Unique' -WHERE profile_run_id = '{PROFILE_RUN_ID}' - AND functional_data_type IN ('ID', 'ID-Secondary') - AND record_ct = distinct_value_ct - AND record_ct > 50; - -- Update alpha ID's to ID-Secondary and ID-Grouping UPDATE profile_results @@ -481,7 +473,16 @@ SET functional_data_type = CASE WHERE profile_run_id = '{PROFILE_RUN_ID}' AND functional_data_type = 'ID'; --- 8. Assign 'ID-FK' functional data type to the columns that are foreign keys of the identity columns identified in the previous step +-- Assign 'ID-Unique' functional data type to the columns that are identity columns + +UPDATE profile_results +SET functional_data_type = 'ID-Unique' +WHERE profile_run_id = '{PROFILE_RUN_ID}' + AND functional_data_type IN ('ID', 'ID-Secondary') + AND record_ct = distinct_value_ct + AND record_ct > 50; + +-- Assign 'ID-FK' functional data type to the columns that are foreign keys of the identity columns identified in the previous step UPDATE profile_results SET functional_data_type = 'ID-FK' @@ -495,9 +496,7 @@ WHERE profile_results.profile_run_id = '{PROFILE_RUN_ID}' and profile_results.table_name <> ui.table_name and profile_results.functional_data_type <> 'ID-Unique'; --- Assign - --- 9. Functional Data Type: 'Measurement Pct' +-- Functional Data Type: 'Measurement Pct' UPDATE profile_results SET functional_data_type = 'Measurement Pct' diff --git a/testgen/template/profiling/profile_anomalies_screen_column.sql b/testgen/template/profiling/profile_anomalies_screen_column.sql index e0d9e34..cb9c4c1 100644 --- a/testgen/template/profiling/profile_anomalies_screen_column.sql +++ b/testgen/template/profiling/profile_anomalies_screen_column.sql @@ -19,4 +19,4 @@ LEFT JOIN v_inactive_anomalies i AND '{ANOMALY_ID}' = i.anomaly_id) WHERE p.profile_run_id = '{PROFILE_RUN_ID}'::UUID AND i.anomaly_id IS NULL - AND {ANOMALY_CRITERIA}; + AND ({ANOMALY_CRITERIA}); diff --git a/testgen/template/profiling/profile_anomalies_screen_multi_column.sql b/testgen/template/profiling/profile_anomalies_screen_multi_column.sql index 7a61561..6451eaf 100644 --- a/testgen/template/profiling/profile_anomalies_screen_multi_column.sql +++ b/testgen/template/profiling/profile_anomalies_screen_multi_column.sql @@ -44,7 +44,7 @@ WITH mults AS ( SELECT p.project_code, AND '{ANOMALY_ID}' = i.anomaly_id) WHERE p.profile_run_id = '{PROFILE_RUN_ID}'::UUID AND i.anomaly_id IS NULL - AND {ANOMALY_CRITERIA} + AND ({ANOMALY_CRITERIA}) ) INSERT INTO profile_anomaly_results (project_code, table_groups_id, profile_run_id, anomaly_id, diff --git a/testgen/template/profiling/profile_anomalies_screen_variants.sql b/testgen/template/profiling/profile_anomalies_screen_variants.sql index cec9bdb..266e73e 100644 --- a/testgen/template/profiling/profile_anomalies_screen_variants.sql +++ b/testgen/template/profiling/profile_anomalies_screen_variants.sql @@ -22,7 +22,7 @@ WITH all_matches AND p.column_name = i.column_name AND '{ANOMALY_ID}' = i.anomaly_id) WHERE p.profile_run_id = '{PROFILE_RUN_ID}'::UUID - AND {ANOMALY_CRITERIA} + AND ({ANOMALY_CRITERIA}) AND p.top_freq_values > '' AND i.anomaly_id IS NULL AND fn_count_intersecting_items(LOWER(fn_extract_top_values(p.top_freq_values)), v.check_values, '|') > 1 diff --git a/testgen/template/profiling/profile_anomaly_scoring.sql b/testgen/template/profiling/profile_anomaly_scoring.sql new file mode 100644 index 0000000..9511c12 --- /dev/null +++ b/testgen/template/profiling/profile_anomaly_scoring.sql @@ -0,0 +1,10 @@ +UPDATE profile_anomaly_results r + SET dq_prevalence = ({PREV_FORMULA}) * {RISK} + FROM profile_anomaly_results r2 +INNER JOIN profile_results p + ON (r2.profile_run_id = p.profile_run_id + AND r2.table_name = p.table_name + AND r2.column_name = p.column_name) + WHERE r.profile_run_id = '{PROFILE_RUN_ID}'::UUID + AND r2.anomaly_id = '{ANOMALY_ID}' + AND r.id = r2.id; \ No newline at end of file diff --git a/testgen/template/profiling/profile_anomaly_scoring_rollup.sql b/testgen/template/profiling/profile_anomaly_scoring_rollup.sql new file mode 100644 index 0000000..9c7047b --- /dev/null +++ b/testgen/template/profiling/profile_anomaly_scoring_rollup.sql @@ -0,0 +1,109 @@ +-- Roll up scoring to profiling run +WITH score_detail + AS (SELECT pr.profile_run_id, pr.table_name, pr.column_name, + MAX(pr.record_ct) as row_ct, + SUM(COALESCE(p.dq_prevalence * pr.record_ct, 0)) as affected_data_points + FROM profile_results pr + INNER JOIN profiling_runs r + ON (pr.profile_run_id = r.id) + LEFT JOIN profile_anomaly_results p + ON (pr.profile_run_id = p.profile_run_id + AND pr.column_name = p.column_name + AND pr.table_name = p.table_name) + WHERE pr.profile_run_id = '{PROFILE_RUN_ID}' + AND COALESCE(p.disposition, 'Confirmed') = 'Confirmed' + GROUP BY 1, 2, 3 ), +score_calc + AS ( SELECT profile_run_id, + SUM(affected_data_points) as sum_affected_data_points, + SUM(row_ct) as sum_data_points + FROM score_detail + GROUP BY profile_run_id ) +UPDATE profiling_runs + SET dq_affected_data_points = sum_affected_data_points, + dq_total_data_points = sum_data_points, + dq_score_profiling = 100.0 - sum_affected_data_points / sum_data_points + FROM score_calc + WHERE profiling_runs.id = score_calc.profile_run_id; + + +-- Roll up latest scores to Table Group +WITH last_profile_date + AS (SELECT table_groups_id, MAX(profiling_starttime) as last_profile_run_date + FROM profiling_runs + WHERE status = 'Complete' + GROUP BY table_groups_id), +score_calc + AS (SELECT run.table_groups_id, run.id as profile_run_id, + run.dq_affected_data_points as sum_affected_data_points, + run.dq_total_data_points as sum_data_points + FROM profiling_runs run + INNER JOIN last_profile_date lp + ON (run.table_groups_id = lp.table_groups_id + AND run.profiling_starttime = lp.last_profile_run_date) + WHERE run.table_groups_id = '{TABLE_GROUPS_ID}' ) +UPDATE table_groups + SET dq_score_profiling = 100.0 - s.sum_affected_data_points::FLOAT / s.sum_data_points::FLOAT, + last_complete_profile_run_id = s.profile_run_id + FROM score_calc s + WHERE table_groups.id = s.table_groups_id; + +-- Roll up latest scores to data_column_chars +WITH score_detail + AS (SELECT dcc.column_id, tg.last_complete_profile_run_id, + MAX(pr.record_ct) as row_ct, + SUM(COALESCE(p.dq_prevalence * pr.record_ct, 0)) as affected_data_points + FROM table_groups tg + INNER JOIN profiling_runs r + ON (tg.last_complete_profile_run_id = r.id) + INNER JOIN profile_results pr + ON (r.id = pr.profile_run_id) + INNER JOIN data_column_chars dcc + ON (pr.table_groups_id = dcc.table_groups_id + AND pr.table_name = dcc.table_name + AND pr.column_name = dcc.column_name) + LEFT JOIN profile_anomaly_results p + ON (pr.profile_run_id = p.profile_run_id + AND pr.column_name = p.column_name + AND pr.table_name = p.table_name) + WHERE tg.id = '{TABLE_GROUPS_ID}' + AND COALESCE(p.disposition, 'Confirmed') = 'Confirmed' + GROUP BY dcc.column_id, tg.last_complete_profile_run_id ) +UPDATE data_column_chars + SET dq_score_profiling = 100.0 - s.affected_data_points / s.row_ct, + last_complete_profile_run_id = s.last_complete_profile_run_id + FROM score_detail s + WHERE data_column_chars.column_id = s.column_id; + +-- Roll up latest scores to data_table_chars +WITH score_detail + AS (SELECT dcc.column_id, dcc.table_id, tg.last_complete_profile_run_id, + MAX(pr.record_ct) as row_ct, + SUM(COALESCE(p.dq_prevalence * pr.record_ct, 0)) as affected_data_points + FROM table_groups tg + INNER JOIN profiling_runs r + ON (tg.last_complete_profile_run_id = r.id) + INNER JOIN profile_results pr + ON (r.id = pr.profile_run_id) + INNER JOIN data_column_chars dcc + ON (pr.table_groups_id = dcc.table_groups_id + AND pr.table_name = dcc.table_name + AND pr.column_name = dcc.column_name) + LEFT JOIN profile_anomaly_results p + ON (pr.profile_run_id = p.profile_run_id + AND pr.column_name = p.column_name + AND pr.table_name = p.table_name) + WHERE tg.id = '{TABLE_GROUPS_ID}' + AND COALESCE(p.disposition, 'Confirmed') = 'Confirmed' + GROUP BY dcc.column_id, dcc.table_id, tg.last_complete_profile_run_id ), +score_calc + AS ( SELECT table_id, last_complete_profile_run_id, + SUM(affected_data_points) as sum_affected_data_points, + SUM(row_ct) as sum_data_points + FROM score_detail + GROUP BY table_id, last_complete_profile_run_id ) +UPDATE data_table_chars + SET dq_score_profiling = 100.0 - s.sum_affected_data_points / s.sum_data_points, + last_complete_profile_run_id = s.last_complete_profile_run_id + FROM score_calc s + WHERE data_table_chars.table_id = s.table_id; diff --git a/testgen/template/profiling/profile_anomaly_types_get.sql b/testgen/template/profiling/profile_anomaly_types_get.sql index f1cd576..c1f3950 100644 --- a/testgen/template/profiling/profile_anomaly_types_get.sql +++ b/testgen/template/profiling/profile_anomaly_types_get.sql @@ -1,3 +1,3 @@ -SELECT id, anomaly_type, data_object, anomaly_criteria, detail_expression +SELECT id, anomaly_type, data_object, anomaly_criteria, detail_expression, dq_score_prevalence_formula, dq_score_risk_factor FROM profile_anomaly_types t ORDER BY id; diff --git a/testgen/template/validate_tests/ex_get_test_column_list_tg.sql b/testgen/template/validate_tests/ex_get_test_column_list_tg.sql index df7bdde..b0953b1 100644 --- a/testgen/template/validate_tests/ex_get_test_column_list_tg.sql +++ b/testgen/template/validate_tests/ex_get_test_column_list_tg.sql @@ -1,6 +1,19 @@ SELECT schema_name || '.' || table_name || '.' || column_name AS columns, ARRAY_AGG(cat_test_id) as test_id_array - FROM (SELECT cat_test_id, + FROM ( + -- FROM: column_name - column scope (single column) + SELECT cat_test_id, + schema_name AS schema_name, + table_name AS table_name, + column_name + FROM test_definitions d + INNER JOIN test_types t + ON d.test_type = t.test_type + WHERE test_suite_id = '{TEST_SUITE_ID}' + AND t.test_scope = 'column' + UNION + -- FROM: column_name - referential scope (could be multiple columns) + SELECT cat_test_id, schema_name AS schema_name, table_name AS table_name, TRIM(UNNEST(STRING_TO_ARRAY(column_name, ','))) as column_name @@ -8,8 +21,9 @@ INNER JOIN test_types t ON d.test_type = t.test_type WHERE test_suite_id = '{TEST_SUITE_ID}' - AND t.test_scope IN ('column', 'referential') + AND t.test_scope = 'referential' UNION + -- FROM: groupby_names (should be referential) SELECT cat_test_id, schema_name AS schema_name, table_name AS table_name, @@ -20,6 +34,7 @@ WHERE test_suite_id = '{TEST_SUITE_ID}' AND t.test_scope IN ('column', 'referential') UNION + -- FROM: window_date_column (referential) SELECT cat_test_id, schema_name AS schema_name, table_name AS table_name, @@ -28,8 +43,9 @@ INNER JOIN test_types t ON d.test_type = t.test_type WHERE test_suite_id = '{TEST_SUITE_ID}' - AND t.test_scope IN ('column', 'referential') + AND t.test_scope = 'referential' UNION + -- FROM: match_column_names (referential) SELECT cat_test_id, match_schema_name AS schema_name, match_table_name AS table_name, @@ -40,6 +56,7 @@ WHERE test_suite_id = '{TEST_SUITE_ID}' AND t.test_scope = 'referential' UNION + -- FROM: match_groupby_names (referential) SELECT cat_test_id, match_schema_name AS schema_name, match_table_name AS table_name, @@ -49,5 +66,5 @@ ON d.test_type = t.test_type WHERE test_suite_id = '{TEST_SUITE_ID}' AND t.test_scope = 'referential' ) cols - WHERE column_name SIMILAR TO '[A-Za-z0-9_]+' +-- WHERE column_name SIMILAR TO '[A-Za-z0-9_]+' GROUP BY columns; diff --git a/testgen/template/validate_tests/ex_write_test_val_errors.sql b/testgen/template/validate_tests/ex_write_test_val_errors.sql index b1d47d3..639cc3e 100644 --- a/testgen/template/validate_tests/ex_write_test_val_errors.sql +++ b/testgen/template/validate_tests/ex_write_test_val_errors.sql @@ -9,6 +9,7 @@ INSERT INTO test_results test_run_id, input_parameters, result_code, + result_status, result_message, result_measure ) SELECT '{TEST_SUITE_ID}'::UUID, @@ -20,7 +21,8 @@ INSERT INTO test_results '{RUN_DATE}' as test_time, '{TEST_RUN_ID}' as test_run_id, NULL as input_parameters, - 0 as result_code, + NULL as result_code, + 'Error' as result_status, test_definition_status AS result_message, NULL as result_measure FROM test_definitions diff --git a/testgen/ui/assets.py b/testgen/ui/assets.py new file mode 100644 index 0000000..9ea10f1 --- /dev/null +++ b/testgen/ui/assets.py @@ -0,0 +1,19 @@ +import pathlib + +from streamlit.elements.image import WidthBehaviour, image_to_url + + +def get_asset_path(path: str) -> str: + return (pathlib.Path(__file__).parent / "assets" / path).as_posix() + + +def get_asset_data_url(path: str) -> str: + absolute_path = get_asset_path(path) + return image_to_url( + absolute_path, + int(WidthBehaviour.ORIGINAL), + clamp=False, + channels="RGB", + output_format="auto", + image_id=path, + ) diff --git a/testgen/ui/assets/flavors/azure_sql.svg b/testgen/ui/assets/flavors/azure_sql.svg new file mode 100644 index 0000000..7329ae2 --- /dev/null +++ b/testgen/ui/assets/flavors/azure_sql.svg @@ -0,0 +1,135 @@ + + + + + + image/svg+xml + + Icon-databases-130 + + + + + + + + + + + + + + + + + + + + + Icon-databases-130 + + + + + + + + diff --git a/testgen/ui/assets/flavors/azure_synapse_table.svg b/testgen/ui/assets/flavors/azure_synapse_table.svg new file mode 100644 index 0000000..9d908fa --- /dev/null +++ b/testgen/ui/assets/flavors/azure_synapse_table.svg @@ -0,0 +1,145 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/testgen/ui/assets/flavors/mssql.svg b/testgen/ui/assets/flavors/mssql.svg new file mode 100644 index 0000000..c6333d9 --- /dev/null +++ b/testgen/ui/assets/flavors/mssql.svg @@ -0,0 +1,123 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/testgen/ui/assets/flavors/postgresql.svg b/testgen/ui/assets/flavors/postgresql.svg new file mode 100644 index 0000000..7db671a --- /dev/null +++ b/testgen/ui/assets/flavors/postgresql.svg @@ -0,0 +1,100 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + diff --git a/testgen/ui/assets/flavors/redshift.svg b/testgen/ui/assets/flavors/redshift.svg new file mode 100644 index 0000000..26bcc27 --- /dev/null +++ b/testgen/ui/assets/flavors/redshift.svg @@ -0,0 +1,75 @@ + + + + + + image/svg+xml + + Icon-Architecture/32/Arch_Amazon-Redshift_32 + + + + + + Icon-Architecture/32/Arch_Amazon-Redshift_32 + + + + + + + diff --git a/testgen/ui/assets/flavors/snowflake.svg b/testgen/ui/assets/flavors/snowflake.svg new file mode 100644 index 0000000..955c3d2 --- /dev/null +++ b/testgen/ui/assets/flavors/snowflake.svg @@ -0,0 +1,97 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/testgen/ui/assets/style.css b/testgen/ui/assets/style.css index 1a1d86f..c5beb62 100644 --- a/testgen/ui/assets/style.css +++ b/testgen/ui/assets/style.css @@ -16,6 +16,7 @@ body { --secondary-text-color: #0000008a; --disabled-text-color: #00000042; --caption-text-color: rgba(49, 51, 63, 0.6); /* Match Streamlit's caption color */ + --border-color: rgba(0, 0, 0, .12); --sidebar-background-color: white; --sidebar-item-hover-color: #f5f5f5; @@ -51,6 +52,7 @@ footer { /* Sidebar */ section[data-testid="stSidebar"] { + width: 250px; z-index: 999; background-color: var(--sidebar-background-color); } @@ -67,15 +69,18 @@ section[data-testid="stSidebar"] { } section.main > :nth-child(1 of div).block-container { - padding: 24px; + padding: 12px 24px 24px; } div[data-testid="stVerticalBlock"] { gap: 0.5rem; } -div[data-testid="collapsedControl"] { +.appview-container:has(section[data-testid="stSidebar"]) div[data-testid="stSidebarCollapsedControl"] { top: 0.5rem; + border-radius: 4px; + background-color: var(--border-color); + padding: 3px 0 0 8px; } /* */ @@ -86,6 +91,14 @@ div[data-testid="stDialog"] div[role="dialog"] { } /* */ +div[data-testid="stSpinner"] { + background: transparent; +} + +div[data-testid="stSpinner"] > div > i { + border-color: var(--primary-color) rgba(49, 51, 63, 0.2) rgba(49, 51, 63, 0.2); +} + /* Theming for buttons, tabs and form inputs */ button[data-testid="stBaseButton-secondary"]:hover, button[data-testid="stBaseButton-secondary"]:focus:not(:active), @@ -166,22 +179,22 @@ button[title="Show password text"] { background-color: var(--dk-card-background); } -[data-testid="column"]:has(> div[data-testid="stVerticalBlockBorderWrapper"] > div > div[data-testid="stVerticalBlock"] > div.element-container > div.stHtml > i.flex-row) [data-testid="stVerticalBlock"] { +div[data-testid="stVerticalBlockBorderWrapper"]:has(> div > div[data-testid="stVerticalBlock"] > div.element-container > div.stHtml > i.flex-row) [data-testid="stVerticalBlock"] { width: 100%; flex-direction: row; } -[data-testid="column"]:has(> div[data-testid="stVerticalBlockBorderWrapper"] > div > div[data-testid="stVerticalBlock"] > div.element-container > div.stHtml > i.flex-row) [data-testid="stVerticalBlock"] > div[data-testid="element-container"], -[data-testid="column"]:has(> div[data-testid="stVerticalBlockBorderWrapper"] > div > div[data-testid="stVerticalBlock"] > div.element-container > div.stHtml > i.flex-row) [data-testid="stVerticalBlock"] > div[data-testid="element-container"] > div[data-testid] { +div[data-testid="stVerticalBlockBorderWrapper"]:has( > div > div[data-testid="stVerticalBlock"] > div.element-container > div.stHtml > i.flex-row) [data-testid="stVerticalBlock"] > div[data-testid="element-container"], +div[data-testid="stVerticalBlockBorderWrapper"]:has( > div > div[data-testid="stVerticalBlock"] > div.element-container > div.stHtml > i.flex-row) [data-testid="stVerticalBlock"] > div[data-testid="element-container"] > div[data-testid] { width: auto !important; max-height: 40px; } -[data-testid="column"]:has(> div[data-testid="stVerticalBlockBorderWrapper"] > div > div[data-testid="stVerticalBlock"] > div.element-container > div.stHtml > i.flex-start) [data-testid="stVerticalBlock"] { +div[data-testid="stVerticalBlockBorderWrapper"]:has( > div > div[data-testid="stVerticalBlock"] > div.element-container > div.stHtml > i.flex-start) [data-testid="stVerticalBlock"] { justify-content: flex-start; } -[data-testid="column"]:has(> div[data-testid="stVerticalBlockBorderWrapper"] > div > div[data-testid="stVerticalBlock"] > div.element-container > div.stHtml > i.flex-end) [data-testid="stVerticalBlock"] { +div[data-testid="stVerticalBlockBorderWrapper"]:has( > div > div[data-testid="stVerticalBlock"] > div.element-container > div.stHtml > i.flex-end) [data-testid="stVerticalBlock"] { justify-content: flex-end; } @@ -241,6 +254,40 @@ Use as testgen.text("text", "extra_styles") */ } /* */ +/* Page header */ +.tg-header { + margin: 0; + padding: 0; + font-weight: 500; + transition: padding 0.3s; +} + +[data-testid="stSidebarCollapsedControl"] ~ section.main .tg-header { + padding-left: 80px; +} + +.tg-header--line { + margin: 0; + border: none; + border-radius: 2px; + height: 2px; + background-color: var(--disabled-text-color); +} + +div[data-testid="stVerticalBlockBorderWrapper"]:has(> div > div[data-testid="stVerticalBlock"] > div.element-container > div.stHtml > i.tg-header--links) [data-testid="stLinkButton"] a { + border: none; + background: none; + padding: 6px; + min-height: 24px; + color: var(--primary-text-color); +} + +div[data-testid="stVerticalBlockBorderWrapper"]:has(> div > div[data-testid="stVerticalBlock"] > div.element-container > div.stHtml > i.tg-header--links) [data-testid="stLinkButton"] a p { + font-size: 20px; + line-height: 1; +} +/* */ + /* Summary bar component */ .tg-summary-bar--label { margin-bottom: 4px; @@ -262,8 +309,34 @@ Use as testgen.text("text", "extra_styles") */ .tg-summary-bar--caption { margin-top: 4px; + display: flex; + flex-flow: row wrap; + align-items: center; color: var(--caption-text-color); + font-size: 13px; font-style: italic; + line-height: 1; +} + +.tg-summary-bar--legend { + display: flex; + flex-flow: row nowrap; + align-items: center; + width: auto; +} + +.tg-summary-bar--legend:not(:last-child) { + margin-right: 8px; +} + +.tg-summary-bar--legend-dot { + margin-right: 2px; + font-size: 4px; + font-style: normal; +} + +.tg-summary-bar--legend-dot::before { + content: 'โฌค'; } /* */ @@ -274,6 +347,7 @@ Use as testgen.text("text", "extra_styles") */ --secondary-text-color: rgba(255, 255, 255, .7); --disabled-text-color: rgba(255, 255, 255, .5); --caption-text-color: rgba(250, 250, 250, .6); /* Match Streamlit's caption color */ + --border-color: rgba(255, 255, 255, .25); --sidebar-background-color: #14181f; --sidebar-item-hover-color: #10141b; diff --git a/testgen/ui/bootstrap.py b/testgen/ui/bootstrap.py index 05b943f..3abacce 100644 --- a/testgen/ui/bootstrap.py +++ b/testgen/ui/bootstrap.py @@ -11,11 +11,12 @@ from testgen.ui.navigation.router import Router from testgen.ui.session import session from testgen.ui.views.connections import ConnectionsPage +from testgen.ui.views.data_hierarchy import DataHierarchyPage +from testgen.ui.views.hygiene_issues import HygieneIssuesPage from testgen.ui.views.login import LoginPage from testgen.ui.views.overview import OverviewPage -from testgen.ui.views.profiling_anomalies import ProfilingAnomaliesPage from testgen.ui.views.profiling_results import ProfilingResultsPage -from testgen.ui.views.profiling_summary import DataProfilingPage +from testgen.ui.views.profiling_runs import DataProfilingPage from testgen.ui.views.project_settings import ProjectSettingsPage from testgen.ui.views.table_groups import TableGroupsPage from testgen.ui.views.test_definitions import TestDefinitionsPage @@ -27,9 +28,10 @@ BUILTIN_PAGES: list[type[Page]] = [ LoginPage, OverviewPage, + DataHierarchyPage, DataProfilingPage, ProfilingResultsPage, - ProfilingAnomaliesPage, + HygieneIssuesPage, TestRunsPage, TestResultsPage, ConnectionsPage, diff --git a/testgen/ui/components/frontend/css/shared.css b/testgen/ui/components/frontend/css/shared.css index fcf0fa1..460fc3a 100644 --- a/testgen/ui/components/frontend/css/shared.css +++ b/testgen/ui/components/frontend/css/shared.css @@ -20,11 +20,17 @@ body { --blue: #42A5F5; --brown: #8D6E63; --grey: #BDBDBD; + --empty: #EEEEEE; + --empty-light: #FAFAFA; --primary-text-color: #000000de; --secondary-text-color: #0000008a; --disabled-text-color: #00000042; --caption-text-color: rgba(49, 51, 63, 0.6); /* Match Streamlit's caption color */ + --form-field-color: rgb(240, 242, 246); /* Match Streamlit's form field color */ + --border-color: rgba(0, 0, 0, .12); + --tooltip-color: #333d; + --dk-card-background: #fff; --sidebar-background-color: white; --sidebar-item-hover-color: #f5f5f5; @@ -33,28 +39,42 @@ body { --field-underline-color: #9e9e9e; - --button-text-color: var(--primary-text-color); - - --button-hover-state-background: var(--primary-color); --button-hover-state-opacity: 0.12; - --button-basic-text-color: var(--primary-color); --button-basic-background: transparent; + --button-basic-text-color: rgba(0, 0, 0, .87); + --button-basic-hover-state-background: rgba(0, 0, 0, .54); + + --button-basic-flat-text-color: rgba(0, 0, 0); + --button-basic-flat-background: rgba(0, 0, 0, .87); + + --button-basic-stroked-text-color: rgba(0, 0, 0, .87); + --button-basic-stroked-background: transparent; + + --button-primary-background: transparent; + --button-primary-text-color: var(--primary-color); + --button-primary-hover-state-background: var(--primary-color); - --button-flat-text-color: rgba(255, 255, 255); - --button-flat-background: rgba(0, 0, 0, .54); + --button-primary-flat-text-color: rgba(255, 255, 255); + --button-primary-flat-background: var(--primary-color); - --button-stroked-text-color: var(--primary-color); - --button-stroked-background: transparent; - --button-stroked-border: 1px solid rgba(0, 0, 0, .12); + --button-primary-stroked-text-color: var(--primary-color); + --button-primary-stroked-background: transparent; + --button-stroked-border: 1px solid var(--border-color); } @media (prefers-color-scheme: dark) { body { + --empty: #424242; + --empty-light: #212121; + --primary-text-color: rgba(255, 255, 255); --secondary-text-color: rgba(255, 255, 255, .7); --disabled-text-color: rgba(255, 255, 255, .5); --caption-text-color: rgba(250, 250, 250, .6); /* Match Streamlit's caption color */ + --form-field-color: rgb(38, 39, 48); /* Match Streamlit's form field color */ + --border-color: rgba(255, 255, 255, .25); + --dk-card-background: #14181f; --sidebar-background-color: #14181f; --sidebar-item-hover-color: #10141b; @@ -62,10 +82,445 @@ body { --sidebar-active-item-border-color: #b4e3c9; --dk-text-value-background: unset; - --button-text-color: var(--primary-text-color); - - --button-flat-background: rgba(255, 255, 255, .54); + --button-basic-background: transparent; + --button-basic-text-color: rgba(255, 255, 255); + --button-basic-hover-state-background: rgba(255, 255, 255, .54); + + --button-basic-flat-text-color: rgba(255, 255, 255); + --button-basic-flat-background: rgba(255, 255, 255, .54); + + --button-basic-stroked-text-color: rgba(255, 255, 255, .87); + --button-basic-stroked-background: transparent; - --button-stroked-border: 1px solid rgba(255, 255, 255, .12); + --button-stroked-border: 1px solid var(--border-color); } } + +.clickable { + cursor: pointer; +} + +.hidden { + display: none !important; +} + +.invisible { + visibility: hidden !important; +} + +.dot { + font-size: 10px; + font-style: normal; +} + +.dot::before { + content: 'โฌค'; +} + +/* Table styles */ +.table { + background-color: var(--dk-card-background); + border: var(--button-stroked-border); + border-radius: 8px; + padding: 16px; +} + +.table-row { + padding: 12px 0; +} + +.table-row:not(:last-child) { + border-bottom: var(--button-stroked-border); +} + +.table-row:last-child { + padding-bottom: 0; +} + +.table-header { + border-bottom: var(--button-stroked-border); + padding: 0 0 8px 0; + font-size: 12px; + color: var(--caption-text-color); + text-transform: uppercase; +} +/* */ + +/* Text utilities */ +.text-primary { + color: var(--primary-text-color); +} + +.text-secondary { + color: var(--secondary-text-color); +} + +.text-disabled { + color: var(--disabled-text-color); +} + +.text-caption { + font-size: 12px; + color: var(--caption-text-color); +} + +.text-error { + color: var(--error-color); +} + +.text-green { + color: var(--primary-color); +} + +.text-capitalize { + text-transform: capitalize; +} +/* */ + +/* Flex utilities */ +.flex-row { + display: flex; + flex-direction: row; + align-items: center; +} + +.flex-column { + display: flex; + flex-direction: column; +} + +.fx-flex { + flex: 1 1 0%; +} + +.fx-flex-wrap { + flex-wrap: wrap; +} + +.fx-align-flex-center { + align-items: center; +} + +.fx-align-flex-start { + align-items: flex-start; +} + +.fx-align-flex-end { + align-items: flex-end; +} + +.fx-align-baseline { + align-items: baseline; +} + +.fx-justify-flex-end { + justify-items: flex-end; +} + +.fx-justify-content-flex-end { + justify-content: flex-end; +} + +.fx-justify-flex-start { + justify-content: flex-start; +} + +.fx-justify-center { + justify-content: center; +} + +.fx-justify-space-between { + justify-content: space-between; +} + +.fx-flex-align-content { + align-content: flex-start; +} + +.fx-gap-1 { + gap: 4px; +} + +.fx-gap-2 { + gap: 8px; +} + +.fx-gap-3 { + gap: 12px; +} + +.fx-gap-4 { + gap: 16px; +} + +.fx-gap-5 { + gap: 24px; +} + +.fx-gap-6 { + gap: 32px; +} + +.fx-gap-7 { + gap: 40px; +} + +/* */ + +/* Whitespace utilities */ +.mt-0 { + margin-top: 0; +} + +.mt-1 { + margin-top: 4px; +} + +.mt-2 { + margin-top: 8px; +} + +.mt-3 { + margin-top: 12px; +} + +.mt-4 { + margin-top: 16px; +} + +.mt-5 { + margin-top: 24px; +} + +.mt-6 { + margin-top: 32px; +} + +.mt-7 { + margin-top: 40px; +} + +.mr-0 { + margin-right: 0; +} + +.mr-1 { + margin-right: 4px; +} + +.mr-2 { + margin-right: 8px; +} + +.mr-3 { + margin-right: 12px; +} + +.mr-4 { + margin-right: 16px; +} + +.mr-5 { + margin-right: 24px; +} + +.mr-6 { + margin-right: 32px; +} + +.mr-7 { + margin-right: 40px; +} + +.mb-0 { + margin-bottom: 0; +} + +.mb-1 { + margin-bottom: 4px; +} + +.mb-2 { + margin-bottom: 8px; +} + +.mb-3 { + margin-bottom: 12px; +} + +.mb-4 { + margin-bottom: 16px; +} + +.mb-5 { + margin-bottom: 24px; +} + +.mb-6 { + margin-bottom: 32px; +} + +.mb-7 { + margin-bottom: 40px; +} + +.ml-0 { + margin-left: 0; +} + +.ml-1 { + margin-left: 4px; +} + +.ml-2 { + margin-left: 8px; +} + +.ml-3 { + margin-left: 12px; +} + +.ml-4 { + margin-left: 16px; +} + +.ml-5 { + margin-left: 24px; +} + +.ml-6 { + margin-left: 32px; +} + +.ml-7 { + margin-left: 40px; +} + +.pt-0 { + padding-top: 0; +} + +.pt-1 { + padding-top: 4px; +} + +.pt-2 { + padding-top: 8px; +} + +.pt-3 { + padding-top: 12px; +} + +.pt-4 { + padding-top: 16px; +} + +.pt-5 { + padding-top: 24px; +} + +.pt-6 { + padding-top: 32px; +} + +.pt-7 { + padding-top: 40px; +} + +.pr-0 { + padding-right: 0; +} + +.pr-1 { + padding-right: 4px; +} + +.pr-2 { + padding-right: 8px; +} + +.pr-3 { + padding-right: 12px; +} + +.pr-4 { + padding-right: 16px; +} + +.pr-5 { + padding-right: 24px; +} + +.pr-6 { + padding-right: 32px; +} + +.pr-7 { + padding-right: 40px; +} + +.pb-0 { + padding-bottom: 0; +} + +.pb-1 { + padding-bottom: 4px; +} + +.pb-2 { + padding-bottom: 8px; +} + +.pb-3 { + padding-bottom: 12px; +} + +.pb-4 { + padding-bottom: 16px; +} + +.pb-5 { + padding-bottom: 24px; +} + +.pb-6 { + padding-bottom: 32px; +} + +.pb-7 { + padding-bottom: 40px; +} + +.pl-0 { + padding-left: 0; +} + +.pl-1 { + padding-left: 4px; +} + +.pl-2 { + padding-left: 8px; +} + +.pl-3 { + padding-left: 12px; +} + +.pl-4 { + padding-left: 16px; +} + +.pl-5 { + padding-left: 24px; +} + +.pl-6 { + padding-left: 32px; +} + +.pl-7 { + padding-left: 40px; +} +/* */ diff --git a/testgen/ui/components/frontend/js/axis_utils.js b/testgen/ui/components/frontend/js/axis_utils.js new file mode 100644 index 0000000..6c7e835 --- /dev/null +++ b/testgen/ui/components/frontend/js/axis_utils.js @@ -0,0 +1,54 @@ +// https://stackoverflow.com/a/4955179 +function niceNumber(value, round = false) { + const exponent = Math.floor(Math.log10(value)); + const fraction = value / Math.pow(10, exponent); + let niceFraction; + + if (round) { + if (fraction < 1.5) { + niceFraction = 1; + } else if (fraction < 3) { + niceFraction = 2; + } else if (fraction < 7) { + niceFraction = 5; + } else { + niceFraction = 10; + } + } else { + if (fraction <= 1) { + niceFraction = 1; + } else if (fraction <= 2) { + niceFraction = 2; + } else if (fraction <= 5) { + niceFraction = 5; + } else { + niceFraction = 10; + } + } + + return niceFraction * Math.pow(10, exponent); +} + +function niceBounds(axisStart, axisEnd, tickCount = 4) { + let axisWidth = axisEnd - axisStart; + + if (axisWidth == 0) { + axisStart -= 0.5; + axisEnd += 0.5; + axisWidth = axisEnd - axisStart; + } + + const niceRange = niceNumber(axisWidth); + const niceTick = niceNumber(niceRange / (tickCount - 1), true); + axisStart = Math.floor(axisStart / niceTick) * niceTick; + axisEnd = Math.ceil(axisEnd / niceTick) * niceTick; + + return { + min: axisStart, + max: axisEnd, + step: niceTick, + range: axisEnd - axisStart, + }; +} + +export { niceBounds }; diff --git a/testgen/ui/components/frontend/js/components/attribute.js b/testgen/ui/components/frontend/js/components/attribute.js new file mode 100644 index 0000000..5ca702f --- /dev/null +++ b/testgen/ui/components/frontend/js/components/attribute.js @@ -0,0 +1,39 @@ +/** + * @typedef Properties + * @type {object} + * @property {string} label + * @property {string | number} value + * @property {number?} width + */ +import { getValue, loadStylesheet } from '../utils.js'; +import van from '../van.min.js'; + +const { div } = van.tags; + +const Attribute = (/** @type Properties */ props) => { + loadStylesheet('attribute', stylesheet); + + return div( + { style: () => `width: ${props.width ? getValue(props.width) + 'px' : 'auto'}` }, + div( + { class: 'text-caption text-capitalize mb-1' }, + props.label, + ), + div( + { class: 'attribute-value' }, + () => { + const value = getValue(props.value); + return (value || value === 0) ? value : '--'; + }, + ), + ); +}; + +const stylesheet = new CSSStyleSheet(); +stylesheet.replace(` +.attribute-value { + word-wrap: break-word; +} +`); + +export { Attribute }; diff --git a/testgen/ui/components/frontend/js/components/box_plot.js b/testgen/ui/components/frontend/js/components/box_plot.js new file mode 100644 index 0000000..81447d3 --- /dev/null +++ b/testgen/ui/components/frontend/js/components/box_plot.js @@ -0,0 +1,290 @@ +/** + * @typedef Properties + * @type {object} + * @property {number} minimum + * @property {number} maximum + * @property {number} median + * @property {number} lowerQuartile + * @property {number} upperQuartile + * @property {number} average + * @property {number} standardDeviation + * @property {number?} width + */ +import van from '../van.min.js'; +import { getValue, loadStylesheet } from '../utils.js'; +import { colorMap } from '../display_utils.js'; +import { niceBounds } from '../axis_utils.js'; + +const { div } = van.tags; +const boxColor = colorMap.teal; +const lineColor = colorMap.limeGreen; + +const BoxPlot = (/** @type Properties */ props) => { + loadStylesheet('boxPlot', stylesheet); + + const { minimum, maximum, median, lowerQuartile, upperQuartile, average, standardDeviation, width } = props; + const axisTicks = van.derive(() => niceBounds(getValue(minimum), getValue(maximum))); + + return div( + { + class: 'flex-row fx-flex-wrap fx-gap-6', + style: () => `max-width: ${width ? getValue(width) + 'px' : '100%'};`, + }, + div( + { style: 'flex: 300px' }, + div( + { + class: 'tg-box-plot--line', + style: () => { + const { min, range } = axisTicks.val; + return `left: ${(getValue(average) - getValue(standardDeviation) - min) * 100 / range}%; + width: ${getValue(standardDeviation) * 2 * 100 / range}%;`; + }, + }, + div({ class: 'tg-box-plot--dot' }), + ), + div( + { + class: 'tg-box-plot--grid', + style: () => { + const { min, max, range } = axisTicks.val; + + return `grid-template-columns: + ${(getValue(minimum) - min) * 100 / range}% + ${(getValue(lowerQuartile) - getValue(minimum)) * 100 / range}% + ${(getValue(median) - getValue(lowerQuartile)) * 100 / range}% + ${(getValue(upperQuartile) - getValue(median)) * 100 / range}% + ${(getValue(maximum) - getValue(upperQuartile)) * 100 / range}% + ${(max - getValue(maximum)) * 100 / range}%;`; + }, + }, + div({ class: 'tg-box-plot--space-left' }), + div({ class: 'tg-box-plot--top-left' }), + div({ class: 'tg-box-plot--bottom-left' }), + div({ class: 'tg-box-plot--mid-left' }), + div({ class: 'tg-box-plot--mid-right' }), + div({ class: 'tg-box-plot--top-right' }), + div({ class: 'tg-box-plot--bottom-right' }), + div({ class: 'tg-box-plot--space-right' }), + ), + () => { + const { min, max, step, range } = axisTicks.val; + const ticks = []; + let currentTick = min; + while (currentTick <= max) { + ticks.push(currentTick); + currentTick += step; + } + + return div( + { class: 'tg-box-plot--axis' }, + ticks.map(position => div( + { + class: 'tg-box-plot--axis-tick', + style: `left: ${(position - min) * 100 / range}%;` + }, + position, + )), + ); + }, + ), + div( + { class: 'flex-column fx-gap-2 text-caption', style: 'flex: 150px;' }, + div( + { class: 'flex-row fx-gap-2' }, + div({ class: 'tg-blox-plot--legend-line' }), + 'Average---Standard Deviation', + ), + div( + { class: 'flex-row fx-gap-2' }, + div({ class: 'tg-blox-plot--legend-whisker' }), + 'Minimum---Maximum', + ), + div( + { class: 'flex-row fx-gap-2' }, + div({ class: 'tg-blox-plot--legend-box' }), + '25th---Median---75th', + ), + ), + ); +}; + +const stylesheet = new CSSStyleSheet(); +stylesheet.replace(` +.tg-box-plot--line { + position: relative; + margin: 8px 0 24px 0; + border-top: 2px dotted ${lineColor}; +} + +.tg-box-plot--dot { + position: absolute; + top: -1px; + left: 50%; + transform: translateX(-50%) translateY(-50%); + width: 10px; + height: 10px; + border-radius: 5px; + background-color: ${lineColor}; +} + +.tg-box-plot--grid { + height: 24px; + display: grid; + grid-template-rows: 50% 50%; +} + +.tg-box-plot--grid div { + border-color: var(--caption-text-color); + border-style: solid; +} + +.tg-box-plot--space-left { + grid-column-start: 1; + grid-column-end: 2; + grid-row-start: 1; + grid-row-end: 3; + border: 0; +} + +.tg-box-plot--top-left { + grid-column-start: 2; + grid-column-end: 3; + grid-row-start: 1; + grid-row-end: 2; + border-width: 0 0 1px 2px; +} + +.tg-box-plot--bottom-left { + grid-column-start: 2; + grid-column-end: 3; + grid-row-start: 2; + grid-row-end: 3; + border-width: 1px 0 0 2px; +} + +.tg-box-plot--mid-left { + grid-column-start: 3; + grid-column-end: 4; + grid-row-start: 1; + grid-row-end: 3; + border-width: 1px 2px 1px 1px; + border-radius: 4px 0 0 4px; + background-color: ${boxColor}; +} + +.tg-box-plot--mid-right { + grid-column-start: 4; + grid-column-end: 5; + grid-row-start: 1; + grid-row-end: 3; + border-width: 1px 1px 1px 2px; + border-radius: 0 4px 4px 0; + background-color: ${boxColor}; +} + +.tg-box-plot--top-right { + grid-column-start: 5; + grid-column-end: 6; + grid-row-start: 1; + grid-row-end: 2; + border-width: 0 2px 1px 0; +} + +.tg-box-plot--bottom-right { + grid-column-start: 5; + grid-column-end: 6; + grid-row-start: 2; + grid-row-end: 3; + border-width: 1px 2px 0 0; +} + +.tg-box-plot--space-right { + grid-column-start: 6; + grid-column-end: 7; + grid-row-start: 1; + grid-row-end: 3; + border: 0; +} + +.tg-box-plot--axis { + position: relative; + margin: 24px 0; + width: 100%; + height: 2px; + background-color: var(--disabled-text-color); + color: var(--caption-text-color); +} + +.tg-box-plot--axis-tick { + position: absolute; + top: 8px; + transform: translateX(-50%); +} + +.tg-box-plot--axis-tick::before { + position: absolute; + top: -9px; + left: 50%; + transform: translateX(-50%); + width: 4px; + height: 4px; + border-radius: 2px; + background-color: var(--disabled-text-color); + content: ''; +} + +.tg-blox-plot--legend-line { + width: 26px; + border: 1px dotted ${lineColor}; + position: relative; +} + +.tg-blox-plot--legend-line::after { + position: absolute; + left: 50%; + transform: translateX(-50%) translateY(-50%); + width: 6px; + height: 6px; + border-radius: 6px; + background-color: ${lineColor}; + content: ''; +} + +.tg-blox-plot--legend-whisker { + width: 24px; + height: 12px; + border: solid var(--caption-text-color); + border-width: 0 2px 0 2px; + position: relative; +} + +.tg-blox-plot--legend-whisker::after { + position: absolute; + top: 5px; + width: 24px; + height: 2px; + background-color: var(--caption-text-color); + content: ''; +} + +.tg-blox-plot--legend-box { + width: 26px; + height: 12px; + border: 1px solid var(--caption-text-color); + border-radius: 4px; + background-color: ${boxColor}; + position: relative; +} + +.tg-blox-plot--legend-box::after { + position: absolute; + left: 12px; + width: 2px; + height: 12px; + background-color: var(--caption-text-color); + content: ''; +} +`); + +export { BoxPlot }; diff --git a/testgen/ui/components/frontend/js/components/breadcrumbs.js b/testgen/ui/components/frontend/js/components/breadcrumbs.js index d6976c8..52a18a9 100644 --- a/testgen/ui/components/frontend/js/components/breadcrumbs.js +++ b/testgen/ui/components/frontend/js/components/breadcrumbs.js @@ -11,27 +11,35 @@ */ import van from '../van.min.js'; import { Streamlit } from '../streamlit.js'; +import { emitEvent, getValue, loadStylesheet } from '../utils.js'; const { a, div, span } = van.tags; const Breadcrumbs = (/** @type Properties */ props) => { - Streamlit.setFrameHeight(24); + loadStylesheet('breadcrumbs', stylesheet); - if (!window.testgen.loadedStylesheets.breadcrumbs) { - document.adoptedStyleSheets.push(stylesheet); - window.testgen.loadedStylesheets.breadcrumbs = true; + if (!window.testgen.isPage) { + Streamlit.setFrameHeight(24); } return div( {class: 'tg-breadcrumbs-wrapper'}, () => { - const breadcrumbs = van.val(props.breadcrumbs); + const breadcrumbs = getValue(props.breadcrumbs) || []; return div( { class: 'tg-breadcrumbs' }, breadcrumbs.reduce((items, b, idx) => { const isLastItem = idx === breadcrumbs.length - 1; - items.push(a({ class: `tg-breadcrumbs--${ isLastItem ? 'current' : 'active'}`, href: `#/${b.path}`, onclick: () => navigate(b.path, b.params) }, b.label)) + items.push(a({ + class: `tg-breadcrumbs--${ isLastItem ? 'current' : 'active'}`, + onclick: (event) => { + event.preventDefault(); + event.stopPropagation(); + emitEvent('LinkClicked', { href: b.path, params: b.params }); + }}, + b.label, + )); if (!isLastItem) { items.push(span({class: 'tg-breadcrumbs--arrow'}, '>')); } @@ -42,11 +50,6 @@ const Breadcrumbs = (/** @type Properties */ props) => { ) }; -function navigate(/** @type string */ path, /** @type object */ params) { - Streamlit.sendData({ path, params }); - return false; -} - const stylesheet = new CSSStyleSheet(); stylesheet.replace(` .tg-breadcrumbs-wrapper { diff --git a/testgen/ui/components/frontend/js/components/button.js b/testgen/ui/components/frontend/js/components/button.js index a5ce8e8..858a588 100644 --- a/testgen/ui/components/frontend/js/components/button.js +++ b/testgen/ui/components/frontend/js/components/button.js @@ -2,16 +2,19 @@ * @typedef Properties * @type {object} * @property {(string)} type + * @property {(string|null)} color * @property {(string|null)} label * @property {(string|null)} icon * @property {(string|null)} tooltip * @property {(string|null)} tooltipPosition * @property {(Function|null)} onclick + * @property {(bool)} disabled * @property {string?} style */ -import { enforceElementWidth } from '../utils.js'; +import { emitEvent, enforceElementWidth, getValue, loadStylesheet } from '../utils.js'; import van from '../van.min.js'; import { Streamlit } from '../streamlit.js'; +import { Tooltip } from './tooltip.js'; const { button, i, span } = van.tags; const BUTTON_TYPE = { @@ -20,50 +23,63 @@ const BUTTON_TYPE = { ICON: 'icon', STROKED: 'stroked', }; +const BUTTON_COLOR = { + BASIC: 'basic', + PRIMARY: 'primary', +}; -const Button = (/** @type Properties */ props) => { - Streamlit.setFrameHeight(40); - - const isIconOnly = props.type === BUTTON_TYPE.ICON || (props.icon?.val && !props.label?.val); - if (isIconOnly) { // Force a 40px width for the parent iframe & handle window resizing - enforceElementWidth(window.frameElement, 40); - } - if (props.tooltip) { - window.frameElement.parentElement.setAttribute('data-tooltip', props.tooltip.val); - window.frameElement.parentElement.setAttribute('data-tooltip-position', props.tooltipPosition.val); +const Button = (/** @type Properties */ props) => { + loadStylesheet('button', stylesheet); + + const buttonType = getValue(props.type); + const width = getValue(props.width); + const isIconOnly = buttonType === BUTTON_TYPE.ICON || (getValue(props.icon) && !getValue(props.label)); + + if (!window.testgen.isPage) { + Streamlit.setFrameHeight(40); + if (isIconOnly) { // Force a 40px width for the parent iframe & handle window resizing + enforceElementWidth(window.frameElement, 40); + } + + if (width) { + enforceElementWidth(window.frameElement, width); + } + if (props.tooltip) { + window.frameElement.parentElement.setAttribute('data-tooltip', props.tooltip.val); + window.frameElement.parentElement.setAttribute('data-tooltip-position', props.tooltipPosition.val); + } } - if (!window.testgen.loadedStylesheets.button) { - document.adoptedStyleSheets.push(stylesheet); - window.testgen.loadedStylesheets.button = true; - } + const onClickHandler = props.onclick || (() => emitEvent('ButtonClicked')); + const showTooltip = van.state(false); - const onClickHandler = props.onclick || post; return button( { - class: `tg-button tg-${props.type.val}-button ${props.type.val !== 'icon' && isIconOnly ? 'tg-icon-button' : ''}`, - style: props.style?.val, + class: `tg-button tg-${buttonType}-button tg-${getValue(props.color) ?? 'basic'}-button ${buttonType !== 'icon' && isIconOnly ? 'tg-icon-button' : ''}`, + style: () => `width: ${isIconOnly ? '' : (width ?? '100%')}; ${getValue(props.style)}`, onclick: onClickHandler, + disabled: props.disabled, + onmouseenter: props.tooltip ? (() => showTooltip.val = true) : undefined, + onmouseleave: props.tooltip ? (() => showTooltip.val = false) : undefined, }, + props.tooltip ? Tooltip({ + text: props.tooltip, + show: showTooltip, + position: props.tooltipPosition, + }) : undefined, span({class: 'tg-button-focus-state-indicator'}, ''), props.icon ? i({class: 'material-symbols-rounded'}, props.icon) : undefined, !isIconOnly ? span(props.label) : undefined, ); }; -function post() { - Streamlit.sendData({ value: Math.random() }); -} - const stylesheet = new CSSStyleSheet(); stylesheet.replace(` button.tg-button { - width: 100%; height: 40px; position: relative; - overflow: hidden; display: flex; flex-direction: row; @@ -78,8 +94,11 @@ button.tg-button { cursor: pointer; font-size: 14px; - color: var(--button-text-color); - background: var(--button-basic-background); +} + +button.tg-button .tg-button-focus-state-indicator { + border-radius: inherit; + overflow: hidden; } button.tg-button .tg-button-focus-state-indicator::before { @@ -92,21 +111,9 @@ button.tg-button .tg-button-focus-state-indicator::before { position: absolute; pointer-events: none; border-radius: inherit; - background: var(--button-hover-state-background); -} - -button.tg-button.tg-basic-button { - color: var(--button-basic-text-color); -} - -button.tg-button.tg-flat-button { - color: var(--button-flat-text-color); - background: var(--button-flat-background); } button.tg-button.tg-stroked-button { - color: var(--button-stroked-text-color); - background: var(--button-stroked-background); border: var(--button-stroked-border); } @@ -118,7 +125,16 @@ button.tg-button:has(span) { padding: 8px 16px; } -button.tg-button.tg-icon-button > i { +button.tg-button:not(.tg-icon-button):has(span):has(i) { + padding-left: 12px; +} + +button.tg-button[disabled] { + color: var(--disabled-text-color); + cursor: not-allowed; +} + +button.tg-button > i { font-size: 18px; } @@ -126,9 +142,52 @@ button.tg-button > i:has(+ span) { margin-right: 8px; } -button.tg-button:hover .tg-button-focus-state-indicator::before { +button.tg-button:hover:not([disabled]) .tg-button-focus-state-indicator::before { opacity: var(--button-hover-state-opacity); } + + +/* Basic button colors */ +button.tg-button.tg-basic-button { + color: var(--button-basic-text-color); + background: var(--button-basic-background); +} + +button.tg-button.tg-basic-button .tg-button-focus-state-indicator::before { + background: var(--button-basic-hover-state-background); +} + +button.tg-button.tg-basic-button.tg-flat-button { + color: var(--button-basic-flat-text-color); + background: var(--button-basic-flat-background); +} + +button.tg-button.tg-basic-button.tg-stroked-button { + color: var(--button-basic-stroked-text-color); + background: var(--button-basic-stroked-background); +} +/* ... */ + +/* Primary button colors */ +button.tg-button.tg-primary-button { + color: var(--button-primary-text-color); + background: var(--button-primary-background); +} + +button.tg-button.tg-primary-button .tg-button-focus-state-indicator::before { + background: var(--button-primary-hover-state-background); +} + +button.tg-button.tg-primary-button.tg-flat-button { + color: var(--button-primary-flat-text-color); + background: var(--button-primary-flat-background); +} + +button.tg-button.tg-primary-button.tg-stroked-button { + color: var(--button-primary-stroked-text-color); + background: var(--button-primary-stroked-background); +} +/* ... */ `); export { Button }; diff --git a/testgen/ui/components/frontend/js/components/card.js b/testgen/ui/components/frontend/js/components/card.js new file mode 100644 index 0000000..66c6ebb --- /dev/null +++ b/testgen/ui/components/frontend/js/components/card.js @@ -0,0 +1,47 @@ +/** + * @typedef Properties + * @type {object} + * @property {string} title + * @property {object} content + * @property {object?} actionContent + */ +import { loadStylesheet } from '../utils.js'; +import van from '../van.min.js'; + +const { div, h3 } = van.tags; + +const Card = (/** @type Properties */ props) => { + loadStylesheet('card', stylesheet); + + return div( + { class: 'tg-card mb-4' }, + div( + { class: 'flex-row fx-justify-space-between fx-align-flex-start' }, + h3( + { class: 'tg-card--title' }, + props.title, + ), + props.actionContent, + ), + props.content, + ); +}; + +const stylesheet = new CSSStyleSheet(); +stylesheet.replace(` +.tg-card { + border-radius: 8px; + background-color: var(--dk-card-background); + padding: 16px; +} + +.tg-card--title { + margin: 0 0 16px; + color: var(--secondary-text-color); + font-size: 16px; + font-weight: 500; + text-transform: capitalize; +} +`); + +export { Card }; diff --git a/testgen/ui/components/frontend/js/components/checkbox.js b/testgen/ui/components/frontend/js/components/checkbox.js new file mode 100644 index 0000000..c7cf9a9 --- /dev/null +++ b/testgen/ui/components/frontend/js/components/checkbox.js @@ -0,0 +1,83 @@ +/** + * @typedef Properties + * @type {object} + * @property {string} label + * @property {boolean?} checked + * @property {function?} onChange + * @property {number?} width + */ +import van from '../van.min.js'; +import { getValue, loadStylesheet } from '../utils.js'; + +const { input, label } = van.tags; + +const Checkbox = (/** @type Properties */ props) => { + loadStylesheet('checkbox', stylesheet); + + return label( + { + class: 'flex-row fx-gap-2 clickable', + style: () => `width: ${props.width ? getValue(props.width) + 'px' : 'auto'}`, + }, + input({ + type: 'checkbox', + class: 'tg-checkbox--input clickable', + checked: props.checked, + onchange: van.derive(() => { + const onChange = props.onChange?.val ?? props.onChange; + return onChange ? (event) => onChange(event.target.checked) : null; + }), + }), + props.label, + ); +}; + +const stylesheet = new CSSStyleSheet(); +stylesheet.replace(` +.tg-checkbox--input { + appearance: none; + box-sizing: border-box; + margin: 0; + width: 18px; + height: 18px; + border: 1px solid var(--secondary-text-color); + border-radius: 4px; + position: relative; + transition-property: border-color, background-color; + transition-duration: 0.3s; +} + +.tg-checkbox--input:focus, +.tg-checkbox--input:focus-visible { + outline: none; +} + +.tg-checkbox--input:focus-visible::before { + content: ''; + box-sizing: border-box; + position: absolute; + top: -4px; + left: -4px; + width: 24px; + height: 24px; + border: 3px solid var(--border-color); + border-radius: 7px; +} + +.tg-checkbox--input:checked { + border-color: transparent; + background-color: var(--primary-color); +} + +.tg-checkbox--input:checked::after { + position: absolute; + top: -4px; + left: -3px; + content: 'check'; + font-family: 'Material Symbols Rounded'; + font-size: 22px; + color: white; +} +`); + +export { Checkbox }; diff --git a/testgen/ui/components/frontend/js/components/column_profile.js b/testgen/ui/components/frontend/js/components/column_profile.js new file mode 100644 index 0000000..bdbef62 --- /dev/null +++ b/testgen/ui/components/frontend/js/components/column_profile.js @@ -0,0 +1,287 @@ +/** + * @typedef ColumnProfile + * @type {object} + * @property {'A' | 'B' | 'D' | 'N' | 'T' | 'X'} general_type + * * Value Counts + * @property {number} record_ct + * @property {number} value_ct + * @property {number} distinct_value_ct + * @property {number} null_value_ct + * @property {number} zero_value_ct + * * Alpha + * @property {number} zero_length_ct + * @property {number} filled_value_ct + * @property {number} includes_digit_ct + * @property {number} numeric_ct + * @property {number} date_ct + * @property {number} quoted_value_ct + * @property {number} lead_space_ct + * @property {number} embedded_space_ct + * @property {number} avg_embedded_spaces + * @property {number} min_length + * @property {number} max_length + * @property {number} avg_length + * @property {string} min_text + * @property {string} max_text + * @property {number} distinct_std_value_ct + * @property {number} distinct_pattern_ct + * @property {'STREET_ADDR' | 'STATE_USA' | 'PHONE_USA' | 'EMAIL' | 'ZIP_USA' | 'FILE_NAME' | 'CREDIT_CARD' | 'DELIMITED_DATA' | 'SSN'} std_pattern_match + * @property {string} top_freq_values + * @property {string} top_patterns + * * Numeric + * @property {number} min_value + * @property {number} min_value_over_0 + * @property {number} max_value + * @property {number} avg_value + * @property {number} stdev_value + * @property {number} percentile_25 + * @property {number} percentile_50 + * @property {number} percentile_75 + * * Date + * @property {number} min_date + * @property {number} max_date + * @property {number} before_1yr_date_ct + * @property {number} before_5yr_date_ct + * @property {number} before_20yr_date_ct + * @property {number} within_1yr_date_ct + * @property {number} within_1mo_date_ct + * @property {number} future_date_ct + * * Boolean + * @property {number} boolean_true_ct + */ +import van from '../van.min.js'; +import { Attribute } from '../components/attribute.js'; +import { SummaryBar } from './summary_bar.js'; +import { PercentBar } from './percent_bar.js'; +import { FrequencyBars } from './frequency_bars.js'; +import { BoxPlot } from './box_plot.js'; +import { loadStylesheet } from '../utils.js'; +import { formatTimestamp, roundDigits } from '../display_utils.js'; + +const { div } = van.tags; +const columnTypeFunctionMap = { + A: AlphaColumn, + B: BooleanColumn, + D: DatetimeColumn, + N: NumericColumn, +}; +const attributeWidth = 200; +const percentWidth = 250; +const summaryWidth = 400; +const summaryHeight = 10; +const boxPlotWidth = 800; + +const ColumnProfile = (/** @type ColumnProfile */ item) => { + loadStylesheet('column_profile', stylesheet); + const columnFunction = columnTypeFunctionMap[item.general_type]; + return columnFunction ? columnFunction(item) : null; +}; + +function AlphaColumn(/** @type ColumnProfile */ item) { + const standardPatternLabels = { + STREET_ADDR: 'Street Address', + STATE_USA: 'State (USA)', + PHONE_USA: 'Phone (USA)', + EMAIL: 'Email', + ZIP_USA: 'Zip Code (USA)', + FILE_NAME: 'Filename', + CREDIT_CARD: 'Credit Card', + DELIMITED_DATA: 'Delimited Data', + SSN: 'SSN (USA)', + }; + let standardPattern = standardPatternLabels[item.std_pattern_match]; + if (!standardPattern) { + standardPattern = (item.std_pattern_match || '').split('_') + .map(word => word ? (word[0].toUpperCase() + word.substring(1)) : '') + .join(' '); + } + + const total = item.record_ct; + + return div( + { class: 'flex-column fx-gap-4' }, + div( + { class: 'flex-row fx-flex-wrap fx-align-flex-start fx-gap-4 tg-profile--fx-basis-content' }, + div( + { + class: 'flex-column fx-gap-5', + }, + DistinctsBar(item), + SummaryBar({ + height: summaryHeight, + width: summaryWidth, + label: `Missing Values: ${item.zero_length_ct + item.zero_value_ct + item.filled_value_ct + item.null_value_ct}`, + items: [ + { label: 'Values', value: item.value_ct - item.zero_value_ct - item.filled_value_ct - item.zero_length_ct, color: 'green' }, + { label: 'Zero Values', value: item.zero_value_ct, color: 'brown' }, + { label: 'Dummy Values', value: item.filled_value_ct, color: 'orange' }, + { label: 'Zero Length', value: item.zero_length_ct, color: 'yellow' }, + { label: 'Null', value: item.null_value_ct, color: 'brownLight' }, + ], + }), + ), + div( + { + class: 'flex-row fx-flex-wrap fx-align-flex-start fx-gap-3 mb-1 tg-profile--fx-grow-content', + }, + div( + { class: 'flex-column fx-gap-3' }, + PercentBar({ label: 'Includes Digits', value: item.includes_digit_ct, total, width: percentWidth }), + PercentBar({ label: 'Numeric Values', value: item.numeric_ct, total, width: percentWidth }), + PercentBar({ label: 'Date Values', value: item.date_ct, total, width: percentWidth }), + PercentBar({ label: 'Quoted Values', value: item.quoted_value_ct, total, width: percentWidth }), + ), + div( + { class: 'flex-column fx-gap-3' }, + PercentBar({ label: 'Leading Spaces', value: item.lead_space_ct, total, width: percentWidth }), + PercentBar({ label: 'Embedded Spaces', value: item.embedded_space_ct ?? 0, total, width: percentWidth }), + Attribute({ label: 'Average Embedded Spaces', value: roundDigits(item.avg_embedded_spaces), width: attributeWidth }), + ), + ), + ), + div( + { class: 'flex-row fx-flex-wrap fx-align-flex-start fx-gap-4' }, + Attribute({ label: 'Minimum Length', value: item.min_length, width: attributeWidth }), + Attribute({ label: 'Maximum Length', value: item.max_length, width: attributeWidth }), + Attribute({ label: 'Average Length', value: roundDigits(item.avg_length), width: attributeWidth }), + Attribute({ label: 'Minimum Text', value: item.min_text, width: attributeWidth }), + Attribute({ label: 'Maximum Text', value: item.max_text, width: attributeWidth }), + Attribute({ label: 'Distinct Standard Values', value: item.distinct_std_value_ct, width: attributeWidth }), + Attribute({ label: 'Distinct Patterns', value: item.distinct_pattern_ct, width: attributeWidth }), + Attribute({ label: 'Standard Pattern Match', value: standardPattern, width: attributeWidth }), + ), + item.top_freq_values || item.top_patterns ? div( + { class: 'flex-row fx-flex-wrap fx-align-flex-start fx-gap-4 mt-2 mb-2 tg-profile--fx-basis-content' }, + item.top_freq_values ? FrequencyBars({ + title: 'Frequent Values', + total: item.record_ct, + items: item.top_freq_values.substring(2).split('\n| ').map(parts => { + const [value, count] = parts.split(' | '); + return { value, count: Number(count) }; + }), + }) : null, + item.top_patterns ? FrequencyBars({ + title: 'Frequent Patterns', + total: item.record_ct, + items: item.top_patterns.split(' | ').reduce((array, item, index) => { + if (index % 2) { + array[(index - 1) / 2].value = item; + } else { + array.push({ count: Number(item) }); + } + return array; + }, []), + }) : null, + ) : null, + ); +} + +function BooleanColumn(/** @type ColumnProfile */ item) { + return SummaryBar({ + height: summaryHeight, + width: summaryWidth, + label: `Record count: ${item.record_ct}`, + items: [ + { label: 'True', value: item.boolean_true_ct, color: 'brownLight' }, + { label: 'False', value: item.value_ct - item.boolean_true_ct, color: 'brown' }, + { label: 'Null', value: item.null_value_ct, color: 'brownDark' }, + ], + }); +} + +function DatetimeColumn(/** @type ColumnProfile */ item) { + const total = item.record_ct; + + return div( + { class: 'flex-row fx-flex-wrap fx-align-flex-start fx-gap-4 tg-profile--fx-basis-content' }, + div( + DistinctsBar(item), + div( + { class: 'flex-row fx-flex-wrap fx-align-flex-start fx-gap-3 mt-5 tg-profile--fx-grow-content' }, + Attribute({ label: 'Minimum Date', value: formatTimestamp(item.min_date, true) }), + Attribute({ label: 'Maximum Date', value: formatTimestamp(item.max_date, true) }), + ), + ), + div( + { + class: 'flex-row fx-flex-wrap fx-align-flex-start fx-gap-3 mb-1 tg-profile--fx-grow-content', + }, + div( + { class: 'flex-column fx-gap-3' }, + PercentBar({ label: 'Before 1 Year', value: item.before_1yr_date_ct, total, width: percentWidth }), + PercentBar({ label: 'Before 5 Year', value: item.before_5yr_date_ct, total, width: percentWidth }), + PercentBar({ label: 'Before 20 Year', value: item.before_20yr_date_ct, total, width: percentWidth }), + ), + div( + { class: 'flex-column fx-gap-3' }, + PercentBar({ label: 'Within 1 Year', value: item.within_1yr_date_ct, total, width: percentWidth }), + PercentBar({ label: 'Within 1 Month', value: item.within_1mo_date_ct, total, width: percentWidth }), + PercentBar({ label: 'Future Dates', value: item.future_date_ct, total, width: percentWidth }), + ), + ), + ); +} + +function NumericColumn(/** @type ColumnProfile */ item) { + return [ + div( + { class: 'flex-row fx-flex-wrap fx-align-flex-start fx-gap-4 mb-5 tg-profile--fx-basis-content tg-profile--fx-grow-content' }, + div( + DistinctsBar(item), + ), + div( + PercentBar({ label: 'Zero Values', value: item.zero_value_ct, total: item.record_ct, width: percentWidth }), + ), + ), + div( + { class: 'flex-row fx-flex-wrap fx-align-flex-start fx-gap-4' }, + Attribute({ label: 'Minimum Value', value: item.min_value, width: attributeWidth }), + Attribute({ label: 'Minimum Value > 0', value: item.min_value_over_0, width: attributeWidth }), + Attribute({ label: 'Maximum Value', value: item.max_value, width: attributeWidth }), + Attribute({ label: 'Average Value', value: roundDigits(item.avg_value), width: attributeWidth }), + Attribute({ label: 'Standard Deviation', value: roundDigits(item.stdev_value), width: attributeWidth }), + Attribute({ label: '25th Percentile', value: roundDigits(item.percentile_25), width: attributeWidth }), + Attribute({ label: 'Median Value', value: roundDigits(item.percentile_50), width: attributeWidth }), + Attribute({ label: '75th Percentile', value: roundDigits(item.percentile_75), width: attributeWidth }), + ), + div( + { class: 'flex-row fx-justify-center mt-5 tg-profile--fx-grow-content' }, + BoxPlot({ + minimum: item.min_value, + maximum: item.max_value, + median: item.percentile_50, + lowerQuartile: item.percentile_25, + upperQuartile: item.percentile_75, + average: item.avg_value, + standardDeviation: item.stdev_value, + width: boxPlotWidth, + }), + ), + ]; +} + +const DistinctsBar = (/** @type ColumnProfile */ item) => { + return SummaryBar({ + height: summaryHeight, + width: summaryWidth, + label: `Record count: ${item.record_ct}`, + items: [ + { label: 'Distinct', value: item.distinct_value_ct, color: 'blue' }, + { label: 'Non-Distinct', value: item.value_ct - item.distinct_value_ct, color: 'blueLight' }, + { label: 'Null', value: item.null_value_ct, color: 'brownLight' }, + ], + }); +}; + +const stylesheet = new CSSStyleSheet(); +stylesheet.replace(` +.tg-profile--fx-grow-content > * { + flex-grow: 1; +} + +.tg-profile--fx-basis-content > * { + flex: 300px; +} +`); + +export { ColumnProfile }; diff --git a/testgen/ui/components/frontend/js/components/editable_card.js b/testgen/ui/components/frontend/js/components/editable_card.js new file mode 100644 index 0000000..4dc8e54 --- /dev/null +++ b/testgen/ui/components/frontend/js/components/editable_card.js @@ -0,0 +1,64 @@ +/** + * @typedef Properties + * @type {object} + * @property {string} title + * @property {object} content + * @property {object} editingContent + * @property {function} onSave + * @property {function?} onCancel + * @property {function?} hasChanges + */ +import { getValue } from '../utils.js'; +import van from '../van.min.js'; +import { Card } from './card.js'; +import { Button } from './button.js'; + +const { div } = van.tags; + +const EditableCard = (/** @type Properties */ props) => { + const editing = van.state(false); + const onCancel = van.derive(() => { + const cancelFunction = props.onCancel?.val ?? props.onCancel; + return () => { + editing.val = false; + cancelFunction?.(); + } + }); + const saveDisabled = van.derive(() => { + const hasChanges = props.hasChanges?.val ?? props.hasChanges; + return !hasChanges?.(); + }); + + return Card({ + title: props.title, + content: [ + () => editing.val ? getValue(props.editingContent) : getValue(props.content), + () => editing.val ? div( + { class: 'flex-row fx-justify-content-flex-end fx-gap-3 mt-4' }, + Button({ + type: 'stroked', + label: 'Cancel', + width: 'auto', + onclick: onCancel, + }), + Button({ + type: 'stroked', + color: 'primary', + label: 'Save', + width: 'auto', + disabled: saveDisabled, + onclick: props.onSave, + }), + ) : '', + ], + actionContent: () => !editing.val ? Button({ + type: 'stroked', + label: 'Edit', + icon: 'edit', + width: 'auto', + onclick: () => editing.val = true, + }) : '', + }); +}; + +export { EditableCard }; diff --git a/testgen/ui/components/frontend/js/components/expander_toggle.js b/testgen/ui/components/frontend/js/components/expander_toggle.js index 0a5220d..fe68891 100644 --- a/testgen/ui/components/frontend/js/components/expander_toggle.js +++ b/testgen/ui/components/frontend/js/components/expander_toggle.js @@ -7,15 +7,15 @@ */ import van from '../van.min.js'; import { Streamlit } from '../streamlit.js'; +import { loadStylesheet } from '../utils.js'; const { div, span, i } = van.tags; const ExpanderToggle = (/** @type Properties */ props) => { - Streamlit.setFrameHeight(24); + loadStylesheet('expanderToggle', stylesheet); - if (!window.testgen.loadedStylesheets.expanderToggle) { - document.adoptedStyleSheets.push(stylesheet); - window.testgen.loadedStylesheets.expanderToggle = true; + if (!window.testgen.isPage) { + Streamlit.setFrameHeight(24); } const expandedState = van.state(!!props.default.val); diff --git a/testgen/ui/components/frontend/js/components/flavor_selector.js b/testgen/ui/components/frontend/js/components/flavor_selector.js new file mode 100644 index 0000000..8cd1c17 --- /dev/null +++ b/testgen/ui/components/frontend/js/components/flavor_selector.js @@ -0,0 +1,147 @@ +/** + * @typedef Falvor + * @type {object} + * @property {string} label + * @property {string} value + * @property {string} icon + * @property {(boolean|null)} selected + * + * @typedef Properties + * @type {object} + * @property {Array.} flavors + * @property {((number|null))} selected + * @property {(number|null)} columns + */ + +import van from '../van.min.js'; +import { Streamlit } from '../streamlit.js'; +import { loadStylesheet } from '../utils.js'; + +const headerHeight = 35; +const rowGap = 16; +const rowHeight = 67; +const columnSize = '200px'; +const { div, span, img, h3 } = van.tags; + +const DatabaseFlavorSelector = (/** @type Properties */props) => { + loadStylesheet('databaseFlavorSelector', stylesheet); + + const flavors = props.flavors?.val ?? props.flavors; + const numberOfColumns = props.columns?.val ?? props.columns ?? 3; + const numberOfRows = Math.ceil(flavors.length / numberOfColumns); + const selectedIndex = van.state(props.selected?.val ?? props.selected); + + window.testgen.isPage = true; + Streamlit.setFrameHeight( + headerHeight + + rowHeight * numberOfRows + + rowGap * (numberOfRows - 1) + ); + + return div( + {class: 'tg-flavor-selector-page'}, + h3( + {class: 'tg-flavor-selector-header'}, + 'Select your database type' + ), + () => { + return div( + { + class: 'tg-flavor-selector', + style: `grid-template-columns: ${Array(numberOfColumns).fill(columnSize).join(' ')}; row-gap: ${rowGap}px;` + }, + flavors.map((flavor, idx) => + DatabaseFlavor( + { + label: van.state(flavor.label), + value: van.state(flavor.value), + icon: van.state(flavor.icon), + selected: van.derive(() => selectedIndex.val == idx), + }, + () => { + selectedIndex.val = idx; + Streamlit.sendData({index: idx, value: flavor.value}); + }, + ) + ), + ); + }, + ); +}; + +const DatabaseFlavor = ( + /** @type Falvor */ props, + /** @type Function */ onClick, +) => { + return div( + { + class: () => `tg-flavor ${props.selected.val ? 'selected' : ''}`, + onclick: onClick, + }, + span({class: 'tg-flavor-focus-state-indicator'}, ''), + img( + {class: 'tg-flavor--icon', src: props.icon}, + ), + span( + {class: 'tg-flavor--label'}, + props.label + ), + ); +}; + +const stylesheet = new CSSStyleSheet(); +stylesheet.replace(` + .tg-flavor-selector-header { + margin: unset; + margin-bottom: 16px; + font-weight: 400; + } + + .tg-flavor-selector { + display: grid; + grid-template-rows: auto; + column-gap: 32px; + } + + .tg-flavor { + display: flex; + align-items: center; + padding: 16px; + border: 1px solid var(--border-color); + border-radius: 4px; + cursor: pointer; + position: relative; + } + + .tg-flavor .tg-flavor-focus-state-indicator::before { + content: ""; + opacity: 0; + top: 0; + left: 0; + right: 0; + bottom: 0; + position: absolute; + pointer-events: none; + border-radius: inherit; + background: var(--button-primary-hover-state-background); + } + + .tg-flavor.selected { + border-color: var(--primary-color); + } + + .tg-flavor:hover .tg-flavor-focus-state-indicator::before, + .tg-flavor.selected .tg-flavor-focus-state-indicator::before { + opacity: var(--button-hover-state-opacity); + } + + .tg-flavor--icon { + margin-right: 16px; + } + + .tg-flavor--label { + font-weight: 500; + } +`); + +export { DatabaseFlavorSelector }; diff --git a/testgen/ui/components/frontend/js/components/frequency_bars.js b/testgen/ui/components/frontend/js/components/frequency_bars.js new file mode 100644 index 0000000..ed49bf5 --- /dev/null +++ b/testgen/ui/components/frontend/js/components/frequency_bars.js @@ -0,0 +1,94 @@ +/** + * @typedef FrequencyItem + * @type {object} + * @property {string} value + * @property {number} count + * + * @typedef Properties + * @type {object} + * @property {FrequencyItem[]} items + * @property {number} total + * @property {string} title + * @property {string?} color + */ +import van from '../van.min.js'; +import { getValue, loadStylesheet } from '../utils.js'; +import { colorMap } from '../display_utils.js'; + +const { div, span } = van.tags; +const defaultColor = 'teal'; + +const FrequencyBars = (/** @type Properties */ props) => { + loadStylesheet('frequencyBars', stylesheet); + + const total = van.derive(() => getValue(props.total)); + const color = van.derive(() => { + const colorValue = getValue(props.color) || defaultColor; + return colorMap[colorValue] || colorValue; + }); + const width = van.derive(() => { + const maxCount = getValue(props.items).reduce((max, { count }) => Math.max(max, count), 0); + return String(maxCount).length * 7; + }); + + return () => div( + div( + { class: 'mb-2 text-secondary' }, + props.title, + ), + getValue(props.items).map(({ value, count }) => { + return div( + { class: 'flex-row fx-gap-2' }, + div( + { class: 'tg-frequency-bars' }, + span({ class: 'tg-frequency-bars--empty' }), + span({ + class: 'tg-frequency-bars--fill', + style: () => `width: ${count * 100 / total.val}%; + ${count ? 'min-width: 1px;' : ''} + background-color: ${color.val};`, + }), + ), + div( + { + class: 'text-caption tg-frequency-bars--count', + style: () => `width: ${width.val}px;`, + }, + count, + ), + div(value), + ); + }), + ); +}; + +const stylesheet = new CSSStyleSheet(); +stylesheet.replace(` +.tg-frequency-bars { + width: 150px; + height: 15px; + flex-shrink: 0; + position: relative; +} + +.tg-frequency-bars--empty { + position: absolute; + width: 100%; + height: 100%; + border-radius: 4px; + background-color: ${colorMap['emptyLight']} +} + +.tg-frequency-bars--fill { + position: absolute; + border-radius: 4px; + height: 100%; +} + +.tg-frequency-bars--count { + flex-shrink: 0; + text-align: right; +} +`); + +export { FrequencyBars }; diff --git a/testgen/ui/components/frontend/js/components/input.js b/testgen/ui/components/frontend/js/components/input.js new file mode 100644 index 0000000..be2aa03 --- /dev/null +++ b/testgen/ui/components/frontend/js/components/input.js @@ -0,0 +1,104 @@ +/** + * @typedef Properties + * @type {object} + * @property {string?} label + * @property {(string | number)?} value + * @property {string?} placeholder + * @property {string?} icon + * @property {boolean?} clearable + * @property {function?} onChange + * @property {number?} width + */ +import van from '../van.min.js'; +import { debounce, getValue, loadStylesheet } from '../utils.js'; + +const { input, label, i } = van.tags; + +const Input = (/** @type Properties */ props) => { + loadStylesheet('input', stylesheet); + + const value = van.derive(() => getValue(props.value) ?? ''); + van.derive(() => { + const onChange = props.onChange?.val ?? props.onChange; + onChange?.(value.val); + }); + + return label( + { + class: 'flex-column fx-gap-1 text-caption text-capitalize tg-input--label', + style: () => `width: ${props.width ? getValue(props.width) + 'px' : 'auto'}`, + }, + props.label, + () => getValue(props.icon) ? i( + { class: 'material-symbols-rounded tg-input--icon' }, + props.icon, + ) : '', + () => getValue(props.clearable) ? i( + { + class: () => `material-symbols-rounded tg-input--clear clickable ${value.val ? '' : 'hidden'}`, + onclick: () => value.val = '', + }, + 'clear', + ) : '', + input({ + class: 'tg-input--field', + value, + placeholder: () => getValue(props.placeholder) ?? '', + oninput: debounce(event => value.val = event.target.value, 300), + }), + ); +}; + +const stylesheet = new CSSStyleSheet(); +stylesheet.replace(` +.tg-input--label { + position: relative; +} + +.tg-input--icon { + position: absolute; + bottom: 5px; + left: 4px; + font-size: 22px; +} + +.tg-input--icon ~ .tg-input--field { + padding-left: 28px; +} + +.tg-input--clear { + position: absolute; + bottom: 6px; + right: 4px; + font-size: 20px; +} + +.tg-input--clear ~ .tg-input--field { + padding-right: 24px; +} + +.tg-input--field { + box-sizing: border-box; + width: 100%; + height: 32px; + border-radius: 8px; + border: 1px solid transparent; + transition: border-color 0.3s; + background-color: var(--form-field-color); + padding: 4px 8px; + color: var(--primary-text-color); + font-size: 14px; +} + +.tg-input--field::placeholder { + color: var(--disabled-text-color); +} + +.tg-input--field:focus, +.tg-input--field:focus-visible { + outline: none; + border-color: var(--primary-color); +} +`); + +export { Input }; diff --git a/testgen/ui/components/frontend/js/components/link.js b/testgen/ui/components/frontend/js/components/link.js index 17463d4..b070b6f 100644 --- a/testgen/ui/components/frontend/js/components/link.js +++ b/testgen/ui/components/frontend/js/components/link.js @@ -4,32 +4,48 @@ * @property {string} href * @property {object} params * @property {string} label + * @property {boolean} open_new * @property {boolean} underline * @property {string?} left_icon * @property {number?} left_icon_size * @property {string?} right_icon * @property {number?} right_icon_size * @property {number?} height + * @property {number?} width * @property {string?} style */ +import { emitEvent, enforceElementWidth, getValue, loadStylesheet } from '../utils.js'; import van from '../van.min.js'; import { Streamlit } from '../streamlit.js'; const { a, div, i, span } = van.tags; const Link = (/** @type Properties */ props) => { - Streamlit.setFrameHeight(props.height?.val || 24); + loadStylesheet('link', stylesheet); - if (!window.testgen.loadedStylesheets.link) { - document.adoptedStyleSheets.push(stylesheet); - window.testgen.loadedStylesheets.link = true; + if (!window.testgen.isPage) { + Streamlit.setFrameHeight(getValue(props.height) || 24); + const width = getValue(props.width); + if (width) { + enforceElementWidth(window.frameElement, width); + } } + const href = getValue(props.href); + const params = getValue(props.params) || {}; + const open_new = !!getValue(props.open_new); + return a( { - class: `tg-link ${props.underline.val ? 'tg-link--underline' : ''}`, + class: `tg-link ${getValue(props.underline) ? 'tg-link--underline' : ''}`, style: props.style, - onclick: () => navigate(props.href.val, props.params.val), + href: `/${href}${getQueryFromParams(params)}`, + target: open_new ? '_blank' : '', + onclick: open_new ? null : (event) => { + event.preventDefault(); + event.stopPropagation(); + emitEvent('LinkClicked', { href, params }); + }, }, div( {class: 'tg-link--wrapper'}, @@ -46,13 +62,19 @@ const LinkIcon = ( /** @type string */position, ) => { return i( - {class: `material-symbols-rounded tg-link--icon tg-link--icon-${position}`, style: `font-size: ${size.val}px;`}, + {class: `material-symbols-rounded tg-link--icon tg-link--icon-${position}`, style: `font-size: ${getValue(size) || 20}px;`}, icon, ); }; -function navigate(href, params) { - Streamlit.sendData({ href, params }); +function getQueryFromParams(/** @type object */ params) { + const query = Object.entries(params).reduce((query, [ key, value ]) => { + if (key && value) { + return `${query}${query ? '&' : ''}${key}=${value}`; + } + return query; + }, ''); + return query ? `?${query}` : ''; } const stylesheet = new CSSStyleSheet(); diff --git a/testgen/ui/components/frontend/js/components/paginator.js b/testgen/ui/components/frontend/js/components/paginator.js index 7c839a2..2c3a497 100644 --- a/testgen/ui/components/frontend/js/components/paginator.js +++ b/testgen/ui/components/frontend/js/components/paginator.js @@ -7,20 +7,18 @@ */ import van from '../van.min.js'; import { Streamlit } from '../streamlit.js'; +import { emitEvent, loadStylesheet } from '../utils.js'; const { div, span, i, button } = van.tags; const Paginator = (/** @type Properties */ props) => { - const count = props.count.val; - const pageSize = props.pageSize.val; + loadStylesheet('paginator', stylesheet); - Streamlit.setFrameHeight(32); - - if (!window.testgen.loadedStylesheets.expanderToggle) { - document.adoptedStyleSheets.push(stylesheet); - window.testgen.loadedStylesheets.expanderToggle = true; + if (!window.testgen.isPage) { + Streamlit.setFrameHeight(32); } + const { count, pageSize } = props; const pageIndexState = van.state(props.pageIndex.val || 0); return div( @@ -29,7 +27,7 @@ const Paginator = (/** @type Properties */ props) => { { class: 'tg-paginator--label' }, () => { const pageIndex = pageIndexState.val; - return `${pageSize * pageIndex + 1} - ${Math.min(count, pageSize * (pageIndex + 1))} of ${count}` + return `${pageSize.val * pageIndex + 1} - ${Math.min(count.val, pageSize.val * (pageIndex + 1))} of ${count.val}`; }, ), button( @@ -37,7 +35,7 @@ const Paginator = (/** @type Properties */ props) => { class: 'tg-paginator--button', onclick: () => { pageIndexState.val = 0; - Streamlit.sendData(pageIndexState.val); + changePage(pageIndexState.val); }, disabled: () => pageIndexState.val === 0, }, @@ -48,7 +46,7 @@ const Paginator = (/** @type Properties */ props) => { class: 'tg-paginator--button', onclick: () => { pageIndexState.val--; - Streamlit.sendData(pageIndexState.val); + changePage(pageIndexState.val); }, disabled: () => pageIndexState.val === 0, }, @@ -59,9 +57,9 @@ const Paginator = (/** @type Properties */ props) => { class: 'tg-paginator--button', onclick: () => { pageIndexState.val++; - Streamlit.sendData(pageIndexState.val); + changePage(pageIndexState.val); }, - disabled: () => pageIndexState.val === Math.ceil(count / pageSize) - 1, + disabled: () => pageIndexState.val === Math.ceil(count.val / pageSize.val) - 1, }, i({class: 'material-symbols-rounded'}, 'chevron_right') ), @@ -69,16 +67,20 @@ const Paginator = (/** @type Properties */ props) => { { class: 'tg-paginator--button', onclick: () => { - pageIndexState.val = Math.ceil(count / pageSize) - 1; - Streamlit.sendData(pageIndexState.val); + pageIndexState.val = Math.ceil(count.val / pageSize.val) - 1; + changePage(pageIndexState.val); }, - disabled: () => pageIndexState.val === Math.ceil(count / pageSize) - 1, + disabled: () => pageIndexState.val === Math.ceil(count.val / pageSize.val) - 1, }, i({class: 'material-symbols-rounded'}, 'last_page') ), ); }; +function changePage(/** @type number */page_index) { + emitEvent('PageChanged', { page_index }) +} + const stylesheet = new CSSStyleSheet(); stylesheet.replace(` .tg-paginator { diff --git a/testgen/ui/components/frontend/js/components/percent_bar.js b/testgen/ui/components/frontend/js/components/percent_bar.js new file mode 100644 index 0000000..e6a5321 --- /dev/null +++ b/testgen/ui/components/frontend/js/components/percent_bar.js @@ -0,0 +1,79 @@ +/** + * @typedef Properties + * @type {object} + * @property {string} label + * @property {number} value + * @property {number} total + * @property {string?} color + * @property {number?} height + * @property {number?} width + */ +import van from '../van.min.js'; +import { getValue, loadStylesheet } from '../utils.js'; +import { colorMap } from '../display_utils.js'; + +const { div, span } = van.tags; +const defaultHeight = 10; +const defaultColor = 'purpleLight'; + +const PercentBar = (/** @type Properties */ props) => { + loadStylesheet('percentBar', stylesheet); + const value = van.derive(() => getValue(props.value)); + const total = van.derive(() => getValue(props.total)); + + return div( + { style: () => `max-width: ${props.width ? getValue(props.width) + 'px' : '100%'};` }, + div( + { class: () => `tg-percent-bar--label ${value.val ? '' : 'text-secondary'}` }, + () => `${getValue(props.label)}: ${value.val}`, + ), + div( + { + class: 'tg-percent-bar', + style: () => `height: ${getValue(props.height) || defaultHeight}px;`, + }, + span({ + class: 'tg-percent-bar--fill', + style: () => { + const color = getValue(props.color) || defaultColor; + return `width: ${value.val * 100 / total.val}%; + ${value.val ? 'min-width: 1px;' : ''} + background-color: ${colorMap[color] || color};` + }, + }), + span({ + class: 'tg-percent-bar--empty', + style: () => `width: ${(total.val - value.val) * 100 / total.val}%; + ${(total.val - value.val) ? 'min-width: 1px;' : ''};`, + }), + ), + ); +}; + +const stylesheet = new CSSStyleSheet(); +stylesheet.replace(` +.tg-percent-bar--label { + margin-bottom: 4px; +} + +.tg-percent-bar { + height: 100%; + display: flex; + flex-flow: row nowrap; + align-items: flex-start; + justify-content: flex-start; + border-radius: 4px; + overflow: hidden; +} + +.tg-percent-bar--fill { + height: 100%; +} + +.tg-percent-bar--empty { + height: 100%; + background-color: ${colorMap['empty']} +} +`); + +export { PercentBar }; diff --git a/testgen/ui/components/frontend/js/components/radio_group.js b/testgen/ui/components/frontend/js/components/radio_group.js new file mode 100644 index 0000000..0c7f5e4 --- /dev/null +++ b/testgen/ui/components/frontend/js/components/radio_group.js @@ -0,0 +1,104 @@ +/** +* @typedef Option + * @type {object} + * @property {string} label + * @property {string | number | boolean | null} value + * + * @typedef Properties + * @type {object} + * @property {string} label + * @property {Option[]} options + * @property {string | number | boolean | null} selected + * @property {function?} onChange + * @property {number?} width + */ +import van from '../van.min.js'; +import { getRandomId, getValue, loadStylesheet } from '../utils.js'; + +const { div, input, label } = van.tags; + +const RadioGroup = (/** @type Properties */ props) => { + loadStylesheet('radioGroup', stylesheet); + const groupName = getRandomId(); + + return div( + { style: () => `width: ${props.width ? getValue(props.width) + 'px' : 'auto'}` }, + div( + { class: 'text-caption text-capitalize mb-1' }, + props.label, + ), + () => div( + { class: 'flex-row fx-gap-4 tg-radio-group' }, + getValue(props.options).map(option => label( + { class: 'flex-row fx-gap-2 text-capitalize clickable' }, + input({ + type: 'radio', + name: groupName, + value: option.value, + checked: () => option.value === getValue(props.value), + onchange: van.derive(() => { + const onChange = props.onChange?.val ?? props.onChange; + return onChange ? () => onChange(option.value) : null; + }), + class: 'tg-radio-group--input', + }), + option.label, + )), + ), + ); +}; + +const stylesheet = new CSSStyleSheet(); +stylesheet.replace(` +.tg-radio-group { + height: 32px; +} + +.tg-radio-group--input { + appearance: none; + box-sizing: border-box; + margin: 0; + width: 18px; + height: 18px; + border: 1px solid var(--secondary-text-color); + border-radius: 9px; + position: relative; + transition-property: border-color, background-color; + transition-duration: 0.3s; +} + +.tg-radio-group--input:focus, +.tg-radio-group--input:focus-visible { + outline: none; +} + +.tg-radio-group--input:focus-visible::before { + content: ''; + box-sizing: border-box; + position: absolute; + top: -4px; + left: -4px; + width: 24px; + height: 24px; + border: 3px solid var(--border-color); + border-radius: 12px; +} + +.tg-radio-group--input:checked { + border-color: var(--primary-color); +} + +.tg-radio-group--input:checked::after { + content: ''; + box-sizing: border-box; + position: absolute; + top: 3px; + left: 3px; + width: 10px; + height: 10px; + background-color: var(--primary-color); + border-radius: 5px; +} +`); + +export { RadioGroup }; diff --git a/testgen/ui/components/frontend/js/components/select.js b/testgen/ui/components/frontend/js/components/select.js index cc8e493..f4fe618 100644 --- a/testgen/ui/components/frontend/js/components/select.js +++ b/testgen/ui/components/frontend/js/components/select.js @@ -13,24 +13,21 @@ */ import van from '../van.min.js'; import { Streamlit } from '../streamlit.js'; +import { getRandomId, getValue, loadStylesheet } from '../utils.js'; const { div, label, option, select } = van.tags; const Select = (/** @type {Properties} */ props) => { + loadStylesheet('select', stylesheet); Streamlit.setFrameHeight(); - if (!window.testgen.loadedStylesheets.select) { - document.adoptedStyleSheets.push(stylesheet); - window.testgen.loadedStylesheets.select = true; - } - - const domId = Math.random().toString(36).substring(2); + const domId = getRandomId(); const changeHandler = props.onChange || post; return div( {class: 'tg-select'}, - label({for: domId, class: 'tg-select--label'}, van.val(props.label)), + label({for: domId, class: 'tg-select--label'}, props.label), () => { - const options = van.val(props.options); + const options = getValue(props.options) || []; return select( {id: domId, class: 'tg-select--field', onchange: changeHandler}, options.map(op => option({class: 'tg-select--field--option', value: op.value, selected: op.selected}, op.label)), diff --git a/testgen/ui/components/frontend/js/components/sidebar.js b/testgen/ui/components/frontend/js/components/sidebar.js index 56c5650..5057a48 100644 --- a/testgen/ui/components/frontend/js/components/sidebar.js +++ b/testgen/ui/components/frontend/js/components/sidebar.js @@ -46,7 +46,7 @@ const Sidebar = (/** @type {Properties} */ props) => { return div( {class: 'menu'}, () => { - const menuItems = van.val(props.menu).items; + const menuItems = props.menu?.val.items || []; return div( {class: 'content'}, menuItems.map(item => @@ -56,12 +56,12 @@ const Sidebar = (/** @type {Properties} */ props) => { ); }, button( - { class: `tg-button logout`, onclick: () => navigate(van.val(props.logout_path)) }, + { class: `tg-button logout`, onclick: (event) => navigate(event, props.logout_path?.val) }, i({class: 'material-symbols-rounded'}, 'logout'), span('Logout'), ), span({class: 'menu--username'}, props.username), - () => Version(van.val(props.menu).version), + () => Version(props.menu?.val.version), ); }; @@ -78,14 +78,14 @@ const MenuSection = (/** @type {MenuItem} */ item, /** @type {string} */ current const MenuItem = (/** @type {MenuItem} */ item, /** @type {string} */ currentPage) => { const classes = van.derive(() => { - if (isCurrentPage(item.page, van.val(currentPage))) { + if (isCurrentPage(item.page, currentPage?.val)) { return 'menu--item active'; } return 'menu--item'; }); return a( - {class: classes, href: `/${item.page}`, onclick: () => navigate(item.page, van.val(currentPage))}, + {class: classes, href: `/${item.page}`, onclick: (event) => navigate(event, item.page, currentPage?.val)}, i({class: 'menu--item--icon material-symbols-rounded'}, item.icon), span({class: 'menu--item--label'}, item.label), ); @@ -121,11 +121,16 @@ const VersionRow = (/** @type string */ label, /** @type string */ version, icon ); }; -function navigate(/** @type string */ path, /** @type string */ currentPage = null) { +function navigate(/** @type object */ event, /** @type string */ path, /** @type string */ currentPage = null) { + // Needed to prevent page refresh + // Returning false does not work because VanJS does not use inline handlers -> https://github.com/vanjs-org/van/discussions/246 + event.preventDefault(); + // Prevent Streamlit from reacting to event + event.stopPropagation(); + if (Sidebar.StreamlitInstance && path !== currentPage) { Sidebar.StreamlitInstance.sendData(path); } - return false; } function isCurrentPage(/** @type string */ itemPath, /** @type string */ currentPage) { diff --git a/testgen/ui/components/frontend/js/components/sorting_selector.js b/testgen/ui/components/frontend/js/components/sorting_selector.js index 926a173..60b9afa 100644 --- a/testgen/ui/components/frontend/js/components/sorting_selector.js +++ b/testgen/ui/components/frontend/js/components/sorting_selector.js @@ -1,5 +1,6 @@ import {Streamlit} from "../streamlit.js"; import van from '../van.min.js'; +import { loadStylesheet } from '../utils.js'; /** * @typedef ColDef @@ -16,20 +17,18 @@ import van from '../van.min.js'; const { button, div, i, span } = van.tags; const SortingSelector = (/** @type {Properties} */ props) => { + loadStylesheet('sortingSelector', stylesheet); let defaultDirection = "ASC"; - if (!window.testgen.loadedStylesheets.sortingSelector) { - document.adoptedStyleSheets.push(stylesheet); - window.testgen.loadedStylesheets.sortSelector = true; - } - const columns = props.columns.val; const prevComponentState = props.state.val || []; const columnLabel = columns.reduce((acc, [colLabel, colId]) => ({ ...acc, [colId]: colLabel}), {}); - Streamlit.setFrameHeight(100 + 30 * columns.length); + if (!window.testgen.isPage) { + Streamlit.setFrameHeight(100 + 30 * columns.length); + } const componentState = columns.reduce( (state, [colLabel, colId]) => ( diff --git a/testgen/ui/components/frontend/js/components/summary_bar.js b/testgen/ui/components/frontend/js/components/summary_bar.js index ec67e01..e331000 100644 --- a/testgen/ui/components/frontend/js/components/summary_bar.js +++ b/testgen/ui/components/frontend/js/components/summary_bar.js @@ -8,63 +8,50 @@ * @typedef Properties * @type {object} * @property {Array.} items - * @property {string} label - * @property {number} height - * @property {number} width + * @property {string?} label + * @property {number?} height + * @property {number?} width */ import van from '../van.min.js'; -import { Streamlit } from '../streamlit.js'; +import { getValue, loadStylesheet } from '../utils.js'; +import { colorMap } from '../display_utils.js'; const { div, span } = van.tags; -const colorMap = { - red: '#EF5350', - orange: '#FF9800', - yellow: '#FDD835', - green: '#9CCC65', - purple: '#AB47BC', - blue: '#42A5F5', - brown: '#8D6E63', - grey: '#BDBDBD', -} +const defaultHeight = 24; const SummaryBar = (/** @type Properties */ props) => { - const height = props.height.val || 24; - const width = props.width.val; - const summaryItems = props.items.val; - const label = props.label.val; - const total = summaryItems.reduce((sum, item) => sum + item.value, 0); - - Streamlit.setFrameHeight(height + 24 + (label ? 24 : 0)); + loadStylesheet('summaryBar', stylesheet); + const total = van.derive(() => getValue(props.items).reduce((sum, item) => sum + item.value, 0)); - if (!window.testgen.loadedStylesheets.summaryBar) { - document.adoptedStyleSheets.push(stylesheet); - window.testgen.loadedStylesheets.summaryBar = true; - } - return div( - { class: 'tg-summary-bar-wrapper' }, - () => { - return label ? div( - { class: 'tg-summary-bar--label' }, - label, - ) : null; - }, - div( + { style: () => `max-width: ${props.width ? getValue(props.width) + 'px' : '100%'};` }, + () => props.label ? div( + { class: 'tg-summary-bar--label' }, + props.label, + ) : '', + () => div( { class: 'tg-summary-bar', - style: `height: ${height}px; max-width: ${width ? width + 'px' : '100%'}` + style: () => `height: ${getValue(props.height) || defaultHeight}px;` }, - summaryItems.map(item => span({ - class: `tg-summary-bar--item`, - style: `width: ${item.value * 100 / total}%; background-color: ${colorMap[item.color] || item.color};`, + getValue(props.items).map(item => span({ + class: 'tg-summary-bar--item', + style: () => `width: ${item.value * 100 / total.val}%; + ${item.value ? 'min-width: 1px;' : ''} + background-color: ${colorMap[item.color] || item.color};`, })), ), - () => { - return total ? div( - { class: `tg-summary-bar--caption` }, - summaryItems.map(item => `${item.label}: ${item.value}`).join(', '), - ) : null; - }, + () => total.val ? div( + { class: 'tg-summary-bar--caption flex-row fx-flex-wrap text-caption mt-1' }, + getValue(props.items).map(item => div( + { class: 'tg-summary-bar--legend flex-row' }, + span({ + class: 'dot', + style: `color: ${colorMap[item.color] || item.color};`, + }), + `${item.label}: ${item.value || 0}`, + )), + ) : '', ); }; @@ -89,10 +76,21 @@ stylesheet.replace(` } .tg-summary-bar--caption { - margin-top: 4px; - color: var(--caption-text-color); font-style: italic; } + +.tg-summary-bar--legend { + width: auto; +} + +.tg-summary-bar--legend:not(:last-child) { + margin-right: 8px; +} + +.tg-summary-bar--legend span { + margin-right: 2px; + font-size: 4px; +} `); export { SummaryBar }; diff --git a/testgen/ui/components/frontend/js/components/tooltip.js b/testgen/ui/components/frontend/js/components/tooltip.js new file mode 100644 index 0000000..843e175 --- /dev/null +++ b/testgen/ui/components/frontend/js/components/tooltip.js @@ -0,0 +1,157 @@ +// Code modified from vanjs-ui +// https://www.npmjs.com/package/vanjs-ui +// https://cdn.jsdelivr.net/npm/vanjs-ui@0.10.0/dist/van-ui.nomodule.js + +/** + * @typedef Properties + * @type {object} + * @property {string} text + * @property {boolean} show + * @property {('top-left' | 'top' | 'top-right' | 'right' | 'bottom-right' | 'bottom' | 'bottom-left' | 'left')?} position + */ +import van from '../van.min.js'; +import { getValue, loadStylesheet } from '../utils.js'; + +const { div, span } = van.tags; +const defaultPosition = 'top'; + +const Tooltip = (/** @type Properties */ props) => { + loadStylesheet('tooltip', stylesheet); + + return span( + { + class: () => `tg-tooltip ${getValue(props.position) || defaultPosition} ${getValue(props.show) ? '' : 'hidden'}`, + style: () => `opacity: ${getValue(props.show) ? 1 : 0};`, + }, + props.text, + div({ class: 'tg-tooltip--triangle' }), + ); +}; + +const stylesheet = new CSSStyleSheet(); +stylesheet.replace(` +.tg-tooltip { + width: max-content; + max-width: 400px; + position: absolute; + z-index: 1; + border-radius: 4px; + background-color: var(--tooltip-color); + padding: 4px 8px; + color: white; + font-size: 13px; + font-family: 'Roboto', 'Helvetica Neue', sans-serif; + text-align: center; + text-wrap: wrap; + transition: opacity 0.3s; +} + +.tg-tooltip--triangle { + width: 0; + height: 0; + position: absolute; + border: solid transparent; +} + +.tg-tooltip.top-left { + right: 50%; + bottom: 125%; + transform: translateX(20px); +} +.top-left .tg-tooltip--triangle { + bottom: -5px; + right: 20px; + margin-right: -5px; + border-width: 5px 5px 0; + border-top-color: var(--tooltip-color); +} + +.tg-tooltip.top { + left: 50%; + bottom: 125%; + transform: translateX(-50%); +} +.top .tg-tooltip--triangle { + bottom: -5px; + left: 50%; + margin-left: -5px; + border-width: 5px 5px 0; + border-top-color: var(--tooltip-color); +} + +.tg-tooltip.top-right { + left: 50%; + bottom: 125%; + transform: translateX(-20px); +} +.top-right .tg-tooltip--triangle { + bottom: -5px; + left: 20px; + margin-left: -5px; + border-width: 5px 5px 0; + border-top-color: var(--tooltip-color); +} + +.tg-tooltip.right { + left: 125%; +} +.right .tg-tooltip--triangle { + top: 50%; + left: -5px; + margin-top: -5px; + border-width: 5px 5px 5px 0; + border-right-color: var(--tooltip-color); +} + +.tg-tooltip.bottom-right { + left: 50%; + top: 125%; + transform: translateX(-20px); +} +.bottom-right .tg-tooltip--triangle { + top: -5px; + left: 20px; + margin-left: -5px; + border-width: 0 5px 5px; + border-bottom-color: var(--tooltip-color); +} + +.tg-tooltip.bottom { + top: 125%; + left: 50%; + transform: translateX(-50%); +} +.bottom .tg-tooltip--triangle { + top: -5px; + left: 50%; + margin-left: -5px; + border-width: 0 5px 5px; + border-bottom-color: var(--tooltip-color); +} + +.tg-tooltip.bottom-left { + right: 50%; + top: 125%; + transform: translateX(20px); +} +.bottom-left .tg-tooltip--triangle { + top: -5px; + right: 20px; + margin-right: -5px; + border-width: 0 5px 5px; + border-bottom-color: var(--tooltip-color); +} + +.tg-tooltip.left { + right: 125%; +} +.left .tg-tooltip--triangle { + top: 50%; + right: -5px; + margin-top: -5px; + border-width: 5px 0 5px 5px; + border-left-color: var(--tooltip-color); +} +`); + +export { Tooltip }; diff --git a/testgen/ui/components/frontend/js/components/tooltip_icon.js b/testgen/ui/components/frontend/js/components/tooltip_icon.js new file mode 100644 index 0000000..7d3d5d3 --- /dev/null +++ b/testgen/ui/components/frontend/js/components/tooltip_icon.js @@ -0,0 +1,45 @@ +/** + * @typedef Properties + * @type {object} + * @property {string} icon + * @property {number?} iconSize + * @property {string} tooltip + * @property {('top-left' | 'top' | 'top-right' | 'right' | 'bottom-right' | 'bottom' | 'bottom-left' | 'left')?} tooltipPosition + * @property {string} classes + */ +import { getValue, loadStylesheet } from '../utils.js'; +import van from '../van.min.js'; +import { Tooltip } from './tooltip.js'; + +const { i } = van.tags; +const defaultIconSize = 20; + +const TooltipIcon = (/** @type Properties */ props) => { + loadStylesheet('tooltipIcon', stylesheet); + const showTooltip = van.state(false); + + return i( + { + class: () => `material-symbols-rounded tg-tooltip-icon text-secondary ${getValue(props.classes)}`, + style: () => `font-size: ${getValue(props.iconSize) || defaultIconSize}px;`, + onmouseenter: () => showTooltip.val = true, + onmouseleave: () => showTooltip.val = false, + }, + props.icon, + Tooltip({ + text: props.tooltip, + show: showTooltip, + position: props.tooltipPosition, + }), + ); +}; + +const stylesheet = new CSSStyleSheet(); +stylesheet.replace(` +.tg-tooltip-icon { + position: relative; + cursor: default; +} +`); + +export { TooltipIcon }; diff --git a/testgen/ui/components/frontend/js/components/tree.js b/testgen/ui/components/frontend/js/components/tree.js new file mode 100644 index 0000000..d29dd2a --- /dev/null +++ b/testgen/ui/components/frontend/js/components/tree.js @@ -0,0 +1,211 @@ +/** + * @typedef TreeNode + * @type {object} + * @property {string} id + * @property {string} label + * @property {string?} classes + * @property {string?} icon + * @property {number?} iconSize + * @property {TreeNode[]?} children + * @property {number?} level + * @property {boolean?} expanded + * @property {boolean?} hidden + * + * @typedef Properties + * @type {object} + * @property {TreeNode[]} nodes + * @property {string} selected + * @property {string} classes + */ +import van from '../van.min.js'; +import { emitEvent, getValue, loadStylesheet } from '../utils.js'; +import { Input } from './input.js'; +import { Button } from './button.js'; + +const { div, i } = van.tags; +const levelOffset = 14; + +const Tree = (/** @type Properties */ props) => { + loadStylesheet('tree', stylesheet); + + // Use only initial prop value as default and maintain internal state + const initialSelection = props.selected?.rawVal || props.selected || null; + const selected = van.state(initialSelection); + + const treeNodes = van.derive(() => { + const nodes = getValue(props.nodes) || []; + const treeSelected = initTreeState(nodes, initialSelection); + if (!treeSelected) { + selected.val = null; + } + return nodes; + }); + + return div( + { class: () => `flex-column ${getValue(props.classes)}` }, + div( + { class: 'flex-row fx-gap-1 tg-tree--actions' }, + Input({ + icon: 'search', + clearable: true, + onChange: (value) => searchTree(treeNodes.val, value), + }), + Button({ + type: 'icon', + icon: 'expand_all', + style: 'width: 24px; height: 24px; padding: 4px;', + tooltip: 'Expand All', + tooltipPosition: 'bottom', + onclick: () => expandOrCollapseTree(treeNodes.val, true), + }), + Button({ + type: 'icon', + icon: 'collapse_all', + style: 'width: 24px; height: 24px; padding: 4px;', + tooltip: 'Collapse All', + tooltipPosition: 'bottom', + onclick: () => expandOrCollapseTree(treeNodes.val, false), + }), + ), + div( + { class: 'tg-tree' }, + () => div( + { class: 'tg-tree--nodes' }, + treeNodes.val.map(node => TreeNode(node, selected)), + ), + ), + ); +}; + +const TreeNode = ( + /** @type TreeNode */ node, + /** @type string */ selected, +) => { + const hasChildren = !!node.children?.length; + return div( + div( + { + class: () => `tg-tree--row flex-row clickable ${node.classes || ''} + ${selected.val === node.id ? 'selected' : ''} + ${node.hidden.val ? 'hidden' : ''}`, + style: `padding-left: ${levelOffset * node.level}px;`, + onclick: () => { + selected.val = node.id; + emitEvent('TreeNodeSelected', { payload: node.id }); + }, + }, + i( + { + class: `material-symbols-rounded text-secondary ${hasChildren ? '' : 'invisible'}`, + onclick: () => { + node.expanded.val = hasChildren ? !node.expanded.val : false; + }, + }, + () => node.expanded.val ? 'arrow_drop_down' : 'arrow_right', + ), + node.icon ? i( + { + class: 'material-symbols-rounded tg-tree--row-icon', + style: `font-size: ${node.iconSize || 24}px;`, + }, + node.icon, + ) : null, + node.label, + ), + hasChildren ? div( + { class: () => node.expanded.val ? '' : 'hidden' }, + node.children.map(node => TreeNode(node, selected)), + ) : null, + ); +}; + +const initTreeState = ( + /** @type TreeNode[] */ nodes, + /** @type string */ selected, + /** @type number */ level = 0, +) => { + let treeExpanded = false; + nodes.forEach(node => { + node.level = level; + // Expand node if it is initial selection + let expanded = node.id === selected; + if (node.children) { + // Expand node if initial selection is a descendent + expanded = initTreeState(node.children, selected, level + 1) || expanded; + } + node.expanded = van.state(expanded); + node.hidden = van.state(false); + treeExpanded = treeExpanded || expanded; + }); + return treeExpanded; +}; + +const searchTree = ( + /** @type TreeNode[] */ nodes, + /** @type string */ search, +) => { + nodes.forEach(node => { + let hidden = !node.label.includes(search); + if (node.children) { + searchTree(node.children, search); + hidden = hidden && node.children.every(child => child.hidden.rawVal); + } + node.hidden.val = hidden; + }); +}; + +const expandOrCollapseTree = ( + /** @type TreeNode[] */ nodes, + /** @type boolean */ expanded, +) => { + nodes.forEach(node => { + if (node.children) { + expandOrCollapseTree(node.children, expanded); + node.expanded.val = expanded; + } + }); +} + +const stylesheet = new CSSStyleSheet(); +stylesheet.replace(` +.tg-tree { + overflow: auto; +} + +.tg-tree--actions { + margin: 4px; +} + +.tg-tree--nodes { + width: fit-content; + min-width: 100%; +} + +.tg-tree--row { + box-sizing: border-box; + width: auto; + min-width: fit-content; + border: solid transparent; + border-width: 1px 0; + padding-right: 8px; + transition: background-color 0.3s; +} + +.tg-tree--row:hover { + background-color: var(--sidebar-item-hover-color); +} + +.tg-tree--row.selected { + background-color: #06a04a17; + font-weight: 500; +} + +.tg-tree--row-icon { + margin-right: 4px; + width: 24px; + color: #B0BEC5; + text-align: center; +} +`); + +export { Tree }; diff --git a/testgen/ui/components/frontend/js/display_utils.js b/testgen/ui/components/frontend/js/display_utils.js new file mode 100644 index 0000000..bbd9a46 --- /dev/null +++ b/testgen/ui/components/frontend/js/display_utils.js @@ -0,0 +1,61 @@ +function formatTimestamp( + /** @type number | string */ timestamp, + /** @type boolean */ show_year, +) { + if (timestamp) { + const date = new Date(timestamp); + if (!isNaN(date)) { + const months = [ 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec' ]; + const hours = date.getHours(); + const minutes = date.getMinutes(); + return `${months[date.getMonth()]} ${date.getDate()}, ${show_year ? date.getFullYear() + ' at ': ''}${hours % 12}:${String(minutes).padStart(2, '0')} ${hours / 12 > 1 ? 'PM' : 'AM'}`; + } + } + return '--'; +} + +function formatDuration(/** @type string */ duration) { + if (!duration) { + return '--'; + } + + const [ hour, minute, second ] = duration.split(':'); + let formatted = [ + { value: Number(hour), unit: 'h' }, + { value: Number(minute), unit: 'm' }, + { value: Number(second), unit: 's' }, + ].map(({ value, unit }) => value ? `${value}${unit}` : '') + .join(' '); + + return formatted.trim() || '< 1s'; +} + +function roundDigits(/** @type number | string */ number, /** @type number */ precision = 3) { + if (!['number', 'string'].includes(typeof number) || isNaN(number)) { + return '--'; + } + return parseFloat(Number(number).toPrecision(precision)); +} + +// https://m2.material.io/design/color/the-color-system.html#tools-for-picking-colors +const colorMap = { + red: '#EF5350', // Red 400 + orange: '#FF9800', // Orange 500 + yellow: '#FDD835', // Yellow 600 + green: '#9CCC65', // Light Green 400 + limeGreen: '#C0CA33', // Lime Green 600 + purple: '#AB47BC', // Purple 400 + purpleLight: '#CE93D8', // Purple 200 + blue: '#2196F3', // Blue 500 + blueLight: '#90CAF9', // Blue 200 + indigo: '#5C6BC0', // Indigo 400 + teal: '#26A69A', // Teal 400 + brown: '#8D6E63', // Brown 400 + brownLight: '#D7CCC8', // Brown 100 + brownDark: '#4E342E', // Brown 800 + grey: '#BDBDBD', // Gray 400 + empty: 'var(--empty)', // Light: Gray 200, Dark: Gray 800 + emptyLight: 'var(--empty-light)', // Light: Gray 50, Dark: Gray 900 +} + +export { formatTimestamp, formatDuration, roundDigits, colorMap }; diff --git a/testgen/ui/components/frontend/js/main.js b/testgen/ui/components/frontend/js/main.js index ece2e49..bc75e9a 100644 --- a/testgen/ui/components/frontend/js/main.js +++ b/testgen/ui/components/frontend/js/main.js @@ -14,6 +14,10 @@ import { Link } from './components/link.js'; import { Paginator } from './components/paginator.js'; import { Select } from './components/select.js' import { SortingSelector } from './components/sorting_selector.js'; +import { TestRuns } from './pages/test_runs.js'; +import { ProfilingRuns } from './pages/profiling_runs.js'; +import { DatabaseFlavorSelector } from './components/flavor_selector.js'; +import { DataHierarchy } from './pages/data_hierarchy.js'; let currentWindowVan = van; let topWindowVan = window.top.van; @@ -28,6 +32,10 @@ const TestGenComponent = (/** @type {string} */ id, /** @type {object} */ props) select: Select, sorting_selector: SortingSelector, sidebar: window.top.testgen.components.Sidebar, + test_runs: TestRuns, + profiling_runs: ProfilingRuns, + database_flavor_selector: DatabaseFlavorSelector, + data_hierarchy: DataHierarchy, }; if (Object.keys(componentById).includes(id)) { diff --git a/testgen/ui/components/frontend/js/pages/data_hierarchy.js b/testgen/ui/components/frontend/js/pages/data_hierarchy.js new file mode 100644 index 0000000..852434b --- /dev/null +++ b/testgen/ui/components/frontend/js/pages/data_hierarchy.js @@ -0,0 +1,673 @@ +/** + * @typedef ColumnPath + * @type {object} + * @property {string} column_id + * @property {string} table_id + * @property {string} column_name + * @property {string} table_name + * @property {'A' | 'B' | 'D' | 'N' | 'T' | 'X'} general_type + * @property {number} column_drop_date + * @property {number} table_drop_date + * + * @typedef Anomaly + * @type {object} + * @property {string} column_name + * @property {string} anomaly_name + * @property {'Definite' | 'Likely' | 'Possible' | 'Potential PII'} issue_likelihood + * @property {string} detail + * @property {'High' | 'Moderate'} pii_risk + * + * @typedef TestIssue + * @type {object} + * @property {string} id + * @property {string} column_name + * @property {string} test_name + * @property {'Failed' | 'Warning' | 'Error' } result_status + * @property {string} result_message + * @property {string} test_suite + * @property {string} test_run_id + * @property {number} test_run_date + * + * @typedef Column + * @type {ColumnProfile} + * @property {string} id + * @property {'column'} type + * @property {string} column_name + * @property {string} table_name + * @property {string} table_group_id + * * Characteristics + * @property {string} column_type + * @property {string} functional_data_type + * @property {string} datatype_suggestion + * @property {number} add_date + * @property {number} last_mod_date + * @property {number} drop_date + * * Column Metadata + * @property {boolean} critical_data_element + * @property {string} data_source + * @property {string} source_system + * @property {string} source_process + * @property {string} business_domain + * @property {string} stakeholder_group + * @property {string} transform_level + * @property {string} aggregation_level + * * Table Metadata + * @property {boolean} table_critical_data_element + * @property {string} table_cdata_source + * @property {string} table_csource_system + * @property {string} table_csource_process + * @property {string} table_cbusiness_domain + * @property {string} table_cstakeholder_group + * @property {string} table_ctransform_level + * @property {string} table_caggregation_level + * * Latest Profile & Test Runs + * @property {string} latest_profile_id + * @property {number} latest_profile_date + * @property {number} has_test_runs + * * Issues + * @property {Anomaly[]} latest_anomalies + * @property {TestIssue[]} latest_test_issues + * + * @typedef Table + * @type {object} + * @property {string} id + * @property {'table'} type + * @property {string} table_name + * @property {string} table_group_id + * * Characteristics + * @property {string} functional_table_type + * @property {number} record_ct + * @property {number} column_ct + * @property {number} data_point_ct + * @property {number} add_date + * @property {number} drop_date + * * Metadata + * @property {boolean} critical_data_element + * @property {string} data_source + * @property {string} source_system + * @property {string} source_process + * @property {string} business_domain + * @property {string} stakeholder_group + * @property {string} transform_level + * @property {string} aggregation_level + * * Latest Profile & Test Runs + * @property {string} latest_profile_id + * @property {number} latest_profile_date + * @property {number} has_test_runs + * * Issues + * @property {Anomaly[]} latest_anomalies + * @property {TestResult[]} latest_test_results + * + * @typedef Properties + * @type {object} + * @property {ColumnPath[]} columns + * @property {Table | Column} selected + */ +import van from '../van.min.js'; +import { Tree } from '../components/tree.js'; +import { Card } from '../components/card.js'; +import { EditableCard } from '../components/editable_card.js'; +import { Link } from '../components/link.js'; +import { Attribute } from '../components/attribute.js'; +import { Input } from '../components/input.js'; +import { TooltipIcon } from '../components/tooltip_icon.js'; +import { Streamlit } from '../streamlit.js'; +import { emitEvent, getValue, loadStylesheet } from '../utils.js'; +import { formatTimestamp } from '../display_utils.js'; +import { ColumnProfile } from '../components/column_profile.js'; +import { RadioGroup } from '../components/radio_group.js'; + +const { div, h2, span, i } = van.tags; + +const tableIcon = { icon: 'table', iconSize: 20 }; +const columnIcons = { + A: { icon: 'abc' }, + B: { icon: 'toggle_off', iconSize: 20 }, + D: { icon: 'calendar_clock', iconSize: 20 }, + N: { icon: '123' }, + T: { icon: 'calendar_clock', iconSize: 20 }, + X: { icon: 'question_mark', iconSize: 18 }, +}; + +const DataHierarchy = (/** @type Properties */ props) => { + loadStylesheet('data_hierarchy', stylesheet); + Streamlit.setFrameHeight(1); // Non-zero value is needed to render + window.frameElement.style.setProperty('height', 'calc(100vh - 175px)'); + window.testgen.isPage = true; + + const treeNodes = van.derive(() => { + let columns = []; + try { + columns = JSON.parse(getValue(props.columns)); + } catch { } + + const tables = {}; + columns.forEach(({ column_id, table_id, column_name, table_name, general_type, column_drop_date, table_drop_date }) => { + if (!tables[table_id]) { + tables[table_id] = { + id: table_id, + label: table_name, + classes: table_drop_date ? 'text-disabled' : '', + ...tableIcon, + children: [], + }; + } + tables[table_id].children.push({ + id: column_id, + label: column_name, + classes: column_drop_date ? 'text-disabled' : '', + ...columnIcons[general_type || 'X'], + }); + }); + return Object.values(tables); + }); + + const selectedItem = van.derive(() => { + try { + return JSON.parse(getValue(props.selected)); + } catch (e) { + console.error(e) + return null; + } + }); + + return div( + { class: 'flex-row tg-dh' }, + Tree({ + nodes: treeNodes, + // Use .rawVal, so only initial value from query params is passed to tree + selected: selectedItem.rawVal?.id, + classes: 'tg-dh--tree', + }), + () => { + const item = selectedItem.val; + if (item) { + return div( + { class: 'tg-dh--details' }, + h2( + { class: 'tg-dh--title' }, + item.type === 'column' ? [ + span( + { class: 'text-secondary' }, + `${item.table_name}: `, + ), + item.column_name, + ] : item.table_name, + ), + span( + { class: 'flex-row fx-gap-1 fx-justify-content-flex-end mb-2 text-secondary' }, + '* as of latest profiling run on ', + Link({ + href: 'profiling-runs:results', + params: { + run_id: item.latest_profile_id, + table_name: item.table_name, + column_name: item.column_name, + }, + open_new: true, + label: formatTimestamp(item.latest_profile_date), + }), + ), + CharacteristicsCard(item), + item.type === 'column' ? Card({ + title: 'Value Distribution *', + content: ColumnProfile(item), + }) : null, + MetadataCard(item), + PotentialPIICard(item), + HygieneIssuesCard(item), + TestIssuesCard(item), + ); + } + + return div( + { class: 'flex-column fx-align-flex-center fx-justify-center tg-dh--no-selection' }, + i( + { class: 'material-symbols-rounded text-disabled mb-5' }, + 'quick_reference_all', + ), + span( + { class: 'text-secondary' }, + 'Select a table or column on the left to view its details.', + ), + ); + }, + ); +}; + +const CharacteristicsCard = (/** @type Table | Column */ item) => { + let attributes = []; + if (item.type === 'column') { + attributes.push( + { key: 'column_type', label: 'Data Type' }, + { key: 'datatype_suggestion', label: 'Suggested Data Type' }, + { key: 'functional_data_type', label: 'Semantic Data Type' }, + { key: 'add_date', label: 'First Detected' }, + ); + if (item.last_mod_date !== item.add_date) { + attributes.push({ key: 'last_mod_date', label: 'Modification Detected' }); + } + } else { + attributes.push( + { key: 'functional_table_type', label: 'Semantic Table Type' }, + { key: 'record_ct', label: 'Row Count' }, + { key: 'column_ct', label: 'Column Count' }, + { key: 'data_point_ct', label: 'Data Point Count' }, + { key: 'add_date', label: 'First Detected' }, + ); + } + if (item.drop_date) { + attributes.push({ key: 'drop_date', label: 'Drop Detected' }); + } + + return Card({ + title: `${item.type} Characteristics *`, + content: div( + { class: 'flex-row fx-flex-wrap fx-gap-4' }, + attributes.map(({ key, label }) => { + let value = item[key]; + if (key === 'column_type') { + const { icon, iconSize } = columnIcons[item.general_type || 'X']; + value = div( + { class: 'flex-row' }, + i( + { + class: 'material-symbols-rounded tg-dh--column-icon', + style: `font-size: ${iconSize || 24}px;`, + }, + icon, + ), + (value || 'unknown').toLowerCase(), + ); + } else if (key === 'datatype_suggestion') { + value = (value || '').toLowerCase(); + } else if (key === 'functional_table_type') { + value = (value || '').split('-') + .map(word => word ? (word[0].toUpperCase() + word.substring(1)) : '') + .join(' '); + } else if (['add_date', 'last_mod_date', 'drop_date'].includes(key)) { + value = formatTimestamp(value, true); + if (key === 'drop_date') { + label = span({ class: 'text-error' }, label); + } + } + + return Attribute({ label, value, width: 300 }); + }), + ), + }); +}; + +const MetadataCard = (/** @type Table | Column */ item) => { + const attributes = [ + 'critical_data_element', + 'data_source', + 'source_system', + 'source_process', + 'business_domain', + 'stakeholder_group', + 'transform_level', + 'aggregation_level', + ].map(key => ({ + key, + label: key.replaceAll('_', ' '), + state: van.state(item[key]), + inherited: item[`table_${key}`], // Table values inherited by column + })); + + const InheritedIcon = () => TooltipIcon({ + icon: 'layers', + iconSize: 18, + classes: 'text-disabled', + tooltip: 'Inherited from table metadata', + tooltipPosition: 'top-right', + }); + const width = 300; + + const content = div( + { class: 'flex-row fx-flex-wrap fx-gap-4' }, + attributes.map(({ key, label, state, inherited }) => { + let value = state.rawVal ?? inherited; + const isInherited = item.type === 'column' && state.rawVal === null; + + if (key === 'critical_data_element') { + return span( + { class: 'flex-row fx-gap-1', style: `width: ${width}px` }, + i( + { class: `material-symbols-rounded ${value ? 'text-green' : 'text-disabled'}` }, + value ? 'check_circle' : 'cancel', + ), + span( + { class: value ? 'text-capitalize' : 'text-secondary' }, + value ? label : `Not a ${label}`, + ), + isInherited ? InheritedIcon() : null, + ); + } + + if (isInherited && value) { + value = span( + { class: 'flex-row fx-gap-1' }, + InheritedIcon(), + value, + ); + } + return Attribute({ label, value, width }); + }), + ); + + const editingContent = div( + { class: 'flex-row fx-flex-wrap fx-gap-4' }, + attributes.map(({ key, label, state, inherited }) => { + if (key === 'critical_data_element') { + const options = [ + { label: 'Yes', value: true }, + { label: 'No', value: false }, + ]; + if (item.type === 'column') { + options.push({ label: 'Inherit', value: null }); + } + return RadioGroup({ + label, width, options, + value: item.type === 'column' ? state.rawVal : !!state.rawVal, // Coerce null to false for tables + onChange: (value) => state.val = value, + }); + }; + + return Input({ + label, width, + value: state.rawVal, + placeholder: inherited ? `Inherited: ${inherited}` : null, + onChange: (value) => state.val = value || null, + }); + }), + ); + + return EditableCard({ + title: `${item.type} Metadata`, + content, + // Pass as function so the block is re-rendered with reset values when re-editing after a cancel + editingContent: () => editingContent, + onSave: () => { + const payload = attributes.reduce((object, { key, state }) => { + object[key] = state.rawVal; + return object; + }, { id: item.id }); + emitEvent('MetadataChanged', { payload }) + }, + // Reset states to original values on cancel + onCancel: () => attributes.forEach(({ key, state }) => state.val = item[key]), + hasChanges: () => attributes.some(({ key, state }) => state.val !== item[key]), + }); +}; + +const PotentialPIICard = (/** @type Table | Column */ item) => { + const riskColors = { + High: 'red', + Moderate: 'orange', + }; + + const attributes = [ + { + key: 'detail', width: 150, label: 'Type', + value_function: (issue) => (issue.detail || '').split('Type: ')[1], + }, + { + key: 'pii_risk', width: 100, label: 'Risk', classes: 'text-secondary', + value_function: (issue) => div( + { class: 'flex-row' }, + span({ class: 'dot mr-2', style: `color: var(--${riskColors[issue.pii_risk]});` }), + issue.pii_risk, + ), + }, + ]; + if (item.type === 'table') { + attributes.unshift( + { key: 'column_name', width: 150, label: 'Column' }, + ); + } + + const potentialPII = item.latest_anomalies.filter(({ issue_likelihood }) => issue_likelihood === 'Potential PII'); + const linkProps = { + href: 'profiling-runs:hygiene', + params: { run_id: item.latest_profile_id, issue_class: 'Potential PII' }, + }; + + return IssuesCard('Potential PII', potentialPII, attributes, linkProps, 'No potential PII detected'); +}; + +const HygieneIssuesCard = (/** @type Table | Column */ item) => { + const likelihoodColors = { + Definite: 'red', + Likely: 'orange', + Possible: 'yellow', + }; + + const attributes = [ + { key: 'anomaly_name', width: 200, label: 'Issue' }, + { + key: 'issue_likelihood', width: 80, label: 'Likelihood', classes: 'text-secondary', + value_function: (issue) => div( + { class: 'flex-row' }, + span({ class: 'dot mr-2', style: `color: var(--${likelihoodColors[issue.issue_likelihood]});` }), + issue.issue_likelihood, + ), + }, + { key: 'detail', width: 300, label: 'Detail' }, + ]; + if (item.type === 'table') { + attributes.unshift( + { key: 'column_name', width: 150, label: 'Column' }, + ); + } + + const hygieneIssues = item.latest_anomalies.filter(({ issue_likelihood }) => issue_likelihood !== 'Potential PII'); + const linkProps = { + href: 'profiling-runs:hygiene', + params: { + run_id: item.latest_profile_id, + table_name: item.table_name, + column_name: item.column_name, + }, + }; + + return IssuesCard('Hygiene Issues', hygieneIssues, attributes, linkProps, 'No hygiene issues detected'); +}; + +const TestIssuesCard = (/** @type Table | Column */ item) => { + const statusColors = { + Failed: 'red', + Warning: 'yellow', + Error: 'brown', + }; + + const attributes = [ + { key: 'test_name', width: 150, label: 'Test' }, + { + key: 'result_status', width: 80, label: 'Status', classes: 'text-secondary', + value_function: (issue) => div( + { class: 'flex-row' }, + span({ class: 'dot mr-2', style: `color: var(--${statusColors[issue.result_status]});` }), + issue.result_status, + ), + }, + { key: 'result_message', width: 300, label: 'Details' }, + { + key: 'test_run_id', width: 150, label: 'Test Suite | Start Time', + value_function: (issue) => div( + div( + { class: 'text-secondary' }, + issue.test_suite, + ), + Link({ + href: 'test-runs:results', + params: { + run_id: issue.test_run_id, + table_name: item.table_name, + column_name: item.column_name, + selected: issue.id, + }, + open_new: true, + label: formatTimestamp(issue.test_run_date), + style: 'font-size: 12px; margin-top: 2px;', + }), + ), + }, + ]; + if (item.type === 'table') { + attributes.unshift( + { key: 'column_name', width: 150, label: 'Column' }, + ); + } + + let noneContent = 'No test issues detected'; + if (!item.has_test_runs) { + if (item.drop_date) { + noneContent = span({ class: 'text-secondary' }, `No test results for ${item.type}`); + } else { + noneContent = span( + { class: 'text-secondary flex-row fx-gap-1 fx-justify-content-flex-end' }, + `No test results yet for ${item.type}.`, + Link({ + href: 'test-suites', + open_new: true, + label: 'Go to Test Suites', + right_icon: 'chevron_right', + }), + ); + } + } + + return IssuesCard('Test Issues', item.latest_test_issues, attributes, null, noneContent); +}; + +/** + * @typedef Attribute + * @type {object} + * @property {string} key + * @property {number} width + * @property {string} label + * @property {string} classes + * @property {function?} value_function + */ +const IssuesCard = ( + /** @type string */ title, + /** @type (Anomaly | TestIssue)[] */ items, + /** @type Attribute[] */ attributes, + /** @type object? */ linkProps, + /** @type (string | object)? */ noneContent, +) => { + const gap = 8; + const minWidth = attributes.reduce((sum, { width }) => sum + width, attributes.length * gap); + + let content = null; + let actionContent = null; + if (items.length) { + content = div( + { style: 'overflow: auto; max-height: 300px;' }, + div( + { + class: 'flex-row table-row text-caption pt-0', + style: `gap: ${gap}px; min-width: ${minWidth}px;`, + }, + attributes.map(({ label, width }) => span( + { style: `flex: 1 0 ${width}px;` }, + label, + )), + ), + items.map(item => div( + { + class: 'flex-row table-row pt-2 pb-2', + style: `gap: ${gap}px; min-width: ${minWidth}px;`, + }, + attributes.map(({ key, width, value_function, classes }) => { + const value = value_function ? value_function(item) : item[key]; + return span( + { + class: classes || '', + style: `flex: 1 0 ${width}px; word-break: break-word;`, + }, + value || '--', + ); + }), + )), + ); + + if (linkProps) { + actionContent = Link({ + ...linkProps, + open_new: true, + label: 'View details', + right_icon: 'chevron_right', + }); + } + } else { + actionContent = typeof noneContent === 'string' ? span( + { class: 'text-secondary flex-row fx-gap-1' }, + noneContent, + i({ class: 'material-symbols-rounded text-green' }, 'check_circle'), + ) : (noneContent || null); + } + + return Card({ + title: `${title} (${items.length})`, + content, + actionContent, + }); +} + +const stylesheet = new CSSStyleSheet(); +stylesheet.replace(` +.tg-dh { + height: 100%; + align-items: stretch; +} + +.tg-dh--tree { + min-width: 250px; + border-radius: 8px; + border: 1px solid var(--border-color); + background-color: var(--sidebar-background-color); +} + +.tg-dh--details { + padding: 8px 0 0 20px; + overflow: auto; + flex-grow: 1; +} + +.tg-dh--title { + margin: 0; + color: var(--primary-text-color); + font-size: 20px; + font-weight: 500; +} + +.tg-dh--details > .tg-card { + min-width: 400px; +} + +.tg-dh--column-icon { + margin-right: 4px; + width: 24px; + color: #B0BEC5; + text-align: center; +} + +.tg-dh--no-selection { + flex: auto; + max-height: 400px; + padding: 16px; +} + +.tg-dh--no-selection > i { + font-size: 80px; +} + +.tg-dh--no-selection > span { + font-size: 18px; + text-align: center; +} +`); + +export { DataHierarchy }; diff --git a/testgen/ui/components/frontend/js/pages/profiling_runs.js b/testgen/ui/components/frontend/js/pages/profiling_runs.js new file mode 100644 index 0000000..6b98d38 --- /dev/null +++ b/testgen/ui/components/frontend/js/pages/profiling_runs.js @@ -0,0 +1,177 @@ +/** + * @typedef ProfilingRun + * @type {object} + * @property {string} profiling_run_id + * @property {number} start_time + * @property {string} table_groups_name + * @property {'Running'|'Complete'|'Error'|'Cancelled'} status + * @property {string} log_message + * @property {string} duration + * @property {string} process_id + * @property {string} schema_name + * @property {number} column_ct + * @property {number} table_ct + * @property {number} anomaly_ct + * @property {number} anomalies_definite_ct + * @property {number} anomalies_likely_ct + * @property {number} anomalies_possible_ct + * @property {number} anomalies_dismissed_ct + * + * @typedef Properties + * @type {object} + * @property {ProfilingRun[]} items + */ +import van from '../van.min.js'; +import { Tooltip } from '../components/tooltip.js'; +import { SummaryBar } from '../components/summary_bar.js'; +import { Link } from '../components/link.js'; +import { Button } from '../components/button.js'; +import { Streamlit } from '../streamlit.js'; +import { emitEvent, resizeFrameHeightToElement } from '../utils.js'; +import { formatTimestamp, formatDuration } from '../display_utils.js'; + +const { div, span, i } = van.tags; + +const ProfilingRuns = (/** @type Properties */ props) => { + window.testgen.isPage = true; + + const profilingRunItems = van.derive(() => { + let items = []; + try { + items = JSON.parse(props.items?.val); + } catch { } + Streamlit.setFrameHeight(100 * items.length); + return items; + }); + const columns = ['20%', '20%', '20%', '40%']; + + const tableId = 'profiling-runs-table'; + resizeFrameHeightToElement(tableId); + + return div( + { class: 'table', id: tableId }, + div( + { class: 'table-header flex-row' }, + span( + { style: `flex: ${columns[0]}` }, + 'Start Time | Table Group', + ), + span( + { style: `flex: ${columns[1]}` }, + 'Status | Duration', + ), + span( + { style: `flex: ${columns[2]}` }, + 'Schema', + ), + span( + { style: `flex: ${columns[3]}` }, + 'Hygiene Issues', + ), + ), + () => div( + profilingRunItems.val.map(item => ProfilingRunItem(item, columns)), + ), + ); +} + +const ProfilingRunItem = (/** @type ProfilingRun */ item, /** @type string[] */ columns) => { + return div( + { class: 'table-row flex-row' }, + div( + { style: `flex: ${columns[0]}` }, + div(formatTimestamp(item.start_time)), + div( + { class: 'text-caption mt-1' }, + item.table_groups_name, + ), + ), + div( + { class: 'flex-row', style: `flex: ${columns[1]}` }, + div( + ProfilingRunStatus(item), + div( + { class: 'text-caption mt-1' }, + formatDuration(item.duration), + ), + ), + item.status === 'Running' && item.process_id ? Button({ + type: 'stroked', + label: 'Cancel Run', + style: 'width: auto; height: 32px; color: var(--purple); margin-left: 16px;', + onclick: () => emitEvent('RunCanceled', { payload: item }), + }) : null, + ), + div( + { style: `flex: ${columns[2]}` }, + div(item.schema_name), + div( + { + class: 'text-caption mt-1 mb-1', + style: item.status === 'Complete' && !item.column_ct ? 'color: var(--red);' : '', + }, + item.status === 'Complete' ? `${item.table_ct || 0} tables, ${item.column_ct || 0} columns` : null, + ), + item.column_ct ? Link({ + label: 'View results', + href: 'profiling-runs:results', + params: { 'run_id': item.profiling_run_id }, + underline: true, + right_icon: 'chevron_right', + }) : null, + ), + div( + { style: `flex: ${columns[3]}` }, + item.anomaly_ct ? SummaryBar({ + items: [ + { label: 'Definite', value: item.anomalies_definite_ct, color: 'red' }, + { label: 'Likely', value: item.anomalies_likely_ct, color: 'orange' }, + { label: 'Possible', value: item.anomalies_possible_ct, color: 'yellow' }, + { label: 'Dismissed', value: item.anomalies_dismissed_ct, color: 'grey' }, + ], + height: 10, + width: 350, + }) : '--', + item.anomaly_ct ? Link({ + label: `View ${item.anomaly_ct} issues`, + href: 'profiling-runs:hygiene', + params: { 'run_id': item.profiling_run_id }, + underline: true, + right_icon: 'chevron_right', + style: 'margin-top: 8px;', + }) : null, + ), + ); +} + +function ProfilingRunStatus(/** @type ProfilingRun */ item) { + const attributeMap = { + Running: { label: 'Running', color: 'blue' }, + Complete: { label: 'Completed', color: '' }, + Error: { label: 'Error', color: 'red' }, + Cancelled: { label: 'Canceled', color: 'purple' }, + }; + const attributes = attributeMap[item.status] || { label: 'Unknown', color: 'grey' }; + return span( + { + class: 'flex-row', + style: `color: var(--${attributes.color});`, + }, + attributes.label, + () => { + const tooltipError = van.state(false); + return item.status === 'Error' && item.log_message ? i( + { + class: 'material-symbols-rounded text-secondary ml-1 profiling-runs--info', + style: 'position: relative; font-size: 16px;', + onmouseenter: () => tooltipError.val = true, + onmouseleave: () => tooltipError.val = false, + }, + 'info', + Tooltip({ text: item.log_message, show: tooltipError }), + ) : null; + }, + ); +} + +export { ProfilingRuns }; diff --git a/testgen/ui/components/frontend/js/pages/test_runs.js b/testgen/ui/components/frontend/js/pages/test_runs.js new file mode 100644 index 0000000..c5656a6 --- /dev/null +++ b/testgen/ui/components/frontend/js/pages/test_runs.js @@ -0,0 +1,152 @@ +/** + * @typedef TestRun + * @type {object} + * @property {string} test_run_id + * @property {number} test_starttime + * @property {string} table_groups_name + * @property {string} test_suite + * @property {'Running'|'Complete'|'Error'|'Cancelled'} status + * @property {string} log_message + * @property {string} duration + * @property {string} process_id + * @property {number} test_ct + * @property {number} passed_ct + * @property {number} warning_ct + * @property {number} failed_ct + * @property {number} error_ct + * @property {number} dismissed_ct + * + * @typedef Properties + * @type {object} + * @property {TestRun[]} items + */ +import van from '../van.min.js'; +import { Tooltip } from '../components/tooltip.js'; +import { SummaryBar } from '../components/summary_bar.js'; +import { Link } from '../components/link.js'; +import { Button } from '../components/button.js'; +import { Streamlit } from '../streamlit.js'; +import { emitEvent, resizeFrameHeightToElement } from '../utils.js'; +import { formatTimestamp, formatDuration } from '../display_utils.js'; + +const { div, span, i } = van.tags; + +const TestRuns = (/** @type Properties */ props) => { + window.testgen.isPage = true; + + const testRunItems = van.derive(() => { + let items = []; + try { + items = JSON.parse(props.items?.val); + } catch { } + Streamlit.setFrameHeight(100 * items.length); + return items; + }); + const columns = ['30%', '20%', '50%']; + + const tableId = 'test-runs-table'; + resizeFrameHeightToElement(tableId); + + return div( + { class: 'table', id: tableId }, + div( + { class: 'table-header flex-row' }, + span( + { style: `flex: ${columns[0]}` }, + 'Start Time | Table Group | Test Suite', + ), + span( + { style: `flex: ${columns[1]}` }, + 'Status | Duration', + ), + span( + { style: `flex: ${columns[2]}` }, + 'Results Summary', + ), + ), + () => div( + testRunItems.val.map(item => TestRunItem(item, columns)), + ), + ); +} + +const TestRunItem = (/** @type TestRun */ item, /** @type string[] */ columns) => { + return div( + { class: 'table-row flex-row' }, + div( + { style: `flex: ${columns[0]}` }, + Link({ + label: formatTimestamp(item.test_starttime), + href: 'test-runs:results', + params: { 'run_id': item.test_run_id }, + underline: true, + }), + div( + { class: 'text-caption mt-1' }, + `${item.table_groups_name} > ${item.test_suite}`, + ), + ), + div( + { class: 'flex-row', style: `flex: ${columns[1]}` }, + div( + TestRunStatus(item), + div( + { class: 'text-caption mt-1' }, + formatDuration(item.duration), + ), + ), + item.status === 'Running' && item.process_id ? Button({ + type: 'stroked', + label: 'Cancel Run', + style: 'width: auto; height: 32px; color: var(--purple); margin-left: 16px;', + onclick: () => emitEvent('RunCanceled', { payload: item }), + }) : null, + ), + div( + { style: `flex: ${columns[2]}` }, + item.test_ct ? SummaryBar({ + items: [ + { label: 'Passed', value: item.passed_ct, color: 'green' }, + { label: 'Warning', value: item.warning_ct, color: 'yellow' }, + { label: 'Failed', value: item.failed_ct, color: 'red' }, + { label: 'Error', value: item.error_ct, color: 'brown' }, + { label: 'Dismissed', value: item.dismissed_ct, color: 'grey' }, + ], + height: 10, + width: 400, + }) : '--', + ), + ); +} + +function TestRunStatus(/** @type TestRun */ item) { + const attributeMap = { + Running: { label: 'Running', color: 'blue' }, + Complete: { label: 'Completed', color: '' }, + Error: { label: 'Error', color: 'red' }, + Cancelled: { label: 'Canceled', color: 'purple' }, + }; + const attributes = attributeMap[item.status] || { label: 'Unknown', color: 'grey' }; + return span( + { + class: 'flex-row', + style: `color: var(--${attributes.color});`, + }, + attributes.label, + () => { + const tooltipError = van.state(false); + return item.status === 'Error' && item.log_message ? i( + { + class: 'material-symbols-rounded text-secondary ml-1', + style: 'position: relative; font-size: 16px;', + onmouseenter: () => tooltipError.val = true, + onmouseleave: () => tooltipError.val = false, + }, + 'info', + Tooltip({ text: item.log_message, show: tooltipError }), + ) : null; + }, + ); +} + +export { TestRuns }; diff --git a/testgen/ui/components/frontend/js/utils.js b/testgen/ui/components/frontend/js/utils.js index 7757caa..b5bdc96 100644 --- a/testgen/ui/components/frontend/js/utils.js +++ b/testgen/ui/components/frontend/js/utils.js @@ -1,3 +1,6 @@ +import van from './van.min.js'; +import { Streamlit } from './streamlit.js'; + function enforceElementWidth( /** @type Element */element, /** @type number */width, @@ -9,4 +12,61 @@ function enforceElementWidth( observer.observe(element); } -export { enforceElementWidth }; +function resizeFrameHeightToElement(/** @type string */elementId) { + const observer = new ResizeObserver(() => { + const height = document.getElementById(elementId).offsetHeight; + if (height) { + Streamlit.setFrameHeight(height); + } + }); + observer.observe(window.frameElement); +} + +function loadStylesheet( + /** @type string */key, + /** @type CSSStyleSheet */stylesheet, +) { + if (!window.testgen.loadedStylesheets[key]) { + document.adoptedStyleSheets.push(stylesheet); + window.testgen.loadedStylesheets[key] = true; + } +} + +function emitEvent( + /** @type string */event, + /** @type object */data = {}, +) { + Streamlit.sendData({ event, ...data, _id: Math.random() }) // Identify the event so its handler is called once +} + +// Replacement for van.val() +// https://github.com/vanjs-org/van/discussions/280 +const stateProto = Object.getPrototypeOf(van.state()); +function getValue(/** @type object */ prop) { // van state or static value + const proto = Object.getPrototypeOf(prop ?? 0); + if (proto === stateProto) { + return prop.val; + } + if (proto === Function.prototype) { + return prop(); + } + return prop; +} + +function getRandomId() { + return Math.random().toString(36).substring(2); +} + +// https://stackoverflow.com/a/75988895 +function debounce( + /** @type function */ callback, + /** @type number */ wait, +) { + let timeoutId = null; + return (...args) => { + window.clearTimeout(timeoutId); + timeoutId = window.setTimeout(() => callback(...args), wait); + }; +} + +export { debounce, emitEvent, enforceElementWidth, getRandomId, getValue, loadStylesheet, resizeFrameHeightToElement }; diff --git a/testgen/ui/components/frontend/js/van.min.js b/testgen/ui/components/frontend/js/van.min.js index a78d3da..7e23e03 100644 --- a/testgen/ui/components/frontend/js/van.min.js +++ b/testgen/ui/components/frontend/js/van.min.js @@ -1 +1,2 @@ -let e,t,l,r,o,f=Object,n=f.getPrototypeOf,s=document,a={isConnected:1},i={},d=n(a),u=n(n),_=(e,t,l,r)=>(e??(setTimeout(l,r),new Set)).add(t),h=(e,t,r)=>{let o=l;l=t;try{return e(r)}catch(e){return console.error(e),r}finally{l=o}},c=e=>e.filter(e=>e.t?.isConnected),g=t=>o=_(o,t,()=>{for(let e of o)e.l=c(e.l),e.o=c(e.o);o=e},1e3),w={get val(){return l?.add(this),this.i},get oldVal(){return l?.add(this),this.u},set val(l){let r=this;if(l!==r.i){r.i=l;let o=[...r.o=c(r.o)];for(let t of o)x(t.f,t.s,t.t),t.t=e;r.l.length?t=_(t,r,p):r.u=l}}},v=e=>({__proto__:w,i:e,u:e,l:[],o:[]}),S=e=>n(e??0)===w,y=(e,t)=>{let l=new Set,o={f:e},f=r;r=[];let n=h(e,l,t);n=(n??s).nodeType?n:new Text(n);for(let e of l)g(e),e.l.push(o);for(let e of r)e.t=n;return r=f,o.t=n},x=(e,t=v(),l)=>{let o=new Set,f={f:e,s:t};f.t=l??r?.push(f)??a,t.val=h(e,o);for(let e of o)g(e),e.o.push(f);return t},V=(t,...l)=>{for(let r of l.flat(1/0)){let l=n(r??0),o=l===w?y(()=>r.val):l===u?y(r):r;o!=e&&t.append(o)}return t},b=t=>new Proxy((l,...r)=>{let[o,...a]=n(r[0]??0)===d?r:[{},...r],_=t?s.createElementNS(t,l):s.createElement(l);for(let[t,r]of f.entries(o)){let o=l=>l?f.getOwnPropertyDescriptor(l,t)??o(n(l)):e,s=l+","+t,a=i[s]??(i[s]=o(n(_))?.set??0),d=a?a.bind(_):_.setAttribute.bind(_,t),h=n(r??0);h===w?y(()=>(d(r.val),_)):h!==u||t.startsWith("on")&&!r.h?d(r):y(()=>(d(r()),_))}return V(_,...a)},{get:(t,l)=>t.bind(e,l)}),m=(e,t)=>t?t!==e&&e.replaceWith(t):e.remove(),p=()=>{let l=[...t].filter(e=>e.i!==e.u);t=e;for(let t of new Set(l.flatMap(e=>e.l=c(e.l))))m(t.t,y(t.f,t.t)),t.t=e;for(let e of l)e.u=e.i};export default{add:V,_:e=>(e.h=1,e),tags:b(),tagsNS:b,state:v,val:e=>S(e)?e.val:e,oldVal:e=>S(e)?e.oldVal:e,derive:x,hydrate:(e,t)=>m(e,y(t,e))}; \ No newline at end of file +// https://vanjs.org/code/van-1.5.2.min.js +let e,t,r,o,l,n,s=Object.getPrototypeOf,f={isConnected:1},i={},h=s(f),a=s(s),d=(e,t,r,o)=>(e??(setTimeout(r,o),new Set)).add(t),u=(e,t,o)=>{let l=r;r=t;try{return e(o)}catch(e){return console.error(e),o}finally{r=l}},w=e=>e.filter(e=>e.t?.isConnected),_=e=>l=d(l,e,()=>{for(let e of l)e.o=w(e.o),e.l=w(e.l);l=n},1e3),c={get val(){return r?.i?.add(this),this.rawVal},get oldVal(){return r?.i?.add(this),this.h},set val(o){r?.u?.add(this),o!==this.rawVal&&(this.rawVal=o,this.o.length+this.l.length?(t?.add(this),e=d(e,this,v)):this.h=o)}},S=e=>({__proto__:c,rawVal:e,h:e,o:[],l:[]}),g=(e,t)=>{let r={i:new Set,u:new Set},l={f:e},n=o;o=[];let s=u(e,r,t);s=(s??document).nodeType?s:new Text(s);for(let e of r.i)r.u.has(e)||(_(e),e.o.push(l));for(let e of o)e.t=s;return o=n,l.t=s},y=(e,t=S(),r)=>{let l={i:new Set,u:new Set},n={f:e,s:t};n.t=r??o?.push(n)??f,t.val=u(e,l,t.rawVal);for(let e of l.i)l.u.has(e)||(_(e),e.l.push(n));return t},b=(e,...t)=>{for(let r of t.flat(1/0)){let t=s(r??0),o=t===c?g(()=>r.val):t===a?g(r):r;o!=n&&e.append(o)}return e},m=(e,t,...r)=>{let[o,...l]=s(r[0]??0)===h?r:[{},...r],f=e?document.createElementNS(e,t):document.createElement(t);for(let[e,r]of Object.entries(o)){let o=t=>t?Object.getOwnPropertyDescriptor(t,e)??o(s(t)):n,l=t+","+e,h=i[l]??=o(s(f))?.set??0,d=e.startsWith("on")?(t,r)=>{let o=e.slice(2);f.removeEventListener(o,r),f.addEventListener(o,t)}:h?h.bind(f):f.setAttribute.bind(f,e),u=s(r??0);e.startsWith("on")||u===a&&(r=y(r),u=c),u===c?g(()=>(d(r.val,r.h),f)):d(r)}return b(f,l)},x=e=>({get:(t,r)=>m.bind(n,e,r)}),j=(e,t)=>t?t!==e&&e.replaceWith(t):e.remove(),v=()=>{let r=0,o=[...e].filter(e=>e.rawVal!==e.h);do{t=new Set;for(let e of new Set(o.flatMap(e=>e.l=w(e.l))))y(e.f,e.s,e.t),e.t=n}while(++r<100&&(o=[...t]).length);let l=[...e].filter(e=>e.rawVal!==e.h);e=n;for(let e of new Set(l.flatMap(e=>e.o=w(e.o))))j(e.t,g(e.f,e.t)),e.t=n;for(let e of l)e.h=e.rawVal};export default{tags:new Proxy(e=>new Proxy(m,x(e)),x()),hydrate:(e,t)=>j(e,g(t,e)),add:b,state:S,derive:y}; \ No newline at end of file diff --git a/testgen/ui/components/widgets/__init__.py b/testgen/ui/components/widgets/__init__.py index eba62b7..2dc7762 100644 --- a/testgen/ui/components/widgets/__init__.py +++ b/testgen/ui/components/widgets/__init__.py @@ -1,8 +1,10 @@ # ruff: noqa: F401 +from testgen.ui.components.utils.component import component from testgen.ui.components.widgets.breadcrumbs import breadcrumbs from testgen.ui.components.widgets.button import button from testgen.ui.components.widgets.card import card +from testgen.ui.components.widgets.empty_state import EmptyStateMessage, empty_state from testgen.ui.components.widgets.expander_toggle import expander_toggle from testgen.ui.components.widgets.link import link from testgen.ui.components.widgets.page import ( @@ -13,11 +15,14 @@ flex_row_start, no_flex_gap, page_header, + page_links, text, - toolbar_select, whitespace, ) from testgen.ui.components.widgets.paginator import paginator +from testgen.ui.components.widgets.select import select from testgen.ui.components.widgets.sidebar import sidebar from testgen.ui.components.widgets.sorting_selector import sorting_selector from testgen.ui.components.widgets.summary_bar import summary_bar +from testgen.ui.components.widgets.testgen_component import testgen_component +from testgen.ui.components.widgets.wizard import WizardStep, wizard diff --git a/testgen/ui/components/widgets/breadcrumbs.py b/testgen/ui/components/widgets/breadcrumbs.py index bb258d1..ecfc88a 100644 --- a/testgen/ui/components/widgets/breadcrumbs.py +++ b/testgen/ui/components/widgets/breadcrumbs.py @@ -23,7 +23,7 @@ def breadcrumbs( props={"breadcrumbs": breadcrumbs}, ) if data: - Router().navigate(to=data["path"], with_args=data["params"]) + Router().navigate(to=data["href"], with_args=data["params"]) class Breadcrumb(typing.TypedDict): path: str | None diff --git a/testgen/ui/components/widgets/button.py b/testgen/ui/components/widgets/button.py index a78bc0d..9b30cdb 100644 --- a/testgen/ui/components/widgets/button.py +++ b/testgen/ui/components/widgets/button.py @@ -3,16 +3,20 @@ from testgen.ui.components.utils.component import component ButtonType = typing.Literal["basic", "flat", "icon", "stroked"] +ButtonColor = typing.Literal["basic", "primary"] TooltipPosition = typing.Literal["left", "right"] def button( type_: ButtonType = "basic", + color: ButtonColor | None = None, label: str | None = None, icon: str | None = None, tooltip: str | None = None, tooltip_position: TooltipPosition = "left", on_click: typing.Callable[..., None] | None = None, + disabled: bool = False, + width: str | int | float | None = None, style: str | None = None, key: str | None = None, ) -> typing.Any: @@ -24,8 +28,11 @@ def button( :param icon: icon name of material rounded icon fonts :param on_click: click handler for this button """ + color_ = color or "primary" + if not color and type_ == "icon": + color_ = "basic" - props = {"type": type_} + props = {"type": type_, "disabled": disabled, "color": color_} if type_ != "icon": if not label: raise ValueError(f"A label is required for {type_} buttons") @@ -37,6 +44,11 @@ def button( if tooltip: props.update({"tooltip": tooltip, "tooltipPosition": tooltip_position}) + if width: + props.update({"width": width}) + if isinstance(width, int | float): + props.update({"width": f"{width}px"}) + if style: props.update({"style": style}) diff --git a/testgen/ui/components/widgets/download_dialog.py b/testgen/ui/components/widgets/download_dialog.py new file mode 100644 index 0000000..a908043 --- /dev/null +++ b/testgen/ui/components/widgets/download_dialog.py @@ -0,0 +1,75 @@ +import tempfile +from collections.abc import Callable, Iterable +from zipfile import ZipFile + +import streamlit as st + +PROGRESS_UPDATE_TYPE = Callable[[float], None] + +FILE_DATA_TYPE = tuple[str, str, str|bytes] + +def zip_multi_file_data( + zip_file_name: str, + file_data_func: Callable[[PROGRESS_UPDATE_TYPE, ...], FILE_DATA_TYPE], + args_list: list[Iterable], +) -> Callable[[PROGRESS_UPDATE_TYPE, ...], FILE_DATA_TYPE]: + + def _file_content_func(update_main_progress, *args): + + progress = 0.0 + step = 1.0 / len(args_list) + + def _update_progress(f_progress): + update_main_progress(progress + step * f_progress) + + with tempfile.NamedTemporaryFile() as zip_file: + with ZipFile(zip_file.name, "w") as zip_writer: + for args in args_list: + file_name, _, file_data = file_data_func(_update_progress, *args) + zip_writer.writestr(file_name, file_data) + progress += step + zip_content = zip_file.read() + + return zip_file_name, "application/zip", zip_content + + return _file_content_func + + +def download_dialog( + dialog_title: str, + file_content_func: Callable[[PROGRESS_UPDATE_TYPE, ...], FILE_DATA_TYPE], + args: Iterable = (), + progress_bar_msg: str = "Generating file...", +): + """Wrapping a dialog and a download button together to allow generating the file contents only when needed.""" + + def _dialog_content(): + + with st.container(height=70, border=False): + p_bar = st.progress(0.0, progress_bar_msg) + + with st.container(height=55, border=False): + _, button_col = st.columns([.8, .2]) + + def _update_progress(progress: float): + p_bar.progress(progress, progress_bar_msg) + + file_name, file_type, file_content = file_content_func(_update_progress, *args) + + p_bar.progress(1.0, "File ready for download.") + + @st.fragment + def render_button(): + if st.download_button( + label=":material/download: Download", + data=file_content, + file_name=file_name, + mime=file_type, + use_container_width=True, + ): + st.rerun() + + with button_col: + render_button() + + return st.dialog(title=dialog_title, width="small")(_dialog_content)() diff --git a/testgen/ui/components/widgets/empty_state.py b/testgen/ui/components/widgets/empty_state.py new file mode 100644 index 0000000..505d560 --- /dev/null +++ b/testgen/ui/components/widgets/empty_state.py @@ -0,0 +1,75 @@ +import typing +from enum import Enum + +import streamlit as st + +from testgen.ui.components.widgets.button import button +from testgen.ui.components.widgets.link import link +from testgen.ui.components.widgets.page import css_class, whitespace + + +class EmptyStateMessage(Enum): + Connection = ( + "Begin by connecting your database.", + "TestGen delivers data quality through data profiling, hygiene review, test generation, and test execution.", + ) + TableGroup = ( + "Profile your tables to detect hygiene issues", + "Create table groups for your connected databases to run data profiling and hygiene review.", + ) + Profiling = ( + "Profile your tables to detect hygiene issues", + "Run data profiling on your table groups to understand data types, column contents, and data patterns.", + ) + TestSuite = ( + "Run data validation tests", + "Automatically generate tests from data profiling results or write custom tests for your business rules.", + ) + TestExecution = ( + "Run data validation tests", + "Execute tests to assess data quality of your tables." + ) + + +def empty_state( + label: str, + icon: str, + message: EmptyStateMessage, + action_label: str, + link_href: str | None = None, + link_params: dict | None = None, + button_onclick: typing.Callable[..., None] | None = None, + button_icon: str = "add", +) -> None: + with st.container(border=True): + css_class("bg-white") + whitespace(5) + st.html(f""" +
+

{label}

+

{icon}

+

{message.value[0]}
{message.value[1]}

+
+ """) + _, center_column, _ = st.columns([.4, .3, .4]) + with center_column: + if link_href: + link( + label=action_label, + href=link_href, + params=link_params or {}, + right_icon="chevron_right", + underline=False, + height=40, + style="margin: auto; border-radius: 4px; border: var(--button-stroked-border); padding: 8px 8px 8px 16px; color: var(--primary-color)", + ) + elif button_onclick: + button( + type_="flat", + color="primary", + label=action_label, + icon=button_icon, + on_click=button_onclick, + style="margin: auto; width: auto;", + ) + whitespace(5) diff --git a/testgen/ui/components/widgets/link.py b/testgen/ui/components/widgets/link.py index 14511a8..4e2bf28 100644 --- a/testgen/ui/components/widgets/link.py +++ b/testgen/ui/components/widgets/link.py @@ -7,12 +7,14 @@ def link( label: str, *, params: dict = {}, # noqa: B006 + open_new: bool = False, underline: bool = True, left_icon: str | None = None, left_icon_size: float = 20.0, right_icon: str | None = None, right_icon_size: float = 20.0, height: float | None = 21.0, + width: float | None = None, style: str | None = None, key: str = "testgen:link", ) -> None: @@ -21,6 +23,7 @@ def link( "params": params, "label": label, "height": height, + "open_new": open_new, "underline": underline, } if left_icon: @@ -32,6 +35,9 @@ def link( if style: props.update({"style": style}) + if width: + props.update({"width": width}) + clicked = component(id_="link", key=key, props=props) if clicked: Router().navigate(to=href, with_args=params) diff --git a/testgen/ui/components/widgets/page.py b/testgen/ui/components/widgets/page.py index cb3b495..e387b28 100644 --- a/testgen/ui/components/widgets/page.py +++ b/testgen/ui/components/widgets/page.py @@ -1,82 +1,46 @@ -import pandas as pd import streamlit as st from streamlit.delta_generator import DeltaGenerator -from streamlit_extras.no_default_selectbox import selectbox from testgen.ui.components.widgets.breadcrumbs import Breadcrumb from testgen.ui.components.widgets.breadcrumbs import breadcrumbs as tg_breadcrumbs -from testgen.ui.navigation.router import Router +BASE_HELP_URL = "https://docs.datakitchen.io/articles/#!dataops-testgen-help/" +DEFAULT_HELP_TOPIC = "dataops-testgen-help" +SLACK_URL = "https://data-observability-slack.datakitchen.io/join" +TRAINING_URL = "https://info.datakitchen.io/data-quality-training-and-certifications" def page_header( title: str, - help_link:str | None = None, + help_topic: str | None = None, breadcrumbs: list["Breadcrumb"] | None = None, ): - hcol1, hcol2 = st.columns([0.95, 0.05]) - hcol1.subheader(title, anchor=False) - if help_link: - with hcol2: - whitespace(0.8) - st.page_link(help_link, label=" ", icon=":material/help:") - - if breadcrumbs: - tg_breadcrumbs(breadcrumbs=breadcrumbs) - - st.write( - '
', - unsafe_allow_html=True, - ) - if "last_page" in st.session_state: - if title != st.session_state["last_page"]: - st.cache_data.clear() - st.session_state["last_page"] = title - - -def toolbar_select( - options: pd.DataFrame | list[str], - value_column: str | None = None, - display_column: str | None = None, - default_value = None, - required: bool = False, - bind_to_query: str | None = None, - **kwargs, -): - kwargs = {**kwargs} - - if isinstance(options, pd.DataFrame): - value_column = value_column or options.columns[0] - display_column = display_column or value_column - kwargs["options"] = options[display_column] - if default_value in options[value_column].values: - kwargs["index"] = int(options[options[value_column] == default_value].index[0]) + (0 if required else 1) - else: - kwargs["options"] = options - if default_value in options: - kwargs["index"] = options.index(default_value) + (0 if required else 1) + with st.container(): + no_flex_gap() + title_column, links_column = st.columns([0.95, 0.05], vertical_alignment="bottom") - if bind_to_query: - kwargs["key"] = kwargs.get("key", f"toolbar_select_{bind_to_query}") - if default_value is not None and kwargs.get("index") is None: - Router().set_query_params({ bind_to_query: None }) # Unset the query params if the current value is not valid + with title_column: + no_flex_gap() + st.html(f'

{title}

') + if breadcrumbs: + tg_breadcrumbs(breadcrumbs=breadcrumbs) - def update_query_params(): - query_value = st.session_state[kwargs["key"]] - if not required and query_value == "---": - query_value = None - elif isinstance(options, pd.DataFrame): - query_value = options.loc[options[display_column] == query_value, value_column].iloc[0] - Router().set_query_params({ bind_to_query: query_value }) + with links_column: + page_links(help_topic) - kwargs["on_change"] = update_query_params + st.html('
') - selected = st.selectbox(**kwargs) if required else selectbox(**kwargs) + if "last_page" in st.session_state: + if title != st.session_state["last_page"]: + st.cache_data.clear() + st.session_state["last_page"] = title - if selected and isinstance(options, pd.DataFrame): - return options.loc[options[display_column] == selected, value_column].iloc[0] - return selected +def page_links(help_topic: str | None = None): + css_class("tg-header--links") + flex_row_end() + st.link_button(":material/question_mark:", f"{BASE_HELP_URL}{help_topic or DEFAULT_HELP_TOPIC}", help="Help Center") + st.link_button(":material/group:", SLACK_URL, help="Slack Community") + st.link_button(":material/school:", TRAINING_URL, help="Training Portal") def whitespace(size: float, container: DeltaGenerator | None = None): diff --git a/testgen/ui/components/widgets/paginator.py b/testgen/ui/components/widgets/paginator.py index 8c1e4c7..c98a335 100644 --- a/testgen/ui/components/widgets/paginator.py +++ b/testgen/ui/components/widgets/paginator.py @@ -17,9 +17,10 @@ def paginator( :param key: unique key to give the component a persisting state """ - return component( + event_data = component( id_="paginator", key=key, - default=page_index, + default={ page_index: page_index }, props={"count": count, "pageSize": page_size, "pageIndex": page_index}, ) + return event_data.get("page_index", 0) diff --git a/testgen/ui/components/widgets/select.py b/testgen/ui/components/widgets/select.py new file mode 100644 index 0000000..31fa748 --- /dev/null +++ b/testgen/ui/components/widgets/select.py @@ -0,0 +1,56 @@ +import pandas as pd +import streamlit as st +from streamlit_extras.no_default_selectbox import selectbox + +from testgen.ui.navigation.router import Router + +EMPTY_VALUE = "---" + +def select( + label: str, + options: pd.DataFrame | list[str], + value_column: str | None = None, + display_column: str | None = None, + default_value = None, + required: bool = False, + bind_to_query: str | None = None, + bind_empty_value: bool = False, + **kwargs, +): + kwargs = {**kwargs} + kwargs["label"] = label + + if isinstance(options, pd.DataFrame): + value_column = value_column or options.columns[0] + display_column = display_column or value_column + kwargs["options"] = options[display_column] + if default_value in options[value_column].values: + kwargs["index"] = int(options[options[value_column] == default_value].index[0]) + (0 if required else 1) + else: + kwargs["options"] = options + if default_value in options: + kwargs["index"] = options.index(default_value) + (0 if required else 1) + elif default_value == EMPTY_VALUE and not required: + kwargs["index"] = 0 + + if bind_to_query: + kwargs["key"] = kwargs.get("key", f"testgen_select_{bind_to_query}") + if default_value is not None and kwargs.get("index") is None: + Router().set_query_params({ bind_to_query: None }) # Unset the query params if the current value is not valid + + def update_query_params(): + query_value = st.session_state[kwargs["key"]] + if not required and query_value == EMPTY_VALUE and not bind_empty_value: + query_value = None + elif isinstance(options, pd.DataFrame): + query_value = options.loc[options[display_column] == query_value, value_column].iloc[0] + Router().set_query_params({ bind_to_query: query_value }) + + kwargs["on_change"] = update_query_params + + selected = st.selectbox(**kwargs) if required else selectbox(**kwargs) + + if selected and isinstance(options, pd.DataFrame): + return options.loc[options[display_column] == selected, value_column].iloc[0] + + return selected diff --git a/testgen/ui/components/widgets/summary_bar.py b/testgen/ui/components/widgets/summary_bar.py index c4b636d..bf913c6 100644 --- a/testgen/ui/components/widgets/summary_bar.py +++ b/testgen/ui/components/widgets/summary_bar.py @@ -44,7 +44,7 @@ def summary_bar( if total: item_spans = "".join([ f'' for item in items ]) - caption = ", ".join([ f"{item['label']}: {item['value']}" for item in items ]) + caption = "".join([ f'
{item["label"]}: {item["value"]}
' for item in items ]) caption_div = f"""
{caption} diff --git a/testgen/ui/components/widgets/testgen_component.py b/testgen/ui/components/widgets/testgen_component.py new file mode 100644 index 0000000..89b8ef0 --- /dev/null +++ b/testgen/ui/components/widgets/testgen_component.py @@ -0,0 +1,59 @@ +import typing + +import streamlit as st + +from testgen.ui.components.utils.component import component +from testgen.ui.navigation.router import Router +from testgen.ui.session import session + + +def testgen_component( + component_id: typing.Literal["profiling_runs", "test_runs", "database_flavor_selector", "data_hierarchy"], + props: dict, + on_change_handlers: dict[str, typing.Callable] | None = None, + event_handlers: dict[str, typing.Callable] | None = None, +) -> dict | None: + """ + Testgen component to display a VanJS page. + + # Parameters + :param component_id: name of page + :param props: properties expected by the page + :param on_change_handlers: event handlers to be called during on_change callback (recommended, but does not support calling st.rerun()) + :param event_handlers: event handlers to be called on next run (supports calling st.rerun()) + + For both on_change_handlers and event_handlers, the "payload" data from the event is passed as the only argument to the callback function + """ + + key = f"testgen:{component_id}" + + def on_change(): + event_data = st.session_state[key] + if event_data and (event := event_data.get("event")): + if on_change_handlers and (handler := on_change_handlers.get(event)): + # Prevent handling the same event multiple times + event_id = f"{component_id}:{event_data.get('_id', '')}" + if event_id != session.testgen_event_id: + session.testgen_event_id = event_id + handler(event_data.get("payload")) + + event_data = component( + id_=component_id, + key=key, + props=props, + on_change=on_change if on_change_handlers else None, + ) + if event_data and (event := event_data.get("event")): + if event == "LinkClicked": + Router().navigate(to=event_data["href"], with_args=event_data.get("params")) + + elif event_handlers and (handler := event_handlers.get(event)): + # Prevent handling the same event multiple times + event_id = f"{component_id}:{event_data.get('_id', '')}" + if event_id != session.testgen_event_id: + session.testgen_event_id = event_id + # These events are not handled through the component's on_change callback + # because they may call st.rerun(), causing the "Calling st.rerun() within a callback is a noop" error + handler(event_data.get("payload")) + + return event_data diff --git a/testgen/ui/components/widgets/wizard.py b/testgen/ui/components/widgets/wizard.py new file mode 100644 index 0000000..1b87da1 --- /dev/null +++ b/testgen/ui/components/widgets/wizard.py @@ -0,0 +1,213 @@ +import dataclasses +import inspect +import logging +import typing + +import streamlit as st +from streamlit.delta_generator import DeltaGenerator + +from testgen.ui.components import widgets as testgen +from testgen.ui.navigation.router import Router +from testgen.ui.session import temp_value + +ResultsType = typing.TypeVar("ResultsType", bound=typing.Any | None) +StepResults = tuple[typing.Any, bool] +logger = logging.getLogger("testgen") + + +def wizard( + *, + key: str, + steps: list[typing.Callable[..., StepResults] | "WizardStep"], + on_complete: typing.Callable[..., bool], + complete_label: str = "Complete", + navigate_to: str | None = None, + navigate_to_args: dict | None = None, +) -> None: + """ + Creates a Wizard with the provided steps and handles the session for + each step internally. + + For each step callable instances of WizardStep for the current step + and previous steps are optionally provided as keyword arguments with + specific names. + + Optional arguments that can be accessed as follows: + + ``` + def step_fn(current_step: WizardStep = ..., step_0: WizardStep = ...) + ... + ``` + + For the `on_complete` callable, on top of passing each WizardStep, a + Streamlit DeltaGenerator is also passed to allow rendering content + inside the step's body. + + ``` + def on_complete(container: DeltaGenerator, step_0: WizardStep = ..., step_1: WizardStep = ...): + ... + ``` + + After the `on_complete` callback returns, the wizard state is reset. + + :param key: used to cache current step and results of each step + :param steps: a list of WizardStep instances or callable objects + :param on_complete: callable object to execute after the last step. + should return true to trigger a Streamlit rerun + :param complete_label: customize the label for the complete button + + :return: None + """ + + if navigate_to: + Router().navigate(navigate_to, navigate_to_args or {}) + + current_step_idx = 0 + wizard_state = st.session_state.get(key) + if isinstance(wizard_state, int): + current_step_idx = wizard_state + + instance = Wizard( + key=key, + steps=[ + WizardStep( + key=f"{key}:{idx}", + body=step, + results=st.session_state.get(f"{key}:{idx}", None), + ) if not isinstance(step, WizardStep) else dataclasses.replace( + step, + key=f"{key}:{idx}", + results=st.session_state.get(f"{key}:{idx}", None), + ) + for idx, step in enumerate(steps) + ], + current_step=current_step_idx, + on_complete=on_complete, + ) + + current_step = instance.current_step + current_step_index = instance.current_step_index + testgen.caption( + f"Step {current_step_index + 1} of {len(steps)}{': ' + current_step.title if current_step.title else ''}" + ) + + step_body_container = st.empty() + with step_body_container.container(): + was_complete_button_clicked, set_complete_button_clicked = temp_value(f"{key}:complete-button") + + if was_complete_button_clicked(): + instance.complete(step_body_container) + else: + instance.render() + button_left_column, _, button_right_column = st.columns([0.30, 0.40, 0.30]) + with button_left_column: + if not instance.is_first_step(): + testgen.button( + type_="stroked", + color="basic", + label="Previous", + on_click=lambda: instance.previous(), + key=f"{key}:button-prev", + ) + + with button_right_column: + next_button_label = complete_label if instance.is_last_step() else "Next" + + testgen.button( + type_="stroked" if not instance.is_last_step() else "flat", + label=next_button_label, + on_click=lambda: set_complete_button_clicked(instance.next() or instance.is_last_step()), + key=f"{key}:button-next", + disabled=not current_step.is_valid, + ) + + +class Wizard: + def __init__( + self, + *, + key: str, + steps: list["WizardStep"], + on_complete: typing.Callable[..., bool] | None = None, + current_step: int = 0, + ) -> None: + self._key = key + self._steps = steps + self._current_step = current_step + self._on_complete = on_complete + + @property + def current_step(self) -> "WizardStep": + return self._steps[self._current_step] + + @property + def current_step_index(self) -> int: + return self._current_step + + def next(self) -> None: + next_step = self._current_step + 1 + if not self.is_last_step(): + st.session_state[self._key] = next_step + return + + def previous(self) -> None: + previous_step = self._current_step - 1 + if previous_step > -1: + st.session_state[self._key] = previous_step + + def is_first_step(self) -> bool: + return self._current_step == 0 + + def is_last_step(self) -> bool: + return self._current_step == len(self._steps) - 1 + + def complete(self, container: DeltaGenerator) -> None: + if self._on_complete: + signature = inspect.signature(self._on_complete) + accepted_params = [param.name for param in signature.parameters.values()] + kwargs: dict = { + key: step for idx, step in enumerate(self._steps) + if (key := f"step_{idx}") and key in accepted_params + } + if "container" in accepted_params: + kwargs["container"] = container + + do_rerun = self._on_complete(**kwargs) + self._reset() + if do_rerun: + st.rerun() + + def _reset(self) -> None: + del st.session_state[self._key] + for step_idx in range(len(self._steps)): + del st.session_state[f"{self._key}:{step_idx}"] + + def render(self) -> None: + step = self._steps[self._current_step] + + extra_args = {"current_step": step} + extra_args.update({f"step_{idx}": step for idx, step in enumerate(self._steps)}) + + signature = inspect.signature(step.body) + step_accepted_params = [param.name for param in signature.parameters.values() if param.name in extra_args] + extra_args = {key: value for key, value in extra_args.items() if key in step_accepted_params} + + try: + results, is_valid = step.body(**extra_args) + except TypeError as error: + logger.exception("Error on wizard step %s", self._current_step, exc_info=True, stack_info=True) + results, is_valid = None, True + + step.results = results + step.is_valid = is_valid + + st.session_state[f"{self._key}:{self._current_step}"] = step.results + + +@dataclasses.dataclass(kw_only=True, slots=True) +class WizardStep(typing.Generic[ResultsType]): + body: typing.Callable[..., StepResults] + results: ResultsType = dataclasses.field(default=None) + title: str = dataclasses.field(default="") + key: str | None = dataclasses.field(default=None) + is_valid: bool = dataclasses.field(default=True) diff --git a/testgen/ui/forms.py b/testgen/ui/forms.py new file mode 100644 index 0000000..ff3e679 --- /dev/null +++ b/testgen/ui/forms.py @@ -0,0 +1,117 @@ +import typing + +import streamlit as st +from pydantic import BaseModel, Field # noqa: F401 +from pydantic.json_schema import DEFAULT_REF_TEMPLATE, GenerateJsonSchema, JsonSchemaMode +from streamlit.delta_generator import DeltaGenerator +from streamlit_pydantic.ui_renderer import InputUI + + +class BaseForm(BaseModel): + def __init__(self, /, **data: typing.Any) -> None: + super().__init__(**data) + + @classmethod + def empty(cls) -> "BaseForm": + non_validated_instance = cls.model_construct() + non_validated_instance.model_post_init(None) + + return non_validated_instance + + @property + def _disabled_fields(self) -> set[str]: + if not getattr(self, "_disabled_fields_set", None): + self._disabled_fields_set = set() + return self._disabled_fields_set + + def disable(self, field: str) -> None: + self._disabled_fields.add(field) + + def enable(self, field) -> None: + self._disabled_fields.remove(field) + + @classmethod + def model_json_schema( + self_or_cls, # type: ignore + by_alias: bool = True, + ref_template: str = DEFAULT_REF_TEMPLATE, + schema_generator: type[GenerateJsonSchema] = GenerateJsonSchema, + mode: JsonSchemaMode = "validation", + ) -> dict[str, typing.Any]: + schema = super().model_json_schema( + by_alias=by_alias, + ref_template=ref_template, + schema_generator=schema_generator, + mode=mode, + ) + + schema_properties: dict[str, dict] = schema.get("properties", {}) + disabled_fields: set[str] = getattr(self_or_cls, "_disabled_fields_set", set()) + for property_name, property_schema in schema_properties.items(): + if property_name in disabled_fields and not property_schema.get("readOnly"): + property_schema["readOnly"] = True + + return schema + + @classmethod + def get_field_label(cls, field_name: str) -> str: + schema = cls.model_json_schema() + schema_properties = schema.get("properties", {}) + field_schema = schema_properties[field_name] + return field_schema.get("st_kwargs_label") or field_schema.get("title") + + +class ManualRender: + @property + def input_ui(self): + if not getattr(self, "_input_ui", None): + self._input_ui = InputUI( + self.form_key(), + self, # type: ignore + group_optional_fields="no", # type: ignore + lowercase_labels=False, + ignore_empty_values=False, + return_model=False, + ) + return self._input_ui + + def form_key(self): + raise NotImplementedError + + def render_input_ui(self, container: DeltaGenerator, session_state: dict) -> "BaseForm": + raise NotImplementedError + + def render_field(self, field_name: str, container: DeltaGenerator | None = None) -> typing.Any: + streamlit_container = container or self.input_ui._streamlit_container + model_property = self.input_ui._schema_properties[field_name] + initial_value = getattr(self, field_name, None) or self.input_ui._get_value(field_name) + is_disabled = field_name in getattr(self, "_disabled_fields", set()) + + if is_disabled: + model_property["readOnly"] = True + + if model_property.get("type") != "boolean" and initial_value not in [None, ""]: + model_property["init_value"] = initial_value + + new_value = self.input_ui._render_property(streamlit_container, field_name, model_property) + self.update_field_value(field_name, new_value) + + return new_value + + def update_field_value(self, field_name: str, value: typing.Any) -> typing.Any: + self.input_ui._store_value(field_name, value) + setattr(self, field_name, value) + return value + + def get_field_value(self, field_name: str, latest: bool = False) -> typing.Any: + if latest: + return st.session_state.get(self.get_field_key(field_name)) + return self.input_ui._get_value(field_name) + + def reset_cache(self) -> None: + for field_name in typing.cast(type[BaseForm], type(self)).model_fields.keys(): + st.session_state.pop(self.get_field_key(field_name), None) + st.session_state.pop(self.form_key() + "-data", None) + + def get_field_key(self, field_name: str) -> typing.Any: + return str(self.input_ui._session_state.run_id) + "-" + str(self.input_ui._key) + "-" + field_name diff --git a/testgen/ui/navigation/page.py b/testgen/ui/navigation/page.py index b7a53cc..369e43f 100644 --- a/testgen/ui/navigation/page.py +++ b/testgen/ui/navigation/page.py @@ -40,7 +40,7 @@ def _navigate(self) -> None: session.current_page_args = session.current_page_args or {} self._validate_project_query_param() - + self.render(**session.current_page_args) def _validate_project_query_param(self) -> None: diff --git a/testgen/ui/navigation/router.py b/testgen/ui/navigation/router.py index d010ee9..d49df3d 100644 --- a/testgen/ui/navigation/router.py +++ b/testgen/ui/navigation/router.py @@ -10,6 +10,7 @@ from testgen.utils.singleton import Singleton LOG = logging.getLogger("testgen") +COOKIES_READY_RERUNS = 2 class Router(Singleton): @@ -32,12 +33,19 @@ def run(self, hide_sidebar=False) -> None: session.current_page_args = st.query_params # This hack is needed because the auth cookie is not retrieved on the first run - # We have to store the page and wait for the second run - + # We have to store the page and wait for the second or third run if not session.cookies_ready: - session.cookies_ready = True + session.cookies_ready = 1 session.page_pending_cookies = current_page - else: + # Set this anyway so that sidebar displays initial selection correctly + session.current_page = current_page.url_path + st.rerun() + + # Sometimes the cookie is ready on the second rerun and other times only on the third -_- + # so we have to make sure the page renders correctly in both cases + # and also handle the login page! + elif session.cookies_ready == COOKIES_READY_RERUNS or session.authentication_status or (session.page_pending_cookies and not session.page_pending_cookies.url_path): + session.cookies_ready = COOKIES_READY_RERUNS current_page = session.page_pending_cookies or current_page session.page_pending_cookies = None @@ -48,6 +56,9 @@ def run(self, hide_sidebar=False) -> None: session.current_page = current_page.url_path current_page.run() + else: + session.cookies_ready += 1 + time.sleep(0.3) def navigate(self, /, to: str, with_args: dict = {}) -> None: # noqa: B006 diff --git a/testgen/ui/pdf/__init__.py b/testgen/ui/pdf/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/testgen/ui/pdf/dataframe_table.py b/testgen/ui/pdf/dataframe_table.py new file mode 100644 index 0000000..ff2f8c2 --- /dev/null +++ b/testgen/ui/pdf/dataframe_table.py @@ -0,0 +1,295 @@ +from collections.abc import Iterable +from math import nan + +import pandas +from numpy import NaN +from pandas.core.dtypes.common import is_numeric_dtype +from reportlab.lib import colors, enums +from reportlab.lib.styles import ParagraphStyle +from reportlab.pdfbase.pdfmetrics import stringWidth +from reportlab.platypus import BalancedColumns, Flowable, Paragraph, Table, TableStyle + +from testgen.ui.pdf.style import COLOR_FADED_TEXT, COLOR_GRAY_BG, PARA_STYLE_CELL, TABLE_STYLE_DEFAULT + +PARA_STYLE_CELL_DATA = ParagraphStyle( + "table_cell_data", + PARA_STYLE_CELL, + leading=10, +) + +PARA_STYLE_CELL_NUMERIC = ParagraphStyle( + "table_cell_numeric", + PARA_STYLE_CELL_DATA, + alignment=enums.TA_RIGHT, + fontName="Courier", +) + +PARA_STYLE_CELL_NULL = ParagraphStyle( + "table_cell_null", + PARA_STYLE_CELL_NUMERIC, + alignment=enums.TA_CENTER, + textColor=COLOR_FADED_TEXT, + fontName="Courier-Oblique", +) + +PARA_STYLE_CELL_HEADER = ParagraphStyle( + "table_cell_header", + PARA_STYLE_CELL_DATA, + alignment=enums.TA_CENTER, + fontName="Helvetica", + splitLongWords=0, +) + +TABLE_STYLE_DATA = TableStyle( + ( + # All table + ("GRID", (0, 0), (-1, -1), 0.5, COLOR_GRAY_BG), + + # Header + *[ + (cmd[0], (0, 0), (-1, 0), *cmd[1:]) + for cmd in ( + ("INNERGRID", 1, colors.white), + ("BACKGROUND", COLOR_GRAY_BG), + ("VALIGN", "MIDDLE"), + ("LEFTPADDING", 4), + ("RIGHTPADDING", 4), + ("TOPPADDING", 6), + ("BOTTOMPADDING", 6), + ) + ], + ), + parent=TABLE_STYLE_DEFAULT, +) + + +class VerticalHeaderCell(Flowable): + """ + Wrap a Paragraph rotating it 90 degrees. + + Technically, it could rotate any element, but it was designed to rotate a Paragraph (which uses all the available + with by default, and grows vertically as needed) into a narrow space, such as a table column with a pre-determined + width, which is the case of our DataFrame table implementation. + + It leverages a starting value for the height as an attempt to avoid unnecessary line breaks, when there's room + available. It attempts to wrap the Paragraph using the header height as its width, but it checks if the Paragraph + height is exceeding the column width, making more room and re-wrapping the Paragraph when necessary. + + It also centralizes the flowable, regardless of the cell style. + """ + + INITIAL_HEIGHT = 40 + HEIGHT_INCR_STEP = 5 + + def __init__(self, flowable): + self.flowable = flowable + self.available_width = 0 + self.flowable_width = 0 + super().__init__() + + def wrap(self, availWidth, _): + self.available_width = availWidth + + available_height = self.INITIAL_HEIGHT + while True: + flowable_height, self.flowable_width = self.flowable.wrap(available_height, self.available_width) + + if self.flowable_width > self.available_width: + available_height += self.HEIGHT_INCR_STEP + else: + break + + return self.available_width, flowable_height + + def drawOn(self, canvas, x, y, _sW=0): + canvas.saveState() + canvas.rotate(90) + # Besides translating x and y for the rotated canvas, we are horizontally centralizing the content by adding + # half of the "unused" width to the y position (which affects what we as "x" in the rotated canvas) + ret = self.flowable.drawOn( + canvas, + y, + -(x + self.available_width - (self.available_width - self.flowable_width) / 2), + _sW, + ) + canvas.restoreState() + return ret + + +class DataFrameTableBuilder: + """ + Build a Table based on the contents of a Pandas DataFrame. + + It wraps the content of each cell into a Paragraph, to ease line breaks when necessary. Both Tables and Paragraphs + adjusts their widths automatically, but they don't play well together, so this class calculates each column width + based on the DataFrame content. It can discard columns when they don't fit in the page width, dropping the widest + until it fits. + + It also provides a utility method to wrap the table (any potentially any other content that should be rendered + within it) into a columned layout. + """ + + null_para = Paragraph("NULL", style=PARA_STYLE_CELL_NULL) + + def __init__(self, dataframe, available_width, col_padding=16, max_header_exp_factor=0.4): + self._dataframe = dataframe + self.available_width = available_width + self.col_padding = col_padding + self.max_header_exp_factor = max_header_exp_factor + self.omitted_columns = [] + self.col_len_data = pandas.DataFrame(columns=["width", "max_width"], index=iter(dataframe)) + self.table_data = None + + def build_table(self, **kwargs): + if "colWidths" in kwargs: + raise ValueError("Can not override the calculated column widths") + + self.table_data = self._prepare_data() + self._drop_columns_that_dont_fit() + self.col_len_data["width"] += self._calc_content_cols_expansion() + header = self._setup_header() + + kwargs["colWidths"] = self.col_len_data["width"].tolist() + kwargs.setdefault("style", TABLE_STYLE_DATA) + kwargs.setdefault("repeatRows", 1) + + table_data = ( + header, + *(data.tolist() for _, data in self.table_data.iterrows()), + ) + + return Table(table_data, **kwargs) + + def split_in_columns(self, flowables, min_rows=5, col_padding=10): + # We don't want the columns to be glued together, so we add a padding for calculation + table_width = self._get_current_width() + col_padding + + # Adding one `col_padding` to the available width to compensate for the fact that + # only n-1 col "paddings" will be rendered for a BC with n cols + layout_columns = int((self.available_width + col_padding) / table_width) + + # Limiting the number of columns so each column has at least `min_rows` rows + layout_columns = min(layout_columns, int(len(self.table_data) / min_rows)) + + if layout_columns > 1: + columns = BalancedColumns( + flowables, layout_columns, leftPadding=0, rightPadding=0, topPadding=0, bottomPadding=0 + ) + # Honoring the `flowables` input type, for consistency + return [columns] if isinstance(flowables, Iterable) else columns + else: + return flowables + + def _setup_header(self): + header_cells = pandas.Series( + [Paragraph(label, style=PARA_STYLE_CELL_HEADER) for label in self.table_data.columns], + index=self.table_data.columns, + ) + + min_max_widths = header_cells.map(self._calc_cell_width) + + min_widths = min_max_widths.map(lambda t: t[0]) + min_exp_appetite = self._calc_expansion_appetite(min_widths) + + # If the minimal expansion fits into the available width, the columns are expanded. + # Otherwise, the header is converted to vertical text + if min_exp_appetite.sum() <= self._get_expansible_width(): + self.col_len_data["width"] += min_exp_appetite + + # If the maximum expansion would grow the table width under the `max_header_exp_factor`, + # it's expanded to match + max_widths = min_max_widths.map(lambda t: t[1]) + max_exp_appetite = self._calc_expansion_appetite(max_widths) + if max_exp_appetite.sum() / self._get_current_width() <= self.max_header_exp_factor: + self.col_len_data["width"] += max_exp_appetite + else: + header_cells = header_cells.map(VerticalHeaderCell) + + return header_cells.tolist() + + def _get_expansible_width(self): + return self.available_width - self._get_current_width() + + def _get_current_width(self): + return self.col_len_data["width"].sum() + + def _calc_expansion_appetite(self, desired_widths): + """ + Given a series of "ideal" widths, return a series with how much each smaller column has to grow to match. + """ + return (desired_widths - self.col_len_data["width"]).apply(max, args=(0,)) + + def _calc_content_cols_expansion(self): + """ + Calculate how much each column has to grow to fit all the text without wrapping. + + The growth is limited by the available width and applied proportionally. + """ + expansion_appetite = self._calc_expansion_appetite(self.col_len_data["max_width"]) + expansible_width = self._get_expansible_width() + expand_factor = max(1, expansion_appetite.sum() / expansible_width) if expansible_width else 0 + return expansion_appetite * expand_factor + + def _drop_columns_that_dont_fit(self): + while True: + if self._get_expansible_width() >= 0: + break + largest_col = self.col_len_data["width"].idxmax() + self.table_data = self.table_data.drop(columns=largest_col) + self.col_len_data = self.col_len_data.drop(index=largest_col) + self.omitted_columns.append(largest_col) + + def _calc_cell_width(self, cell): + """ + Calculate the minimum and maximum widths required by a given cell (Paragraph). + + The min width considers wrapping only at the spaces, while the max width considers no wrapping. + """ + font_name = cell.style.fontName + font_size = cell.style.fontSize + space_width = stringWidth(" ", font_name, font_size) + words_width = [stringWidth(word, font_name, font_size) for word in cell.text.split(" ")] + min_width = max(words_width) + self.col_padding + max_width = sum(words_width) + self.col_padding + space_width * (len(words_width) - 1) + return min_width, max_width + + def _calc_col_width(self, col): + col_width = col.map(self._calc_cell_width) + min_width = col_width.max()[0] + max_width = col_width.map(lambda t: t[1]).max() + return min_width, max_width + + def _convert_col_values(self, col): + """ + Convert all values of a given column into Paragraphs. + + It applies different styles depending on the data type, and skips converting values that are already Paragraphs. + """ + para_style = PARA_STYLE_CELL_NUMERIC if is_numeric_dtype(col.dtype) else PARA_STYLE_CELL + + def _convert_value(value): + if isinstance(value, Paragraph): + return value + elif value in (None, NaN, nan): + return self.null_para + else: + return Paragraph(str(value), para_style) + + return col.map(_convert_value) + + def _prepare_data(self): + """ + Create a new DataFrame with the converted values from the input DataFrame. + + It also calculates the initial column widths. + """ + table_data = pandas.DataFrame() + for col_idx in self._dataframe.columns: + col = self._dataframe[col_idx] + table_data[col_idx] = self._convert_col_values(col) + self.col_len_data.loc[col_idx] = self._calc_col_width(table_data[col_idx]) + + # Freeing up the reference to the original Dataframe, in case it's ready to be garbage collected + del self._dataframe + + return table_data diff --git a/testgen/ui/pdf/dk_logo.py b/testgen/ui/pdf/dk_logo.py new file mode 100644 index 0000000..89cfb98 --- /dev/null +++ b/testgen/ui/pdf/dk_logo.py @@ -0,0 +1,59 @@ +__all__ = ["get_logo"] + +from reportlab.graphics.shapes import Drawing, Path +from reportlab.lib.colors import Color + +# The following paths were gotten from the `dk_logo.svg` file. As a convenience, it's possible to manually run this file +# to update the paths, in case the logo changes. Installing the `svglib` package is required for that. + +shapes = [ + Path(fillOpacity=1.0, strokeColor=None, strokeOpacity=1.0, points=[37.3, 107.6, 37.9, 107.6, 38.4, 107.6, 38.9, 107.5, 36.699999999999996, 107.5, 34.4, 107.6, 32.0, 107.6, 37.3, 107.6], operators=[0, 2, 2, 1, 3], _fillRule=1), + Path(fillOpacity=1.0, strokeColor=None, strokeOpacity=1.0, points=[37.3, 9.9, 32.0, 9.9, 34.4, 9.9, 36.7, 9.9, 38.9, 10.0, 38.3, 9.9, 37.8, 9.9, 37.3, 9.9], operators=[0, 1, 2, 2, 3], _fillRule=1), + Path(fillColor=Color(.666667,.815686,.27451,1), fillOpacity=1.0, strokeColor=None, strokeOpacity=1.0, points=[85.6, 58.3, 85.6, 56.599999999999994, 85.5, 55.0, 85.3, 53.4, 84.7, 45.6, 82.8, 38.599999999999994, 79.39999999999999, 32.599999999999994, 76.0, 26.5, 71.5, 21.7, 65.9, 18.0, 64.7, 17.2, 63.60000000000001, 16.4, 62.300000000000004, 15.8, 59.6, 14.4, 56.800000000000004, 13.3, 53.800000000000004, 12.4, 11.7, 58.7, 53.900000000000006, 105.1, 56.800000000000004, 104.19999999999999, 59.7, 103.1, 62.400000000000006, 101.69999999999999, 63.300000000000004, 101.19999999999999, 64.2, 100.6, 65.10000000000001, 99.99999999999999, 71.10000000000001, 96.19999999999999, 76.00000000000001, 91.19999999999999, 79.50000000000001, 84.79999999999998, 83.10000000000001, 78.39999999999998, 85.00000000000001, 70.89999999999998, 85.40000000000002, 62.399999999999984, 85.50000000000001, 61.499999999999986, 85.50000000000001, 60.59999999999999, 85.50000000000001, 59.69999999999998, 85.50000000000001, 59.29999999999998, 85.60000000000001, 58.99999999999998, 85.60000000000001, 58.59999999999998, 85.60000000000001, 58.49999999999998, 85.60000000000001, 58.49999999999998, 85.60000000000001, 58.39999999999998, 85.5, 58.5, 85.6, 58.4, 85.6, 58.3], operators=[0, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3], _fillRule=1), + Path(fillColor=Color(.023529,.627451,.290196,1), fillOpacity=1.0, strokeColor=None, strokeOpacity=1.0, points=[53.8, 12.3, 49.199999999999996, 10.9, 44.199999999999996, 10.100000000000001, 38.9, 9.9, 36.699999999999996, 9.9, 34.4, 9.8, 32.0, 9.8, 16.5, 9.8, 13.8, 9.8, 11.7, 12.0, 11.7, 14.600000000000001, 11.7, 44.400000000000006, 11.7, 58.7, 53.8, 12.3, 53.8, 12.3, 53.8, 12.3, 53.8, 12.3], operators=[0, 2, 2, 1, 2, 1, 1, 1, 2, 3], _fillRule=1), + Path(fillColor=Color(.023529,.627451,.290196,1), fillOpacity=1.0, strokeColor=None, strokeOpacity=1.0, points=[11.7, 73.0, 11.7, 102.8, 11.7, 105.5, 13.899999999999999, 107.6, 16.5, 107.6, 32.0, 107.6, 34.4, 107.6, 36.7, 107.6, 38.9, 107.5, 44.199999999999996, 107.4, 49.2, 106.5, 53.8, 105.1, 53.8, 105.1, 53.8, 105.1, 53.8, 105.1, 11.7, 58.7, 11.7, 73.0], operators=[0, 1, 2, 1, 2, 2, 2, 1, 1, 3], _fillRule=1), + Path(fillColor=Color(.666667,.815686,.27451,1), fillOpacity=1.0, strokeColor=None, strokeOpacity=1.0, points=[94.5, 9.9, 69.3, 9.9, 69.6, 10.1, 69.8, 10.3, 70.1, 10.6, 76.5, 15.0, 81.8, 20.6, 85.69999999999999, 27.700000000000003, 89.69999999999999, 34.900000000000006, 91.99999999999999, 43.2, 92.69999999999999, 52.5, 92.89999999999999, 54.4, 92.99999999999999, 56.4, 92.99999999999999, 58.4, 92.99999999999999, 58.5, 92.99999999999999, 58.6, 92.99999999999999, 58.699999999999996, 92.99999999999999, 58.8, 92.99999999999999, 58.8, 92.99999999999999, 58.9, 92.99999999999999, 59.3, 92.89999999999999, 59.699999999999996, 92.89999999999999, 60.199999999999996, 92.89999999999999, 61.3, 92.8, 62.3, 92.69999999999999, 63.4, 92.19999999999999, 73.5, 89.89999999999999, 82.4, 85.6, 90.1, 81.5, 97.5, 75.89999999999999, 103.3, 69.1, 107.69999999999999, 69.1, 107.69999999999999, 69.1, 107.79999999999998, 69.1, 107.79999999999998, 94.5, 107.79999999999998, 97.2, 107.79999999999998, 99.3, 105.59999999999998, 99.3, 102.99999999999999, 99.3, 14.999999999999986, 99.4, 12.1, 97.2, 9.9, 94.5, 9.9], operators=[0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 3], _fillRule=1), + Path(fillColor=Color(.023529,.627451,.290196,1), fillOpacity=1.0, strokeColor=None, strokeOpacity=1.0, points=[132.0, 28.0, 156.1, 28.0, 173.7, 28.0, 183.9, 41.4, 183.9, 58.9, 183.9, 76.3, 173.6, 89.5, 156.1, 89.5, 132.0, 89.5, 132.0, 28.0, 156.1, 79.7, 167.29999999999998, 79.7, 173.0, 70.2, 173.0, 58.800000000000004, 173.0, 47.300000000000004, 167.3, 37.7, 156.1, 37.7, 142.9, 37.7, 142.9, 79.7, 156.1, 79.7], operators=[0, 1, 2, 2, 1, 1, 3, 0, 2, 2, 1, 1, 1, 3], _fillRule=1), + Path(fillColor=Color(.023529,.627451,.290196,1), fillOpacity=1.0, strokeColor=None, strokeOpacity=1.0, points=[231.9, 47.8, 231.9, 89.5, 221.5, 89.5, 221.5, 83.9, 218.6, 88.4, 212.8, 90.4, 207.7, 90.4, 196.7, 90.4, 187.2, 81.9, 187.2, 68.60000000000001, 187.2, 55.20000000000001, 196.7, 46.900000000000006, 207.6, 46.900000000000006, 212.9, 46.900000000000006, 218.7, 49.00000000000001, 221.5, 53.300000000000004, 221.5, 47.800000000000004, 231.9, 47.800000000000004, 221.4, 68.5, 221.4, 61.2, 215.3, 56.5, 209.4, 56.5, 203.0, 56.5, 197.70000000000002, 61.5, 197.70000000000002, 68.5, 197.70000000000002, 75.5, 203.00000000000003, 80.6, 209.4, 80.6, 215.7, 80.6, 221.4, 75.8, 221.4, 68.5], operators=[0, 1, 1, 1, 2, 2, 2, 2, 1, 1, 3, 0, 2, 2, 2, 2, 3], _fillRule=1), + Path(fillColor=Color(.023529,.627451,.290196,1), fillOpacity=1.0, strokeColor=None, strokeOpacity=1.0, points=[263.0, 56.1, 254.2, 56.1, 254.2, 89.5, 243.79999999999998, 89.5, 243.79999999999998, 56.1, 236.29999999999998, 56.1, 236.29999999999998, 47.8, 243.79999999999998, 47.8, 243.79999999999998, 32.5, 254.2, 32.5, 254.2, 47.9, 263.0, 47.9, 263.0, 56.1], operators=[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3], _fillRule=1), + Path(fillColor=Color(.023529,.627451,.290196,1), fillOpacity=1.0, strokeColor=None, strokeOpacity=1.0, points=[309.0, 47.8, 309.0, 89.5, 298.6, 89.5, 298.6, 83.9, 295.70000000000005, 88.4, 289.90000000000003, 90.4, 284.8, 90.4, 273.8, 90.4, 264.3, 81.9, 264.3, 68.60000000000001, 264.3, 55.20000000000001, 273.8, 46.900000000000006, 284.7, 46.900000000000006, 290.0, 46.900000000000006, 295.8, 49.00000000000001, 298.59999999999997, 53.300000000000004, 298.59999999999997, 47.800000000000004, 309.0, 47.800000000000004, 298.4, 68.5, 298.4, 61.2, 292.29999999999995, 56.5, 286.4, 56.5, 280.0, 56.5, 274.7, 61.5, 274.7, 68.5, 274.7, 75.5, 280.0, 80.6, 286.4, 80.6, 292.7, 80.6, 298.4, 75.8, 298.4, 68.5], operators=[0, 1, 1, 1, 2, 2, 2, 2, 1, 1, 3, 0, 2, 2, 2, 2, 3], _fillRule=1), + Path(fillColor=Color(.023529,.627451,.290196,1), fillOpacity=1.0, strokeColor=None, strokeOpacity=1.0, points=[354.6, 89.5, 333.40000000000003, 66.1, 333.40000000000003, 89.5, 322.6, 89.5, 322.6, 28.0, 333.40000000000003, 28.0, 333.40000000000003, 51.3, 350.7, 28.0, 364.2, 28.0, 340.7, 58.6, 369.3, 89.5, 354.6, 89.5], operators=[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3], _fillRule=1), + Path(fillColor=Color(.023529,.627451,.290196,1), fillOpacity=1.0, strokeColor=None, strokeOpacity=1.0, points=[373.7, 33.1, 373.7, 29.400000000000002, 376.8, 26.8, 380.5, 26.8, 384.2, 26.8, 387.2, 29.5, 387.2, 33.1, 387.2, 36.7, 384.3, 39.4, 380.5, 39.4, 376.9, 39.4, 373.7, 36.6, 373.7, 33.1, 375.3, 47.8, 385.7, 47.8, 385.7, 89.5, 375.3, 89.5, 375.3, 47.8], operators=[0, 2, 2, 2, 2, 3, 0, 1, 1, 1, 1, 3], _fillRule=1), + Path(fillColor=Color(.023529,.627451,.290196,1), fillOpacity=1.0, strokeColor=None, strokeOpacity=1.0, points=[417.4, 56.1, 408.59999999999997, 56.1, 408.59999999999997, 89.5, 398.2, 89.5, 398.2, 56.1, 390.7, 56.1, 390.7, 47.8, 398.2, 47.8, 398.2, 32.5, 408.59999999999997, 32.5, 408.59999999999997, 47.9, 417.4, 47.9, 417.4, 56.1], operators=[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3], _fillRule=1), + Path(fillColor=Color(.023529,.627451,.290196,1), fillOpacity=1.0, strokeColor=None, strokeOpacity=1.0, points=[417.6, 68.6, 417.6, 55.39999999999999, 428.20000000000005, 46.89999999999999, 440.3, 46.89999999999999, 447.5, 46.89999999999999, 453.40000000000003, 49.99999999999999, 457.3, 54.89999999999999, 449.90000000000003, 60.69999999999999, 447.8, 58.09999999999999, 444.20000000000005, 56.499999999999986, 440.50000000000006, 56.499999999999986, 433.30000000000007, 56.499999999999986, 428.1000000000001, 61.499999999999986, 428.1000000000001, 68.49999999999999, 428.1000000000001, 75.49999999999999, 433.30000000000007, 80.49999999999999, 440.50000000000006, 80.49999999999999, 444.20000000000005, 80.49999999999999, 447.70000000000005, 78.89999999999999, 449.90000000000003, 76.29999999999998, 457.3, 82.09999999999998, 453.5, 86.89999999999998, 447.6, 90.09999999999998, 440.3, 90.09999999999998, 428.3, 90.3, 417.6, 81.8, 417.6, 68.6], operators=[0, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 3], _fillRule=1), + Path(fillColor=Color(.023529,.627451,.290196,1), fillOpacity=1.0, strokeColor=None, strokeOpacity=1.0, points=[500.7, 66.1, 500.7, 89.5, 490.3, 89.5, 490.3, 67.1, 490.3, 60.49999999999999, 486.3, 57.099999999999994, 481.6, 57.099999999999994, 476.90000000000003, 57.099999999999994, 471.0, 59.699999999999996, 471.0, 67.69999999999999, 471.0, 89.49999999999999, 460.6, 89.49999999999999, 460.6, 25.499999999999986, 471.0, 25.499999999999986, 471.0, 54.09999999999999, 473.1, 49.09999999999999, 479.7, 46.899999999999984, 483.9, 46.899999999999984, 494.8, 46.9, 500.7, 54.0, 500.7, 66.1], operators=[0, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 2, 3], _fillRule=1), + Path(fillColor=Color(.023529,.627451,.290196,1), fillOpacity=1.0, strokeColor=None, strokeOpacity=1.0, points=[547.5, 72.3, 515.3, 72.3, 516.5, 78.1, 520.9, 81.0, 527.0999999999999, 81.0, 531.6999999999999, 81.0, 535.8999999999999, 79.2, 538.3999999999999, 75.8, 545.2999999999998, 81.1, 541.4999999999999, 87.19999999999999, 534.0999999999998, 90.39999999999999, 526.5999999999998, 90.39999999999999, 514.0999999999998, 90.39999999999999, 504.5999999999998, 81.69999999999999, 504.5999999999998, 68.6, 504.5999999999998, 55.3, 514.5999999999998, 46.89999999999999, 526.4999999999998, 46.89999999999999, 538.4999999999998, 46.89999999999999, 547.7999999999997, 55.19999999999999, 547.7999999999997, 68.19999999999999, 547.8, 69.4, 547.7, 70.7, 547.5, 72.3, 537.4, 65.0, 536.8, 59.3, 532.4, 56.0, 526.6, 56.0, 521.0, 56.0, 516.5, 58.7, 515.3000000000001, 65.0, 537.4, 65.0], operators=[0, 1, 2, 2, 1, 2, 2, 2, 2, 2, 3, 0, 2, 2, 1, 3], _fillRule=1), + Path(fillColor=Color(.023529,.627451,.290196,1), fillOpacity=1.0, strokeColor=None, strokeOpacity=1.0, points=[591.9, 66.1, 591.9, 89.5, 581.5, 89.5, 581.5, 67.1, 581.5, 60.49999999999999, 577.5, 57.099999999999994, 572.8, 57.099999999999994, 568.0999999999999, 57.099999999999994, 562.1999999999999, 59.699999999999996, 562.1999999999999, 67.69999999999999, 562.1999999999999, 89.49999999999999, 551.8, 89.49999999999999, 551.8, 47.8, 562.1999999999999, 47.8, 562.1999999999999, 54.4, 564.3, 49.199999999999996, 570.9, 46.9, 575.0999999999999, 46.9, 585.9, 46.9, 591.9, 54.0, 591.9, 66.1], operators=[0, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 2, 3], _fillRule=1), +] + + +def get_logo(width): + orig_width = 600 + orig_height = 110 + height = orig_height * width / orig_width + logo = Drawing(width, height, *shapes) + logo.translate(0, height) + scale = width / orig_width + logo.scale(scale, -scale) + return logo + + +if __name__ == "__main__": + + from svglib.svglib import svg2rlg + + drawing = svg2rlg("./testgen/testgen/ui/assets/dk_logo.svg") + + def extract_shapes(drawing): + if hasattr(drawing, "contents"): + for content in drawing.contents: + yield from extract_shapes(content) + else: + yield drawing + + print("shapes = [") + for shape in extract_shapes(drawing): + print(f" {shape.__class__.__name__}(", end="") + print(", ".join([f"{attr}={val!r}" for attr, val in shape.getProperties().items()]), end="") + print("),") + print("]\n") diff --git a/testgen/ui/pdf/hygiene_issue_report.py b/testgen/ui/pdf/hygiene_issue_report.py new file mode 100644 index 0000000..7a0462a --- /dev/null +++ b/testgen/ui/pdf/hygiene_issue_report.py @@ -0,0 +1,176 @@ +from reportlab.lib import colors +from reportlab.lib.colors import HexColor +from reportlab.lib.enums import TA_CENTER +from reportlab.lib.styles import ParagraphStyle +from reportlab.platypus import CondPageBreak, KeepTogether, Paragraph, Table, TableStyle + +from testgen.ui.pdf.dataframe_table import DataFrameTableBuilder +from testgen.ui.pdf.style import ( + COLOR_GRAY_BG, + COLOR_GREEN_BG, + PARA_STYLE_CELL, + PARA_STYLE_FOOTNOTE, + PARA_STYLE_H1, + PARA_STYLE_INFO, + PARA_STYLE_LINK, + PARA_STYLE_MONO, + PARA_STYLE_TEXT, + PARA_STYLE_TITLE, + TABLE_STYLE_DEFAULT, + get_formatted_datetime, +) +from testgen.ui.pdf.templates import DatakitchenTemplate +from testgen.ui.services.hygiene_issues_service import get_source_data +from testgen.utils import get_base_url + +SECTION_MIN_AVAILABLE_HEIGHT = 120 + +CLASS_COLORS = { + "Definite": HexColor(0xE94D4A), + "Likely": HexColor(0xFC8F2A), + "Possible": HexColor(0xFCD349), + "Potential PII": HexColor(0xFC8F2A), +} + +def build_summary_table(document, hi_data): + + summary_table_style = TableStyle( + ( + # All-table styles + ("GRID", (0, 0), (-1, -1), 2, colors.white), + ("BACKGROUND", (0, 0), (-1, -1), COLOR_GRAY_BG), + + # Header cells + *[ + (cmd[0], *coords, *cmd[1:]) + for coords in ( + ((2, 2), (2, 4)), + ((0, 0), (0, -1)) + ) + for cmd in ( + ("FONT", "Helvetica-Bold"), + ("ALIGN", "RIGHT"), + ("BACKGROUND", COLOR_GREEN_BG), + ) + ], + + # Layout + ("SPAN", (1, 0), (3, 0)), + + ("SPAN", (1, 1), (4, 1)), + + ("SPAN", (3, 2), (4, 2)), + ("SPAN", (3, 3), (4, 3)), + ("SPAN", (3, 4), (4, 4)), + ("SPAN", (3, 5), (4, 5)), + ("SPAN", (2, 5), (4, 5)), + + # Link cell + ("BACKGROUND", (2, 5), (4, 5), colors.white), + + # Status cell + *[ + (cmd[0], (4, 0), (4, 0), *cmd[1:]) + for cmd in ( + ("BACKGROUND", CLASS_COLORS.get(hi_data["issue_likelihood"], COLOR_GRAY_BG)), + ("ALIGNMENT", "CENTER"), + ("VALIGN", "MIDDLE"), + ) + ], + ), + parent=TABLE_STYLE_DEFAULT, + ) + + + profiling_timestamp = get_formatted_datetime(hi_data["profiling_starttime"]) + summary_table_data = [ + ( + "Hygiene Issue", + ( + Paragraph(f"{hi_data['anomaly_name']}:", style=PARA_STYLE_CELL), + Paragraph(hi_data["anomaly_description"], style=PARA_STYLE_CELL), + ), + None, + None, + Paragraph( + hi_data["issue_likelihood"], + style=ParagraphStyle("likelihood", textColor=colors.white, fontSize=10, parent=PARA_STYLE_CELL, alignment=TA_CENTER), + ), + ), + ( + "Detail", + Paragraph( + hi_data["detail"], + style=ParagraphStyle("detail", fontName="Helvetica-Bold", parent=PARA_STYLE_CELL), + ), + ), + + ("Database/Schema", hi_data["schema_name"], "Profiling Date", profiling_timestamp), + ("Table", hi_data["table_name"], "Table Group", hi_data["table_groups_name"]), + ("Column", hi_data["column_name"], "Disposition", hi_data["disposition"] or "No Decision"), + ( + "Column Type", + hi_data["column_type"], + Paragraph( + f""" + View on TestGen > + """, + style=PARA_STYLE_LINK, + ), + ), + ] + + summary_table_col_widths = [n * document.width for n in (.15, .35, .15, .15, .20)] + return Table(summary_table_data, style=summary_table_style, hAlign="LEFT", colWidths=summary_table_col_widths) + + +def build_sample_data_content(document, sample_data_tuple): + sample_data_status, sample_data_msg, lookup_query, sample_data = sample_data_tuple + if sample_data_status in ("ND", "NA"): + yield Paragraph(sample_data_msg, style=PARA_STYLE_INFO) + elif sample_data_status == "ERR" or sample_data is None: + yield Paragraph("It was not possible to fetch the sample data this time.", style=PARA_STYLE_INFO) + else: + sample_data.columns = [col.replace("_", " ").title() for col in sample_data.columns] + df_table_builder = DataFrameTableBuilder(sample_data, document.width) + table_flowables = [df_table_builder.build_table(hAlign="LEFT")] + if df_table_builder.omitted_columns: + omitted_columns = ", ".join(df_table_builder.omitted_columns) + sample_data_msg = f"Note: The following columns were omitted from this table: {omitted_columns}" + if sample_data_msg: + table_flowables.append(Paragraph(sample_data_msg, style=PARA_STYLE_FOOTNOTE)) + + yield from df_table_builder.split_in_columns(table_flowables) + + +def build_sql_query_content(sample_data_tuple): + lookup_query = sample_data_tuple[2] + if lookup_query: + return Paragraph(lookup_query, PARA_STYLE_MONO) + else: + return Paragraph("No sample data lookup query registered for this issue.") + + +def get_report_content(document, hi_data): + yield Paragraph("TestGen Hygiene Issue Report", PARA_STYLE_TITLE) + yield build_summary_table(document, hi_data) + + yield CondPageBreak(SECTION_MIN_AVAILABLE_HEIGHT) + yield Paragraph("Suggested Action", style=PARA_STYLE_H1) + yield Paragraph(hi_data["suggested_action"], style=PARA_STYLE_TEXT) + + sample_data_tuple = get_source_data(hi_data) + + yield CondPageBreak(SECTION_MIN_AVAILABLE_HEIGHT) + yield Paragraph("Sample Data", PARA_STYLE_H1) + yield from build_sample_data_content(document, sample_data_tuple) + + yield KeepTogether([ + Paragraph("SQL Query", PARA_STYLE_H1), + build_sql_query_content(sample_data_tuple) + ]) + + +def create_report(filename, hi_data): + doc = DatakitchenTemplate(filename) + doc.build(flowables=list(get_report_content(doc, hi_data))) diff --git a/testgen/ui/pdf/style.py b/testgen/ui/pdf/style.py new file mode 100644 index 0000000..03ed49a --- /dev/null +++ b/testgen/ui/pdf/style.py @@ -0,0 +1,109 @@ +import pandas +import streamlit as st +from reportlab.lib import enums +from reportlab.lib.colors import HexColor +from reportlab.lib.styles import ParagraphStyle +from reportlab.platypus import TableStyle + +from testgen.common import date_service + +COLOR_GRAY_BG = HexColor(0xF2F2F2) +COLOR_GREEN_BG = HexColor(0xDCE4DA) +COLOR_YELLOW_BG = HexColor(0xA0C84E40, hasAlpha=True) +COLOR_GREEN_TEXT = HexColor(0x139549) +COLOR_FADED_TEXT = HexColor(0x404040) +COLOR_LINK_TEXT = HexColor(0x1976D2) + +PARA_STYLE_DEFAULT = ParagraphStyle( + "default", + fontSize=8, + fontName="Helvetica", +) + +PARA_STYLE_TEXT = ParagraphStyle( + "text", + PARA_STYLE_DEFAULT, + fontName="Times-Roman", +) + +PARA_STYLE_INFO = ParagraphStyle( + "info", + PARA_STYLE_DEFAULT, + fontName="Helvetica", + backColor=COLOR_YELLOW_BG, + borderPadding=12, + leftIndent=12, + rightIndent=12, + spaceBefore=18, + spaceAfter=18, +) + +PARA_STYLE_MONO = ParagraphStyle( + "monospaced", + PARA_STYLE_DEFAULT, + fontName="Courier", + borderPadding=4, + backColor=COLOR_GRAY_BG, + leftIndent=4, + rightIndent=4, + spaceBefore=8, + spaceAfter=8, +) + +PARA_STYLE_FOOTNOTE = ParagraphStyle( + "footnote", + PARA_STYLE_DEFAULT, + fontSize=6, + fontName="Helvetica-Oblique", + textColor=COLOR_FADED_TEXT, +) + +PARA_STYLE_TITLE = ParagraphStyle( + "title", + PARA_STYLE_DEFAULT, + fontSize=18, + leading=30, + alignment=enums.TA_CENTER, + spaceBefore=12, + spaceAfter=4, + textColor=COLOR_GREEN_TEXT, +) + +PARA_STYLE_H1 = ParagraphStyle( + "heading_1", + PARA_STYLE_TITLE, + fontSize=12, + leading=16, + alignment=enums.TA_LEFT, +) + +TABLE_STYLE_DEFAULT = TableStyle( + ( + ("ALIGN", (0, 0), (-1, -1), "LEFT"), + ("VALIGN", (0, 0), (-1, -1), "TOP"), + ("FONT", (0, 0), (-1, -1), "Helvetica", 7), + ) +) + +PARA_STYLE_CELL = ParagraphStyle( + "table_cell", + fontSize=7, + fontName="Helvetica", + leading=10, +) + +PARA_STYLE_LINK = ParagraphStyle( + "link", + PARA_STYLE_DEFAULT, + fontSize=9, + alignment=enums.TA_RIGHT, + textColor=COLOR_LINK_TEXT, +) + + +def get_formatted_datetime(value) -> str: + return date_service.get_timezoned_timestamp( + st.session_state, + pandas.to_datetime(value), + "%b %-d, %-I:%M %p %Z", + ) diff --git a/testgen/ui/pdf/templates.py b/testgen/ui/pdf/templates.py new file mode 100644 index 0000000..cba722a --- /dev/null +++ b/testgen/ui/pdf/templates.py @@ -0,0 +1,31 @@ +from reportlab.lib.units import inch +from reportlab.platypus import SimpleDocTemplate + +from testgen.ui.pdf.dk_logo import get_logo + +MARGIN = 0.4 * inch + + +class DatakitchenTemplate(SimpleDocTemplate): + + def __init__(self, filename): + super().__init__(filename, leftMargin=MARGIN, rightMargin=MARGIN, topMargin=MARGIN + 10, bottomMargin=MARGIN) + + def beforePage(self): + header_padding = 5 + header_base_y = self.pagesize[1] - 18 + self.canv.setFont("Helvetica", 8) + self.canv.drawString(MARGIN + header_padding, header_base_y , "DataOps Data Quality TestGen") + self.canv.line( + MARGIN + header_padding, + header_base_y - header_padding, + self.pagesize[0] - MARGIN, + header_base_y - header_padding + ) + + logo = get_logo(80) + logo.drawOn( + self.canv, + self.pagesize[0] - logo.width - MARGIN, + header_base_y + ) diff --git a/testgen/ui/pdf/test_result_report.py b/testgen/ui/pdf/test_result_report.py new file mode 100644 index 0000000..c60cfc3 --- /dev/null +++ b/testgen/ui/pdf/test_result_report.py @@ -0,0 +1,232 @@ +import pandas +from reportlab.lib import colors +from reportlab.lib.colors import HexColor +from reportlab.lib.styles import ParagraphStyle +from reportlab.platypus import ( + CondPageBreak, + KeepTogether, + Paragraph, + Table, + TableStyle, +) + +from testgen.ui.pdf.dataframe_table import TABLE_STYLE_DATA, DataFrameTableBuilder +from testgen.ui.pdf.style import ( + COLOR_GRAY_BG, + COLOR_GREEN_BG, + PARA_STYLE_CELL, + PARA_STYLE_FOOTNOTE, + PARA_STYLE_H1, + PARA_STYLE_INFO, + PARA_STYLE_LINK, + PARA_STYLE_MONO, + PARA_STYLE_TEXT, + PARA_STYLE_TITLE, + TABLE_STYLE_DEFAULT, + get_formatted_datetime, +) +from testgen.ui.pdf.templates import DatakitchenTemplate +from testgen.ui.services.database_service import get_schema +from testgen.ui.services.test_results_service import ( + do_source_data_lookup, + do_source_data_lookup_custom, + get_test_result_history, +) +from testgen.utils import get_base_url + +SECTION_MIN_AVAILABLE_HEIGHT = 120 + +RESULT_STATUS_COLORS = { + "Passed": HexColor(0x94C465), + "Warning": HexColor(0xFCD349), + "Failed": HexColor(0xE94D4A), +} + + +def build_summary_table(document, tr_data): + status_color = RESULT_STATUS_COLORS.get(tr_data["result_status"], COLOR_GRAY_BG) + summary_table_style = TableStyle( + ( + # All-table styles + ("GRID", (0, 0), (-1, -1), 2, colors.white), + ("BACKGROUND", (0, 0), (-1, -1), COLOR_GRAY_BG), + + # Header cells + *[ + (cmd[0], *coords, *cmd[1:]) + for coords in ( + ((3, 3), (3, -2)), + ((0, 0), (0, -2)) + ) + for cmd in ( + ("FONT", "Helvetica-Bold"), + ("ALIGN", "RIGHT"), + ("BACKGROUND", COLOR_GREEN_BG), + ) + ], + + # Layout + ("SPAN", (1, 0), (4, 0)), + ("SPAN", (5, 0), (5, 2)), + ("SPAN", (2, 1), (4, 1)), + ("SPAN", (2, 2), (4, 2)), + ("SPAN", (1, 3), (2, 3)), + ("SPAN", (4, 3), (5, 3)), + ("SPAN", (1, 4), (2, 4)), + ("SPAN", (4, 4), (5, 4)), + ("SPAN", (1, 5), (2, 5)), + ("SPAN", (4, 5), (5, 5)), + ("SPAN", (1, 6), (2, 6)), + ("SPAN", (4, 6), (5, 6)), + ("SPAN", (0, 7), (5, 7)), + + # Link cell + ("BACKGROUND", (0, 7), (5, 7), colors.white), + + # Measure cell + ("FONT", (1, 1), (1, 1), "Helvetica-Bold"), + + # Status cell + *[ + (cmd[0], (5, 0), (5, 0), *cmd[1:]) + for cmd in ( + ("BACKGROUND", status_color), + ("FONT", "Helvetica", 14), + ("ALIGN", "CENTER"), + ("VALIGN", "MIDDLE"), + ("TEXTCOLOR", colors.white), + ) + ], + ), + parent=TABLE_STYLE_DEFAULT, + ) + + test_timestamp = get_formatted_datetime(tr_data["test_time"]) + summary_table_data = [ + ( + "Test", + ( + Paragraph(f"""{tr_data["test_name_short"]}: {tr_data["test_name_long"]}""", + style=PARA_STYLE_CELL), + Paragraph(tr_data["test_description"], style=PARA_STYLE_CELL), + ), + None, + None, + None, + tr_data["result_status"], + ), + ("Measured Value", tr_data["result_measure"], tr_data["measure_uom_description"]), + ("Threshold Value", tr_data["threshold_value"], tr_data["threshold_description"]), + + ("Test Run Date", test_timestamp, None, "Table Group", tr_data["table_groups_name"]), + ("Database/Schema", tr_data["schema_name"], None, "Test Suite", tr_data["test_suite"]), + ("Table", tr_data["table_name"], None, "Data Quality Dimension", tr_data["dq_dimension"]), + ("Column", tr_data["column_names"], None, "Disposition", tr_data["disposition"] or "No Decision"), + ( + Paragraph( + f""" + View on TestGen > + """, + style=PARA_STYLE_LINK, + ), + ), + ] + + summary_table_col_widths = [n * document.width for n in (.2, .1, .2, .2, .15, .15)] + return Table(summary_table_data, style=summary_table_style, hAlign="LEFT", colWidths=summary_table_col_widths) + + +def build_history_table(document, tr_data): + history_data = get_test_result_history(get_schema(), tr_data) + + history_table_style = TableStyle( + ( + ("ALIGN", (3, 0), (3, -1), "CENTER"), + ), + parent=TABLE_STYLE_DATA) + + test_timestamp = pandas.to_datetime(tr_data["test_time"]) + + style_per_status = { + status: ParagraphStyle(f"result_{status}", parent=PARA_STYLE_CELL, textColor=color) + for status, color in RESULT_STATUS_COLORS.items() + } + + for idx in history_data.index[history_data["test_date"] == test_timestamp]: + if idx > 0: + history_table_style.add("BACKGROUND", (0, idx + 1), (-1, idx + 1), COLOR_GRAY_BG) + + history_df = pandas.DataFrame() + history_df = history_df.assign( + test_date=history_data["test_date"].map(get_formatted_datetime).copy(), + threshold_value=history_data["threshold_value"].astype(float).copy(), + result_measure=history_data["result_measure"].astype(float).copy(), + result_status=history_data["result_status"].map( + lambda status: Paragraph(status, style=style_per_status[status]) + ).copy(), + ) + history_df.columns = ("Test Date", "Threshold Value", "Measure Value", "Status") + + table_builder = DataFrameTableBuilder(history_df, document.width) + table = table_builder.build_table(hAlign="LEFT", style=history_table_style) + return table_builder.split_in_columns(table) + + +def build_sample_data_content(document, sample_data_tuple): + sample_data_status, sample_data_msg, lookup_query, sample_data = sample_data_tuple + if sample_data_status in ("ND", "NA"): + yield Paragraph(sample_data_msg, style=PARA_STYLE_INFO) + elif sample_data_status == "ERR" or sample_data is None: + yield Paragraph("It was not possible to fetch the sample data this time.", style=PARA_STYLE_INFO) + else: + sample_data.columns = [col.replace("_", " ").title() for col in sample_data.columns] + df_table_builder = DataFrameTableBuilder(sample_data, document.width) + table_flowables = [df_table_builder.build_table(hAlign="LEFT")] + if df_table_builder.omitted_columns: + omitted_columns = ", ".join(df_table_builder.omitted_columns) + sample_data_msg = f"Note: The following columns were omitted from this table: {omitted_columns}" + if sample_data_msg: + table_flowables.append(Paragraph(sample_data_msg, style=PARA_STYLE_FOOTNOTE)) + + yield from df_table_builder.split_in_columns(table_flowables) + + +def build_sql_query_content(sample_data_tuple): + lookup_query = sample_data_tuple[2] + if lookup_query: + return Paragraph(lookup_query, PARA_STYLE_MONO) + else: + return Paragraph("No sample data lookup query registered for this test.") + + +def get_report_content(document, tr_data): + yield Paragraph("TestGen Test Issue Report", PARA_STYLE_TITLE) + yield build_summary_table(document, tr_data) + + yield KeepTogether([ + Paragraph("Usage Notes", PARA_STYLE_H1), + Paragraph(f"{tr_data['usage_notes']}", PARA_STYLE_TEXT), + ]) + + yield CondPageBreak(SECTION_MIN_AVAILABLE_HEIGHT) + yield Paragraph("Result History", PARA_STYLE_H1) + yield build_history_table(document, tr_data) + + if tr_data["test_type"] == "CUSTOM": + sample_data_tuple = do_source_data_lookup_custom(get_schema(), tr_data) + else: + sample_data_tuple = do_source_data_lookup(get_schema(), tr_data) + + yield CondPageBreak(SECTION_MIN_AVAILABLE_HEIGHT) + yield Paragraph("Sample Data", PARA_STYLE_H1) + yield from build_sample_data_content(document, sample_data_tuple) + + yield KeepTogether([ + Paragraph("SQL Query", PARA_STYLE_H1), + build_sql_query_content(sample_data_tuple) + ]) + + +def create_report(filename, tr_data): + doc = DatakitchenTemplate(filename) + doc.build(flowables=list(get_report_content(doc, tr_data))) diff --git a/testgen/ui/queries/connection_queries.py b/testgen/ui/queries/connection_queries.py index dc10bed..087c9f0 100644 --- a/testgen/ui/queries/connection_queries.py +++ b/testgen/ui/queries/connection_queries.py @@ -1,3 +1,5 @@ +from typing import cast + import pandas as pd import streamlit as st @@ -8,7 +10,7 @@ def get_by_id(connection_id): str_schema = st.session_state["dbschema"] str_sql = f""" SELECT id::VARCHAR(50), project_code, connection_id, connection_name, - sql_flavor, project_host, project_port, project_user, project_qc_schema, + sql_flavor, project_host, project_port, project_user, project_db, project_pw_encrypted, NULL as password, max_threads, max_query_chars, url, connect_by_url, connect_by_key, private_key, private_key_passphrase FROM {str_schema}.connections @@ -21,7 +23,7 @@ def get_connections(project_code): str_schema = st.session_state["dbschema"] str_sql = f""" SELECT id::VARCHAR(50), project_code, connection_id, connection_name, - sql_flavor, project_host, project_port, project_user, project_qc_schema, + sql_flavor, project_host, project_port, project_user, project_db, project_pw_encrypted, NULL as password, max_threads, max_query_chars, connect_by_url, url, connect_by_key, private_key, private_key_passphrase @@ -46,7 +48,6 @@ def edit_connection(schema, connection, encrypted_password, encrypted_private_ke project_port = '{connection["project_port"]}', project_user = '{connection["project_user"]}', project_db = '{connection["project_db"]}', - project_qc_schema = '{connection["project_qc_schema"]}', connection_name = '{connection["connection_name"]}', max_threads = '{connection["max_threads"]}', max_query_chars = '{connection["max_query_chars"]}', @@ -68,11 +69,16 @@ def edit_connection(schema, connection, encrypted_password, encrypted_private_ke st.cache_data.clear() -def add_connection(schema, connection, encrypted_password, encrypted_private_key, encrypted_private_key_passphrase): - +def add_connection( + schema: str, + connection: dict, + encrypted_password: str | None, + encrypted_private_key: str | None, + encrypted_private_key_passphrase: str | None, +) -> int: sql_header = f"""INSERT INTO {schema}.connections (project_code, sql_flavor, url, connect_by_url, connect_by_key, - project_host, project_port, project_user, project_db, project_qc_schema, + project_host, project_port, project_user, project_db, connection_name,""" sql_footer = f""" SELECT @@ -85,7 +91,6 @@ def add_connection(schema, connection, encrypted_password, encrypted_private_key '{connection["project_port"]}' as project_port, '{connection["project_user"]}' as project_user, '{connection["project_db"]}' as project_db, - '{connection["project_qc_schema"]}' as project_qc_schema, '{connection["connection_name"]}' as connection_name, """ if encrypted_password: @@ -103,12 +108,16 @@ def add_connection(schema, connection, encrypted_password, encrypted_private_key sql_header += """max_threads, max_query_chars) """ sql_footer += f""" '{connection["max_threads"]}' as max_threads, - '{connection["max_query_chars"]}' as max_query_chars;""" + '{connection["max_query_chars"]}' as max_query_chars""" - sql = sql_header + sql_footer + sql = sql_header + sql_footer + " RETURNING connection_id" - db.execute_sql(sql) + cursor = db.execute_sql(sql) st.cache_data.clear() + if cursor and (primary_key := cast(tuple, cursor.fetchone())): + return primary_key[0] + + return 0 def delete_connections(schema, connection_ids): diff --git a/testgen/ui/queries/profiling_queries.py b/testgen/ui/queries/profiling_queries.py index dc93496..75477d0 100644 --- a/testgen/ui/queries/profiling_queries.py +++ b/testgen/ui/queries/profiling_queries.py @@ -12,23 +12,14 @@ def run_table_groups_lookup_query(str_project_code): @st.cache_data(show_spinner=False) -def get_latest_profile_run(str_table_group): - str_schema = st.session_state["dbschema"] - str_sql = f""" - WITH last_profile_run - AS (SELECT table_groups_id, MAX(profiling_starttime) as last_profile_run_date - FROM {str_schema}.profiling_runs - GROUP BY table_groups_id) - SELECT id as profile_run_id - FROM {str_schema}.profiling_runs r - INNER JOIN last_profile_run l - ON (r.table_groups_id = l.table_groups_id - AND r.profiling_starttime = l.last_profile_run_date) - WHERE r.table_groups_id = '{str_table_group}'; -""" - str_profile_run_id = db.retrieve_single_result(str_sql) - - return str_profile_run_id +def get_latest_profile_run(table_group_id: str) -> str: + schema: str = st.session_state["dbschema"] + sql = f""" + SELECT last_complete_profile_run_id + FROM {schema}.table_groups + WHERE id = '{table_group_id}'; + """ + return db.retrieve_single_result(sql) @st.cache_data(show_spinner=False) diff --git a/testgen/ui/queries/project_queries.py b/testgen/ui/queries/project_queries.py index 43eced1..5c08706 100644 --- a/testgen/ui/queries/project_queries.py +++ b/testgen/ui/queries/project_queries.py @@ -1,9 +1,58 @@ +import pandas as pd import streamlit as st +import testgen.ui.services.database_service as db import testgen.ui.services.query_service as query_service @st.cache_data(show_spinner=False) def get_projects(): - str_schema = st.session_state["dbschema"] - return query_service.run_project_lookup_query(str_schema) + schema: str = st.session_state["dbschema"] + return query_service.run_project_lookup_query(schema) + + +@st.cache_data(show_spinner=False) +def get_summary_by_code(project_code: str) -> pd.Series: + schema: str = st.session_state["dbschema"] + sql = f""" + SELECT ( + SELECT COUNT(*) AS count + FROM {schema}.connections + WHERE connections.project_code = '{project_code}' + ) AS connections_ct, + ( + SELECT connection_id + FROM {schema}.connections + WHERE connections.project_code = '{project_code}' + LIMIT 1 + ) AS default_connection_id, + ( + SELECT COUNT(*) + FROM {schema}.table_groups + WHERE table_groups.project_code = '{project_code}' + ) AS table_groups_ct, + ( + SELECT COUNT(*) + FROM {schema}.profiling_runs + LEFT JOIN {schema}.table_groups ON profiling_runs.table_groups_id = table_groups.id + WHERE table_groups.project_code = '{project_code}' + ) AS profiling_runs_ct, + ( + SELECT COUNT(*) + FROM {schema}.test_suites + WHERE test_suites.project_code = '{project_code}' + ) AS test_suites_ct, + ( + SELECT COUNT(*) + FROM {schema}.test_definitions + LEFT JOIN {schema}.test_suites ON test_definitions.test_suite_id = test_suites.id + WHERE test_suites.project_code = '{project_code}' + ) AS test_definitions_ct, + ( + SELECT COUNT(*) + FROM {schema}.test_runs + LEFT JOIN {schema}.test_suites ON test_runs.test_suite_id = test_suites.id + WHERE test_suites.project_code = '{project_code}' + ) AS test_runs_ct; + """ + return db.retrieve_data(sql).iloc[0] diff --git a/testgen/ui/queries/table_group_queries.py b/testgen/ui/queries/table_group_queries.py index 0663a6f..c13e62a 100644 --- a/testgen/ui/queries/table_group_queries.py +++ b/testgen/ui/queries/table_group_queries.py @@ -1,3 +1,5 @@ +import uuid + import streamlit as st import testgen.ui.services.database_service as db @@ -108,7 +110,8 @@ def edit(schema, table_group): st.cache_data.clear() -def add(schema, table_group): +def add(schema, table_group) -> str: + new_table_group_id = str(uuid.uuid4()) sql = f"""INSERT INTO {schema}.table_groups (id, project_code, @@ -132,7 +135,7 @@ def add(schema, table_group): source_process, stakeholder_group) SELECT - gen_random_uuid(), + '{new_table_group_id}', '{table_group["project_code"]}', '{table_group["connection_id"]}', '{table_group["table_groups_name"]}', @@ -155,6 +158,7 @@ def add(schema, table_group): ;""" db.execute_sql(sql) st.cache_data.clear() + return new_table_group_id def delete(schema, table_group_ids): diff --git a/testgen/ui/queries/test_suite_queries.py b/testgen/ui/queries/test_suite_queries.py index 80a3fcc..7300695 100644 --- a/testgen/ui/queries/test_suite_queries.py +++ b/testgen/ui/queries/test_suite_queries.py @@ -13,12 +13,6 @@ def get_by_project(schema, project_code, table_group_id=None): FROM {schema}.test_definitions GROUP BY test_suite_id ), - last_run_date AS ( - SELECT test_suite_id, - MAX(test_starttime) as test_starttime - FROM {schema}.test_runs - GROUP BY test_suite_id - ), last_run AS ( SELECT test_runs.test_suite_id, test_runs.id, @@ -58,15 +52,20 @@ def get_by_project(schema, project_code, table_group_id=None): ELSE 0 END ) as dismissed_ct - FROM last_run_date lrd + FROM {schema}.test_suites LEFT JOIN {schema}.test_runs ON ( - lrd.test_suite_id = test_runs.test_suite_id - AND lrd.test_starttime = test_runs.test_starttime + test_suites.last_complete_test_run_id = test_runs.id ) LEFT JOIN {schema}.test_results ON ( test_runs.id = test_results.test_run_id ) GROUP BY test_runs.id + ), + test_defs AS ( + SELECT test_suite_id, + COUNT(*) as count + FROM {schema}.test_definitions + GROUP BY test_suite_id ) SELECT suites.id::VARCHAR(50), @@ -84,13 +83,15 @@ def get_by_project(schema, project_code, table_group_id=None): suites.component_key, suites.component_type, suites.component_name, + test_defs.count as test_ct, last_gen_date.auto_gen_date as latest_auto_gen_date, - last_run.id as latest_run_id, - last_run.test_starttime as latest_run_start, + last_complete_profile_run_id, + last_run.id as latest_run_id, + last_run.test_starttime as latest_run_start, last_run.test_ct as last_run_test_ct, - last_run.passed_ct as last_run_passed_ct, - last_run.warning_ct as last_run_warning_ct, - last_run.failed_ct as last_run_failed_ct, + last_run.passed_ct as last_run_passed_ct, + last_run.warning_ct as last_run_warning_ct, + last_run.failed_ct as last_run_failed_ct, last_run.error_ct as last_run_error_ct, last_run.dismissed_ct as last_run_dismissed_ct FROM {schema}.test_suites as suites @@ -98,13 +99,15 @@ def get_by_project(schema, project_code, table_group_id=None): ON (suites.id = last_gen_date.test_suite_id) LEFT JOIN last_run ON (suites.id = last_run.test_suite_id) - LEFT JOIN {schema}.connections AS connections - ON (connections.connection_id = suites.connection_id) - LEFT JOIN {schema}.table_groups as groups - ON (groups.id = suites.table_groups_id) + LEFT JOIN test_defs + ON (suites.id = test_defs.test_suite_id) + LEFT JOIN {schema}.connections AS connections + ON (connections.connection_id = suites.connection_id) + LEFT JOIN {schema}.table_groups as groups + ON (groups.id = suites.table_groups_id) WHERE suites.project_code = '{project_code}' """ - + if table_group_id: sql += f""" AND suites.table_groups_id = '{table_group_id}' @@ -113,7 +116,7 @@ def get_by_project(schema, project_code, table_group_id=None): sql += """ ORDER BY suites.test_suite; """ - + return db.retrieve_data(sql) diff --git a/testgen/ui/services/connection_service.py b/testgen/ui/services/connection_service.py index 394c82a..293a623 100644 --- a/testgen/ui/services/connection_service.py +++ b/testgen/ui/services/connection_service.py @@ -3,10 +3,8 @@ import testgen.ui.queries.connection_queries as connection_queries import testgen.ui.services.table_group_service as table_group_service from testgen.commands.run_profiling_bridge import InitializeProfilingSQL -from testgen.commands.run_setup_profiling_tools import run_setup_profiling_tools from testgen.common.database.database_service import ( AssignConnectParms, - RetrieveDBResultsToList, empty_cache, get_db_type, get_flavor_service, @@ -58,12 +56,18 @@ def edit_connection(connection): connection_queries.edit_connection(schema, connection, encrypted_password, encrypted_private_key, encrypted_private_key_passphrase) -def add_connection(connection): +def add_connection(connection) -> int: empty_cache() schema = st.session_state["dbschema"] connection = pre_save_connection_process(connection) encrypted_password, encrypted_private_key, encrypted_private_key_passphrase = encrypt_credentials(connection) - connection_queries.add_connection(schema, connection, encrypted_password, encrypted_private_key, encrypted_private_key_passphrase) + return connection_queries.add_connection( + schema, + connection, + encrypted_password, + encrypted_private_key, + encrypted_private_key_passphrase, + ) def pre_save_connection_process(connection): @@ -133,7 +137,6 @@ def init_profiling_sql(project_code, connection, table_group_schema=None): project_port = connection["project_port"] project_db = connection["project_db"] project_user = connection["project_user"] - project_qc_schema = connection["project_qc_schema"] password = connection["password"] # prepare the profiling query @@ -145,7 +148,7 @@ def init_profiling_sql(project_code, connection, table_group_schema=None): project_host, project_port, project_db, - table_group_schema if table_group_schema else project_qc_schema, + table_group_schema, project_user, sql_flavor, url, @@ -160,42 +163,6 @@ def init_profiling_sql(project_code, connection, table_group_schema=None): return clsProfiling -def test_qc_connection(project_code, connection, init_profiling=True): - qc_results = {} - - if init_profiling: - init_profiling_sql(project_code, connection) - - project_qc_schema = connection["project_qc_schema"] - query_isnum_true = f"select {project_qc_schema}.fndk_isnum('32')" - query_isnum_true_result_raw = RetrieveDBResultsToList("PROJECT", query_isnum_true) - isnum_true_result = query_isnum_true_result_raw[0][0][0] == 1 - qc_results["isnum_true_result"] = isnum_true_result - - query_isnum_false = f"select {project_qc_schema}.fndk_isnum('HELLO')" - query_isnum_false_result_raw = RetrieveDBResultsToList("PROJECT", query_isnum_false) - isnum_false_result = query_isnum_false_result_raw[0][0][0] == 0 - qc_results["isnum_false_result"] = isnum_false_result - - query_isdate_true = f"select {project_qc_schema}.fndk_isdate('2013-05-18')" - query_isdate_true_result_raw = RetrieveDBResultsToList("PROJECT", query_isdate_true) - isdate_true_result = query_isdate_true_result_raw[0][0][0] == 1 - qc_results["isdate_true_result"] = isdate_true_result - - query_isdate_false = f"select {project_qc_schema}.fndk_isdate('HELLO')" - query_isdate_false_result_raw = RetrieveDBResultsToList("PROJECT", query_isdate_false) - isdate_false_result = query_isdate_false_result_raw[0][0][0] == 0 - qc_results["isdate_false_result"] = isdate_false_result - - return qc_results - - -def create_qc_schema(connection_id, create_qc_schema, db_user, db_password, skip_granting_privileges, admin_private_key_passphrase=None, admin_private_key=None, user_role=None): - dry_run = False - empty_cache() - run_setup_profiling_tools(connection_id, dry_run, create_qc_schema, db_user, db_password, skip_granting_privileges, admin_private_key_passphrase, admin_private_key, user_role) - - def form_overwritten_connection_url(connection): flavor = connection["sql_flavor"] @@ -207,7 +174,7 @@ def form_overwritten_connection_url(connection): "dbname": connection["project_db"], "url": None, "connect_by_url": None, - "connect_by_key": connection["connect_by_key"], + "connect_by_key": connection.get("connect_by_key"), "private_key": None, "private_key_passphrase": "", "dbschema": "", diff --git a/testgen/ui/services/database_service.py b/testgen/ui/services/database_service.py index fd2fac9..e5030cd 100644 --- a/testgen/ui/services/database_service.py +++ b/testgen/ui/services/database_service.py @@ -2,6 +2,7 @@ import pandas as pd from sqlalchemy import create_engine, text +from sqlalchemy.engine.cursor import CursorResult from testgen.common.credentials import ( get_tg_db, @@ -74,10 +75,10 @@ def retrieve_single_result(str_sql): return lstResult[0] -def execute_sql(str_sql): +def execute_sql(str_sql) -> CursorResult | None: if str_sql > "": tg_engine = _start_engine() - tg_engine.execute(text(str_sql)) + return tg_engine.execute(text(str_sql)) def execute_sql_raw(str_sql): diff --git a/testgen/ui/services/form_service.py b/testgen/ui/services/form_service.py index ba07527..06ed0f9 100644 --- a/testgen/ui/services/form_service.py +++ b/testgen/ui/services/form_service.py @@ -11,7 +11,7 @@ import pandas as pd import streamlit as st -import validators +from attrs import validators from pandas.api.types import is_datetime64_any_dtype from st_aggrid import AgGrid, ColumnsAutoSizeMode, DataReturnMode, GridOptionsBuilder, GridUpdateMode, JsCode from streamlit_extras.no_default_selectbox import selectbox @@ -19,6 +19,7 @@ import testgen.common.date_service as date_service import testgen.ui.services.authentication_service as authentication_service import testgen.ui.services.database_service as db +from testgen.ui.navigation.router import Router """ Shared rendering of UI elements @@ -762,14 +763,31 @@ def render_insert_form( def render_grid_select( - df, + df: pd.DataFrame, show_columns, str_prompt=None, int_height=400, - do_multi_select=False, + do_multi_select: bool | None = None, + selection_mode: typing.Literal["single", "multiple", "disabled"] = "single", show_column_headers=None, render_highlights=True, + bind_to_query_name: str | None = None, + bind_to_query_prop: str | None = None, + key: str = "aggrid", ): + """ + :param do_multi_select: DEPRECATED. boolean to choose between single + or multiple selection. + :param selection_mode: one of single, multiple or disabled. defaults + to single. + :param bind_to_query_name: name of the query param where to bind the + selected row. + :param bind_to_query_prop: name of the property of the selected row + which value will be set in the query param. + :param key: Streamlit cache key for the grid. required when binding + selection to query. + """ + show_prompt(str_prompt) # Set grid formatting @@ -837,12 +855,40 @@ def render_grid_select( } """ ) + data_changed: bool = True + rendering_counter = st.session_state.get(f"{key}_counter") or 0 + previous_dataframe = st.session_state.get(f"{key}_dataframe") + + if previous_dataframe is not None: + data_changed = not df.equals(previous_dataframe) dct_col_to_header = dict(zip(show_columns, show_column_headers, strict=True)) if show_column_headers else None gb = GridOptionsBuilder.from_dataframe(df) - selection_mode = "multiple" if do_multi_select else "single" - gb.configure_selection(selection_mode=selection_mode, use_checkbox=do_multi_select) + selection_mode_ = selection_mode + if do_multi_select is not None: + selection_mode_ = "multiple" if do_multi_select else "single" + + pre_selected_rows: typing.Any = {} + if bind_to_query_name and bind_to_query_prop: + bound_value = st.query_params.get(bind_to_query_name) + bound_items = df[df[bind_to_query_prop] == bound_value] + if len(bound_items) > 0: + # https://github.com/PablocFonseca/streamlit-aggrid/issues/207#issuecomment-1793039564 + pre_selected_rows = {str(bound_items.iloc[0][bind_to_query_prop]): True} + else: + if data_changed and st.query_params.get(bind_to_query_name): + rendering_counter += 1 + Router().set_query_params({bind_to_query_name: None}) + + gb.configure_selection( + selection_mode=selection_mode_, + use_checkbox=selection_mode_ == "multiple", + pre_selected_rows=pre_selected_rows, + ) + + if bind_to_query_prop: + gb.configure_grid_options(getRowId=JsCode(f"""function(row) {{ return row.data['{bind_to_query_prop}'] }}""")) all_columns = list(df.columns) @@ -853,8 +899,8 @@ def render_grid_select( "field": column, "header_name": str_header if str_header else ut_prettify_header(column), "hide": column not in show_columns, - "headerCheckboxSelection": do_multi_select and column == show_columns[0], - "headerCheckboxSelectionFilteredOnly": do_multi_select and column == show_columns[0], + "headerCheckboxSelection": selection_mode_ == "multiple" and column == show_columns[0], + "headerCheckboxSelectionFilteredOnly": selection_mode_ == "multiple" and column == show_columns[0], } highlight_kwargs = {"cellStyle": cellstyle_jscode} @@ -888,7 +934,8 @@ def render_grid_select( theme="balham", enable_enterprise_modules=False, allow_unsafe_jscode=True, - update_mode=GridUpdateMode.SELECTION_CHANGED, + update_mode=GridUpdateMode.NO_UPDATE, + update_on=["selectionChanged"], data_return_mode=DataReturnMode.FILTERED_AND_SORTED, columns_auto_size_mode=ColumnsAutoSizeMode.FIT_CONTENTS, height=int_height, @@ -897,10 +944,18 @@ def render_grid_select( "padding-bottom": "0px !important", } }, + key=f"{key}_{selection_mode_}_{rendering_counter}", + reload_data=data_changed, ) - if len(grid_data["selected_rows"]): - return grid_data["selected_rows"] + st.session_state[f"{key}_counter"] = rendering_counter + st.session_state[f"{key}_dataframe"] = df + + selected_rows = grid_data["selected_rows"] + if len(selected_rows) > 0: + if bind_to_query_name and bind_to_query_prop: + Router().set_query_params({bind_to_query_name: selected_rows[0][bind_to_query_prop]}) + return selected_rows def render_logo(logo_path: str = logo_file): diff --git a/testgen/ui/services/hygiene_issues_service.py b/testgen/ui/services/hygiene_issues_service.py new file mode 100644 index 0000000..1085f3e --- /dev/null +++ b/testgen/ui/services/hygiene_issues_service.py @@ -0,0 +1,91 @@ +import streamlit as st + +from testgen.common.read_file import replace_templated_functions +from testgen.ui.services import database_service as db + + +def get_source_data(hi_data): + str_schema = st.session_state["dbschema"] + # Define the query + str_sql = f""" + SELECT t.lookup_query, tg.table_group_schema, + c.sql_flavor, c.project_host, c.project_port, c.project_db, c.project_user, c.project_pw_encrypted, + c.url, c.connect_by_url, c.connect_by_key, c.private_key, c.private_key_passphrase + FROM {str_schema}.target_data_lookups t + INNER JOIN {str_schema}.table_groups tg + ON ('{hi_data["table_groups_id"]}'::UUID = tg.id) + INNER JOIN {str_schema}.connections c + ON (tg.connection_id = c.connection_id) + AND (t.sql_flavor = c.sql_flavor) + WHERE t.error_type = 'Profile Anomaly' + AND t.test_id = '{hi_data["anomaly_id"]}' + AND t.lookup_query > ''; + """ + + def get_lookup_query(test_id, detail_exp, column_names): + if test_id in {"1019", "1020"}: + start_index = detail_exp.find("Columns: ") + if start_index == -1: + columns = [col.strip() for col in column_names.split(",")] + else: + start_index += len("Columns: ") + column_names_str = detail_exp[start_index:] + columns = [col.strip() for col in column_names_str.split(",")] + queries = [ + f"SELECT '{column}' AS column_name, MAX({column}) AS max_date_available FROM {{TARGET_SCHEMA}}.{{TABLE_NAME}}" + for column in columns + ] + sql_query = " UNION ALL ".join(queries) + " ORDER BY max_date_available DESC;" + else: + sql_query = "" + return sql_query + + def replace_parms(str_query): + str_query = ( + get_lookup_query(hi_data["anomaly_id"], hi_data["detail"], hi_data["column_name"]) + if lst_query[0]["lookup_query"] == "created_in_ui" + else lst_query[0]["lookup_query"] + ) + str_query = str_query.replace("{TARGET_SCHEMA}", lst_query[0]["table_group_schema"]) + str_query = str_query.replace("{TABLE_NAME}", hi_data["table_name"]) + str_query = str_query.replace("{COLUMN_NAME}", hi_data["column_name"]) + str_query = str_query.replace("{DETAIL_EXPRESSION}", hi_data["detail"]) + str_query = str_query.replace("{PROFILE_RUN_DATE}", hi_data["profiling_starttime"]) + + if "{{DKFN_" in str_query: + str_query = replace_templated_functions(str_query, lst_query[0]["sql_flavor"]) + + if str_query is None or str_query == "": + raise ValueError("Lookup query is not defined for this Anomoly Type.") + return str_query + + try: + # Retrieve SQL for customer lookup + lst_query = db.retrieve_data_list(str_sql) + + # Retrieve and return data as df + if lst_query: + str_sql = replace_parms(str_sql) + df = db.retrieve_target_db_df( + lst_query[0]["sql_flavor"], + lst_query[0]["project_host"], + lst_query[0]["project_port"], + lst_query[0]["project_db"], + lst_query[0]["project_user"], + lst_query[0]["project_pw_encrypted"], + str_sql, + lst_query[0]["url"], + lst_query[0]["connect_by_url"], + lst_query[0]["connect_by_key"], + lst_query[0]["private_key"], + lst_query[0]["private_key_passphrase"], + ) + if df.empty: + return "ND", "Data that violates Hygiene Issue criteria is not present in the current dataset.", str_sql, None + else: + return "OK", None, str_sql, df + else: + return "NA", "Source data lookup is not available for this Issue.", None, None + + except Exception as e: + return "ERR", f"Source data lookup query caused an error:\n\n{e.args[0]}", None, None diff --git a/testgen/ui/services/javascript_service.py b/testgen/ui/services/javascript_service.py index 7b4ea32..93eae90 100644 --- a/testgen/ui/services/javascript_service.py +++ b/testgen/ui/services/javascript_service.py @@ -38,7 +38,6 @@ def get_browser_locale_timezone(): return st_javascript( """await (async () => { const userTimezone = Intl.DateTimeFormat().resolvedOptions().timeZone; - console.log(userTimezone) return userTimezone })().then(returnValue => returnValue)""" ) diff --git a/testgen/ui/services/project_service.py b/testgen/ui/services/project_service.py index 24a41ab..fa049b7 100644 --- a/testgen/ui/services/project_service.py +++ b/testgen/ui/services/project_service.py @@ -24,4 +24,4 @@ def get_project_by_code(code: str): if not code: return None return query_service.get_project_by_code(session.dbschema, code) - \ No newline at end of file + diff --git a/testgen/ui/services/query_service.py b/testgen/ui/services/query_service.py index 3343010..088c4b0 100644 --- a/testgen/ui/services/query_service.py +++ b/testgen/ui/services/query_service.py @@ -1,3 +1,5 @@ +import pandas as pd + import testgen.ui.services.database_service as db """ @@ -84,35 +86,35 @@ def run_connections_lookup_query(str_schema, str_project_code): return db.retrieve_data(str_sql) -def run_table_groups_lookup_query(str_schema, str_project_code, connection_id=None, table_group_id=None): - str_sql = f""" +def run_table_groups_lookup_query(schema: str, project_code: str, connection_id: str | None = None, table_group_id: str | None = None) -> pd.DataFrame: + sql = f""" SELECT tg.id::VARCHAR(50), tg.table_groups_name, tg.connection_id, tg.table_group_schema - FROM {str_schema}.table_groups tg + FROM {schema}.table_groups tg """ if connection_id: - str_sql += f""" - inner join {str_schema}.connections c on c.connection_id = tg.connection_id + sql += f""" + inner join {schema}.connections c on c.connection_id = tg.connection_id """ - str_sql += f""" - WHERE tg.project_code = '{str_project_code}' + sql += f""" + WHERE tg.project_code = '{project_code}' """ if table_group_id: - str_sql += f""" + sql += f""" AND tg.id = '{table_group_id}'::UUID """ if connection_id: - str_sql += f""" + sql += f""" AND c.id = '{connection_id}'::UUID """ - str_sql += """ + sql += """ ORDER BY table_groups_name """ - return db.retrieve_data(str_sql) + return db.retrieve_data(sql) def run_table_lookup_query(str_schema, str_table_groups_id): diff --git a/testgen/ui/services/table_group_service.py b/testgen/ui/services/table_group_service.py index 57ea6bd..92a8509 100644 --- a/testgen/ui/services/table_group_service.py +++ b/testgen/ui/services/table_group_service.py @@ -21,9 +21,9 @@ def edit(table_group): table_group_queries.edit(schema, table_group) -def add(table_group): +def add(table_group: dict) -> str: schema = st.session_state["dbschema"] - table_group_queries.add(schema, table_group) + return table_group_queries.add(schema, table_group) def cascade_delete(table_group_names, dry_run=False): @@ -81,7 +81,6 @@ def test_table_group(table_group, connection_id, project_code): # get table group data table_group_schema = table_group["table_group_schema"] table_group_id = table_group["id"] - project_qc_schema = connection["project_qc_schema"] profiling_table_set = table_group["profiling_table_set"] profiling_include_mask = table_group["profiling_include_mask"] profiling_exclude_mask = table_group["profiling_exclude_mask"] @@ -104,7 +103,6 @@ def test_table_group(table_group, connection_id, project_code): clsProfiling.parm_do_patterns = "Y" clsProfiling.parm_max_pattern_length = 25 clsProfiling.profile_run_id = "" - clsProfiling.data_qc_schema = project_qc_schema clsProfiling.data_schema = table_group_schema clsProfiling.parm_table_set = get_profiling_table_set_with_quotes(profiling_table_set) clsProfiling.parm_table_include_mask = profiling_include_mask @@ -118,9 +116,7 @@ def test_table_group(table_group, connection_id, project_code): query = clsProfiling.GetDDFQuery() table_group_results = RetrieveDBResultsToDictList("PROJECT", query) - qc_results = connection_service.test_qc_connection(project_code, connection, init_profiling=False) - - return table_group_results, qc_results + return table_group_results def get_profiling_table_set_with_quotes(profiling_table_set): diff --git a/testgen/ui/services/test_definition_service.py b/testgen/ui/services/test_definition_service.py index 3d7d64b..d8315cd 100644 --- a/testgen/ui/services/test_definition_service.py +++ b/testgen/ui/services/test_definition_service.py @@ -22,6 +22,27 @@ def get_test_definitions( ) +def get_test_definition(db_schema, test_def_id): + str_sql = f""" + SELECT d.id::VARCHAR, tt.test_name_short as test_name, tt.test_name_long as full_name, + tt.test_description as description, tt.usage_notes, + d.column_name, + d.baseline_value, d.baseline_ct, d.baseline_avg, d.baseline_sd, d.threshold_value, + d.subset_condition, d.groupby_names, d.having_condition, d.match_schema_name, + d.match_table_name, d.match_column_names, d.match_subset_condition, + d.match_groupby_names, d.match_having_condition, + d.window_date_column, d.window_days::VARCHAR as window_days, + d.custom_query, + d.severity, tt.default_severity, + d.test_active, d.lock_refresh, d.last_manual_update + FROM {db_schema}.test_definitions d + INNER JOIN {db_schema}.test_types tt + ON (d.test_type = tt.test_type) + WHERE d.id = '{test_def_id}'; + """ + return database_service.retrieve_data(str_sql) + + def delete(test_definition_ids, dry_run=False): schema = st.session_state["dbschema"] usage_result = test_definition_queries.get_test_definition_usage(schema, test_definition_ids) diff --git a/testgen/ui/services/test_results_service.py b/testgen/ui/services/test_results_service.py new file mode 100644 index 0000000..0fe29d0 --- /dev/null +++ b/testgen/ui/services/test_results_service.py @@ -0,0 +1,186 @@ +import pandas as pd + +from testgen.common import ConcatColumnList +from testgen.common.read_file import replace_templated_functions +from testgen.ui.services import database_service as db +from testgen.ui.services.string_service import empty_if_null +from testgen.ui.services.test_definition_service import get_test_definition + + +def get_test_result_history(db_schema, tr_data): + if tr_data["auto_gen"]: + str_where = f""" + WHERE test_suite_id = '{tr_data["test_suite_id"]}' + AND table_name = '{tr_data["table_name"]}' + AND column_names = '{tr_data["column_names"]}' + AND test_type = '{tr_data["test_type"]}' + AND auto_gen = TRUE + """ + else: + str_where = f""" + WHERE test_definition_id_runtime = '{tr_data["test_definition_id_runtime"]}' + """ + + str_sql = f""" + SELECT test_date, test_type, + test_name_short, test_name_long, measure_uom, test_operator, + threshold_value::NUMERIC, result_measure, result_status + FROM {db_schema}.v_test_results {str_where} + ORDER BY test_date DESC; + """ + + df = db.retrieve_data(str_sql) + # Clean Up + df["test_date"] = pd.to_datetime(df["test_date"]) + + return df + + +def do_source_data_lookup_custom(db_schema, tr_data): + # Define the query + str_sql = f""" + SELECT d.custom_query as lookup_query, tg.table_group_schema, + c.sql_flavor, c.project_host, c.project_port, c.project_db, c.project_user, c.project_pw_encrypted, + c.url, c.connect_by_url, c.connect_by_key, c.private_key, c.private_key_passphrase + FROM {db_schema}.test_definitions d + INNER JOIN {db_schema}.table_groups tg + ON ('{tr_data["table_groups_id"]}'::UUID = tg.id) + INNER JOIN {db_schema}.connections c + ON (tg.connection_id = c.connection_id) + WHERE d.id = '{tr_data["test_definition_id_current"]}'; + """ + + try: + # Retrieve SQL for customer lookup + lst_query = db.retrieve_data_list(str_sql) + + # Retrieve and return data as df + if lst_query: + str_sql = lst_query[0]["lookup_query"] + str_sql = str_sql.replace("{DATA_SCHEMA}", empty_if_null(lst_query[0]["table_group_schema"])) + df = db.retrieve_target_db_df( + lst_query[0]["sql_flavor"], + lst_query[0]["project_host"], + lst_query[0]["project_port"], + lst_query[0]["project_db"], + lst_query[0]["project_user"], + lst_query[0]["project_pw_encrypted"], + str_sql, + lst_query[0]["url"], + lst_query[0]["connect_by_url"], + lst_query[0]["connect_by_key"], + lst_query[0]["private_key"], + lst_query[0]["private_key_passphrase"], + ) + if df.empty: + return "ND", "Data that violates Test criteria is not present in the current dataset.", str_sql, None + else: + return "OK", None, str_sql, df + else: + return "NA", "Source data lookup is not available for this test.", None, None + + except Exception as e: + return "ERR", f"Source data lookup query caused an error:\n\n{e.args[0]}", str_sql, None + + +def do_source_data_lookup(db_schema, tr_data, sql_only=False): + # Define the query + str_sql = f""" + SELECT t.lookup_query, tg.table_group_schema, + c.sql_flavor, c.project_host, c.project_port, c.project_db, c.project_user, c.project_pw_encrypted, + c.url, c.connect_by_url, + c.connect_by_key, c.private_key, c.private_key_passphrase + FROM {db_schema}.target_data_lookups t + INNER JOIN {db_schema}.table_groups tg + ON ('{tr_data["table_groups_id"]}'::UUID = tg.id) + INNER JOIN {db_schema}.connections c + ON (tg.connection_id = c.connection_id) + AND (t.sql_flavor = c.sql_flavor) + WHERE t.error_type = 'Test Results' + AND t.test_id = '{tr_data["test_type_id"]}' + AND t.lookup_query > ''; + """ + + def replace_parms(df_test, str_query): + if df_test.empty: + raise ValueError("This test definition is no longer present.") + str_query = str_query.replace("{TARGET_SCHEMA}", empty_if_null(lst_query[0]["table_group_schema"])) + str_query = str_query.replace("{TABLE_NAME}", empty_if_null(tr_data["table_name"])) + str_query = str_query.replace("{COLUMN_NAME}", empty_if_null(tr_data["column_names"])) + str_query = str_query.replace("{TEST_DATE}", str(empty_if_null(tr_data["test_date"]))) + + str_query = str_query.replace("{CUSTOM_QUERY}", empty_if_null(df_test.at[0, "custom_query"])) + str_query = str_query.replace("{BASELINE_VALUE}", empty_if_null(df_test.at[0, "baseline_value"])) + str_query = str_query.replace("{BASELINE_CT}", empty_if_null(df_test.at[0, "baseline_ct"])) + str_query = str_query.replace("{BASELINE_AVG}", empty_if_null(df_test.at[0, "baseline_avg"])) + str_query = str_query.replace("{BASELINE_SD}", empty_if_null(df_test.at[0, "baseline_sd"])) + str_query = str_query.replace("{THRESHOLD_VALUE}", empty_if_null(df_test.at[0, "threshold_value"])) + + str_substitute = empty_if_null(df_test.at[0, "subset_condition"]) + str_substitute = "1=1" if str_substitute == "" else str_substitute + str_query = str_query.replace("{SUBSET_CONDITION}", str_substitute) + + str_query = str_query.replace("{GROUPBY_NAMES}", empty_if_null(df_test.at[0, "groupby_names"])) + str_query = str_query.replace("{HAVING_CONDITION}", empty_if_null(df_test.at[0, "having_condition"])) + str_query = str_query.replace("{MATCH_SCHEMA_NAME}", empty_if_null(df_test.at[0, "match_schema_name"])) + str_query = str_query.replace("{MATCH_TABLE_NAME}", empty_if_null(df_test.at[0, "match_table_name"])) + str_query = str_query.replace("{MATCH_COLUMN_NAMES}", empty_if_null(df_test.at[0, "match_column_names"])) + + str_substitute = empty_if_null(df_test.at[0, "match_subset_condition"]) + str_substitute = "1=1" if str_substitute == "" else str_substitute + str_query = str_query.replace("{MATCH_SUBSET_CONDITION}", str_substitute) + + str_query = str_query.replace("{MATCH_GROUPBY_NAMES}", empty_if_null(df_test.at[0, "match_groupby_names"])) + str_query = str_query.replace("{MATCH_HAVING_CONDITION}", empty_if_null(df_test.at[0, "match_having_condition"])) + str_query = str_query.replace("{COLUMN_NAME_NO_QUOTES}", empty_if_null(tr_data["column_names"])) + + str_query = str_query.replace("{WINDOW_DATE_COLUMN}", empty_if_null(df_test.at[0, "window_date_column"])) + str_query = str_query.replace("{WINDOW_DAYS}", empty_if_null(df_test.at[0, "window_days"])) + + str_substitute = ConcatColumnList(tr_data["column_names"], "") + str_query = str_query.replace("{CONCAT_COLUMNS}", str_substitute) + str_substitute = ConcatColumnList(df_test.at[0, "match_groupby_names"], "") + str_query = str_query.replace("{CONCAT_MATCH_GROUPBY}", str_substitute) + + if "{{DKFN_" in str_query: + str_query = replace_templated_functions(str_query, lst_query[0]["sql_flavor"]) + + if str_query is None or str_query == "": + raise ValueError("Lookup query is not defined for this Test Type.") + return str_query + + try: + # Retrieve SQL for customer lookup + lst_query = db.retrieve_data_list(str_sql) + + if sql_only: + return lst_query, replace_parms, None + + # Retrieve and return data as df + if lst_query: + df_test = get_test_definition(db_schema, tr_data["test_definition_id_current"]) + + str_sql = replace_parms(df_test, lst_query[0]["lookup_query"]) + df = db.retrieve_target_db_df( + lst_query[0]["sql_flavor"], + lst_query[0]["project_host"], + lst_query[0]["project_port"], + lst_query[0]["project_db"], + lst_query[0]["project_user"], + lst_query[0]["project_pw_encrypted"], + str_sql, + lst_query[0]["url"], + lst_query[0]["connect_by_url"], + lst_query[0]["connect_by_key"], + lst_query[0]["private_key"], + lst_query[0]["private_key_passphrase"], + ) + if df.empty: + return "ND", "Data that violates Test criteria is not present in the current dataset.", str_sql, None + else: + return "OK", None, str_sql, df + else: + return "NA", "A source data lookup for this Test is not available.", None, None + + except Exception as e: + return "ERR", f"Source data lookup query caused:\n\n{e.args[0]}", str_sql, None diff --git a/testgen/ui/session.py b/testgen/ui/session.py index b10e251..0e5ef49 100644 --- a/testgen/ui/session.py +++ b/testgen/ui/session.py @@ -1,16 +1,21 @@ -import typing +from collections.abc import Callable +from typing import Any, Literal, TypeVar import streamlit as st from streamlit.runtime.state import SessionStateProxy from testgen.utils.singleton import Singleton +T = TypeVar("T") +TempValueGetter = Callable[..., T] +TempValueSetter = Callable[[T], None] + class TestgenSession(Singleton): - cookies_ready: bool + cookies_ready: int logging_in: bool logging_out: bool - page_pending_cookies: st.Page + page_pending_cookies: st.Page # type: ignore page_pending_login: str page_pending_sidebar: str page_args_pending_router: dict @@ -23,22 +28,24 @@ class TestgenSession(Singleton): name: str username: str authentication_status: bool - auth_role: typing.Literal["admin", "edit", "read"] + auth_role: Literal["admin", "edit", "read"] project: str add_project: bool latest_version: str | None + testgen_event_id: str | None + def __init__(self, state: SessionStateProxy) -> None: super().__setattr__("_state", state) - def __getattr__(self, key: str) -> typing.Any: + def __getattr__(self, key: str) -> Any: state = object.__getattribute__(self, "_state") if key not in state: return None return state[key] - def __setattr__(self, key: str, value: typing.Any) -> None: + def __setattr__(self, key: str, value: Any) -> None: object.__getattribute__(self, "_state")[key] = value def __delattr__(self, key: str) -> None: @@ -47,4 +54,17 @@ def __delattr__(self, key: str) -> None: del state[key] +def temp_value(session_key: str, *, default: T | None = None) -> tuple[TempValueGetter[T | None], TempValueSetter[T]]: + scoped_session_key = f"tg-session:tmp-value:{session_key}" + + def getter() -> T | None: + if scoped_session_key not in st.session_state: + return default + return st.session_state.pop(scoped_session_key, None) + + def setter(value: T): + st.session_state[scoped_session_key] = value + + return getter, setter + session: TestgenSession = TestgenSession(st.session_state) diff --git a/testgen/ui/views/connections.py b/testgen/ui/views/connections.py deleted file mode 100644 index 33df711..0000000 --- a/testgen/ui/views/connections.py +++ /dev/null @@ -1,456 +0,0 @@ -import dataclasses -import logging -import os -import time -import typing - -import streamlit as st - -import testgen.ui.services.database_service as db -from testgen.commands.run_setup_profiling_tools import get_setup_profiling_tools_queries -from testgen.common.database.database_service import empty_cache -from testgen.ui.components import widgets as testgen -from testgen.ui.navigation.menu import MenuItem -from testgen.ui.navigation.page import Page -from testgen.ui.services import authentication_service, connection_service -from testgen.ui.session import session - -LOG = logging.getLogger("testgen") - - -class ConnectionsPage(Page): - path = "connections" - can_activate: typing.ClassVar = [ - lambda: session.authentication_status, - ] - menu_item = MenuItem(icon="database", label="Data Configuration", order=4) - - def render(self, project_code: str, **_kwargs) -> None: - dataframe = connection_service.get_connections(project_code) - connection = dataframe.iloc[0] - - testgen.page_header( - "Connection", - "https://docs.datakitchen.io/article/dataops-testgen-help/connect-your-database", - ) - - _, actions_column = st.columns([.1, .9]) - testgen.flex_row_end(actions_column) - - enable_table_groups = connection["project_host"] and connection["project_db"] and connection["project_qc_schema"] - - with st.container(border=True): - self.show_connection_form(connection, "edit", project_code) - - if actions_column.button( - "Configure QC Utility Schema", - help="Creates the required Utility schema and related functions in the target database", - ): - self.create_qc_schema_dialog(connection) - - if actions_column.button( - f":{'gray' if not enable_table_groups else 'green'}[Table Groupsใ€€โ†’]", - help="Create or edit Table Groups for the Connection", - ): - self.router.navigate( - "connections:table-groups", - {"connection_id": connection["connection_id"]}, - ) - - @st.dialog(title="Configure QC Utility Schema") - def create_qc_schema_dialog(self, selected_connection): - connection_id = selected_connection["connection_id"] - project_qc_schema = selected_connection["project_qc_schema"] - sql_flavor = selected_connection["sql_flavor"] - user = selected_connection["project_user"] - - create_qc_schema = st.toggle("Create QC Utility Schema", value=True) - grant_privileges = st.toggle("Grant access privileges to TestGen user", value=True) - - user_role = None - - # TODO ALEX: This textbox may be needed if we want to grant permissions to user role - # if sql_flavor == "snowflake": - # user_role_textbox_label = f"Primary role for database user {user}" - # user_role = st.text_input(label=user_role_textbox_label, max_chars=100) - - admin_credentials_expander = st.expander("Admin credential options", expanded=True) - with admin_credentials_expander: - admin_connection_option_index = 0 - admin_connection_options = ["Do not use admin credentials", "Use admin credentials with Password"] - if sql_flavor == "snowflake": - admin_connection_options.append("Use admin credentials with Key-Pair") - - admin_connection_option = st.radio( - "Admin credential options", - label_visibility="hidden", - options=admin_connection_options, - index=admin_connection_option_index, - horizontal=True, - ) - - st.markdown("

 
", unsafe_allow_html=True) - - db_user = None - db_password = None - admin_private_key_passphrase = None - admin_private_key = None - if admin_connection_option == admin_connection_options[0]: - st.markdown(":orange[User created in the connection dialog will be used.]") - else: - db_user = st.text_input(label="Admin db user", max_chars=40) - if admin_connection_option == admin_connection_options[1]: - db_password = st.text_input( - label="Admin db password", max_chars=40, type="password" - ) - st.markdown(":orange[Note: Admin credentials are not stored, are only used for this operation.]") - - if len(admin_connection_options) > 2 and admin_connection_option == admin_connection_options[2]: - admin_private_key_passphrase = st.text_input( - label="Private Key Passphrase", - key="create-qc-schema-private-key-password", - type="password", - max_chars=200, - help="Passphrase used while creating the private Key (leave empty if not applicable)", - ) - - admin_uploaded_file = st.file_uploader("Upload private key (rsa_key.p8)", key="admin-uploaded-file") - if admin_uploaded_file: - admin_private_key = admin_uploaded_file.getvalue().decode("utf-8") - - st.markdown(":orange[Note: Admin credentials are not stored, are only used for this operation.]") - - submit = st.button("Update Configuration") - - if submit: - empty_cache() - script_expander = st.expander("Script Details") - - operation_status = st.empty() - operation_status.info(f"Configuring QC Utility Schema '{project_qc_schema}'...") - - try: - skip_granting_privileges = not grant_privileges - queries = get_setup_profiling_tools_queries(sql_flavor, create_qc_schema, skip_granting_privileges, project_qc_schema, user, user_role) - with script_expander: - st.code( - os.linesep.join(queries), - language="sql", - line_numbers=True) - - connection_service.create_qc_schema( - connection_id, - create_qc_schema, - db_user if db_user else None, - db_password if db_password else None, - skip_granting_privileges, - admin_private_key_passphrase=admin_private_key_passphrase, - admin_private_key=admin_private_key, - user_role=user_role, - ) - operation_status.empty() - operation_status.success("Operation has finished successfully.") - - except Exception as e: - operation_status.empty() - operation_status.error("Error configuring QC Utility Schema.") - error_message = e.args[0] - st.text_area("Error Details", value=error_message) - - def show_connection_form(self, selected_connection, mode, project_code): - flavor_options = ["redshift", "snowflake", "mssql", "postgresql"] - connection_options = ["Connect by Password", "Connect by Key-Pair"] - - left_column, right_column = st.columns([0.75, 0.25]) - - mid_column = st.columns(1)[0] - url_override_toogle_container = st.container() - bottom_left_column, bottom_right_column = st.columns([0.25, 0.75]) - button_left_column, button_right_column = st.columns([0.20, 0.80]) - connection_status_wrapper = st.container() - - connection_id = selected_connection["connection_id"] if mode == "edit" else None - connection_name = selected_connection["connection_name"] if mode == "edit" else "" - sql_flavor_index = flavor_options.index(selected_connection["sql_flavor"]) if mode == "edit" else 0 - project_port = selected_connection["project_port"] if mode == "edit" else "" - project_host = selected_connection["project_host"] if mode == "edit" else "" - project_db = selected_connection["project_db"] if mode == "edit" else "" - project_user = selected_connection["project_user"] if mode == "edit" else "" - url = selected_connection["url"] if mode == "edit" else "" - project_qc_schema = selected_connection["project_qc_schema"] if mode == "edit" else "qc" - password = selected_connection["password"] if mode == "edit" else "" - max_threads = selected_connection["max_threads"] if mode == "edit" else 4 - max_query_chars = selected_connection["max_query_chars"] if mode == "edit" else 10000 - connect_by_url = selected_connection["connect_by_url"] if mode == "edit" else False - connect_by_key = selected_connection["connect_by_key"] if mode == "edit" else False - connection_option_index = 1 if connect_by_key else 0 - private_key = selected_connection["private_key"] if mode == "edit" else None - private_key_passphrase = selected_connection["private_key_passphrase"] if mode == "edit" else "" - - new_connection = { - "connection_id": connection_id, - "project_code": project_code, - "private_key": private_key, - "private_key_passphrase": private_key_passphrase, - "password": password, - "url": url, - "max_threads": right_column.number_input( - label="Max Threads (Advanced Tuning)", - min_value=1, - max_value=8, - value=max_threads, - help=( - "Maximum number of concurrent threads that run tests. Default values should be retained unless " - "test queries are failing." - ), - key=f"connections:form:max-threads:{connection_id or 0}", - ), - "max_query_chars": right_column.number_input( - label="Max Expression Length (Advanced Tuning)", - min_value=500, - max_value=14000, - value=max_query_chars, - help="Some tests are consolidated into queries for maximum performance. Default values should be retained unless test queries are failing.", - key=f"connections:form:max-length:{connection_id or 0}", - ), - "connection_name": left_column.text_input( - label="Connection Name", - max_chars=40, - value=connection_name, - help="Your name for this connection. Can be any text.", - key=f"connections:form:name:{connection_id or 0}", - ), - "sql_flavor": left_column.selectbox( - label="SQL Flavor", - options=flavor_options, - index=sql_flavor_index, - help="The type of database server that you will connect to. This determines TestGen's drivers and SQL dialect.", - key=f"connections:form:flavor:{connection_id or 0}", - ) - } - - st.session_state.disable_url_widgets = connect_by_url - - new_connection["project_port"] = right_column.text_input( - label="Port", - max_chars=5, - value=project_port, - disabled=st.session_state.disable_url_widgets, - key=f"connections:form:port:{connection_id or 0}", - ) - new_connection["project_host"] = left_column.text_input( - label="Host", - max_chars=250, - value=project_host, - disabled=st.session_state.disable_url_widgets, - key=f"connections:form:host:{connection_id or 0}", - ) - new_connection["project_db"] = left_column.text_input( - label="Database", - max_chars=100, - value=project_db, - help="The name of the database defined on your host where your schemas and tables is present.", - disabled=st.session_state.disable_url_widgets, - key=f"connections:form:database:{connection_id or 0}", - ) - - new_connection["project_user"] = left_column.text_input( - label="User", - max_chars=50, - value=project_user, - help="Username to connect to your database.", - key=f"connections:form:user:{connection_id or 0}", - ) - - new_connection["project_qc_schema"] = right_column.text_input( - label="QC Utility Schema", - max_chars=50, - value=project_qc_schema, - help="The name of the schema on your database that will contain TestGen's profiling functions.", - key=f"connections:form:qcschema:{connection_id or 0}", - ) - - if new_connection["sql_flavor"] == "snowflake": - mid_column.divider() - - connection_option = mid_column.radio( - "Connection options", - options=connection_options, - index=connection_option_index, - horizontal=True, - help="Connection strategy", - key=f"connections:form:type_options:{connection_id or 0}", - ) - - new_connection["connect_by_key"] = connection_option == "Connect by Key-Pair" - password_column = mid_column - else: - new_connection["connect_by_key"] = False - password_column = left_column - - uploaded_file = None - - if new_connection["connect_by_key"]: - new_connection["private_key_passphrase"] = mid_column.text_input( - label="Private Key Passphrase", - type="password", - max_chars=200, - value=private_key_passphrase, - help="Passphrase used while creating the private Key (leave empty if not applicable)", - key=f"connections:form:passphrase:{connection_id or 0}", - ) - - uploaded_file = mid_column.file_uploader("Upload private key (rsa_key.p8)") - else: - new_connection["password"] = password_column.text_input( - label="Password", - max_chars=50, - type="password", - value=password, - help="Password to connect to your database.", - key=f"connections:form:password:{connection_id or 0}", - ) - - mid_column.divider() - - url_override_help_text = "If this switch is set to on, the connection string will be driven by the field below. " - if new_connection["connect_by_key"]: - url_override_help_text += "Only user name will be passed per the relevant fields above." - else: - url_override_help_text += "Only user name and password will be passed per the relevant fields above." - - def on_connect_by_url_change(): - value = st.session_state.connect_by_url_toggle - st.session_state.disable_url_widgets = value - - new_connection["connect_by_url"] = url_override_toogle_container.toggle( - "URL override", - value=connect_by_url, - key="connect_by_url_toggle", - help=url_override_help_text, - on_change=on_connect_by_url_change, - ) - - if new_connection["connect_by_url"]: - connection_string = connection_service.form_overwritten_connection_url(new_connection) - connection_string_beginning, connection_string_end = connection_string.split("@", 1) - connection_string_header = connection_string_beginning + "@" - connection_string_header = connection_string_header.replace("%3E", ">") - connection_string_header = connection_string_header.replace("%3C", "<") - - if not url: - url = connection_string_end - - new_connection["url"] = bottom_right_column.text_input( - label="URL Suffix", - max_chars=200, - value=url, - help="Provide a connection string directly. This will override connection parameters if the 'Connect by URL' switch is set.", - ) - - bottom_left_column.text_input(label="URL Prefix", value=connection_string_header, disabled=True) - - bottom_left_column.markdown("

 
", unsafe_allow_html=True) - - testgen.flex_row_end(button_right_column) - submit = button_right_column.button( - "Save" if mode == "edit" else "Add Connection", - disabled=authentication_service.current_user_has_read_role(), - ) - - if submit: - if not new_connection["password"] and not new_connection["connect_by_key"]: - st.error("Enter a valid password.") - else: - if uploaded_file: - new_connection["private_key"] = uploaded_file.getvalue().decode("utf-8") - - if mode == "edit": - connection_service.edit_connection(new_connection) - else: - connection_service.add_connection(new_connection) - success_message = ( - "Changes have been saved successfully. " - if mode == "edit" - else "New connection added successfully. " - ) - st.success(success_message) - time.sleep(1) - st.rerun() - - test_connection = button_left_column.button("Test Connection") - - if test_connection: - single_element_container = connection_status_wrapper.empty() - single_element_container.info("Connecting ...") - connection_status = self.test_connection(new_connection) - - with single_element_container.container(): - renderer = { - True: st.success, - False: st.error, - }[connection_status.successful] - - renderer(connection_status.message) - if not connection_status.successful and connection_status.details: - st.caption("Connection Error Details") - - with st.container(border=True): - st.markdown(connection_status.details) - else: - # This is needed to fix a strange bug in Streamlit when using dialog + input fields + button - # If an input field is changed and the button is clicked immediately (without unfocusing the input first), - # two fragment reruns happen successively, one for unfocusing the input and the other for clicking the button - # Some or all (it seems random) of the input fields disappear when this happens - time.sleep(0.1) - - def test_connection(self, connection: dict) -> "ConnectionStatus": - if connection["connect_by_key"] and connection["connection_id"] is None: - return ConnectionStatus( - message="Please add the connection before testing it (so that we can get your private key file).", - successful=False, - ) - - empty_cache() - try: - sql_query = "select 1;" - results = db.retrieve_target_db_data( - connection["sql_flavor"], - connection["project_host"], - connection["project_port"], - connection["project_db"], - connection["project_user"], - connection["password"], - connection["url"], - connection["connect_by_url"], - connection["connect_by_key"], - connection["private_key"], - connection["private_key_passphrase"], - sql_query, - ) - connection_successful = len(results) == 1 and results[0][0] == 1 - - if not connection_successful: - return ConnectionStatus(message="Error completing a query to the database server.", successful=False) - - qc_error_message = "The connection was successful, but there is an issue with the QC Utility Schema" - try: - qc_results = connection_service.test_qc_connection(connection["project_code"], connection) - if not all(qc_results): - return ConnectionStatus( - message=qc_error_message, - details=f"QC Utility Schema confirmation failed. details: {qc_results}", - successful=False, - ) - return ConnectionStatus(message="The connection was successful.", successful=True) - except Exception as error: - return ConnectionStatus(message=qc_error_message, details=error.args[0], successful=False) - except Exception as error: - return ConnectionStatus(message="Error attempting the Connection.", details=error.args[0], successful=False) - - -@dataclasses.dataclass(frozen=True, slots=True) -class ConnectionStatus: - message: str - successful: bool - details: str | None = dataclasses.field(default=None) diff --git a/testgen/ui/views/connections/__init__.py b/testgen/ui/views/connections/__init__.py new file mode 100644 index 0000000..cc9b67f --- /dev/null +++ b/testgen/ui/views/connections/__init__.py @@ -0,0 +1,5 @@ +# ruff: noqa: F401 + +from testgen.ui.views.connections.forms import BaseConnectionForm, KeyPairConnectionForm, PasswordConnectionForm +from testgen.ui.views.connections.models import ConnectionStatus +from testgen.ui.views.connections.page import ConnectionsPage diff --git a/testgen/ui/views/connections/forms.py b/testgen/ui/views/connections/forms.py new file mode 100644 index 0000000..ce7fc42 --- /dev/null +++ b/testgen/ui/views/connections/forms.py @@ -0,0 +1,274 @@ +# type: ignore +import time +import typing + +import streamlit as st +from pydantic import computed_field +from streamlit.delta_generator import DeltaGenerator +from streamlit.runtime.uploaded_file_manager import UploadedFile + +from testgen.ui.components import widgets as testgen +from testgen.ui.forms import BaseForm, Field, ManualRender +from testgen.ui.services import connection_service + +SQL_FLAVORS = ["redshift", "snowflake", "mssql", "postgresql"] +SQLFlavor = typing.Literal["redshift", "snowflake", "mssql", "postgresql"] + + +class BaseConnectionForm(BaseForm, ManualRender): + connection_name: str = Field( + default="", + min_length=3, + max_length=40, + st_kwargs_max_chars=40, + st_kwargs_label="Connection Name", + st_kwargs_help="Your name for this connection. Can be any text.", + ) + project_host: str = Field( + default="", + max_length=250, + st_kwargs_max_chars=250, + st_kwargs_label="Host", + ) + project_port: str = Field(default="", max_length=5, st_kwargs_max_chars=5, st_kwargs_label="Port") + project_db: str = Field( + default="", + max_length=100, + st_kwargs_max_chars=100, + st_kwargs_label="Database", + st_kwargs_help="The name of the database defined on your host where your schemas and tables is present.", + ) + project_user: str = Field( + default="", + max_length=50, + st_kwargs_max_chars=50, + st_kwargs_label="User", + st_kwargs_help="Username to connect to your database.", + ) + connect_by_url: bool = Field( + default=False, + st_kwargs_label="URL override", + st_kwargs_help=( + "If this switch is set to on, the connection string will be driven by the field below. " + "Only user name and password will be passed per the relevant fields above." + ), + ) + url_prefix: str = Field( + default="", + readOnly=True, + st_kwargs_label="URL Prefix", + ) + url: str = Field( + default="", + max_length=200, + st_kwargs_label="URL Suffix", + st_kwargs_max_chars=200, + st_kwargs_help=( + "Provide a connection string directly. This will override connection parameters if " + "the 'Connect by URL' switch is set." + ), + ) + max_threads: int = Field( + default=4, + ge=1, + le=8, + st_kwargs_min_value=1, + st_kwargs_max_value=8, + st_kwargs_label="Max Threads (Advanced Tuning)", + st_kwargs_help=( + "Maximum number of concurrent threads that run tests. Default values should be retained unless " + "test queries are failing." + ), + ) + max_query_chars: int = Field( + default=10000, + ge=500, + le=14000, + st_kwargs_label="Max Expression Length (Advanced Tuning)", + st_kwargs_min_value=500, + st_kwargs_max_value=14000, + st_kwargs_help=( + "Some tests are consolidated into queries for maximum performance. Default values should be retained " + "unless test queries are failing." + ), + ) + + connection_id: int | None = Field(default=None) + + sql_flavor: SQLFlavor = Field( + ..., + st_kwargs_label="SQL Flavor", + st_kwargs_options=SQL_FLAVORS, + st_kwargs_help=( + "The type of database server that you will connect to. This determines TestGen's drivers and SQL dialect." + ), + ) + + def form_key(self): + return f"connection_form:{self.connection_id or 'new'}" + + def render_input_ui(self, container: DeltaGenerator, data: dict) -> "BaseConnectionForm": + time.sleep(0.1) + main_fields_container, optional_fields_container = container.columns([0.7, 0.3]) + + if self.get_field_value("connect_by_url", latest=True): + self.disable("project_host") + self.disable("project_port") + self.disable("project_db") + + self.render_field("sql_flavor", container=main_fields_container) + self.render_field("connection_name", container=main_fields_container) + host_field_container, port_field_container = main_fields_container.columns([0.8, 0.2]) + self.render_field("project_host", container=host_field_container) + self.render_field("project_port", container=port_field_container) + + self.render_field("project_db", container=main_fields_container) + self.render_field("project_user", container=main_fields_container) + self.render_field("max_threads", container=optional_fields_container) + self.render_field("max_query_chars", container=optional_fields_container) + + self.render_extra(container, main_fields_container, optional_fields_container, data) + + testgen.divider(margin_top=8, margin_bottom=8, container=container) + + self.url_prefix = data.get("url_prefix", "") + self.render_field("connect_by_url") + if self.connect_by_url: + connection_string = connection_service.form_overwritten_connection_url(data) + connection_string_beginning, connection_string_end = connection_string.split("@", 1) + + self.update_field_value( + "url_prefix", + f"{connection_string_beginning}@".replace("%3E", ">").replace("%3C", "<"), + ) + if not data.get("url", ""): + self.update_field_value("url", connection_string_end) + + url_override_left_column, url_override_right_column = st.columns([0.25, 0.75]) + self.render_field("url_prefix", container=url_override_left_column) + self.render_field("url", container=url_override_right_column) + + time.sleep(0.1) + + return self + + def render_extra( + self, + _container: DeltaGenerator, + _left_fields_container: DeltaGenerator, + _right_fields_container: DeltaGenerator, + _data: dict, + ) -> None: + ... + + @staticmethod + def for_flavor(flavor: SQLFlavor) -> type["BaseConnectionForm"]: + return { + "redshift": PasswordConnectionForm, + "snowflake": KeyPairConnectionForm, + "mssql": PasswordConnectionForm, + "postgresql": PasswordConnectionForm, + }[flavor] + + +class PasswordConnectionForm(BaseConnectionForm): + password: str = Field( + default="", + max_length=50, + writeOnly=True, + st_kwargs_label="Password", + st_kwargs_max_chars=50, + st_kwargs_help="Password to connect to your database.", + ) + + def render_extra( + self, + _container: DeltaGenerator, + left_fields_container: DeltaGenerator, + _right_fields_container: DeltaGenerator, + _data: dict, + ) -> None: + self.render_field("password", left_fields_container) + + +class KeyPairConnectionForm(PasswordConnectionForm): + connect_by_key: bool = Field(default=None) + private_key_passphrase: str = Field( + default="", + max_length=200, + writeOnly=True, + st_kwargs_max_chars=200, + st_kwargs_help=( + "Passphrase used while creating the private Key (leave empty if not applicable)" + ), + st_kwargs_label="Private Key Passphrase", + ) + _uploaded_file: UploadedFile | None = None + + @computed_field + @property + def private_key(self) -> str: + if self._uploaded_file is None: + return "" + + file_contents: bytes = self._uploaded_file.getvalue() + return file_contents.decode("utf-8") + + def render_extra( + self, + container: DeltaGenerator, + _left_fields_container: DeltaGenerator, + _right_fields_container: DeltaGenerator, + _data: dict, + ) -> None: + testgen.divider(margin_top=8, margin_bottom=8, container=container) + + connect_by_key = self.connect_by_key + if connect_by_key is None: + connect_by_key = self.get_field_value("connect_by_key") + + connection_option: typing.Literal["Connect by Password", "Connect by Key-Pair"] = container.radio( + "Connection options", + options=["Connect by Password", "Connect by Key-Pair"], + index=1 if connect_by_key else 0, + horizontal=True, + help="Connection strategy", + key=self.get_field_key("connection_option"), + ) + self.update_field_value("connect_by_key", connection_option == "Connect by Key-Pair") + + if connection_option == "Connect by Password": + self.render_field("password", container) + else: + self.render_field("private_key_passphrase", container) + + file_uploader_key = self.get_field_key("private_key_uploader") + cached_file_upload_key = self.get_field_key("previous_private_key_file") + + self._uploaded_file = container.file_uploader( + key=file_uploader_key, + label="Upload private key (rsa_key.p8)", + accept_multiple_files=False, + on_change=lambda: st.session_state.pop(cached_file_upload_key, None), + ) + + if self._uploaded_file: + st.session_state[cached_file_upload_key] = self._uploaded_file + elif self._uploaded_file is None and (cached_file_upload := st.session_state.get(cached_file_upload_key)): + self._uploaded_file = cached_file_upload + file_size = f"{round(self._uploaded_file.size / 1024, 2)}KB" + container.markdown( + f""" +
+ draft + {self._uploaded_file.name} + {file_size} +
+ """, + unsafe_allow_html=True, + ) + + def reset_cache(self) -> None: + st.session_state.pop(self.get_field_key("private_key_uploader"), None) + st.session_state.pop(self.get_field_key("previous_private_key_file"), None) + return super().reset_cache() diff --git a/testgen/ui/views/connections/models.py b/testgen/ui/views/connections/models.py new file mode 100644 index 0000000..90f16ca --- /dev/null +++ b/testgen/ui/views/connections/models.py @@ -0,0 +1,8 @@ +import dataclasses + + +@dataclasses.dataclass(frozen=True, slots=True) +class ConnectionStatus: + message: str + successful: bool + details: str | None = dataclasses.field(default=None) diff --git a/testgen/ui/views/connections/page.py b/testgen/ui/views/connections/page.py new file mode 100644 index 0000000..9518ba4 --- /dev/null +++ b/testgen/ui/views/connections/page.py @@ -0,0 +1,328 @@ +import logging +import time +import typing +from functools import partial + +import streamlit as st +import streamlit_pydantic as sp +from pydantic import ValidationError +from streamlit.delta_generator import DeltaGenerator + +import testgen.ui.services.database_service as db +from testgen.commands.run_profiling_bridge import run_profiling_in_background +from testgen.common.database.database_service import empty_cache +from testgen.ui.components import widgets as testgen +from testgen.ui.navigation.menu import MenuItem +from testgen.ui.navigation.page import Page +from testgen.ui.services import connection_service, table_group_service +from testgen.ui.session import session, temp_value +from testgen.ui.views.connections.forms import BaseConnectionForm +from testgen.ui.views.connections.models import ConnectionStatus +from testgen.ui.views.table_groups import TableGroupForm + +LOG = logging.getLogger("testgen") + + +class ConnectionsPage(Page): + path = "connections" + can_activate: typing.ClassVar = [ + lambda: session.authentication_status, + ] + menu_item = MenuItem(icon="database", label="Data Configuration", order=4) + + def render(self, project_code: str, **_kwargs) -> None: + dataframe = connection_service.get_connections(project_code) + connection = dataframe.iloc[0] + has_table_groups = ( + len(connection_service.get_table_group_names_by_connection([connection["connection_id"]]) or []) > 0 + ) + + testgen.page_header( + "Connection", + "connect-your-database", + ) + + testgen.whitespace(0.3) + _, actions_column = st.columns([.1, .9]) + testgen.whitespace(0.3) + testgen.flex_row_end(actions_column) + + with st.container(border=True): + self.show_connection_form(connection.to_dict(), "edit", project_code) + + if has_table_groups: + with actions_column: + testgen.link( + label="Manage Table Groups", + href="connections:table-groups", + params={"connection_id": str(connection["connection_id"])}, + right_icon="chevron_right", + underline=False, + height=40, + style="margin-left: auto; border-radius: 4px; background: var(--dk-card-background);" + " border: var(--button-stroked-border); padding: 8px 8px 8px 16px; color: var(--primary-color)", + ) + else: + with actions_column: + testgen.button( + type_="stroked", + color="primary", + icon="table_view", + label="Setup Table Groups", + style="background: white;", + width=200, + on_click=lambda: self.setup_data_configuration(project_code, connection.to_dict()), + ) + + def show_connection_form(self, selected_connection: dict, _mode: str, project_code) -> None: + connection = selected_connection or {} + connection_id = connection.get("connection_id", None) + sql_flavor = connection.get("sql_flavor", "postgresql") + data = {} + + try: + FlavorForm = BaseConnectionForm.for_flavor(sql_flavor) + if connection: + connection["password"] = connection["password"] or "" + FlavorForm = BaseConnectionForm.for_flavor(sql_flavor) + + form_kwargs = connection or {"sql_flavor": sql_flavor} + form = FlavorForm(**form_kwargs) + + sql_flavor = form.get_field_value("sql_flavor", latest=True) or sql_flavor + if form.sql_flavor != sql_flavor: + form = BaseConnectionForm.for_flavor(sql_flavor)(sql_flavor=sql_flavor) + + form_errors_container = st.empty() + data = sp.pydantic_input( + key=f"connection_form:{connection_id or 'new'}", + model=form, # type: ignore + ) + data.update({ + "project_code": project_code, + }) + if "private_key" not in data: + data.update({ + "connect_by_key": False, + "private_key_passphrase": None, + "private_key": None, + }) + + try: + FlavorForm.model_validate(data) + except ValidationError as error: + form_errors_container.warning("\n".join([ + f"- {field_label}: {err['msg']}" for err in error.errors() + if (field_label := FlavorForm.get_field_label(str(err["loc"][0]))) + ])) + except Exception: + LOG.exception("unexpected form validation error") + st.error("Unexpected error displaying the form. Try again") + + test_button_column, _, save_button_column = st.columns([.2, .6, .2]) + is_submitted, set_submitted = temp_value(f"connection_form-{connection_id or 'new'}:submit") + get_connection_status, set_connection_status = temp_value( + f"connection_form-{connection_id or 'new'}:test_conn" + ) + + with save_button_column: + testgen.button( + type_="flat", + label="Save", + key=f"connection_form:{connection_id or 'new'}:submit", + on_click=lambda: set_submitted(True), + ) + + with test_button_column: + testgen.button( + type_="stroked", + color="basic", + label="Test Connection", + key=f"connection_form:{connection_id or 'new'}:test", + on_click=lambda: set_connection_status(self.test_connection(data)), + ) + + if (connection_status := get_connection_status()): + single_element_container = st.empty() + single_element_container.info("Connecting ...") + + with single_element_container.container(): + renderer = { + True: st.success, + False: st.error, + }[connection_status.successful] + + renderer(connection_status.message) + if not connection_status.successful and connection_status.details: + st.caption("Connection Error Details") + + with st.container(border=True): + st.markdown(connection_status.details) + + connection_status = None + else: + # This is needed to fix a strange bug in Streamlit when using dialog + input fields + button + # If an input field is changed and the button is clicked immediately (without unfocusing the input first), + # two fragment reruns happen successively, one for unfocusing the input and the other for clicking the button + # Some or all (it seems random) of the input fields disappear when this happens + time.sleep(0.1) + + if is_submitted(): + if not data.get("password") and not data.get("connect_by_key"): + st.error("Enter a valid password.") + else: + if data.get("private_key"): + data["private_key"] = data["private_key"].getvalue().decode("utf-8") + + connection_service.edit_connection(data) + st.success("Changes have been saved successfully.") + time.sleep(1) + st.rerun() + + def test_connection(self, connection: dict) -> "ConnectionStatus": + if connection["connect_by_key"] and connection["connection_id"] is None: + return ConnectionStatus( + message="Please add the connection before testing it (so that we can get your private key file).", + successful=False, + ) + + empty_cache() + try: + sql_query = "select 1;" + results = db.retrieve_target_db_data( + connection["sql_flavor"], + connection["project_host"], + connection["project_port"], + connection["project_db"], + connection["project_user"], + connection["password"], + connection["url"], + connection["connect_by_url"], + connection["connect_by_key"], + connection["private_key"], + connection["private_key_passphrase"], + sql_query, + ) + connection_successful = len(results) == 1 and results[0][0] == 1 + + if not connection_successful: + return ConnectionStatus(message="Error completing a query to the database server.", successful=False) + return ConnectionStatus(message="The connection was successful.", successful=True) + except Exception as error: + return ConnectionStatus(message="Error attempting the Connection.", details=error.args[0], successful=False) + + @st.dialog(title="Data Configuration Setup") + def setup_data_configuration(self, project_code: str, connection: dict) -> None: + will_run_profiling = st.session_state.get("connection_form-new:run-profiling-toggle", True) + testgen.wizard( + key="connections:setup-wizard", + steps=[ + testgen.WizardStep( + title="Create a Table Group", + body=partial(self.create_table_group_step, project_code, connection), + ), + testgen.WizardStep( + title="Run Profiling", + body=self.run_data_profiling_step, + ), + ], + on_complete=self.execute_setup, + complete_label="Save & Run Profiling" if will_run_profiling else "Finish Setup", + navigate_to=st.session_state.pop("setup_data_config:navigate-to", None), + navigate_to_args=st.session_state.pop("setup_data_config:navigate-to-args", {}), + ) + + def create_table_group_step(self, project_code: str, connection: dict) -> tuple[dict | None, bool]: + is_valid: bool = True + data: dict = {} + + try: + form = TableGroupForm.model_construct() + form_errors_container = st.empty() + data = sp.pydantic_input(key="table_form:new", model=form) # type: ignore + + try: + TableGroupForm.model_validate(data) + form_errors_container.empty() + data.update({"project_code": project_code, "connection_id": connection["connection_id"]}) + except ValidationError as error: + form_errors_container.warning("\n".join([ + f"- {field_label}: {err['msg']}" for err in error.errors() + if (field_label := TableGroupForm.get_field_label(str(err["loc"][0]))) + ])) + is_valid = False + except Exception: + LOG.exception("unexpected form validation error") + st.error("Unexpected error displaying the form. Try again") + is_valid = False + + return data, is_valid + + def run_data_profiling_step(self, step_0: testgen.WizardStep | None = None) -> tuple[bool, bool]: + if not step_0 or not step_0.results: + st.error("A table group is required to complete this step.") + return False, False + + run_profiling = True + profiling_message = "Profiling will be performed in a background process." + table_group = step_0.results + + with st.container(): + run_profiling = st.checkbox( + label=f"Execute profiling for the table group **{table_group['table_groups_name']}**?", + key="connection_form-new:run-profiling-toggle", + value=True, + ) + if not run_profiling: + profiling_message = ( + "Profiling will be skipped. You can run this step later from the Profiling Runs page." + ) + st.markdown(f":material/info: _{profiling_message}_") + + return run_profiling, True + + def execute_setup( + self, + container: DeltaGenerator, + step_0: testgen.WizardStep[dict], + step_1: testgen.WizardStep[bool], + ) -> bool: + table_group = step_0.results + table_group_name: str = table_group["table_groups_name"] + should_run_profiling: bool = step_1.results + + with container.container(): + status_container = st.empty() + + try: + status_container.info(f"Creating table group **{table_group_name.strip()}**.") + table_group_id = table_group_service.add(table_group) + TableGroupForm.model_construct().reset_cache() + except Exception as err: + status_container.error(f"Error creating table group: {err!s}.") + + if should_run_profiling: + try: + status_container.info("Starting profiling run ...") + run_profiling_in_background(table_group_id) + status_container.success(f"Profiling run started for table group **{table_group_name.strip()}**.") + except Exception as err: + status_container.error(f"Profiling run encountered errors: {err!s}.") + + _, link_column = st.columns([.7, .3]) + with link_column: + testgen.button( + type_="stroked", + color="primary", + label="Go to Profiling Runs", + icon="chevron_right", + key="setup_data_config:keys:go-to-runs", + on_click=lambda: ( + st.session_state.__setattr__("setup_data_config:navigate-to", "profiling-runs") + or st.session_state.__setattr__("setup_data_config:navigate-to-args", { + "table_group": table_group_id + }) + ), + ) + + return not should_run_profiling diff --git a/testgen/ui/views/data_hierarchy.py b/testgen/ui/views/data_hierarchy.py new file mode 100644 index 0000000..5f9dcbb --- /dev/null +++ b/testgen/ui/views/data_hierarchy.py @@ -0,0 +1,424 @@ +import json +import typing +from functools import partial + +import pandas as pd +import streamlit as st + +import testgen.ui.services.database_service as db +import testgen.ui.services.query_service as dq +from testgen.ui.components import widgets as testgen +from testgen.ui.components.widgets import testgen_component +from testgen.ui.navigation.menu import MenuItem +from testgen.ui.navigation.page import Page +from testgen.ui.queries import project_queries +from testgen.ui.session import session +from testgen.ui.views.dialogs.run_profiling_dialog import run_profiling_dialog +from testgen.utils import is_uuid4 + +PAGE_ICON = "dataset" + +class DataHierarchyPage(Page): + path = "data-hierarchy" + can_activate: typing.ClassVar = [ + lambda: session.authentication_status, + ] + menu_item = MenuItem(icon=PAGE_ICON, label="Data Hierarchy", order=1) + + def render(self, project_code: str | None = None, table_group_id: str | None = None, selected: str | None = None, **_kwargs) -> None: + testgen.page_header( + "Data Hierarchy", + ) + + project_code = project_code or session.project + + if render_empty_state(project_code): + return + + group_filter_column, _, loading_column = st.columns([.3, .5, .2], vertical_alignment="center") + + with group_filter_column: + table_groups_df = get_table_group_options(project_code) + table_group_id = testgen.select( + options=table_groups_df, + value_column="id", + display_column="table_groups_name", + default_value=table_group_id, + required=True, + label="Table Group", + bind_to_query="table_group_id", + ) + + with loading_column: + columns_df = get_table_group_columns(table_group_id) + selected_item = get_selected_item(selected, table_group_id) + if not selected_item: + self.router.set_query_params({ "selected": None }) + + if columns_df.empty: + table_group = table_groups_df.loc[table_groups_df["id"] == table_group_id].iloc[0] + testgen.empty_state( + label="No profiling data yet", + icon=PAGE_ICON, + message=testgen.EmptyStateMessage.Profiling, + action_label="Run Profiling", + button_onclick=partial(run_profiling_dialog, project_code, table_group), + button_icon="play_arrow", + ) + else: + def on_tree_node_select(node_id): + self.router.set_query_params({ "selected": node_id }) + + testgen_component( + "data_hierarchy", + props={ "columns": columns_df.to_json(orient="records"), "selected": json.dumps(selected_item) }, + on_change_handlers={ "TreeNodeSelected": on_tree_node_select }, + event_handlers={ "MetadataChanged": on_metadata_changed }, + ) + + +def on_metadata_changed(metadata: dict) -> None: + schema = st.session_state["dbschema"] + item_type, item_id = metadata["id"].split("_", 2) + + if item_type == "table": + update_table = "data_table_chars" + id_column = "table_id" + else: + update_table = "data_column_chars" + id_column = "column_id" + + attributes = [ + "data_source", + "source_system", + "source_process", + "business_domain", + "stakeholder_group", + "transform_level", + "aggregation_level" + ] + cde_value_map = { + True: "TRUE", + False: "FALSE", + None: "NULL", + } + set_attributes = [ f"{key} = NULLIF('{metadata.get(key) or ''}', '')" for key in attributes ] + set_attributes.append(f"critical_data_element = {cde_value_map[metadata.get('critical_data_element')]}") + + sql = f""" + UPDATE {schema}.{update_table} + SET {', '.join(set_attributes)} + WHERE {id_column} = '{item_id}'; + """ + db.execute_sql(sql) + get_selected_item.clear() + st.rerun() + + +def render_empty_state(project_code: str) -> bool: + project_summary_df = project_queries.get_summary_by_code(project_code) + if project_summary_df["profiling_runs_ct"]: # Without profiling, we don't have any table and column information in db + return False + + label="Your project is empty" + testgen.whitespace(5) + if not project_summary_df["connections_ct"]: + testgen.empty_state( + label=label, + icon=PAGE_ICON, + message=testgen.EmptyStateMessage.Connection, + action_label="Go to Connections", + link_href="connections", + ) + else: + testgen.empty_state( + label=label, + icon=PAGE_ICON, + message=testgen.EmptyStateMessage.Profiling if project_summary_df["table_groups_ct"] else testgen.EmptyStateMessage.TableGroup, + action_label="Go to Table Groups", + link_href="connections:table-groups", + link_params={ "connection_id": str(project_summary_df["default_connection_id"]) } + ) + return True + + +@st.cache_data(show_spinner=False) +def get_table_group_options(project_code): + schema = st.session_state["dbschema"] + return dq.run_table_groups_lookup_query(schema, project_code) + + +@st.cache_data(show_spinner="Loading data ...") +def get_table_group_columns(table_group_id: str) -> pd.DataFrame: + schema = st.session_state["dbschema"] + sql = f""" + SELECT CONCAT('column_', column_chars.column_id) AS column_id, + CONCAT('table_', table_chars.table_id) AS table_id, + column_chars.column_name, + table_chars.table_name, + column_chars.general_type, + column_chars.drop_date AS column_drop_date, + table_chars.drop_date AS table_drop_date + FROM {schema}.data_column_chars column_chars + LEFT JOIN {schema}.data_table_chars table_chars ON ( + column_chars.table_id = table_chars.table_id + ) + WHERE column_chars.table_groups_id = '{table_group_id}' + ORDER BY table_name, column_name; + """ + return db.retrieve_data(sql) + + +@st.cache_data(show_spinner="Loading data ...") +def get_selected_item(selected: str, table_group_id: str) -> dict | None: + if not selected: + return None + + schema = st.session_state["dbschema"] + item_type, item_id = selected.split("_", 2) + + if item_type not in ["table", "column"] or not is_uuid4(item_id): + return None + + if item_type == "table": + sql = f""" + SELECT table_chars.table_name, + table_chars.table_groups_id::VARCHAR(50) AS table_group_id, + -- Characteristics + functional_table_type, + record_ct, + table_chars.column_ct, + data_point_ct, + add_date AS add_date, + drop_date AS drop_date, + -- Metadata + critical_data_element, + data_source, + source_system, + source_process, + business_domain, + stakeholder_group, + transform_level, + aggregation_level, + -- Latest Profile & Test Runs + last_complete_profile_run_id::VARCHAR(50) AS latest_profile_id, + profiling_starttime AS latest_profile_date, + EXISTS( + SELECT 1 + FROM {schema}.test_results + WHERE table_groups_id = '{table_group_id}' + AND table_name = table_chars.table_name + ) AS has_test_runs + FROM {schema}.data_table_chars table_chars + LEFT JOIN {schema}.profiling_runs ON ( + table_chars.last_complete_profile_run_id = profiling_runs.id + ) + WHERE table_id = '{item_id}' + AND table_chars.table_groups_id = '{table_group_id}'; + """ + else: + sql = f""" + SELECT column_chars.column_name, + column_chars.table_name, + column_chars.table_groups_id::VARCHAR(50) AS table_group_id, + -- Characteristics + column_chars.general_type, + column_chars.column_type, + column_chars.functional_data_type, + datatype_suggestion, + column_chars.add_date AS add_date, + column_chars.last_mod_date AS last_mod_date, + column_chars.drop_date AS drop_date, + -- Column Metadata + column_chars.critical_data_element, + column_chars.data_source, + column_chars.source_system, + column_chars.source_process, + column_chars.business_domain, + column_chars.stakeholder_group, + column_chars.transform_level, + column_chars.aggregation_level, + -- Table Metadata + table_chars.critical_data_element AS table_critical_data_element, + table_chars.data_source AS table_data_source, + table_chars.source_system AS table_source_system, + table_chars.source_process AS table_source_process, + table_chars.business_domain AS table_business_domain, + table_chars.stakeholder_group AS table_stakeholder_group, + table_chars.transform_level AS table_transform_level, + table_chars.aggregation_level AS table_aggregation_level, + -- Latest Profile & Test Runs + column_chars.last_complete_profile_run_id::VARCHAR(50) AS latest_profile_id, + run_date AS latest_profile_date, + EXISTS( + SELECT 1 + FROM {schema}.test_results + WHERE table_groups_id = '{table_group_id}' + AND table_name = column_chars.table_name + AND column_names = column_chars.column_name + ) AS has_test_runs, + -- Value Counts + profile_results.record_ct, + value_ct, + distinct_value_ct, + null_value_ct, + zero_value_ct, + -- Alpha + zero_length_ct, + filled_value_ct, + includes_digit_ct, + numeric_ct, + date_ct, + quoted_value_ct, + lead_space_ct, + embedded_space_ct, + avg_embedded_spaces, + min_length, + max_length, + avg_length, + min_text, + max_text, + distinct_std_value_ct, + distinct_pattern_ct, + std_pattern_match, + top_freq_values, + top_patterns, + -- Numeric + min_value, + min_value_over_0, + max_value, + avg_value, + stdev_value, + percentile_25, + percentile_50, + percentile_75, + -- Date + min_date, + max_date, + before_1yr_date_ct, + before_5yr_date_ct, + before_20yr_date_ct, + within_1yr_date_ct, + within_1mo_date_ct, + future_date_ct, + -- Boolean + boolean_true_ct + FROM {schema}.data_column_chars column_chars + LEFT JOIN {schema}.data_table_chars table_chars ON ( + column_chars.table_id = table_chars.table_id + ) + LEFT JOIN {schema}.profile_results ON ( + column_chars.last_complete_profile_run_id = profile_results.profile_run_id + AND column_chars.column_name = profile_results.column_name + ) + WHERE column_id = '{item_id}' + AND column_chars.table_groups_id = '{table_group_id}'; + """ + + item_df = db.retrieve_data(sql) + if not item_df.empty: + # to_json converts datetimes, NaN, etc, to JSON-safe values (Note: to_dict does not) + item = json.loads(item_df.to_json(orient="records"))[0] + item["id"] = selected + item["type"] = item_type + item["latest_anomalies"] = get_profile_anomalies(item["latest_profile_id"], item["table_name"], item.get("column_name")) + item["latest_test_issues"] = get_latest_test_issues(item["table_group_id"], item["table_name"], item.get("column_name")) + return item + + +@st.cache_data(show_spinner=False) +def get_profile_anomalies(profile_run_id: str, table_name: str, column_name: str | None = None) -> dict | None: + schema = st.session_state["dbschema"] + + column_condition = "" + if column_name: + column_condition = f"AND column_name = '{column_name}'" + + sql = f""" + WITH pii_results AS ( + SELECT id, + CASE + WHEN detail LIKE 'Risk: HIGH%%' THEN 'High' + WHEN detail LIKE 'Risk: MODERATE%%' THEN 'Moderate' + ELSE null + END AS pii_risk + FROM {schema}.profile_anomaly_results + ) + SELECT column_name, + anomaly_name, + issue_likelihood, + detail, + pii_risk + FROM {schema}.profile_anomaly_results anomaly_results + LEFT JOIN {schema}.profile_anomaly_types anomaly_types ON ( + anomaly_types.id = anomaly_results.anomaly_id + ) + LEFT JOIN pii_results ON ( + anomaly_results.id = pii_results.id + ) + WHERE profile_run_id = '{profile_run_id}' + AND table_name = '{table_name}' + {column_condition} + AND COALESCE(disposition, 'Confirmed') = 'Confirmed' + ORDER BY + CASE issue_likelihood + WHEN 'Definite' THEN 1 + WHEN 'Likely' THEN 2 + WHEN 'Possible' THEN 3 + ELSE 4 + END, + CASE pii_risk + WHEN 'High' THEN 1 + WHEN 'Moderate' THEN 2 + ELSE 3 + END, + column_name; + """ + + df = db.retrieve_data(sql) + return json.loads(df.to_json(orient="records")) + + +@st.cache_data(show_spinner=False) +def get_latest_test_issues(table_group_id: str, table_name: str, column_name: str | None = None) -> dict | None: + schema = st.session_state["dbschema"] + + column_condition = "" + if column_name: + column_condition = f"AND column_names = '{column_name}'" + + sql = f""" + SELECT test_results.id::VARCHAR(50), + column_names AS column_name, + test_name_short AS test_name, + result_status, + result_message, + test_suite, + test_results.test_run_id::VARCHAR(50), + test_starttime AS test_run_date + FROM {schema}.test_suites + LEFT JOIN {schema}.test_runs ON ( + test_suites.last_complete_test_run_id = test_runs.id + ) + LEFT JOIN {schema}.test_results ON ( + test_runs.id = test_results.test_run_id + ) + LEFT JOIN {schema}.test_types ON ( + test_results.test_type = test_types.test_type + ) + WHERE test_suites.table_groups_id = '{table_group_id}' + AND table_name = '{table_name}' + {column_condition} + AND result_status <> 'Passed' + AND COALESCE(disposition, 'Confirmed') = 'Confirmed' + ORDER BY + CASE result_status + WHEN 'Failed' THEN 1 + WHEN 'Warning' THEN 2 + ELSE 3 + END, + column_name; + """ + + df = db.retrieve_data(sql) + return json.loads(df.to_json(orient="records")) diff --git a/testgen/ui/views/app_log_modal.py b/testgen/ui/views/dialogs/application_logs_dialog.py similarity index 100% rename from testgen/ui/views/app_log_modal.py rename to testgen/ui/views/dialogs/application_logs_dialog.py diff --git a/testgen/ui/views/dialogs/generate_tests_dialog.py b/testgen/ui/views/dialogs/generate_tests_dialog.py new file mode 100644 index 0000000..7647645 --- /dev/null +++ b/testgen/ui/views/dialogs/generate_tests_dialog.py @@ -0,0 +1,81 @@ +import time + +import pandas as pd +import streamlit as st + +import testgen.ui.services.test_suite_service as test_suite_service +from testgen.commands.run_generate_tests import run_test_gen_queries +from testgen.ui.components import widgets as testgen + +ALL_TYPES_LABEL = "All Test Types" + + +@st.dialog(title="Generate Tests") +def generate_tests_dialog(test_suite: pd.Series) -> None: + test_suite_id = test_suite["id"] + test_suite_name = test_suite["test_suite"] + table_group_id = test_suite["table_groups_id"] + + selected_set = "" + generation_sets = test_suite_service.get_generation_set_choices() + + if generation_sets: + generation_sets.insert(0, ALL_TYPES_LABEL) + + with st.container(): + selected_set = st.selectbox("Generation Set", generation_sets) + if selected_set == ALL_TYPES_LABEL: + selected_set = "" + + test_ct, unlocked_test_ct, unlocked_edits_ct = test_suite_service.get_test_suite_refresh_warning(test_suite_id) + if test_ct: + unlocked_message = "" + if unlocked_edits_ct > 0: + unlocked_message = "Manual changes have been made to auto-generated tests in this test suite that have not been locked. " + elif unlocked_test_ct > 0: + unlocked_message = "Auto-generated tests are present in this test suite that have not been locked. " + + warning_message = f""" + {unlocked_message} + Generating tests now will overwrite unlocked tests subject to auto-generation based on the latest profiling. + \n\n_Auto-generated Tests: {test_ct}, Unlocked: {unlocked_test_ct}, Edited Unlocked: {unlocked_edits_ct}_ + """ + + with st.container(): + st.warning(warning_message, icon=":material/warning:") + if unlocked_edits_ct > 0: + if st.button("Lock Edited Tests"): + if test_suite_service.lock_edited_tests(test_suite_id): + st.info("Edited tests have been successfully locked.") + + with st.container(): + st.markdown(f"Execute test generation for the test suite **{test_suite_name}**?") + + if testgen.expander_toggle(expand_label="Show CLI command", key="test_suite:keys:generate-tests-show-cli"): + st.code( + f"testgen run-test-generation --table-group-id {table_group_id} --test-suite-key {test_suite_name}", + language="shellSession", + ) + + button_container = st.empty() + status_container = st.empty() + + test_generation_button = None + with button_container: + _, button_column = st.columns([.75, .25]) + with button_column: + test_generation_button = st.button("Generate Tests", use_container_width=True) + + if test_generation_button: + button_container.empty() + status_container.info("Starting test generation ...") + + try: + run_test_gen_queries(table_group_id, test_suite_name, selected_set) + except Exception as e: + status_container.error(f"Test generation encountered errors: {e!s}.") + + status_container.success(f"Test generation completed for test suite **{test_suite_name}**.") + time.sleep(1) + st.cache_data.clear() + st.rerun() diff --git a/testgen/ui/views/profiling_modal.py b/testgen/ui/views/dialogs/profiling_results_dialog.py similarity index 86% rename from testgen/ui/views/profiling_modal.py rename to testgen/ui/views/dialogs/profiling_results_dialog.py index 26f3078..9a07b32 100644 --- a/testgen/ui/views/profiling_modal.py +++ b/testgen/ui/views/dialogs/profiling_results_dialog.py @@ -12,9 +12,8 @@ BUTTON_HELP = "Review profiling for highlighted column" -def view_profiling_button(button_container, str_table_name, str_column_name, - str_profile_run_id=None, str_table_groups_id=None): - with button_container: +def view_profiling_button(str_table_name, str_column_name, str_profile_run_id=None, str_table_groups_id=None): + if str_table_name != "(multi-table)": if st.button( BUTTON_TEXT, help=BUTTON_HELP, use_container_width=True ): diff --git a/testgen/ui/views/dialogs/run_profiling_dialog.py b/testgen/ui/views/dialogs/run_profiling_dialog.py new file mode 100644 index 0000000..b1077f8 --- /dev/null +++ b/testgen/ui/views/dialogs/run_profiling_dialog.py @@ -0,0 +1,84 @@ +import time + +import pandas as pd +import streamlit as st + +import testgen.ui.services.query_service as dq +from testgen.commands.run_profiling_bridge import run_profiling_in_background +from testgen.ui.components import widgets as testgen +from testgen.ui.session import session + +LINK_KEY = "run_profiling_dialog:keys:go-to-runs" +LINK_HREF = "profiling-runs" + + +@st.dialog(title="Run Profiling") +def run_profiling_dialog(project_code: str, table_group: pd.Series | None = None, default_table_group_id: str | None = None) -> None: + if table_group is not None and not table_group.empty: + table_group_id: str = table_group["id"] + table_group_name: str = table_group["table_groups_name"] + else: + table_groups_df = get_table_group_options(project_code) + table_group_id: str = testgen.select( + label="Table Group", + options=table_groups_df, + value_column="id", + display_column="table_groups_name", + default_value=default_table_group_id, + required=True, + ) + table_group_name: str = table_groups_df.loc[table_groups_df["id"] == table_group_id, "table_groups_name"].iloc[0] + testgen.whitespace(1) + + with st.container(): + st.markdown(f"Execute profiling for the table group **{table_group_name}**?") + st.markdown(":material/info: _Profiling will be performed in a background process._") + + if testgen.expander_toggle(expand_label="Show CLI command", key="test_suite:keys:run-tests-show-cli"): + st.code(f"testgen run-profile --table-group-id {table_group_id}", language="shellSession") + + button_container = st.empty() + status_container = st.empty() + + with button_container: + _, button_column = st.columns([.85, .15]) + with button_column: + profile_button = st.button("Run Profiling", use_container_width=True, disabled=not table_group_id) + + if profile_button: + button_container.empty() + status_container.info("Starting profiling run ...") + + try: + run_profiling_in_background(table_group_id) + except Exception as e: + status_container.error(f"Profiling run encountered errors: {e!s}.") + + # The second condition is needed for the link to work + if profile_button or st.session_state.get(LINK_KEY): + with status_container.container(): + st.success( + f"Profiling run started for table group **{table_group_name}**." + ) + + if session.current_page != LINK_HREF: + testgen.link( + label="Go to Profiling Runs", + href=LINK_HREF, + params={ "table_group": table_group_id }, + right_icon="chevron_right", + underline=False, + height=40, + key=LINK_KEY, + style="margin-left: auto; border-radius: 4px; border: var(--button-stroked-border); padding: 8px 8px 8px 16px; color: var(--primary-color)", + ) + else: + time.sleep(2) + st.cache_data.clear() + st.rerun() + + +@st.cache_data(show_spinner=False) +def get_table_group_options(project_code: str) -> pd.DataFrame: + schema: str = st.session_state["dbschema"] + return dq.run_table_groups_lookup_query(schema, project_code) diff --git a/testgen/ui/views/dialogs/run_tests_dialog.py b/testgen/ui/views/dialogs/run_tests_dialog.py new file mode 100644 index 0000000..a5b9eb6 --- /dev/null +++ b/testgen/ui/views/dialogs/run_tests_dialog.py @@ -0,0 +1,95 @@ +import time + +import pandas as pd +import streamlit as st + +import testgen.ui.services.database_service as db +from testgen.commands.run_execute_tests import run_execution_steps_in_background +from testgen.ui.components import widgets as testgen +from testgen.ui.session import session + +LINK_KEY = "run_tests_dialog:keys:go-to-runs" +LINK_HREF = "test-runs" + + +@st.dialog(title="Run Tests") +def run_tests_dialog(project_code: str, test_suite: pd.Series | None = None, default_test_suite_id: str | None = None) -> None: + if test_suite is not None and not test_suite.empty: + test_suite_id: str = test_suite["id"] + test_suite_name: str = test_suite["test_suite"] + else: + test_suites_df = get_test_suite_options(project_code) + test_suite_id: str = testgen.select( + label="Test Suite", + options=test_suites_df, + value_column="id", + display_column="test_suite", + default_value=default_test_suite_id, + required=True, + ) + test_suite_name: str = test_suites_df.loc[test_suites_df["id"] == test_suite_id, "test_suite"].iloc[0] + testgen.whitespace(1) + + with st.container(): + st.markdown(f"Run tests for the test suite **{test_suite_name}**?") + st.markdown(":material/info: _Test execution will be performed in a background process._") + + if testgen.expander_toggle(expand_label="Show CLI command", key="run_tests_dialog:keys:show-cli"): + st.code( + f"testgen run-tests --project-key {project_code} --test-suite-key {test_suite['test_suite']}", + language="shellSession" + ) + + button_container = st.empty() + status_container = st.empty() + + run_test_button = None + with button_container: + _, button_column = st.columns([.8, .2]) + with button_column: + run_test_button = st.button("Run Tests", use_container_width=True) + + if run_test_button: + button_container.empty() + status_container.info("Starting test run ...") + + try: + run_execution_steps_in_background(project_code, test_suite_name) + except Exception as e: + status_container.error(f"Test run encountered errors: {e!s}.") + + # The second condition is needed for the link to work + if run_test_button or st.session_state.get(LINK_KEY): + with status_container.container(): + st.success( + f"Test run started for test suite **{test_suite_name}**." + ) + + if session.current_page != LINK_HREF: + testgen.link( + label="Go to Test Runs", + href=LINK_HREF, + params={ "test_suite": test_suite_id }, + right_icon="chevron_right", + underline=False, + height=40, + key=LINK_KEY, + style="margin-left: auto; border-radius: 4px; border: var(--button-stroked-border); padding: 8px 8px 8px 16px; color: var(--primary-color)", + ) + else: + time.sleep(2) + st.cache_data.clear() + st.rerun() + + +@st.cache_data(show_spinner=False) +def get_test_suite_options(project_code: str) -> pd.DataFrame: + schema: str = st.session_state["dbschema"] + sql = f""" + SELECT test_suites.id::VARCHAR(50), + test_suites.test_suite + FROM {schema}.test_suites + WHERE test_suites.project_code = '{project_code}' + ORDER BY test_suites.test_suite + """ + return db.retrieve_data(sql) diff --git a/testgen/ui/views/profiling_anomalies.py b/testgen/ui/views/hygiene_issues.py similarity index 71% rename from testgen/ui/views/profiling_anomalies.py rename to testgen/ui/views/hygiene_issues.py index 29dc430..e227646 100644 --- a/testgen/ui/views/profiling_anomalies.py +++ b/testgen/ui/views/hygiene_issues.py @@ -1,5 +1,7 @@ import typing +from io import BytesIO +import pandas as pd import plotly.express as px import streamlit as st @@ -9,49 +11,61 @@ import testgen.ui.services.query_service as dq from testgen.common import date_service from testgen.ui.components import widgets as testgen +from testgen.ui.components.widgets.download_dialog import FILE_DATA_TYPE, download_dialog, zip_multi_file_data from testgen.ui.navigation.page import Page +from testgen.ui.pdf.hygiene_issue_report import create_report from testgen.ui.services import project_service +from testgen.ui.services.hygiene_issues_service import get_source_data as get_source_data_uncached from testgen.ui.session import session -from testgen.ui.views.profiling_modal import view_profiling_button +from testgen.ui.views.dialogs.profiling_results_dialog import view_profiling_button -class ProfilingAnomaliesPage(Page): +class HygieneIssuesPage(Page): path = "profiling-runs:hygiene" can_activate: typing.ClassVar = [ lambda: session.authentication_status, lambda: "run_id" in session.current_page_args or "profiling-runs", ] - def render(self, run_id: str, issue_class: str | None = None, issue_type: str | None = None, **_kwargs) -> None: + def render( + self, + run_id: str, + issue_class: str | None = None, + issue_type: str | None = None, + table_name: str | None = None, + column_name: str | None = None, + **_kwargs, + ) -> None: run_parentage = profiling_queries.lookup_db_parentage_from_run(run_id) if not run_parentage: self.router.navigate_with_warning( f"Profiling run with ID '{run_id}' does not exist. Redirecting to list of Profiling Runs ...", "profiling-runs", ) - + return + run_date, _table_group_id, table_group_name, project_code = run_parentage run_date = date_service.get_timezoned_timestamp(st.session_state, run_date) project_service.set_current_project(project_code) testgen.page_header( "Hygiene Issues", - "https://docs.datakitchen.io/article/dataops-testgen-help/profile-anomalies", + "view-hygiene-issues", breadcrumbs=[ { "label": "Profiling Runs", "path": "profiling-runs", "params": { "project_code": project_code } }, { "label": f"{table_group_name} | {run_date}" }, ], ) - others_summary_column, pii_summary_column, _ = st.columns([.3, .3, .4]) - (liklihood_filter_column, issue_type_filter_column, sort_column, actions_column, export_button_column) = ( - st.columns([.16, .34, .08, .32, .1], vertical_alignment="bottom") + others_summary_column, pii_summary_column, actions_column = st.columns([.25, .25, .5], vertical_alignment="bottom") + (liklihood_filter_column, issue_type_filter_column, table_filter_column, column_filter_column, sort_column, export_button_column) = ( + st.columns([.15, .25, .2, .2, .1, .1], vertical_alignment="bottom") ) testgen.flex_row_end(actions_column) testgen.flex_row_end(export_button_column) with liklihood_filter_column: - issue_class = testgen.toolbar_select( + issue_class = testgen.select( options=["Definite", "Likely", "Possible", "Potential PII"], default_value=issue_class, required=False, @@ -61,7 +75,7 @@ def render(self, run_id: str, issue_class: str | None = None, issue_type: str | with issue_type_filter_column: issue_type_options = get_issue_types() - issue_type_id = testgen.toolbar_select( + issue_type_id = testgen.select( options=issue_type_options, default_value=None if issue_class == "Potential PII" else issue_type, value_column="id", @@ -72,6 +86,26 @@ def render(self, run_id: str, issue_class: str | None = None, issue_type: str | disabled=issue_class == "Potential PII", ) + run_columns_df = get_profiling_run_columns(run_id) + with table_filter_column: + table_name = testgen.select( + options=list(run_columns_df["table_name"].unique()), + default_value=table_name, + bind_to_query="table_name", + label="Table Name", + ) + + with column_filter_column: + column_options = list(run_columns_df.loc[run_columns_df["table_name"] == table_name]["column_name"].unique()) + column_name = testgen.select( + options=column_options, + value_column="column_name", + default_value=column_name, + bind_to_query="column_name", + label="Column Name", + disabled=not table_name, + ) + with sort_column: sortable_columns = ( ("Table", "r.table_name"), @@ -89,7 +123,7 @@ def render(self, run_id: str, issue_class: str | None = None, issue_type: str | # Get hygiene issue list - df_pa = get_profiling_anomalies(run_id, issue_class, issue_type_id, sorting_columns) + df_pa = get_profiling_anomalies(run_id, issue_class, issue_type_id, table_name, column_name, sorting_columns) # Retrieve disposition action (cache refreshed) df_action = get_anomaly_disposition(run_id) @@ -104,7 +138,7 @@ def render(self, run_id: str, issue_class: str | None = None, issue_type: str | testgen.summary_bar( items=others_summary, label="Hygiene Issues", - height=40, + height=20, width=400, ) @@ -114,7 +148,7 @@ def render(self, run_id: str, issue_class: str | None = None, issue_type: str | testgen.summary_bar( items=anomalies_pii_summary, label="Potential PII", - height=40, + height=20, width=400, ) # write_frequency_graph(df_pa) @@ -130,7 +164,12 @@ def render(self, run_id: str, issue_class: str | None = None, issue_type: str | # Show main grid and retrieve selections selected = fm.render_grid_select( - df_pa, lst_show_columns, int_height=400, do_multi_select=do_multi_select + df_pa, + lst_show_columns, + int_height=400, + do_multi_select=do_multi_select, + bind_to_query_name="selected", + bind_to_query_prop="id", ) with export_button_column: @@ -160,7 +199,7 @@ def render(self, run_id: str, issue_class: str | None = None, issue_type: str | if not selected_row: st.markdown(":orange[Select a record to see more information.]") else: - col1, col2 = st.columns([0.7, 0.3]) + col1, col2 = st.columns([0.8, 0.2]) with col1: fm.render_html_list( selected_row, @@ -178,17 +217,33 @@ def render(self, run_id: str, issue_class: str | None = None, issue_type: str | int_data_width=700, ) with col2: - # _, v_col2 = st.columns([0.3, 0.7]) - v_col1, v_col2 = st.columns([0.5, 0.5]) - view_profiling_button( - v_col1, selected_row["table_name"], selected_row["column_name"], - str_profile_run_id=run_id - ) - with v_col2: + view_profiling_button( + selected_row["table_name"], selected_row["column_name"], str_profile_run_id=run_id + ) + if st.button( "Source Data โ†’", help="Review current source data for highlighted issue", use_container_width=True ): source_data_dialog(selected_row) + if st.button( + ":material/file_save: Issue Report", + use_container_width=True, + help="Generate a PDF report for each selected issue", + ): + dialog_title = "Download Issue Report" + if len(selected) == 1: + download_dialog( + dialog_title=dialog_title, + file_content_func=get_report_file_data, + args=(selected[0],), + ) + else: + zip_func = zip_multi_file_data( + "testgen_hygiene_issue_reports.zip", + get_report_file_data, + [(arg,) for arg in selected], + ) + download_dialog(dialog_title=dialog_title, file_content_func=zip_func) cached_functions = [get_anomaly_disposition, get_profiling_anomaly_summary] # Clear the list cache if the list is sorted by disposition/action @@ -225,24 +280,48 @@ def render(self, run_id: str, issue_class: str | None = None, issue_type: str | @st.cache_data(show_spinner=False) -def get_db_table_group_choices(str_project_code): - str_schema = st.session_state["dbschema"] - return dq.run_table_groups_lookup_query(str_schema, str_project_code) +def get_db_table_group_choices(project_code: str) -> pd.DataFrame: + schema: str = st.session_state["dbschema"] + return dq.run_table_groups_lookup_query(schema, project_code) + + +@st.cache_data(show_spinner="False") +def get_profiling_run_columns(profiling_run_id: str) -> pd.DataFrame: + schema: str = st.session_state["dbschema"] + sql = f""" + SELECT table_name, column_name + FROM {schema}.profile_anomaly_results + WHERE profile_run_id = '{profiling_run_id}' + ORDER BY table_name, column_name; + """ + return db.retrieve_data(sql) @st.cache_data(show_spinner="Retrieving Data") -def get_profiling_anomalies(str_profile_run_id, str_likelihood, issue_type_id, sorting_columns): - str_schema = st.session_state["dbschema"] - if str_likelihood is None: - str_criteria = " AND t.issue_likelihood <> 'Potential PII'" - else: - str_criteria = f" AND t.issue_likelihood = '{str_likelihood}'" - if sorting_columns: - str_order_by = "ORDER BY " + (", ".join(" ".join(col) for col in sorting_columns)) - else: - str_order_by = "" +def get_profiling_anomalies( + profile_run_id: str, + likelihood: str | None, + issue_type_id: str | None, + table_name: str | None, + column_name: str | None, + sorting_columns: list[str] | None, +): + schema: str = st.session_state["dbschema"] + criteria = "" + order_by = "" + + if likelihood: + criteria += f" AND t.issue_likelihood = '{likelihood}'" if issue_type_id: - str_criteria += f" AND t.id = '{issue_type_id}'" + criteria += f" AND t.id = '{issue_type_id}'" + if table_name: + criteria += f" AND r.table_name = '{table_name}'" + if column_name: + criteria += f" AND r.column_name = '{column_name}'" + + if sorting_columns: + order_by = "ORDER BY " + (", ".join(" ".join(col) for col in sorting_columns)) + # Define the query -- first visible column must be first, because will hold the multi-select box str_sql = f""" SELECT r.table_name, r.column_name, r.schema_name, @@ -262,15 +341,18 @@ def get_profiling_anomalies(str_profile_run_id, str_likelihood, issue_type_id, s WHEN t.issue_likelihood = 'Definite' THEN 4 END AS likelihood_order, t.anomaly_description, r.detail, t.suggested_action, - r.anomaly_id, r.table_groups_id::VARCHAR, r.id::VARCHAR, p.profiling_starttime - FROM {str_schema}.profile_anomaly_results r - INNER JOIN {str_schema}.profile_anomaly_types t + r.anomaly_id, r.table_groups_id::VARCHAR, r.id::VARCHAR, p.profiling_starttime, r.profile_run_id::VARCHAR, + tg.table_groups_name + FROM {schema}.profile_anomaly_results r + INNER JOIN {schema}.profile_anomaly_types t ON r.anomaly_id = t.id - INNER JOIN {str_schema}.profiling_runs p + INNER JOIN {schema}.profiling_runs p ON r.profile_run_id = p.id - WHERE r.profile_run_id = '{str_profile_run_id}' - {str_criteria} - {str_order_by} + INNER JOIN {schema}.table_groups tg + ON r.table_groups_id = tg.id + WHERE r.profile_run_id = '{profile_run_id}' + {criteria} + {order_by} """ # Retrieve data as df df = db.retrieve_data(str_sql) @@ -345,88 +427,8 @@ def get_profiling_anomaly_summary(str_profile_run_id): @st.cache_data(show_spinner=False) -def get_bad_data(selected_row): - str_schema = st.session_state["dbschema"] - # Define the query - str_sql = f""" - SELECT t.lookup_query, tg.table_group_schema, c.project_qc_schema, - c.sql_flavor, c.project_host, c.project_port, c.project_db, c.project_user, c.project_pw_encrypted, - c.url, c.connect_by_url, c.connect_by_key, c.private_key, c.private_key_passphrase - FROM {str_schema}.target_data_lookups t - INNER JOIN {str_schema}.table_groups tg - ON ('{selected_row["table_groups_id"]}'::UUID = tg.id) - INNER JOIN {str_schema}.connections c - ON (tg.connection_id = c.connection_id) - AND (t.sql_flavor = c.sql_flavor) - WHERE t.error_type = 'Profile Anomaly' - AND t.test_id = '{selected_row["anomaly_id"]}' - AND t.lookup_query > ''; - """ - - def get_lookup_query(test_id, detail_exp, column_names): - if test_id in {"1019", "1020"}: - start_index = detail_exp.find("Columns: ") - if start_index == -1: - columns = [col.strip() for col in column_names.split(",")] - else: - start_index += len("Columns: ") - column_names_str = detail_exp[start_index:] - columns = [col.strip() for col in column_names_str.split(",")] - queries = [ - f"SELECT '{column}' AS column_name, MAX({column}) AS max_date_available FROM {{TARGET_SCHEMA}}.{{TABLE_NAME}}" - for column in columns - ] - sql_query = " UNION ALL ".join(queries) + " ORDER BY max_date_available DESC;" - else: - sql_query = "" - return sql_query - - def replace_parms(str_query): - str_query = ( - get_lookup_query(selected_row["anomaly_id"], selected_row["detail"], selected_row["column_name"]) - if lst_query[0]["lookup_query"] == "created_in_ui" - else lst_query[0]["lookup_query"] - ) - str_query = str_query.replace("{TARGET_SCHEMA}", lst_query[0]["table_group_schema"]) - str_query = str_query.replace("{TABLE_NAME}", selected_row["table_name"]) - str_query = str_query.replace("{COLUMN_NAME}", selected_row["column_name"]) - str_query = str_query.replace("{DATA_QC_SCHEMA}", lst_query[0]["project_qc_schema"]) - str_query = str_query.replace("{DETAIL_EXPRESSION}", selected_row["detail"]) - str_query = str_query.replace("{PROFILE_RUN_DATE}", selected_row["profiling_starttime"]) - if str_query is None or str_query == "": - raise ValueError("Lookup query is not defined for this Anomoly Type.") - return str_query - - try: - # Retrieve SQL for customer lookup - lst_query = db.retrieve_data_list(str_sql) - - # Retrieve and return data as df - if lst_query: - str_sql = replace_parms(str_sql) - df = db.retrieve_target_db_df( - lst_query[0]["sql_flavor"], - lst_query[0]["project_host"], - lst_query[0]["project_port"], - lst_query[0]["project_db"], - lst_query[0]["project_user"], - lst_query[0]["project_pw_encrypted"], - str_sql, - lst_query[0]["url"], - lst_query[0]["connect_by_url"], - lst_query[0]["connect_by_key"], - lst_query[0]["private_key"], - lst_query[0]["private_key_passphrase"], - ) - if df.empty: - return "ND", "Data that violates Hygiene Issue criteria is not present in the current dataset.", None - else: - return "OK", None, df - else: - return "NA", "A source data lookup for this Issue is not available.", None - - except Exception as e: - return "ERR", f"Source data lookup query caused an error:\n\n{e.args[0]}", None +def get_source_data(hi_data): + return get_source_data_uncached(hi_data) def write_frequency_graph(df_tests): @@ -457,7 +459,7 @@ def source_data_dialog(selected_row): fm.render_html_list(selected_row, ["detail"], None, 700, ["Hygiene Issue Detail"]) with st.spinner("Retrieving source data..."): - bad_data_status, bad_data_msg, df_bad = get_bad_data(selected_row) + bad_data_status, bad_data_msg, _, df_bad = get_source_data(selected_row) if bad_data_status in {"ND", "NA"}: st.info(bad_data_msg) elif bad_data_status == "ERR": @@ -487,3 +489,14 @@ def do_disposition_update(selected, str_new_status): str_result = f":red[**The update {str_which} did not succeed.**]" return str_result + +def get_report_file_data(update_progress, tr_data) -> FILE_DATA_TYPE: + hi_id = tr_data["id"][:8] + profiling_time = pd.Timestamp(tr_data["profiling_starttime"]).strftime("%Y%m%d_%H%M%S") + file_name = f"testgen_hygiene_issue_report_{hi_id}_{profiling_time}.pdf" + + with BytesIO() as buffer: + create_report(buffer, tr_data) + update_progress(1.0) + buffer.seek(0) + return file_name, "application/pdf", buffer.read() diff --git a/testgen/ui/views/login.py b/testgen/ui/views/login.py index 13e08fa..beb50a0 100644 --- a/testgen/ui/views/login.py +++ b/testgen/ui/views/login.py @@ -4,6 +4,7 @@ import streamlit as st import streamlit_authenticator as stauth +from testgen.ui.components import widgets as testgen from testgen.ui.navigation.page import Page from testgen.ui.services import javascript_service, user_session_service from testgen.ui.session import session @@ -28,12 +29,16 @@ def render(self, **_kwargs) -> None: auth_data["preauthorized"], ) - _column_1, column_2, _column_3 = st.columns([0.25, 0.5, 0.25]) - with column_2: - st.markdown(""" + _, login_column, links_column = st.columns([0.25, 0.5, 0.25]) + + with links_column: + testgen.page_links() + + with login_column: + st.html("""


Welcome to DataKitchen DataOps TestGen

- """, unsafe_allow_html=True) + """) name, authentication_status, username = authenticator.login("Login") if authentication_status is False: diff --git a/testgen/ui/views/overview.py b/testgen/ui/views/overview.py index 727d643..e37565e 100644 --- a/testgen/ui/views/overview.py +++ b/testgen/ui/views/overview.py @@ -3,17 +3,20 @@ import pandas as pd import streamlit as st +from pandas.api.types import is_string_dtype import testgen.ui.services.database_service as db from testgen.common import date_service from testgen.ui.components import widgets as testgen from testgen.ui.navigation.menu import MenuItem from testgen.ui.navigation.page import Page +from testgen.ui.queries import project_queries from testgen.ui.services import test_suite_service from testgen.ui.session import session -from testgen.utils import to_int +from testgen.utils import to_int, truncate STALE_PROFILE_DAYS = 30 +PAGE_ICON = "home" class OverviewPage(Page): @@ -21,27 +24,90 @@ class OverviewPage(Page): can_activate: typing.ClassVar = [ lambda: session.authentication_status, ] - menu_item = MenuItem(icon="home", label="Overview", order=0) + menu_item = MenuItem(icon=PAGE_ICON, label="Overview", order=0) def render(self, project_code: str | None = None, **_kwargs): - project_code = project_code or session.project - table_groups_df: pd.DataFrame = get_table_groups_summary(project_code) - testgen.page_header( "Project Overview", - "https://docs.datakitchen.io/article/dataops-testgen-help/introduction-to-dataops-testgen", + "introduction-to-dataops-testgen", ) + project_code = project_code or session.project + table_groups_df: pd.DataFrame = get_table_groups_summary(project_code) render_project_summary(table_groups_df) - st.html(f'
Table Groups ({len(table_groups_df.index)})
') + if render_empty_state(project_code): + return + + table_group_header_col, table_group_filter_col, table_group_sort_col = st.columns([0.6, 0.2, 0.2]) + table_group_header_col.html(f'
Table Groups ({len(table_groups_df.index)})
') + with table_group_filter_col: + name_filter = st.text_input(label="Search by table group name") + table_groups_df = table_groups_df.loc[ + table_groups_df["table_groups_name"].str.contains(name_filter, case=False) + ] + + with table_group_sort_col: + table_groups_df["latest_activity_date"] = table_groups_df[ + ["latest_profile_start", "latest_tests_start"] + ].apply(pd.to_datetime).max(axis=1) # apply is needed to handle missing values + ascending_fields: list[str] = ["table_groups_name"] + sort_options = pd.DataFrame({ + "value": ["table_groups_name", "latest_activity_date"], + "label": ["Table group name", "Latest activity"], + }) + + sort_by = testgen.select( + label="Sort by", + options=sort_options, + required=True, + default_value="latest_activity_date", + display_column="label", + value_column="value", + ) + + table_groups_df.sort_values( + by=typing.cast(str, sort_by), + ascending=sort_by in ascending_fields, + inplace=True, + key=lambda column: column.str.lower() if is_string_dtype(column) else column, + ) + for index, table_group in table_groups_df.iterrows(): render_table_group_card(table_group, project_code, index) +def render_empty_state(project_code: str) -> bool: + project_summary_df = project_queries.get_summary_by_code(project_code) + if project_summary_df["profiling_runs_ct"] or project_summary_df["test_runs_ct"]: + return False + + label="Your project is empty" + testgen.whitespace(3) + if not project_summary_df["connections_ct"]: + testgen.empty_state( + label=label, + icon=PAGE_ICON, + message=testgen.EmptyStateMessage.Connection, + action_label="Go to Connections", + link_href="connections", + ) + else: + testgen.empty_state( + label=label, + icon=PAGE_ICON, + message=testgen.EmptyStateMessage.Profiling if project_summary_df["table_groups_ct"] else testgen.EmptyStateMessage.TableGroup, + action_label="Go to Table Groups", + link_href="connections:table-groups", + link_params={ "connection_id": str(project_summary_df["default_connection_id"]) } + ) + return True + + def render_project_summary(table_groups: pd.DataFrame) -> None: project_column, _ = st.columns([.5, .5]) with project_column: + testgen.whitespace(0.3) with testgen.card(): summary_column, _ = st.columns([.8, .2]) # TODO: Uncomment and replace with below section when adding the score @@ -107,9 +173,19 @@ def render_table_group_card(table_group: pd.Series, project_code: str, key: int) ) anomaly_count = to_int(table_group["latest_anomalies_ct"]) - st.html(f""" - {anomaly_count} hygiene issues in {to_int(table_group["latest_profile_table_ct"])} tables - """) + with st.container(): + testgen.flex_row_start() + testgen.text(f""" + {to_int(table_group['latest_profile_table_ct'])} tables  |  + {to_int(table_group['latest_profile_column_ct'])} columns  | + """) + testgen.link( + label=f"{anomaly_count} hygiene issues", + href="profiling-runs:hygiene", + params={ "run_id": str(table_group["latest_profile_id"]) }, + width=150, + key=f"overview:keys:go-to-issues:{table_group['latest_profile_id']}", + ) if anomaly_count: testgen.summary_bar( @@ -131,11 +207,8 @@ def render_table_group_card(table_group: pd.Series, project_code: str, key: int) total_tests = to_int(table_group["latest_tests_ct"]) if total_tests: passed_tests = to_int(table_group["latest_tests_passed_ct"]) - - st.html(f""" -

{round(passed_tests * 100 / total_tests)}% passed

- {total_tests} tests in {to_int(table_group["latest_tests_suite_ct"])} test suites - """) + testgen.text(f"{truncate(passed_tests * 100 / total_tests)}% passed") + testgen.text(f"{total_tests} tests in {to_int(table_group['latest_tests_suite_ct'])} test suites", "margin: 12px 0 12px;") testgen.summary_bar( items=[ @@ -182,7 +255,7 @@ def render_test_suite_item(test_suite: pd.Series, column_spec: list[int]) -> Non params={ "test_suite_id": str(test_suite["id"]) }, key=f"overview:keys:go-to-definitions:{test_suite['id']}", ) - testgen.caption(f"{to_int(test_suite['last_run_test_ct'])} tests", "margin-top: -16px;") + testgen.caption(f"{to_int(test_suite['test_ct'])} tests", "margin-top: -16px;") with generation_column: if (latest_generation := test_suite["latest_auto_gen_date"]) and pd.notnull(latest_generation): @@ -221,17 +294,12 @@ def render_test_suite_item(test_suite: pd.Series, column_spec: list[int]) -> Non def get_table_groups_summary(project_code: str) -> pd.DataFrame: schema = st.session_state["dbschema"] sql = f""" - WITH latest_profile_dates AS ( - SELECT table_groups_id, - MAX(profiling_starttime) as profiling_starttime - FROM {schema}.profiling_runs - GROUP BY table_groups_id - ), - latest_profile AS ( + WITH latest_profile AS ( SELECT latest_run.table_groups_id, latest_run.id, latest_run.profiling_starttime, latest_run.table_ct, + latest_run.column_ct, latest_run.anomaly_ct, SUM( CASE @@ -261,10 +329,9 @@ def get_table_groups_summary(project_code: str) -> pd.DataFrame: ELSE 0 END ) as dismissed_ct - FROM latest_profile_dates lpd + FROM {schema}.table_groups groups LEFT JOIN {schema}.profiling_runs latest_run ON ( - lpd.table_groups_id = latest_run.table_groups_id - AND lpd.profiling_starttime = latest_run.profiling_starttime + groups.last_complete_profile_run_id = latest_run.id ) LEFT JOIN {schema}.profile_anomaly_results latest_anomalies ON ( latest_run.id = latest_anomalies.profile_run_id @@ -274,16 +341,11 @@ def get_table_groups_summary(project_code: str) -> pd.DataFrame: ) GROUP BY latest_run.id ), - latest_run_dates AS ( - SELECT test_suite_id, - MAX(test_starttime) as test_starttime - FROM {schema}.test_runs - GROUP BY test_suite_id - ), latest_tests AS ( SELECT suites.table_groups_id, + MAX(latest_run.test_starttime) AS test_starttime, COUNT(DISTINCT latest_run.test_suite_id) as test_suite_ct, - COUNT(*) as test_ct, + COUNT(latest_results.id) as test_ct, SUM( CASE WHEN COALESCE(latest_results.disposition, 'Confirmed') = 'Confirmed' @@ -318,15 +380,13 @@ def get_table_groups_summary(project_code: str) -> pd.DataFrame: ELSE 0 END ) as dismissed_ct - FROM latest_run_dates lrd + FROM {schema}.test_suites suites LEFT JOIN {schema}.test_runs latest_run ON ( - lrd.test_suite_id = latest_run.test_suite_id - AND lrd.test_starttime = latest_run.test_starttime + suites.last_complete_test_run_id = latest_run.id ) LEFT JOIN {schema}.test_results latest_results ON ( latest_run.id = latest_results.test_run_id ) - LEFT JOIN {schema}.test_suites as suites ON (suites.id = lrd.test_suite_id) GROUP BY suites.table_groups_id ) SELECT groups.id::VARCHAR(50), @@ -334,11 +394,13 @@ def get_table_groups_summary(project_code: str) -> pd.DataFrame: latest_profile.id as latest_profile_id, latest_profile.profiling_starttime as latest_profile_start, latest_profile.table_ct as latest_profile_table_ct, + latest_profile.column_ct as latest_profile_column_ct, latest_profile.anomaly_ct as latest_anomalies_ct, latest_profile.definite_ct as latest_anomalies_definite_ct, latest_profile.likely_ct as latest_anomalies_likely_ct, latest_profile.possible_ct as latest_anomalies_possible_ct, latest_profile.dismissed_ct as latest_anomalies_dismissed_ct, + latest_tests.test_starttime as latest_tests_start, latest_tests.test_suite_ct as latest_tests_suite_ct, latest_tests.test_ct as latest_tests_ct, latest_tests.passed_ct as latest_tests_passed_ct, diff --git a/testgen/ui/views/profiling_results.py b/testgen/ui/views/profiling_results.py index aa94ae6..3e81017 100644 --- a/testgen/ui/views/profiling_results.py +++ b/testgen/ui/views/profiling_results.py @@ -30,6 +30,7 @@ def render(self, run_id: str, table_name: str | None = None, column_name: str | f"Profiling run with ID '{run_id}' does not exist. Redirecting to list of Profiling Runs ...", "profiling-runs", ) + return run_date, table_group_id, table_group_name, project_code = run_parentage run_date = date_service.get_timezoned_timestamp(st.session_state, run_date) @@ -37,7 +38,7 @@ def render(self, run_id: str, table_name: str | None = None, column_name: str | testgen.page_header( "Data Profiling Results", - "https://docs.datakitchen.io/article/dataops-testgen-help/investigate-profiling", + "view-data-profiling-results", breadcrumbs=[ { "label": "Profiling Runs", "path": "profiling-runs", "params": { "project_code": project_code } }, { "label": f"{table_group_name} | {run_date}" }, @@ -51,7 +52,7 @@ def render(self, run_id: str, table_name: str | None = None, column_name: str | with table_filter_column: # Table Name filter df = profiling_queries.run_table_lookup_query(table_group_id) - table_name = testgen.toolbar_select( + table_name = testgen.select( options=df, value_column="table_name", default_value=table_name, @@ -62,7 +63,7 @@ def render(self, run_id: str, table_name: str | None = None, column_name: str | with column_filter_column: # Column Name filter df = profiling_queries.run_column_lookup_query(table_group_id, table_name) - column_name = testgen.toolbar_select( + column_name = testgen.select( options=df, value_column="column_name", default_value=column_name, @@ -105,7 +106,12 @@ def render(self, run_id: str, table_name: str | None = None, column_name: str | with st.expander("๐Ÿ“œ **Table CREATE script with suggested datatypes**"): st.code(generate_create_script(df), "sql") - selected_row = fm.render_grid_select(df, show_columns) + selected_row = fm.render_grid_select( + df, + show_columns, + bind_to_query_name="selected", + bind_to_query_prop="id", + ) with export_button_column: testgen.flex_row_end() diff --git a/testgen/ui/views/profiling_summary.py b/testgen/ui/views/profiling_runs.py similarity index 51% rename from testgen/ui/views/profiling_summary.py rename to testgen/ui/views/profiling_runs.py index b9921bc..f396b17 100644 --- a/testgen/ui/views/profiling_summary.py +++ b/testgen/ui/views/profiling_runs.py @@ -9,15 +9,19 @@ import testgen.ui.services.form_service as fm import testgen.ui.services.query_service as dq from testgen.commands.run_profiling_bridge import update_profile_run_status -from testgen.common import date_service from testgen.ui.components import widgets as testgen +from testgen.ui.components.widgets import testgen_component from testgen.ui.navigation.menu import MenuItem from testgen.ui.navigation.page import Page +from testgen.ui.queries import project_queries +from testgen.ui.services import authentication_service from testgen.ui.session import session +from testgen.ui.views.dialogs.run_profiling_dialog import run_profiling_dialog from testgen.utils import to_int FORM_DATA_WIDTH = 400 -PAGE_SIZE = 10 +PAGE_SIZE = 50 +PAGE_ICON = "data_thresholding" class DataProfilingPage(Page): @@ -25,21 +29,23 @@ class DataProfilingPage(Page): can_activate: typing.ClassVar = [ lambda: session.authentication_status, ] - menu_item = MenuItem(icon="problem", label="Data Profiling", order=1) + menu_item = MenuItem(icon=PAGE_ICON, label="Data Profiling", order=1) def render(self, project_code: str | None = None, table_group_id: str | None = None, **_kwargs) -> None: - project_code = project_code or session.project - testgen.page_header( "Profiling Runs", - "https://docs.datakitchen.io/article/dataops-testgen-help/investigate-profiling", + "investigate-profiling", ) + project_code = project_code or session.project + if render_empty_state(project_code): + return + group_filter_column, actions_column = st.columns([.3, .7], vertical_alignment="bottom") with group_filter_column: table_groups_df = get_db_table_group_choices(project_code) - table_group_id = testgen.toolbar_select( + table_group_id = testgen.select( options=table_groups_df, value_column="id", display_column="table_groups_name", @@ -48,120 +54,68 @@ def render(self, project_code: str | None = None, table_group_id: str | None = N label="Table Group", ) - testgen.flex_row_end(actions_column) + with actions_column: + testgen.flex_row_end() + + if authentication_service.current_user_has_edit_role(): + st.button( + ":material/play_arrow: Run Profiling", + help="Run profiling for a table group", + on_click=partial(run_profiling_dialog, project_code, None, table_group_id) + ) fm.render_refresh_button(actions_column) testgen.whitespace(0.5) - list_container = st.container(border=True) + list_container = st.container() profiling_runs_df = get_db_profiling_runs(project_code, table_group_id) run_count = len(profiling_runs_df) page_index = testgen.paginator(count=run_count, page_size=PAGE_SIZE) + paginated_df = profiling_runs_df[PAGE_SIZE * page_index : PAGE_SIZE * (page_index + 1)] with list_container: - testgen.css_class("bg-white") - column_spec = [.2, .2, .2, .4] - - run_column, status_column, schema_column, issues_column = st.columns(column_spec, vertical_alignment="top") - header_styles = "font-size: 12px; text-transform: uppercase; margin-bottom: 8px;" - testgen.caption("Start Time | Table Group", header_styles, run_column) - testgen.caption("Status | Duration", header_styles, status_column) - testgen.caption("Schema", header_styles, schema_column) - testgen.caption("Hygiene Issues", header_styles, issues_column) - testgen.divider(-8) - - paginated_df = profiling_runs_df[PAGE_SIZE * page_index : PAGE_SIZE * (page_index + 1)] - for index, profiling_run in paginated_df.iterrows(): - with st.container(): - render_profiling_run_row(profiling_run, column_spec) - - if (index + 1) % PAGE_SIZE and index != run_count - 1: - testgen.divider(-4, 4) - - -def render_profiling_run_row(profiling_run: pd.Series, column_spec: list[int]) -> None: - profiling_run_id = profiling_run["profiling_run_id"] - status = profiling_run["status"] - - run_column, status_column, schema_column, issues_column = st.columns(column_spec, vertical_alignment="top") - - with run_column: - start_time = date_service.get_timezoned_timestamp(st.session_state, profiling_run["start_time"]) if pd.notnull(profiling_run["start_time"]) else "--" - testgen.no_flex_gap() - testgen.text(start_time) - testgen.caption(profiling_run["table_groups_name"]) - - with status_column: - testgen.flex_row_start() - - status_display_map = { - "Running": { "label": "Running", "color": "blue" }, - "Complete": { "label": "Completed", "color": "" }, - "Error": { "label": "Error", "color": "red" }, - "Cancelled": { "label": "Canceled", "color": "purple" }, - } - status_attrs = status_display_map.get(status, { "label": "Unknown", "color": "grey" }) - - st.html(f""" -

{status_attrs["label"]}

-

{date_service.get_formatted_duration(profiling_run["duration"])}

- """) - - if status == "Error" and (log_message := profiling_run["log_message"]): - st.markdown("", help=log_message) - - if status == "Running" and pd.notnull(profiling_run["process_id"]): - if testgen.button( - type_="stroked", - label="Cancel Run", - style="width: auto; height: 32px; color: var(--purple); margin-left: 16px;", - key=f"profiling_run:keys:cancel-run:{profiling_run_id}", - ): - on_cancel_run(profiling_run) - - with schema_column: - column_count = to_int(profiling_run["column_ct"]) - testgen.no_flex_gap() - testgen.text(profiling_run["schema_name"]) - testgen.caption( - f"{to_int(profiling_run['table_ct'])} tables, {column_count} columns", - f"margin-bottom: 3px;{' color: var(--red);' if status == 'Complete' and not column_count else ''}", - ) - - if column_count: - testgen.link( - label="View results", - href="profiling-runs:results", - params={ "run_id": str(profiling_run_id) }, - right_icon="chevron_right", - height=18, - key=f"profiling_run:keys:go-to-runs:{profiling_run_id}", + testgen_component( + "profiling_runs", + props={ "items": paginated_df.to_json(orient="records") }, + event_handlers={ "RunCanceled": on_cancel_run } ) - with issues_column: - if anomaly_count := to_int(profiling_run["anomaly_ct"]): - testgen.no_flex_gap() - testgen.summary_bar( - items=[ - { "label": "Definite", "value": to_int(profiling_run["anomalies_definite_ct"]), "color": "red" }, - { "label": "Likely", "value": to_int(profiling_run["anomalies_likely_ct"]), "color": "orange" }, - { "label": "Possible", "value": to_int(profiling_run["anomalies_possible_ct"]), "color": "yellow" }, - { "label": "Dismissed", "value": to_int(profiling_run["anomalies_dismissed_ct"]), "color": "grey" }, - ], - height=10, - width=280, - ) - testgen.link( - label=f"View {anomaly_count} issues", - href="profiling-runs:hygiene", - params={ "run_id": str(profiling_run_id) }, - right_icon="chevron_right", - height=18, - key=f"profiling_run:keys:go-to-hygiene:{profiling_run_id}", - ) - else: - st.markdown("--") + +def render_empty_state(project_code: str) -> bool: + project_summary_df = project_queries.get_summary_by_code(project_code) + if project_summary_df["profiling_runs_ct"]: + return False + + label = "No profiling runs yet" + testgen.whitespace(5) + if not project_summary_df["connections_ct"]: + testgen.empty_state( + label=label, + icon=PAGE_ICON, + message=testgen.EmptyStateMessage.Connection, + action_label="Go to Connections", + link_href="connections", + ) + elif not project_summary_df["table_groups_ct"]: + testgen.empty_state( + label=label, + icon=PAGE_ICON, + message=testgen.EmptyStateMessage.TableGroup, + action_label="Go to Table Groups", + link_href="connections:table-groups", + link_params={ "connection_id": str(project_summary_df["default_connection_id"]) } + ) + else: + testgen.empty_state( + label=label, + icon=PAGE_ICON, + message=testgen.EmptyStateMessage.Profiling, + action_label="Run Profiling", + button_onclick=partial(run_profiling_dialog, project_code), + button_icon="play_arrow", + ) + return True def on_cancel_run(profiling_run: pd.Series) -> None: diff --git a/testgen/ui/views/project_settings.py b/testgen/ui/views/project_settings.py index 603d104..7c7f0c3 100644 --- a/testgen/ui/views/project_settings.py +++ b/testgen/ui/views/project_settings.py @@ -8,7 +8,7 @@ from testgen.ui.navigation.page import Page from testgen.ui.services import form_service, project_service from testgen.ui.session import session -from testgen.ui.views.app_log_modal import view_log_file +from testgen.ui.views.dialogs.application_logs_dialog import view_log_file class ProjectSettingsPage(Page): @@ -24,7 +24,7 @@ def render(self, project_code: str | None = None, **_kwargs) -> None: testgen.page_header( "Settings", - "https://docs.datakitchen.io/article/dataops-testgen-help/configuration", + "tg-project-settings", ) testgen.whitespace(1) diff --git a/testgen/ui/views/table_groups/__init__.py b/testgen/ui/views/table_groups/__init__.py new file mode 100644 index 0000000..77b5027 --- /dev/null +++ b/testgen/ui/views/table_groups/__init__.py @@ -0,0 +1,4 @@ +# ruff: noqa: F401 + +from testgen.ui.views.table_groups.forms import TableGroupForm +from testgen.ui.views.table_groups.page import TableGroupsPage diff --git a/testgen/ui/views/table_groups/forms.py b/testgen/ui/views/table_groups/forms.py new file mode 100644 index 0000000..00ae5a2 --- /dev/null +++ b/testgen/ui/views/table_groups/forms.py @@ -0,0 +1,169 @@ +# type: ignore +import typing + +from streamlit.delta_generator import DeltaGenerator + +from testgen.ui.forms import BaseForm, Field, ManualRender + +SQLFlavor = typing.Literal["redshift", "snowflake", "mssql", "postgresql"] + + +class TableGroupForm(BaseForm, ManualRender): + table_groups_name: str = Field( + default="", + min_length=1, + max_length=40, + st_kwargs_label="Table Group Name", + st_kwargs_max_chars=40, + st_kwargs_help="A unique name to describe the table group", + ) + profiling_include_mask: str = Field( + default="%", + max_length=40, + st_kwargs_label="Tables to Include Mask", + st_kwargs_max_chars=40, + st_kwargs_help="A SQL filter supported by your database's LIKE operator for table names to include", + ) + profiling_exclude_mask: str = Field( + default="tmp%", + st_kwargs_label="Tables to Exclude Mask", + st_kwargs_max_chars=40, + st_kwargs_help="A SQL filter supported by your database's LIKE operator for table names to exclude", + ) + profiling_table_set: str = Field( + default="", + st_kwargs_label="Explicit Table List", + st_kwargs_max_chars=2000, + st_kwargs_help="A list of specific table names to include, separated by commas", + ) + table_group_schema: str = Field( + default="", + min_length=1, + max_length=40, + st_kwargs_label="Schema", + st_kwargs_max_chars=40, + st_kwargs_help="The database schema containing the tables in the Table Group", + ) + profile_id_column_mask: str = Field( + default="%_id", + st_kwargs_label="Profiling ID column mask", + st_kwargs_max_chars=40, + st_kwargs_help="A SQL filter supported by your database's LIKE operator representing ID columns (optional)", + ) + profile_sk_column_mask: str = Field( + default="%_sk", + st_kwargs_label="Profiling Surrogate Key column mask", + st_kwargs_max_chars=40, + st_kwargs_help="A SQL filter supported by your database's LIKE operator representing surrogate key columns (optional)", + ) + profiling_delay_days: int = Field( + default=0, + st_kwargs_label="Min Profiling Age, Days", + st_kwargs_min_value=0, + st_kwargs_max_value=999, + st_kwargs_help="The number of days to wait before new profiling will be available to generate tests", + ) + profile_use_sampling: bool = Field( + default=True, + st_kwargs_label="Use profile sampling", + st_kwargs_help="Toggle on to base profiling on a sample of records instead of the full table", + ) + profile_sample_percent: int = Field( + default=30, + st_kwargs_label="Sample percent", + st_kwargs_min_value=1, + st_kwargs_max_value=100, + st_kwargs_help="Percent of records to include in the sample, unless the calculated count falls below the specified minimum.", + ) + profile_sample_min_count: int = Field( + default=15000, + st_kwargs_label="Min Sample Record Count", + st_kwargs_min_value=1, + st_kwargs_max_value=1000000, + st_kwargs_help="The minimum number of records to be included in any sample (if available)", + ) + data_source: str = Field( + default="", + st_kwargs_label="Data Source", + st_kwargs_max_chars=40, + st_kwargs_help="Original source of all tables in this dataset. This can be overridden at the table level. (Optional)", + ) + source_system: str = Field( + default="", + st_kwargs_label="System of Origin", + st_kwargs_max_chars=40, + st_kwargs_help="Enterprise system source for all tables in this dataset. " + "This can be overridden at the table level. (Optional)", + ) + business_domain: str = Field( + default="", + st_kwargs_label="Business Domain", + st_kwargs_max_chars=40, + st_kwargs_help="Business division responsible for all tables in this dataset. " + "e.g. Finance, Sales, Manufacturing. (Optional)", + ) + data_location: str = Field( + default="", + st_kwargs_label="Location", + st_kwargs_max_chars=40, + st_kwargs_help="Physical or virtual location of all tables in this dataset. " + "e.g. Headquarters, Cloud, etc. (Optional)", + ) + transform_level: str = Field( + default="", + st_kwargs_label="Transform Level", + st_kwargs_max_chars=40, + st_kwargs_help="Data warehouse processing layer. " + "Indicates the processing stage: e.g. Raw, Conformed, Processed, Reporting. (Optional)", + ) + source_process: str = Field( + default="", + st_kwargs_label="Source Process", + st_kwargs_max_chars=40, + st_kwargs_help="The process, program or data flow that produced this data. (Optional)", + ) + stakeholder_group: str = Field( + default="", + st_kwargs_label="Stakeholder Group", + st_kwargs_max_chars=40, + st_kwargs_help="Designator for data owners or stakeholders who are responsible for this data. (Optional)", + ) + table_group_id: int | None = Field(default=None) + + def form_key(self): + return f"table_group_form:{self.table_group_id or 'new'}" + + def render_input_ui(self, container: DeltaGenerator, _: dict) -> "TableGroupForm": + left_column, right_column = container.columns([.5, .5]) + + self.render_field("table_groups_name", left_column) + self.render_field("profiling_include_mask", left_column) + self.render_field("profiling_exclude_mask", left_column) + self.render_field("profiling_table_set", left_column) + + self.render_field("table_group_schema", right_column) + self.render_field("profile_id_column_mask", right_column) + self.render_field("profile_sk_column_mask", right_column) + self.render_field("profiling_delay_days", right_column) + + self.render_field("profile_use_sampling", container) + profile_sampling_expander = container.expander("Sampling Parameters", expanded=False) + with profile_sampling_expander: + expander_left_column, expander_right_column = profile_sampling_expander.columns([0.50, 0.50]) + self.render_field("profile_sample_percent", expander_left_column) + self.render_field("profile_sample_min_count", expander_right_column) + + provenance_expander = container.expander("Data Provenance (Optional)", expanded=False) + with provenance_expander: + provenance_left_column, provenance_right_column = provenance_expander.columns([0.50, 0.50]) + + self.render_field("data_source", provenance_left_column) + self.render_field("source_system", provenance_left_column) + self.render_field("business_domain", provenance_left_column) + self.render_field("data_location", provenance_left_column) + + self.render_field("transform_level", provenance_right_column) + self.render_field("source_process", provenance_right_column) + self.render_field("stakeholder_group", provenance_right_column) + + return self diff --git a/testgen/ui/views/table_groups.py b/testgen/ui/views/table_groups/page.py similarity index 88% rename from testgen/ui/views/table_groups.py rename to testgen/ui/views/table_groups/page.py index 1f82de5..0e53dbc 100644 --- a/testgen/ui/views/table_groups.py +++ b/testgen/ui/views/table_groups/page.py @@ -10,12 +10,12 @@ import testgen.ui.services.connection_service as connection_service import testgen.ui.services.form_service as fm import testgen.ui.services.table_group_service as table_group_service -from testgen.commands.run_profiling_bridge import run_profiling_in_background from testgen.ui.components import widgets as testgen from testgen.ui.navigation.page import Page from testgen.ui.services import project_service from testgen.ui.services.string_service import empty_if_null from testgen.ui.session import session +from testgen.ui.views.dialogs.run_profiling_dialog import run_profiling_dialog class TableGroupsPage(Page): @@ -29,7 +29,7 @@ class TableGroupsPage(Page): def render(self, connection_id: str, **_kwargs) -> None: connection = connection_service.get_by_id(connection_id, hide_passwords=False) if not connection: - self.router.navigate_with_warning( + return self.router.navigate_with_warning( f"Connection with ID '{connection_id}' does not exist. Redirecting to list of Connections ...", "connections", ) @@ -39,18 +39,30 @@ def render(self, connection_id: str, **_kwargs) -> None: testgen.page_header( "Table Groups", - "https://docs.datakitchen.io/article/dataops-testgen-help/create-a-table-group", - breadcrumbs=[ + "create-a-table-group", + breadcrumbs=[ # type: ignore { "label": "Connections", "path": "connections", "params": { "project_code": project_code } }, { "label": connection["connection_name"] }, ], ) + df = table_group_service.get_by_connection(project_code, connection_id) + + if df.empty: + testgen.whitespace(3) + testgen.empty_state( + label="No table groups yet", + icon="table_view", + message=testgen.EmptyStateMessage.TableGroup, + action_label="Add Table Group", + button_onclick=partial(self.add_table_group_dialog, project_code, connection), + ) + return + + testgen.whitespace(0.3) _, actions_column = st.columns([.1, .9], vertical_alignment="bottom") testgen.flex_row_end(actions_column) - df = table_group_service.get_by_connection(project_code, connection_id) - for _, table_group in df.iterrows(): with testgen.card(title=table_group["table_groups_name"]) as table_group_card: with table_group_card.actions: @@ -114,7 +126,7 @@ def render(self, connection_id: str, **_kwargs) -> None: testgen.button( type_="stroked", label="Run Profiling", - on_click=partial(run_profiling_dialog, table_group), + on_click=partial(run_profiling_dialog, project_code, table_group), key=f"tablegroups:keys:runprofiling:{table_group['id']}", ) @@ -155,11 +167,18 @@ def delete_table_group_dialog(self, table_group: pd.Series): ) accept_cascade_delete = st.toggle("I accept deletion of this Table Group and all related TestGen data.") - with st.form("Delete Table Group", clear_on_submit=True): + with st.form("Delete Table Group", clear_on_submit=True, border=False): disable_delete_button = authentication_service.current_user_has_read_role() or ( not can_be_deleted and not accept_cascade_delete ) - delete = st.form_submit_button("Delete", disabled=disable_delete_button, type="primary") + _, button_column = st.columns([.85, .15]) + with button_column: + delete = st.form_submit_button( + "Delete", + disabled=disable_delete_button, + type="primary", + use_container_width=True, + ) if delete: if table_group_service.are_table_groups_in_use([table_group_name]): @@ -172,44 +191,6 @@ def delete_table_group_dialog(self, table_group: pd.Series): st.rerun() -@st.dialog(title="Run Profiling") -def run_profiling_dialog(table_group: pd.Series) -> None: - table_group_id = table_group["id"] - - with st.container(): - st.markdown( - f"Execute profiling for the Table Group :green[{table_group['table_groups_name']}]?" - " Profiling will be performed in a background process" - ) - - if testgen.expander_toggle(expand_label="Show CLI command", key="test_suite:keys:run-tests-show-cli"): - st.code(f"testgen run-profile --table-group-id {table_group_id}", language="shellSession") - - button_container = st.empty() - status_container = st.empty() - - with button_container: - _, button_column = st.columns([.85, .15]) - with button_column: - profile_button = st.button("Start", use_container_width=True) - - if profile_button: - button_container.empty() - - status_container.info("Executing Profiling...") - - try: - run_profiling_in_background(table_group_id) - except Exception as e: - status_container.empty() - status_container.error(f"Process started with errors: {e!s}.") - - status_container.empty() - status_container.success( - "Process has successfully started. Check 'Data Profiling' item in the menu to see the progress." - ) - - def show_table_group_form(mode, project_code: str, connection: dict, table_group: pd.Series | None = None): connection_id = connection["connection_id"] table_groups_settings_tab, table_groups_preview_tab = st.tabs(["Table Group Settings", "Test"]) @@ -404,7 +385,7 @@ def show_table_group_form(mode, project_code: str, connection: dict, table_group success_message = "Changes have been saved successfully. " else: table_group_service.add(entity) - success_message = "New Table Group added successfully. " + success_message = "New table group added successfully. " except IntegrityError: st.error("A Table Group with the same name already exists. ") return @@ -428,8 +409,8 @@ def table_group_preview(entity, connection_id, project_code, status): status.empty() status.info("Connecting to the Table Group ...") try: - table_group_results, qc_results = table_group_service.test_table_group(entity, connection_id, project_code) - if len(table_group_results) > 0 and all(qc_results): + table_group_results = table_group_service.test_table_group(entity, connection_id, project_code) + if len(table_group_results) > 0: tables = set() columns = [] schemas = set() @@ -438,7 +419,7 @@ def table_group_preview(entity, connection_id, project_code, status): tables.add(result["table_name"]) columns.append(result["column_name"]) - show_test_results(schemas, tables, columns, qc_results) + show_test_results(schemas, tables, columns) status.empty() status.success("Operation has finished successfully.") @@ -448,8 +429,6 @@ def table_group_preview(entity, connection_id, project_code, status): error_message = "" if len(table_group_results) == 0: error_message = "Result is empty." - if not all(qc_results): - error_message = f"Error testing the connection to the Table Group. Details: {qc_results}" st.text_area("Table Group Error Details", value=error_message) except Exception as e: status.empty() @@ -458,10 +437,7 @@ def table_group_preview(entity, connection_id, project_code, status): st.text_area("Table Group Error Details", value=error_message) -def show_test_results(schemas, tables, columns, qc_results): - qc_test_results = all(qc_results) - st.markdown(f"**Utility QC Schema Validity Test**: {':white_check_mark:' if qc_test_results else ':x:'}") - +def show_test_results(schemas, tables, columns): st.markdown(f"**Schema**: {schemas.pop()}") st.markdown(f"**Column Count**: {len(columns)}") diff --git a/testgen/ui/views/test_definitions.py b/testgen/ui/views/test_definitions.py index 0892998..5fe317e 100644 --- a/testgen/ui/views/test_definitions.py +++ b/testgen/ui/views/test_definitions.py @@ -16,7 +16,7 @@ from testgen.ui.services import authentication_service, project_service from testgen.ui.services.string_service import empty_if_null, snake_case_to_title_case from testgen.ui.session import session -from testgen.ui.views.profiling_modal import view_profiling_button +from testgen.ui.views.dialogs.profiling_results_dialog import view_profiling_button LOG = logging.getLogger("testgen") @@ -43,7 +43,7 @@ def render(self, test_suite_id: str, table_name: str | None = None, column_name: testgen.page_header( "Test Definitions", - "https://docs.datakitchen.io/article/dataops-testgen-help/testgen-test-types", + "testgen-test-types", breadcrumbs=[ { "label": "Test Suites", "path": "test-suites", "params": { "project_code": project_code } }, { "label": test_suite["test_suite"] }, @@ -59,7 +59,7 @@ def render(self, test_suite_id: str, table_name: str | None = None, column_name: with table_filter_column: table_options = run_table_lookup_query(table_group["id"]) - table_name = testgen.toolbar_select( + table_name = testgen.select( options=table_options, value_column="table_name", default_value=table_name, @@ -69,7 +69,7 @@ def render(self, test_suite_id: str, table_name: str | None = None, column_name: ) with column_filter_column: column_options = get_column_names(table_group["id"], table_name) - column_name = testgen.toolbar_select( + column_name = testgen.select( options=column_options, default_value=column_name, bind_to_query="column_name", @@ -127,7 +127,7 @@ def render(self, test_suite_id: str, table_name: str | None = None, column_name: help="Delete the selected Test Definition", disabled=not selected, ): - delete_test_dialog(selected_test_def) + delete_test_dialog(selected_test_def) @st.dialog("Delete Test") @@ -156,9 +156,16 @@ def delete_test_dialog(selected_test_definition): int_data_width=700, ) - with st.form("Delete Test Definition", clear_on_submit=True): + with st.form("Delete Test Definition", clear_on_submit=True, border=False): disable_delete_button = authentication_service.current_user_has_read_role() or not can_be_deleted - delete = st.form_submit_button("Delete", disabled=disable_delete_button, type="primary") + _, button_column = st.columns([.85, .15]) + with button_column: + delete = st.form_submit_button( + "Delete", + disabled=disable_delete_button, + type="primary", + use_container_width=True, + ) if delete: test_definition_service.delete([test_definition_id]) @@ -522,6 +529,12 @@ def show_test_form( if dynamic_attribute in ["custom_query"]: show_custom_query = True + elif dynamic_attribute in ["threshold_value"]: + test_definition[dynamic_attribute] = current_column.number_input( + label=actual_dynamic_attributes_labels, + value=float(value), + help=actual_dynamic_attributes_help, + ) else: test_definition[dynamic_attribute] = current_column.text_input( label=actual_dynamic_attributes_labels, @@ -706,6 +719,8 @@ def show_test_defs_grid( do_multi_select=do_multi_select, show_column_headers=show_column_headers, render_highlights=False, + bind_to_query_name="selected", + bind_to_query_prop="id", ) with export_container: @@ -799,10 +814,11 @@ def show_test_defs_grid( _, col_profile_button = right_column.columns([0.7, 0.3]) if selected_row["test_scope"] == "column": - view_profiling_button( - col_profile_button, selected_row["table_name"], selected_row["column_name"], - str_table_groups_id=str_table_groups_id - ) + with col_profile_button: + view_profiling_button( + selected_row["table_name"], selected_row["column_name"], + str_table_groups_id=str_table_groups_id + ) with right_column: st.write(generate_test_defs_help(row_selected["test_type"])) diff --git a/testgen/ui/views/test_results.py b/testgen/ui/views/test_results.py index a521c05..b6af05e 100644 --- a/testgen/ui/views/test_results.py +++ b/testgen/ui/views/test_results.py @@ -1,21 +1,37 @@ import typing from datetime import date +from io import BytesIO import pandas as pd import plotly.express as px import plotly.graph_objects as go import streamlit as st +from streamlit.delta_generator import DeltaGenerator import testgen.ui.services.database_service as db import testgen.ui.services.form_service as fm import testgen.ui.services.query_service as dq -from testgen.common import ConcatColumnList, date_service +from testgen.common import date_service from testgen.ui.components import widgets as testgen +from testgen.ui.components.widgets.download_dialog import FILE_DATA_TYPE, download_dialog, zip_multi_file_data from testgen.ui.navigation.page import Page +from testgen.ui.pdf.test_result_report import create_report from testgen.ui.services import authentication_service, project_service from testgen.ui.services.string_service import empty_if_null +from testgen.ui.services.test_definition_service import ( + get_test_definition as get_test_definition_uncached, +) +from testgen.ui.services.test_results_service import ( + do_source_data_lookup as do_source_data_lookup_uncached, +) +from testgen.ui.services.test_results_service import ( + do_source_data_lookup_custom as do_source_data_lookup_custom_uncached, +) +from testgen.ui.services.test_results_service import ( + get_test_result_history as get_test_result_history_uncached, +) from testgen.ui.session import session -from testgen.ui.views.profiling_modal import view_profiling_button +from testgen.ui.views.dialogs.profiling_results_dialog import view_profiling_button from testgen.ui.views.test_definitions import show_test_form_by_id ALWAYS_SPIN = False @@ -28,13 +44,22 @@ class TestResultsPage(Page): lambda: "run_id" in session.current_page_args or "test-runs", ] - def render(self, run_id: str, status: str | None = None, test_type: str | None = None, **_kwargs) -> None: + def render( + self, + run_id: str, + status: str | None = None, + test_type: str | None = None, + table_name: str | None = None, + column_name: str | None = None, + **_kwargs, + ) -> None: run_parentage = get_drill_test_run(run_id) if not run_parentage: self.router.navigate_with_warning( f"Test run with ID '{run_id}' does not exist. Redirecting to list of Test Runs ...", "test-runs", ) + return run_date, test_suite_name, project_code = run_parentage run_date = date_service.get_timezoned_timestamp(st.session_state, run_date) @@ -42,24 +67,25 @@ def render(self, run_id: str, status: str | None = None, test_type: str | None = testgen.page_header( "Test Results", - "https://docs.datakitchen.io/article/dataops-testgen-help/test-results", + "view-testgen-test-results", breadcrumbs=[ { "label": "Test Runs", "path": "test-runs", "params": { "project_code": project_code } }, { "label": f"{test_suite_name} | {run_date}" }, ], ) - # Display summary bar - tests_summary = get_test_result_summary(run_id) - testgen.summary_bar(items=tests_summary, height=40, width=800) - - # Setup Toolbar - status_filter_column, test_type_filter_column, sort_column, actions_column, export_button_column = st.columns( - [.2, .2, .08, .4, .12], vertical_alignment="bottom" + summary_column, actions_column = st.columns([.5, .5], vertical_alignment="bottom") + status_filter_column, test_type_filter_column, table_filter_column, column_filter_column, sort_column, export_button_column = st.columns( + [.2, .2, .2, .2, .1, .1], vertical_alignment="bottom" ) + testgen.flex_row_end(actions_column) testgen.flex_row_end(export_button_column) + with summary_column: + tests_summary = get_test_result_summary(run_id) + testgen.summary_bar(items=tests_summary, height=20, width=800) + with status_filter_column: status_options = [ "Failed + Warning", @@ -67,16 +93,17 @@ def render(self, run_id: str, status: str | None = None, test_type: str | None = "Warning", "Passed", ] - status = testgen.toolbar_select( + status = testgen.select( options=status_options, default_value=status or "Failed + Warning", required=False, bind_to_query="status", + bind_empty_value=True, label="Result Status", ) with test_type_filter_column: - test_type = testgen.toolbar_select( + test_type = testgen.select( options=get_test_types(), value_column="test_type", display_column="test_name_short", @@ -86,6 +113,26 @@ def render(self, run_id: str, status: str | None = None, test_type: str | None = label="Test Type", ) + run_columns_df = get_test_run_columns(run_id) + with table_filter_column: + table_name = testgen.select( + options=list(run_columns_df["table_name"].unique()), + default_value=table_name, + bind_to_query="table_name", + label="Table Name", + ) + + with column_filter_column: + column_options = list(run_columns_df.loc[run_columns_df["table_name"] == table_name]["column_name"].unique()) + column_name = testgen.select( + options=column_options, + value_column="column_name", + default_value=column_name, + bind_to_query="column_name", + label="Column Name", + disabled=not table_name, + ) + with sort_column: sortable_columns = ( ("Table Name", "r.table_name"), @@ -115,7 +162,7 @@ def render(self, run_id: str, status: str | None = None, test_type: str | None = # Display main grid and retrieve selection selected = show_result_detail( - run_id, status, test_type, sorting_columns, do_multi_select, export_button_column + run_id, export_button_column, status, test_type, table_name, column_name, sorting_columns, do_multi_select ) # Need to render toolbar buttons after grid, so selection status is maintained @@ -174,25 +221,59 @@ def get_test_types(): return df -@st.cache_data(show_spinner="Retrieving Results") -def get_test_results(str_run_id, str_sel_test_status, test_type_id, sorting_columns): - schema = st.session_state["dbschema"] - return get_test_results_uncached(schema, str_run_id, str_sel_test_status, test_type_id, sorting_columns) +@st.cache_data(show_spinner="False") +def get_test_run_columns(test_run_id: str) -> pd.DataFrame: + schema: str = st.session_state["dbschema"] + sql = f""" + SELECT table_name, column_names AS column_name + FROM {schema}.test_results + WHERE test_run_id = '{test_run_id}' + ORDER BY table_name, column_names; + """ + return db.retrieve_data(sql) -def get_test_results_uncached(str_schema, str_run_id, str_sel_test_status, test_type_id=None, sorting_columns=None): +@st.cache_data(show_spinner="Retrieving Results") +def get_test_results( + run_id: str, + test_status: str | None = None, + test_type_id: str | None = None, + table_name: str | None = None, + column_name: str | None = None, + sorting_columns: list[str] | None = None, +) -> pd.DataFrame: + schema: str = st.session_state["dbschema"] + return get_test_results_uncached(schema, run_id, test_status, test_type_id, table_name, column_name, sorting_columns) + + +def get_test_results_uncached( + schema: str, + run_id: str, + test_status: str | None = None, + test_type_id: str | None = None, + table_name: str | None = None, + column_name: str | None = None, + sorting_columns: list[str] | None = None, +) -> pd.DataFrame: # First visible row first, so multi-select checkbox will render - str_order_by = "ORDER BY " + (", ".join(" ".join(col) for col in sorting_columns)) if sorting_columns else "" - test_type_clause = f"AND r.test_type = '{test_type_id}'" if test_type_id else "" - status_clause = f" AND r.result_status IN ({str_sel_test_status})" if str_sel_test_status else "" - str_sql = f""" + order_by = "ORDER BY " + (", ".join(" ".join(col) for col in sorting_columns)) if sorting_columns else "" + filters = "" + if test_status: + filters += f" AND r.result_status IN ({test_status})" + if test_type_id: + filters += f" AND r.test_type = '{test_type_id}'" + if table_name: + filters += f" AND r.table_name = '{table_name}'" + if column_name: + filters += f" AND r.column_names = '{column_name}'" + + sql = f""" WITH run_results AS (SELECT * - FROM {str_schema}.test_results r + FROM {schema}.test_results r WHERE - r.test_run_id = '{str_run_id}' - {status_clause} - {test_type_clause} + r.test_run_id = '{run_id}' + {filters} ) SELECT r.table_name, p.project_name, ts.test_suite, tg.table_groups_name, cn.connection_name, cn.project_host, cn.sql_flavor, @@ -227,33 +308,37 @@ def get_test_results_uncached(str_schema, str_run_id, str_sel_test_status, test_ WHEN r.auto_gen = TRUE THEN d.id ELSE r.test_definition_id END::VARCHAR as test_definition_id_current, - r.auto_gen + r.auto_gen, + + -- These are used in the PDF report + tt.threshold_description, tt.usage_notes, r.test_time + FROM run_results r - INNER JOIN {str_schema}.test_types tt + INNER JOIN {schema}.test_types tt ON (r.test_type = tt.test_type) - LEFT JOIN {str_schema}.test_definitions rd + LEFT JOIN {schema}.test_definitions rd ON (r.test_definition_id = rd.id) - LEFT JOIN {str_schema}.test_definitions d + LEFT JOIN {schema}.test_definitions d ON (r.test_suite_id = d.test_suite_id AND r.table_name = d.table_name AND r.column_names = COALESCE(d.column_name, 'N/A') AND r.test_type = d.test_type AND r.auto_gen = TRUE AND d.last_auto_gen_date IS NOT NULL) - INNER JOIN {str_schema}.test_suites ts + INNER JOIN {schema}.test_suites ts ON r.test_suite_id = ts.id - INNER JOIN {str_schema}.projects p + INNER JOIN {schema}.projects p ON (ts.project_code = p.project_code) - INNER JOIN {str_schema}.table_groups tg + INNER JOIN {schema}.table_groups tg ON (ts.table_groups_id = tg.id) - INNER JOIN {str_schema}.connections cn + INNER JOIN {schema}.connections cn ON (tg.connection_id = cn.connection_id) - LEFT JOIN {str_schema}.cat_test_conditions c + LEFT JOIN {schema}.cat_test_conditions c ON (cn.sql_flavor = c.sql_flavor AND r.test_type = c.test_type) - {str_order_by} ; + {order_by} ; """ - df = db.retrieve_data(str_sql) + df = db.retrieve_data(sql) # Clean Up df["test_date"] = pd.to_datetime(df["test_date"]) @@ -333,221 +418,28 @@ def get_test_result_summary(run_id): ] -@st.cache_data(show_spinner=ALWAYS_SPIN) -def get_test_result_history(str_test_type, str_test_suite_id, str_table_name, str_column_names, - str_test_definition_id, auto_gen): - str_schema = st.session_state["dbschema"] - - if auto_gen: - str_where = f""" - WHERE test_suite_id = '{str_test_suite_id}' - AND table_name = '{str_table_name}' - AND column_names = '{str_column_names}' - AND test_type = '{str_test_type}' - AND auto_gen = TRUE - """ - else: - str_where = f""" - WHERE test_definition_id_runtime = '{str_test_definition_id}' - """ - - str_sql = f""" - SELECT test_date, test_type, - test_name_short, test_name_long, measure_uom, test_operator, - threshold_value::NUMERIC, result_measure, result_status - FROM {str_schema}.v_test_results {str_where} - ORDER BY test_date DESC; - """ - - df = db.retrieve_data(str_sql) - # Clean Up - df["test_date"] = pd.to_datetime(df["test_date"]) - - return df - - @st.cache_data(show_spinner=ALWAYS_SPIN) def get_test_definition(str_test_def_id): str_schema = st.session_state["dbschema"] return get_test_definition_uncached(str_schema, str_test_def_id) -def get_test_definition_uncached(str_schema, str_test_def_id): - str_sql = f""" - SELECT d.id::VARCHAR, tt.test_name_short as test_name, tt.test_name_long as full_name, - tt.test_description as description, tt.usage_notes, - d.column_name, - d.baseline_value, d.baseline_ct, d.baseline_avg, d.baseline_sd, d.threshold_value, - d.subset_condition, d.groupby_names, d.having_condition, d.match_schema_name, - d.match_table_name, d.match_column_names, d.match_subset_condition, - d.match_groupby_names, d.match_having_condition, - d.window_date_column, d.window_days::VARCHAR as window_days, - d.custom_query, - d.severity, tt.default_severity, - d.test_active, d.lock_refresh, d.last_manual_update - FROM {str_schema}.test_definitions d - INNER JOIN {str_schema}.test_types tt - ON (d.test_type = tt.test_type) - WHERE d.id = '{str_test_def_id}'; - """ - return db.retrieve_data(str_sql) - - @st.cache_data(show_spinner=False) def do_source_data_lookup(selected_row): schema = st.session_state["dbschema"] return do_source_data_lookup_uncached(schema, selected_row) -def do_source_data_lookup_uncached(str_schema, selected_row, sql_only=False): - # Define the query - str_sql = f""" - SELECT t.lookup_query, tg.table_group_schema, c.project_qc_schema, - c.sql_flavor, c.project_host, c.project_port, c.project_db, c.project_user, c.project_pw_encrypted, - c.url, c.connect_by_url, - c.connect_by_key, c.private_key, c.private_key_passphrase - FROM {str_schema}.target_data_lookups t - INNER JOIN {str_schema}.table_groups tg - ON ('{selected_row["table_groups_id"]}'::UUID = tg.id) - INNER JOIN {str_schema}.connections c - ON (tg.connection_id = c.connection_id) - AND (t.sql_flavor = c.sql_flavor) - WHERE t.error_type = 'Test Results' - AND t.test_id = '{selected_row["test_type_id"]}' - AND t.lookup_query > ''; - """ - - def replace_parms(df_test, str_query): - if df_test.empty: - raise ValueError("This test definition is no longer present.") - - str_query = str_query.replace("{TARGET_SCHEMA}", empty_if_null(lst_query[0]["table_group_schema"])) - str_query = str_query.replace("{TABLE_NAME}", empty_if_null(selected_row["table_name"])) - str_query = str_query.replace("{COLUMN_NAME}", empty_if_null(selected_row["column_names"])) - str_query = str_query.replace("{DATA_QC_SCHEMA}", empty_if_null(lst_query[0]["project_qc_schema"])) - str_query = str_query.replace("{TEST_DATE}", str(empty_if_null(selected_row["test_date"]))) - - str_query = str_query.replace("{CUSTOM_QUERY}", empty_if_null(df_test.at[0, "custom_query"])) - str_query = str_query.replace("{BASELINE_VALUE}", empty_if_null(df_test.at[0, "baseline_value"])) - str_query = str_query.replace("{BASELINE_CT}", empty_if_null(df_test.at[0, "baseline_ct"])) - str_query = str_query.replace("{BASELINE_AVG}", empty_if_null(df_test.at[0, "baseline_avg"])) - str_query = str_query.replace("{BASELINE_SD}", empty_if_null(df_test.at[0, "baseline_sd"])) - str_query = str_query.replace("{THRESHOLD_VALUE}", empty_if_null(df_test.at[0, "threshold_value"])) - - str_substitute = empty_if_null(df_test.at[0, "subset_condition"]) - str_substitute = "1=1" if str_substitute == "" else str_substitute - str_query = str_query.replace("{SUBSET_CONDITION}", str_substitute) - - str_query = str_query.replace("{GROUPBY_NAMES}", empty_if_null(df_test.at[0, "groupby_names"])) - str_query = str_query.replace("{HAVING_CONDITION}", empty_if_null(df_test.at[0, "having_condition"])) - str_query = str_query.replace("{MATCH_SCHEMA_NAME}", empty_if_null(df_test.at[0, "match_schema_name"])) - str_query = str_query.replace("{MATCH_TABLE_NAME}", empty_if_null(df_test.at[0, "match_table_name"])) - str_query = str_query.replace("{MATCH_COLUMN_NAMES}", empty_if_null(df_test.at[0, "match_column_names"])) - - str_substitute = empty_if_null(df_test.at[0, "match_subset_condition"]) - str_substitute = "1=1" if str_substitute == "" else str_substitute - str_query = str_query.replace("{MATCH_SUBSET_CONDITION}", str_substitute) - - str_query = str_query.replace("{MATCH_GROUPBY_NAMES}", empty_if_null(df_test.at[0, "match_groupby_names"])) - str_query = str_query.replace("{MATCH_HAVING_CONDITION}", empty_if_null(df_test.at[0, "match_having_condition"])) - str_query = str_query.replace("{COLUMN_NAME_NO_QUOTES}", empty_if_null(selected_row["column_names"])) - - str_query = str_query.replace("{WINDOW_DATE_COLUMN}", empty_if_null(df_test.at[0, "window_date_column"])) - str_query = str_query.replace("{WINDOW_DAYS}", empty_if_null(df_test.at[0, "window_days"])) - - str_substitute = ConcatColumnList(selected_row["column_names"], "") - str_query = str_query.replace("{CONCAT_COLUMNS}", str_substitute) - str_substitute = ConcatColumnList(df_test.at[0, "match_groupby_names"], "") - str_query = str_query.replace("{CONCAT_MATCH_GROUPBY}", str_substitute) - - if str_query is None or str_query == "": - raise ValueError("Lookup query is not defined for this Test Type.") - return str_query - - try: - # Retrieve SQL for customer lookup - lst_query = db.retrieve_data_list(str_sql) - - if sql_only: - return lst_query, replace_parms, None - - # Retrieve and return data as df - if lst_query: - df_test = get_test_definition(selected_row["test_definition_id_current"]) - - str_sql = replace_parms(df_test, lst_query[0]["lookup_query"]) - df = db.retrieve_target_db_df( - lst_query[0]["sql_flavor"], - lst_query[0]["project_host"], - lst_query[0]["project_port"], - lst_query[0]["project_db"], - lst_query[0]["project_user"], - lst_query[0]["project_pw_encrypted"], - str_sql, - lst_query[0]["url"], - lst_query[0]["connect_by_url"], - lst_query[0]["connect_by_key"], - lst_query[0]["private_key"], - lst_query[0]["private_key_passphrase"], - ) - if df.empty: - return "ND", "Data that violates Test criteria is not present in the current dataset.", None - else: - return "OK", None, df - else: - return "NA", "A source data lookup for this Test is not available.", None - - except Exception as e: - return "ERR", f"Source data lookup query caused an error:\n\n{e.args[0]}\n\n{str_sql}", None - - @st.cache_data(show_spinner=False) def do_source_data_lookup_custom(selected_row): - str_schema = st.session_state["dbschema"] - # Define the query - str_sql = f""" - SELECT d.custom_query as lookup_query, tg.table_group_schema, c.project_qc_schema, - c.sql_flavor, c.project_host, c.project_port, c.project_db, c.project_user, c.project_pw_encrypted, - c.url, c.connect_by_url, c.connect_by_key, c.private_key, c.private_key_passphrase - FROM {str_schema}.test_definitions d - INNER JOIN {str_schema}.table_groups tg - ON ('{selected_row["table_groups_id"]}'::UUID = tg.id) - INNER JOIN {str_schema}.connections c - ON (tg.connection_id = c.connection_id) - WHERE d.id = '{selected_row["test_definition_id_current"]}'; - """ + schema = st.session_state["dbschema"] + return do_source_data_lookup_custom_uncached(schema, selected_row) - try: - # Retrieve SQL for customer lookup - lst_query = db.retrieve_data_list(str_sql) - - # Retrieve and return data as df - if lst_query: - str_sql = lst_query[0]["lookup_query"] - str_sql = str_sql.replace("{DATA_SCHEMA}", empty_if_null(lst_query[0]["table_group_schema"])) - df = db.retrieve_target_db_df( - lst_query[0]["sql_flavor"], - lst_query[0]["project_host"], - lst_query[0]["project_port"], - lst_query[0]["project_db"], - lst_query[0]["project_user"], - lst_query[0]["project_pw_encrypted"], - str_sql, - lst_query[0]["url"], - lst_query[0]["connect_by_url"], - lst_query[0]["connect_by_key"], - lst_query[0]["private_key"], - lst_query[0]["private_key_passphrase"], - ) - if df.empty: - return "ND", "Data that violates Test criteria is not present in the current dataset.", None - else: - return "OK", None, df - else: - return "NA", "A source data lookup for this Test is not available.", None - except Exception as e: - return "ERR", f"Source data lookup query caused an error:\n\n{e.args[0]}\n\n{str_sql}", None +@st.cache_data(show_spinner=False) +def get_test_result_history(selected_row): + schema = st.session_state["dbschema"] + return get_test_result_history_uncached(schema, selected_row) def show_test_def_detail(str_test_def_id): @@ -622,11 +514,20 @@ def show_test_def_detail(str_test_def_id): ) -def show_result_detail(str_run_id, str_sel_test_status, test_type_id, sorting_columns, do_multi_select, export_container): +def show_result_detail( + run_id: str, + export_container: DeltaGenerator, + test_status: str | None = None, + test_type_id: str | None = None, + table_name: str | None = None, + column_name: str | None = None, + sorting_columns: list[str] | None = None, + do_multi_select: bool = False, +): # Retrieve test results (always cached, action as null) - df = get_test_results(str_run_id, str_sel_test_status, test_type_id, sorting_columns) + df = get_test_results(run_id, test_status, test_type_id, table_name, column_name, sorting_columns) # Retrieve disposition action (cache refreshed) - df_action = get_test_disposition(str_run_id) + df_action = get_test_disposition(run_id) # Update action from disposition df action_map = df_action.set_index("id")["action"].to_dict() df["action"] = df["test_result_id"].map(action_map).fillna(df["action"]) @@ -652,7 +553,12 @@ def show_result_detail(str_run_id, str_sel_test_status, test_type_id, sorting_co ] selected_rows = fm.render_grid_select( - df, lst_show_columns, do_multi_select=do_multi_select, show_column_headers=lst_show_headers + df, + lst_show_columns, + do_multi_select=do_multi_select, + show_column_headers=lst_show_headers, + bind_to_query_name="selected", + bind_to_query_prop="test_result_id", ) with export_container: @@ -697,15 +603,8 @@ def show_result_detail(str_run_id, str_sel_test_status, test_type_id, sorting_co if not selected_rows: st.markdown(":orange[Select a record to see more information.]") else: - selected_row = selected_rows[len(selected_rows) - 1] - dfh = get_test_result_history( - selected_row["test_type"], - selected_row["test_suite_id"], - selected_row["table_name"], - selected_row["column_names"], - selected_row["test_definition_id_runtime"], - selected_row["auto_gen"] - ) + selected_row = selected_rows[0] + dfh = get_test_result_history(selected_row) show_hist_columns = ["test_date", "threshold_value", "result_measure", "result_status"] time_columns = ["test_date"] @@ -714,21 +613,65 @@ def show_result_detail(str_run_id, str_sel_test_status, test_type_id, sorting_co pg_col1, pg_col2 = st.columns([0.5, 0.5]) with pg_col2: - v_col1, v_col2, v_col3 = st.columns([0.33, 0.33, 0.33]) + v_col1, v_col2, v_col3, v_col4 = st.columns([.25, .25, .25, .25]) if authentication_service.current_user_has_edit_role(): view_edit_test(v_col1, selected_row["test_definition_id_current"]) + if selected_row["test_scope"] == "column": - view_profiling_button( - v_col2, selected_row["table_name"], selected_row["column_names"], - str_table_groups_id=selected_row["table_groups_id"] - ) - view_bad_data(v_col3, selected_row) + with v_col2: + view_profiling_button( + selected_row["table_name"], + selected_row["column_names"], + str_table_groups_id=selected_row["table_groups_id"] + ) + + with v_col3: + if st.button( + "Source Dataใ€€โ†’", help="Review current source data for highlighted result", + use_container_width=True + ): + source_data_dialog(selected_row) + + with v_col4: + + report_eligible_rows = [ + row for row in selected_rows + if row["result_status"] != "Passed" and row["disposition"] in (None, "Confirmed") + ] + + if do_multi_select: + report_btn_help = ( + "Generate PDF reports for the selected results that are not muted or dismissed and are not Passed" + ) + else: + report_btn_help = "Generate PDF report for selected result" + + if st.button( + ":material/file_save: Issue Report", + use_container_width=True, + disabled=not report_eligible_rows, + help=report_btn_help, + ): + dialog_title = "Download Issue Report" + if len(report_eligible_rows) == 1: + download_dialog( + dialog_title=dialog_title, + file_content_func=get_report_file_data, + args=(report_eligible_rows[0],), + ) + else: + zip_func = zip_multi_file_data( + "testgen_test_issue_reports.zip", + get_report_file_data, + [(arg,) for arg in selected_rows], + ) + download_dialog(dialog_title=dialog_title, file_content_func=zip_func) with pg_col1: fm.show_subheader(selected_row["test_name_short"]) st.markdown(f"###### {selected_row['test_description']}") st.caption(empty_if_null(selected_row["measure_uom_description"])) - fm.render_grid_select(dfh, show_hist_columns) + fm.render_grid_select(dfh, show_hist_columns, selection_mode="disabled") with pg_col2: ut_tab1, ut_tab2 = st.tabs(["History", "Test Definition"]) with ut_tab1: @@ -834,14 +777,6 @@ def do_disposition_update(selected, str_new_status): return str_result -def view_bad_data(button_container, selected_row): - with button_container: - if st.button( - "Source Data โ†’", help="Review current source data for highlighted result", use_container_width=True - ): - source_data_dialog(selected_row) - - @st.dialog(title="Source Data") def source_data_dialog(selected_row): st.markdown(f"#### {selected_row['test_name_short']}") @@ -855,13 +790,13 @@ def source_data_dialog(selected_row): with st.spinner("Retrieving source data..."): if selected_row["test_type"] == "CUSTOM": - bad_data_status, bad_data_msg, df_bad = do_source_data_lookup_custom(selected_row) + bad_data_status, bad_data_msg, query, df_bad = do_source_data_lookup_custom(selected_row) else: - bad_data_status, bad_data_msg, df_bad = do_source_data_lookup(selected_row) + bad_data_status, bad_data_msg, query, df_bad = do_source_data_lookup(selected_row) if bad_data_status in {"ND", "NA"}: st.info(bad_data_msg) elif bad_data_status == "ERR": - st.error(bad_data_msg) + st.error(f"{bad_data_msg}\n\n{query}") elif df_bad is None: st.error("An unknown error was encountered.") else: @@ -876,5 +811,17 @@ def source_data_dialog(selected_row): def view_edit_test(button_container, test_definition_id): with button_container: - if st.button("๐Ÿ–Š๏ธ Edit Test", help="Edit the Test Definition", use_container_width=True): + if st.button(":material/edit: Edit Test", help="Edit the Test Definition", use_container_width=True): show_test_form_by_id(test_definition_id) + + +def get_report_file_data(update_progress, tr_data) -> FILE_DATA_TYPE: + tr_id = tr_data["test_result_id"][:8] + tr_time = pd.Timestamp(tr_data["test_time"]).strftime("%Y%m%d_%H%M%S") + file_name = f"testgen_test_issue_report_{tr_id}_{tr_time}.pdf" + + with BytesIO() as buffer: + create_report(buffer, tr_data) + update_progress(1.0) + buffer.seek(0) + return file_name, "application/pdf", buffer.read() diff --git a/testgen/ui/views/test_runs.py b/testgen/ui/views/test_runs.py index e52e4a9..9edd3a8 100644 --- a/testgen/ui/views/test_runs.py +++ b/testgen/ui/views/test_runs.py @@ -9,14 +9,18 @@ import testgen.ui.services.form_service as fm import testgen.ui.services.query_service as dq import testgen.ui.services.test_run_service as test_run_service -from testgen.common import date_service from testgen.ui.components import widgets as testgen +from testgen.ui.components.widgets import testgen_component from testgen.ui.navigation.menu import MenuItem from testgen.ui.navigation.page import Page +from testgen.ui.queries import project_queries +from testgen.ui.services import authentication_service from testgen.ui.session import session +from testgen.ui.views.dialogs.run_tests_dialog import run_tests_dialog from testgen.utils import to_int -PAGE_SIZE = 10 +PAGE_SIZE = 50 +PAGE_ICON = "labs" class TestRunsPage(Page): @@ -25,21 +29,23 @@ class TestRunsPage(Page): lambda: session.authentication_status, lambda: session.project != None or "overview", ] - menu_item = MenuItem(icon="labs", label="Data Quality Testing", order=2) + menu_item = MenuItem(icon=PAGE_ICON, label="Data Quality Testing", order=2) def render(self, project_code: str | None = None, table_group_id: str | None = None, test_suite_id: str | None = None, **_kwargs) -> None: - project_code = project_code or st.session_state["project"] - testgen.page_header( "Test Runs", - "https://docs.datakitchen.io/article/dataops-testgen-help/test-results", + "test-results", ) + project_code = project_code or session.project + if render_empty_state(project_code): + return + group_filter_column, suite_filter_column, actions_column = st.columns([.3, .3, .4], vertical_alignment="bottom") with group_filter_column: table_groups_df = get_db_table_group_choices(project_code) - table_groups_id = testgen.toolbar_select( + table_group_id = testgen.select( options=table_groups_df, value_column="id", display_column="table_groups_name", @@ -49,8 +55,8 @@ def render(self, project_code: str | None = None, table_group_id: str | None = N ) with suite_filter_column: - test_suites_df = get_db_test_suite_choices(project_code, table_groups_id) - test_suite_id = testgen.toolbar_select( + test_suites_df = get_db_test_suite_choices(project_code, table_group_id) + test_suite_id = testgen.select( options=test_suites_df, value_column="id", display_column="test_suite", @@ -59,101 +65,75 @@ def render(self, project_code: str | None = None, table_group_id: str | None = N label="Test Suite", ) - testgen.flex_row_end(actions_column) + with actions_column: + testgen.flex_row_end(actions_column) + + if authentication_service.current_user_has_edit_role(): + st.button( + ":material/play_arrow: Run Tests", + help="Run tests for a test suite", + on_click=partial(run_tests_dialog, project_code, None, test_suite_id) + ) + fm.render_refresh_button(actions_column) testgen.whitespace(0.5) - list_container = st.container(border=True) - - test_runs_df = get_db_test_runs(project_code, table_groups_id, test_suite_id) + list_container = st.container() - run_count = len(test_runs_df) - page_index = testgen.paginator(count=run_count, page_size=PAGE_SIZE) + test_runs_df = get_db_test_runs(project_code, table_group_id, test_suite_id) + page_index = testgen.paginator(count=len(test_runs_df), page_size=PAGE_SIZE) + paginated_df = test_runs_df[PAGE_SIZE * page_index : PAGE_SIZE * (page_index + 1)] with list_container: - testgen.css_class("bg-white") - column_spec = [.3, .2, .5] - - run_column, status_column, results_column = st.columns(column_spec, vertical_alignment="top") - header_styles = "font-size: 12px; text-transform: uppercase; margin-bottom: 8px;" - testgen.caption("Start Time | Table Group | Test Suite", header_styles, run_column) - testgen.caption("Status | Duration", header_styles, status_column) - testgen.caption("Results Summary", header_styles, results_column) - testgen.divider(-8) - - paginated_df = test_runs_df[PAGE_SIZE * page_index : PAGE_SIZE * (page_index + 1)] - for index, test_run in paginated_df.iterrows(): - with st.container(): - render_test_run_row(test_run, column_spec) - - if (index + 1) % PAGE_SIZE and index != run_count - 1: - testgen.divider(-4, 4) - - -def render_test_run_row(test_run: pd.Series, column_spec: list[int]) -> None: - test_run_id = test_run["test_run_id"] - status = test_run["status"] + testgen_component( + "test_runs", + props={ "items": paginated_df.to_json(orient="records") }, + event_handlers={ "RunCanceled": on_cancel_run } + ) - run_column, status_column, results_column = st.columns(column_spec, vertical_alignment="top") - with run_column: - start_time = date_service.get_timezoned_timestamp(st.session_state, test_run["test_starttime"]) if pd.notnull(test_run["test_starttime"]) else "--" - testgen.no_flex_gap() - testgen.link( - label=start_time, - href="test-runs:results", - params={ "run_id": str(test_run_id) }, - height=18, - key=f"test_run:keys:go-to-run:{test_run_id}", +def render_empty_state(project_code: str) -> bool: + project_summary_df = project_queries.get_summary_by_code(project_code) + if project_summary_df["test_runs_ct"]: + return False + + label="No test runs yet" + testgen.whitespace(5) + if not project_summary_df["connections_ct"]: + testgen.empty_state( + label=label, + icon=PAGE_ICON, + message=testgen.EmptyStateMessage.Connection, + action_label="Go to Connections", + link_href="connections", ) - testgen.caption( - f"{test_run['table_groups_name']} > {test_run['test_suite']}", - "margin-top: -9px;" + elif not project_summary_df["table_groups_ct"]: + testgen.empty_state( + label=label, + icon=PAGE_ICON, + message=testgen.EmptyStateMessage.TableGroup, + action_label="Go to Table Groups", + link_href="connections:table-groups", + link_params={ "connection_id": str(project_summary_df["default_connection_id"]) } ) - - with status_column: - testgen.flex_row_start() - - status_display_map = { - "Running": { "label": "Running", "color": "blue" }, - "Complete": { "label": "Completed", "color": "" }, - "Error": { "label": "Error", "color": "red" }, - "Cancelled": { "label": "Canceled", "color": "purple" }, - } - status_attrs = status_display_map.get(status, { "label": "Unknown", "color": "grey" }) - - st.html(f""" -

{status_attrs["label"]}

-

{date_service.get_formatted_duration(test_run["duration"])}

- """) - - if status == "Error" and (log_message := test_run["log_message"]): - st.markdown("", help=log_message) - - if status == "Running" and pd.notnull(test_run["process_id"]): - if testgen.button( - type_="stroked", - label="Cancel Run", - style="width: auto; height: 32px; color: var(--purple); margin-left: 16px;", - key=f"test_run:keys:cancel-run:{test_run_id}", - ): - on_cancel_run(test_run) - - with results_column: - if to_int(test_run["test_ct"]): - testgen.summary_bar( - items=[ - { "label": "Passed", "value": to_int(test_run["passed_ct"]), "color": "green" }, - { "label": "Warning", "value": to_int(test_run["warning_ct"]), "color": "yellow" }, - { "label": "Failed", "value": to_int(test_run["failed_ct"]), "color": "red" }, - { "label": "Error", "value": to_int(test_run["error_ct"]), "color": "brown" }, - { "label": "Dismissed", "value": to_int(test_run["dismissed_ct"]), "color": "grey" }, - ], - height=10, - width=300, - ) - else: - st.markdown("--") + elif not project_summary_df["test_suites_ct"] or not project_summary_df["test_definitions_ct"]: + testgen.empty_state( + label=label, + icon=PAGE_ICON, + message=testgen.EmptyStateMessage.TestSuite, + action_label="Go to Test Suites", + link_href="test-suites", + ) + else: + testgen.empty_state( + label=label, + icon=PAGE_ICON, + message=testgen.EmptyStateMessage.TestExecution, + action_label="Run Tests", + button_onclick=partial(run_tests_dialog, project_code), + button_icon="play_arrow", + ) + return True def on_cancel_run(test_run: pd.Series) -> None: diff --git a/testgen/ui/views/test_suites.py b/testgen/ui/views/test_suites.py index 780a2ed..5af0a0a 100644 --- a/testgen/ui/views/test_suites.py +++ b/testgen/ui/views/test_suites.py @@ -9,40 +9,48 @@ import testgen.ui.services.form_service as fm import testgen.ui.services.query_service as dq import testgen.ui.services.test_suite_service as test_suite_service -from testgen.commands.run_execute_tests import run_execution_steps_in_background -from testgen.commands.run_generate_tests import run_test_gen_queries from testgen.commands.run_observability_exporter import export_test_results from testgen.common import date_service from testgen.ui.components import widgets as testgen from testgen.ui.navigation.menu import MenuItem from testgen.ui.navigation.page import Page +from testgen.ui.queries import project_queries from testgen.ui.services.string_service import empty_if_null from testgen.ui.session import session +from testgen.ui.views.dialogs.generate_tests_dialog import generate_tests_dialog +from testgen.ui.views.dialogs.run_tests_dialog import run_tests_dialog from testgen.utils import to_int +PAGE_ICON = "rule" + class TestSuitesPage(Page): path = "test-suites" can_activate: typing.ClassVar = [ lambda: session.authentication_status, ] - menu_item = MenuItem(icon="list_alt", label="Test Suites", order=3) + menu_item = MenuItem(icon=PAGE_ICON, label="Test Suites", order=3) def render(self, project_code: str | None = None, table_group_id: str | None = None, **_kwargs) -> None: - project_code = st.session_state["project"] testgen.page_header( "Test Suites", - "https://docs.datakitchen.io/article/dataops-testgen-help/create-a-test-suite", + "create-a-test-suite", ) + project_code = project_code or session.project + table_groups_df = get_db_table_group_choices(project_code) + add_button_onclick = partial(add_test_suite_dialog, project_code, table_groups_df) + + if render_empty_state(project_code, add_button_onclick): + return + group_filter_column, actions_column = st.columns([.2, .8], vertical_alignment="bottom") testgen.flex_row_end(actions_column) with group_filter_column: - df_tg = get_db_table_group_choices(project_code) - table_group_id = testgen.toolbar_select( - options=df_tg, + table_group_id = testgen.select( + options=table_groups_df, value_column="id", display_column="table_groups_name", default_value=table_group_id, @@ -59,7 +67,7 @@ def render(self, project_code: str | None = None, table_group_id: str | None = N ":material/add: Add Test Suite", key="test_suite:keys:add", help="Add a new test suite", - on_click=lambda: add_test_suite_dialog(project_code, df_tg), + on_click=add_button_onclick, ) for _, test_suite in df.iterrows(): @@ -80,7 +88,7 @@ def render(self, project_code: str | None = None, table_group_id: str | None = N icon="edit", tooltip="Edit test suite", tooltip_position="right", - on_click=partial(edit_test_suite_dialog, project_code, df_tg, test_suite), + on_click=partial(edit_test_suite_dialog, project_code, table_groups_df, test_suite), key=f"test_suite:keys:edit:{test_suite['id']}", ) testgen.button( @@ -97,7 +105,7 @@ def render(self, project_code: str | None = None, table_group_id: str | None = N with main_section: testgen.no_flex_gap() testgen.link( - label=f"{to_int(test_suite['last_run_test_ct'])} tests definitions", + label=f"{to_int(test_suite['test_ct'])} tests definitions", href="test-suites:definitions", params={ "test_suite_id": test_suite["id"] }, right_icon="chevron_right", @@ -137,20 +145,61 @@ def render(self, project_code: str | None = None, table_group_id: str | None = N if user_can_edit: with actions_section: + run_disabled = not to_int(test_suite["test_ct"]) testgen.button( type_="stroked", label="Run Tests", + tooltip="No test definitions to run" if run_disabled else None, on_click=partial(run_tests_dialog, project_code, test_suite), + disabled=run_disabled, key=f"test_suite:keys:runtests:{test_suite['id']}", ) + generate_disabled = pd.isnull(test_suite["last_complete_profile_run_id"]) testgen.button( type_="stroked", label="Generate Tests", + tooltip="No profiling data available for test generation" if generate_disabled else None, on_click=partial(generate_tests_dialog, test_suite), + disabled=generate_disabled, key=f"test_suite:keys:generatetests:{test_suite['id']}", ) +def render_empty_state(project_code: str, add_button_onclick: partial) -> bool: + project_summary_df = project_queries.get_summary_by_code(project_code) + if project_summary_df["test_suites_ct"]: + return False + + label="No test suites yet" + testgen.whitespace(5) + if not project_summary_df["connections_ct"]: + testgen.empty_state( + label=label, + icon=PAGE_ICON, + message=testgen.EmptyStateMessage.Connection, + action_label="Go to Connections", + link_href="connections", + ) + elif not project_summary_df["table_groups_ct"]: + testgen.empty_state( + label=label, + icon=PAGE_ICON, + message=testgen.EmptyStateMessage.TableGroup, + action_label="Go to Table Groups", + link_href="connections:table-groups", + link_params={ "connection_id": str(project_summary_df["default_connection_id"]) } + ) + else: + testgen.empty_state( + label=label, + icon=PAGE_ICON, + message=testgen.EmptyStateMessage.TestSuite, + action_label="Add Test Suite", + button_onclick=add_button_onclick, + ) + return True + + @st.cache_data(show_spinner=False) def get_db_table_group_choices(project_code): schema = st.session_state["dbschema"] @@ -269,7 +318,7 @@ def show_test_suite(mode, project_code, table_groups_df, selected=None): success_message = ( "Changes have been saved successfully. " if mode == "edit" - else "New TestSuite added successfully. " + else "New test suite added successfully. " ) st.success(success_message) time.sleep(1) @@ -326,123 +375,6 @@ def delete_test_suite_dialog(selected_test_suite): st.rerun() -@st.dialog(title="Run Tests") -def run_tests_dialog(project_code, selected_test_suite): - test_suite_key = selected_test_suite["test_suite"] - start_process_button_message = "Start" - - with st.container(): - st.markdown(f"Run tests for the test suite :green[{test_suite_key}]?") - - if testgen.expander_toggle(expand_label="Show CLI command", key="test_suite:keys:run-tests-show-cli"): - st.code( - f"testgen run-tests --project-key {project_code} --test-suite-key {selected_test_suite['test_suite']}", - language="shellSession" - ) - - button_container = st.empty() - status_container = st.empty() - - run_test_button = None - with button_container: - _, button_column = st.columns([.85, .15]) - with button_column: - run_test_button = st.button(start_process_button_message, use_container_width=True) - - if run_test_button: - button_container.empty() - - status_container.info(f"Running tests for test suite {test_suite_key}") - - try: - run_execution_steps_in_background(project_code, test_suite_key) - except Exception as e: - status_container.empty() - status_container.error(f"Process started with errors: {e!s}.") - - status_container.empty() - status_container.success( - "Process has successfully started. Check details in menu item 'Data Quality Testing'." - ) - - -@st.dialog(title="Generate Tests") -def generate_tests_dialog(selected_test_suite): - test_suite_id = selected_test_suite["id"] - test_suite_key = selected_test_suite["test_suite"] - table_group_id = selected_test_suite["table_groups_id"] - start_process_button_message = "Start" - - with st.container(): - st.markdown(f"Execute the test generation for test suite :green[{test_suite_key}]?") - - warning_container = st.container() - options_container = st.container() - - if testgen.expander_toggle(expand_label="Show CLI command", key="test_suite:keys:generate-tests-show-cli"): - st.code( - f"testgen run-test-generation --table-group-id {table_group_id} --test-suite-key {test_suite_key}", - language="shellSession", - ) - - button_container = st.empty() - status_container = st.empty() - - test_ct, unlocked_test_ct, unlocked_edits_ct = test_suite_service.get_test_suite_refresh_warning(test_suite_id) - if test_ct: - warning_msg = "" - counts_msg = f"\n\nAuto-Generated Tests: {test_ct}, Unlocked: {unlocked_test_ct}, Edited Unlocked: {unlocked_edits_ct}" - if unlocked_edits_ct > 0: - if unlocked_edits_ct > 1: - - warning_msg = "Manual changes have been made to auto-generated tests in this Test Suite that have not been locked. " - else: - warning_msg = "A manual change has been made to an auto-generated test in this Test Suite that has not been locked. " - elif unlocked_test_ct > 0: - warning_msg = "Auto-generated tests are present in this Test Suite that have not been locked. " - warning_msg = f"{warning_msg}Generating tests now will overwrite unlocked tests subject to auto-generation based on the latest profiling.{counts_msg}" - with warning_container: - st.warning(warning_msg) - if unlocked_edits_ct > 0: - lock_edits_button = st.button("Lock Edited Tests") - if lock_edits_button: - edits_locked = test_suite_service.lock_edited_tests(test_suite_id) - if edits_locked: - st.info("Edited tests have been successfully locked.") - - with options_container: - lst_generation_sets = test_suite_service.get_generation_set_choices() - if lst_generation_sets: - lst_generation_sets.insert(0, "(All Test Types)") - str_generation_set = st.selectbox("Generation Set", lst_generation_sets) - if str_generation_set == "(All Test Types)": - str_generation_set = "" - else: - str_generation_set = "" - - test_generation_button = None - with button_container: - _, button_column = st.columns([.85, .15]) - with button_column: - test_generation_button = st.button(start_process_button_message, use_container_width=True) - - if test_generation_button: - button_container.empty() - - table_group_id = selected_test_suite["table_groups_id"] - test_suite_key = selected_test_suite["test_suite"] - status_container.info("Executing Test Generation...") - - try: - run_test_gen_queries(table_group_id, test_suite_key, str_generation_set) - except Exception as e: - status_container.empty() - status_container.error(f"Process had errors: {e!s}.") - - status_container.empty() - status_container.success("Process has successfully finished.") - - @st.dialog(title="Export to Observability") def observability_export_dialog(selected_test_suite): project_key = selected_test_suite["project_code"] diff --git a/testgen/utils/__init__.py b/testgen/utils/__init__.py index d7475d5..40f42b6 100644 --- a/testgen/utils/__init__.py +++ b/testgen/utils/__init__.py @@ -1,7 +1,33 @@ +import math +import urllib.parse +from uuid import UUID + import pandas as pd +import streamlit as st def to_int(value: float | int) -> int: if pd.notnull(value): return int(value) return 0 + + +def truncate(value: float) -> int: + if 0 < value < 1: + return 1 + return math.trunc(value) + + +def is_uuid4(value: str) -> bool: + try: + uuid = UUID(value, version=4) + except Exception: + return False + + return str(uuid) == value + + +# https://github.com/streamlit/streamlit/issues/798#issuecomment-1647759949 +def get_base_url() -> str: + session = st.runtime.get_instance()._session_mgr.list_active_sessions()[0] + return urllib.parse.urlunparse([session.client.request.protocol, session.client.request.host, "", "", "", ""]) diff --git a/testgen/utils/singleton.py b/testgen/utils/singleton.py index 0c87de3..722f7f2 100644 --- a/testgen/utils/singleton.py +++ b/testgen/utils/singleton.py @@ -2,9 +2,9 @@ class SingletonType(type): - _instances: typing.ClassVar[dict[type, object]] = {} + _instances: typing.ClassVar[dict[type, typing.Any]] = {} - def __call__(cls, *args, **kwargs) -> typing.Any: + def __call__(cls, *args, **kwargs): if cls not in cls._instances: cls._instances[cls] = super().__call__(*args, **kwargs) return cls._instances[cls] diff --git a/tests/unit/test_read_file.py b/tests/unit/test_read_file.py new file mode 100644 index 0000000..a5aa0fd --- /dev/null +++ b/tests/unit/test_read_file.py @@ -0,0 +1,15 @@ +import pytest + +from testgen.common.read_file import replace_templated_functions + + +@pytest.mark.unit +def test_replace_templated_functions(): + fn = replace_templated_functions( + "SELECT {{DKFN_DATEDIFF_YEAR;;'{COL_NAME}'::DATE;;'1970-01-01'}} FROM ATABLE WHERE {{DKFN_DATEDIFF_MONTH;;'{COL_NAME}'::DATE;;'1970-01-01'}} > 36", + "postgresql", + ) + assert ( + fn + == "SELECT DATE_PART('year', '1970-01-01'::TIMESTAMP) - DATE_PART('year', '{COL_NAME}'::DATE::TIMESTAMP) FROM ATABLE WHERE (DATE_PART('year', '1970-01-01'::TIMESTAMP) - DATE_PART('year', '{COL_NAME}'::DATE::TIMESTAMP)) * 12 + (DATE_PART('month', '1970-01-01'::TIMESTAMP) - DATE_PART('month', '{COL_NAME}'::DATE::TIMESTAMP)) > 36" + )