Merge pull request #25 from DataKitchen/release/2.24.7

Release/2.24.7
DataKitchen · Nov 27, 2024 · 12852bd · 12852bd
2 parents ce0cc42 + 62386b3
commit 12852bd
Show file tree

Hide file tree

Showing 173 changed files with 10,187 additions and 2,647 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -2,7 +2,7 @@ FROM python:3.10-slim-bookworm AS build-image
 
 RUN mkdir -p /dk && \
     apt-get update && \
-    apt-get install -y gcc libpcre3 libpcre3-dev g++
+    apt-get install -y gcc libpcre3 libpcre3-dev g++ git
 
 COPY ./pyproject.toml /tmp/dk/
 RUN python3 -m pip install /tmp/dk --prefix=/dk

diff --git a/docs/configuration.md b/docs/configuration.md
@@ -159,12 +159,6 @@ Determine how many tests are grouped together in a single query. Increase for be
 
 default: `5000`
 
-#### `PROJECT_QC_SCHEMA`
-
-Name of the schema to be created in the project database.
-
-default: `qc`
-
 #### `PROJECT_DATABASE_NAME`
 
 Name of the database the auto generated project will run test against.

diff --git a/pyproject.toml b/pyproject.toml
@@ -8,7 +8,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "dataops-testgen"
-version = "2.15.3"
+version = "2.24.7"
 description = "DataKitchen's Data Quality DataOps TestGen"
 authors = [
     { "name" = "DataKitchen, Inc.", "email" = "info@datakitchen.io" },
@@ -32,6 +32,7 @@ requires-python = ">=3.10"
 dependencies = [
     "PyYAML==6.0.1",
     "click==8.1.3",
+    "regex==2024.9.11",
     "sqlalchemy==1.4.46",
     "snowflake-sqlalchemy==1.4.7",
     "pyodbc==5.0.0",
@@ -60,6 +61,8 @@ dependencies = [
     "concurrent_log_handler==0.9.25",
     "cryptography==42.0.8",
     "validators==0.33.0",
+    "reportlab==4.2.2",
+    "streamlit-pydantic @ git+https://github.com/LukasMasuch/streamlit-pydantic.git@9f84145b6b6e74cdff3a7815ab75b0464c4d4f24",
 ]
 
 [project.optional-dependencies]
@@ -99,7 +102,7 @@ include-package-data = true
 [tool.setuptools.package-data]
 "*" = ["*.toml", "*.sql", "*.yaml"]
 "testgen.template" = ["*.sql", "*.yaml", "**/*.sql", "**/*.yaml"]
-"testgen.ui.assets" = ["*.svg", "*.png", "*.js", "*.css"]
+"testgen.ui.assets" = ["*.svg", "*.png", "*.js", "*.css", "*.ico", "flavors/*.svg"]
 "testgen.ui.components.frontend" = ["*.html", "**/*.js", "**/*.css", "**/*.woff2", "**/*.svg"]
 
 [tool.setuptools.packages.find]
@@ -224,8 +227,9 @@ select = ["A", "F", "S", "I", "T10", "B", "UP", "ISC", "T20", "RSE", "Q", "ARG",
 # globally ignore the following error codes
 # * TRY003: Avoid specifying long messages outside the exception class
 # * S608: Hardcoded SQL
-# # F841: Unused local variable (it is instable)
-ignore = ["TRY003", "S608", "S404", "F841"]
+# * F841: Unused local variable (it is instable)
+# * B023: Buggy: https://github.com/astral-sh/ruff/issues/7847
+ignore = ["TRY003", "S608", "S404", "F841", "B023"]
 
 # Ignore the following errors in files:
 # F403 - in __init__.py: We use __all__ in our module files so this behavior is acceptable in __init__.py
@@ -237,6 +241,7 @@ ignore = ["TRY003", "S608", "S404", "F841"]
 "tests*" = ["S101", "T201"]
 "invocations/**" = ["ARG001", "T201"]
 "testgen/common/encrypt.py" = ["S413"]
+"testgen/ui/pdf/dk_logo.py" = ["T201"]
 
 # See: https://coverage.readthedocs.io/en/latest/config.html
 [tool.coverage.run]

diff --git a/testgen/__main__.py b/testgen/__main__.py
@@ -1,4 +1,3 @@
-import getpass
 import logging
 import os
 import subprocess
@@ -33,7 +32,6 @@
 from testgen.commands.run_observability_exporter import run_observability_exporter
 from testgen.commands.run_profiling_bridge import run_profiling_queries
 from testgen.commands.run_quick_start import run_quick_start, run_quick_start_increment
-from testgen.commands.run_setup_profiling_tools import run_setup_profiling_tools
 from testgen.commands.run_upgrade_db_config import get_schema_revision, is_db_revision_up_to_date, run_upgrade_db_config
 from testgen.common import (
     configure_logging,
@@ -450,84 +448,6 @@ def do_upgrade_system_version():
         click.echo("System and services upgrade is not required.")
 
 
-@cli.command(
-    "setup-target-db-functions", help="Use to set up the utility functions in the target database for running profiles."
-)
-@click.option(
-    "-c",
-    "--connection-id",
-    help="The identifier for the connection. Use a connection_id shown in list-connections.",
-    required=True,
-    type=click.STRING,
-)
-@click.option(
-    "-dr",
-    "--dry-run",
-    default=False,
-    is_flag=True,
-    required=False,
-    help="Dry run to show which schema will be modified",
-)
-@click.option(
-    "-cs",
-    "--create-qc-schema",
-    default=False,
-    is_flag=True,
-    required=False,
-    help="Create the QC utility schema required in the target database",
-)
-@click.option("--yes", "-y", default=False, is_flag=True, required=False, help="Force yes")
-@click.option(
-    "--skip-asking-credentials",
-    "-s",
-    default=False,
-    is_flag=True,
-    required=False,
-    help="Skip request for special write credentials for target database, uses standard credentials instead",
-)
-@click.option(
-    "--skip-granting-privileges",
-    "-sgp",
-    default=False,
-    is_flag=True,
-    required=False,
-    help="Skip granting execute privileges to the user for the QC utility schema in the target database",
-)
-@pass_configuration
-def setup_profiling_tools(
-    configuration: Configuration,
-    connection_id: str,
-    dry_run: bool,
-    create_qc_schema: bool,
-    yes: bool,
-    skip_asking_credentials: bool,
-    skip_granting_privileges: bool,
-):
-    db_user = None
-    db_password = None
-    if not skip_asking_credentials:
-        db_user = input("Admin DB User?")
-        db_password = getpass.getpass("Admin DB Password?")
-
-    if not yes and not dry_run:
-        confirm = input(
-            f"Are you sure you want to setup the utility functions to be able to run the profile for connection {connection_id}? [yes/No]"
-        )
-        if confirm.lower() != "yes":
-            click.echo("Exiting without any operation performed.")
-            return
-    project_qc_schema = run_setup_profiling_tools(
-        connection_id, dry_run, create_qc_schema, db_user, db_password, skip_granting_privileges
-    )
-    if not dry_run:
-        message = f"Project DB has been set up. Modified schema: {project_qc_schema}"
-    else:
-        message = (
-            f"Project DB dry run completed, no changes applied. Modified schema would have been: {project_qc_schema}"
-        )
-    click.echo(message)
-
-
 @cli.command("get-test-results", help="Fetches results for a test run.")
 @click.option(
     "-tr",

diff --git a/testgen/commands/queries/execute_cat_tests_query.py b/testgen/commands/queries/execute_cat_tests_query.py
@@ -2,6 +2,7 @@
 
 from testgen.common import date_service, read_template_sql_file
 from testgen.common.database import database_service
+from testgen.common.read_file import replace_templated_functions
 
 
 class CCATExecutionSQL:
@@ -11,13 +12,13 @@ class CCATExecutionSQL:
     test_suite = ""
     run_date = ""
     test_run_id = ""
+    table_groups_id = ""
     max_query_chars = ""
     exception_message = ""
 
     # Test Set Parameters
     target_schema = ""
     target_table = ""
-    replace_qc_schema = ""
     dctTestParms: typing.ClassVar = {}
 
     def __init__(self, strProjectCode, strTestSuiteId, strTestSuite, strSQLFlavor, max_query_chars, minutes_offset=0):
@@ -38,9 +39,8 @@ def _ReplaceParms(self, strInputString):
         strInputString = strInputString.replace("{PROJECT_CODE}", self.project_code)
         strInputString = strInputString.replace("{TEST_SUITE}", self.test_suite)
         strInputString = strInputString.replace("{TEST_SUITE_ID}", self.test_suite_id)
-        # NOTE:  REPLACE_QC_SCHEMA is parm replaced to run build query: sets the actual value to replace.
-        #        DATA_QC_SCHEMA is parm in cat_test_conditions that build query replaces via SQL.
-        strInputString = strInputString.replace("{REPLACE_QC_SCHEMA}", self.replace_qc_schema)
+        strInputString = strInputString.replace("{TABLE_GROUPS_ID}", self.table_groups_id)
+
         strInputString = strInputString.replace("{SQL_FLAVOR}", self.flavor)
         strInputString = strInputString.replace("{CONCAT_OPERATOR}", self.concat_operator)
 
@@ -60,6 +60,9 @@ def _ReplaceParms(self, strInputString):
 
         strInputString = strInputString.replace("{RUN_DATE}", self.run_date)
 
+        if "{{DKFN_" in strInputString:
+            strInputString = replace_templated_functions(strInputString, self.flavor)
+
         # Adding escape character where ':' is referenced
         strInputString = strInputString.replace(":", "\\:")
 
@@ -95,3 +98,12 @@ def FinalizeTestResultsSQL(self):
     def PushTestRunStatusUpdateSQL(self):
         strQ = self._ReplaceParms(read_template_sql_file("ex_update_test_record_in_testrun_table.sql", "execution"))
         return strQ
+
+    def FinalizeTestSuiteUpdateSQL(self):
+        strQ = self._ReplaceParms(read_template_sql_file("ex_update_test_suite.sql", "execution"))
+        return strQ
+
+
+    def TestScoringRollupSQL(self):
+        strQ = self._ReplaceParms(read_template_sql_file("test_scoring_rollup.sql", "execution"))
+        return strQ
diff --git a/testgen/commands/queries/profiling_query.py b/testgen/commands/queries/profiling_query.py
@@ -1,6 +1,7 @@
 import typing
 
 from testgen.common import date_service, read_template_sql_file, read_template_yaml_file
+from testgen.common.read_file import replace_templated_functions
 
 
 class CProfilingSQL:
@@ -13,7 +14,6 @@ class CProfilingSQL:
     table_groups_id = ""
     flavor = ""
     run_date = ""
-    data_qc_schema = ""
     data_schema = ""
     data_table = ""
 
@@ -74,7 +74,6 @@ def ReplaceParms(self, strInputString):
         strInputString = strInputString.replace("{TABLE_GROUPS_ID}", self.table_groups_id)
         strInputString = strInputString.replace("{RUN_DATE}", self.run_date)
         strInputString = strInputString.replace("{DATA_SCHEMA}", self.data_schema)
-        strInputString = strInputString.replace("{DATA_QC_SCHEMA}", self.data_qc_schema)
         strInputString = strInputString.replace("{DATA_TABLE}", self.data_table)
         strInputString = strInputString.replace("{COL_NAME}", self.col_name)
         strInputString = strInputString.replace("{COL_NAME_SANITIZED}", self.col_name.replace("'", "''"))
@@ -98,6 +97,8 @@ def ReplaceParms(self, strInputString):
         strInputString = strInputString.replace("{CONTINGENCY_COLUMNS}", self.contingency_columns)
         strInputString = strInputString.replace("{CONTINGENCY_MAX_VALUES}", self.contingency_max_values)
         strInputString = strInputString.replace("{PROCESS_ID}", str(self.process_id))
+        if "{{DKFN_" in strInputString:
+            strInputString = replace_templated_functions(strInputString, self.flavor)
 
         return strInputString
 
@@ -141,11 +142,16 @@ def GetPIIFlagUpdateQuery(self):
         strQ = self.ReplaceParms(read_template_sql_file("pii_flag.sql", sub_directory="profiling"))
         return strQ
 
-    def GetAnomalyRefreshQuery(self):
+    def GetAnomalyStatsRefreshQuery(self):
         # Runs on DK Postgres Server
         strQ = self.ReplaceParms(read_template_sql_file("refresh_anomalies.sql", sub_directory="profiling"))
         return strQ
 
+    def GetAnomalyScoringRollupQuery(self):
+        # Runs on DK Postgres Server
+        strQ = self.ReplaceParms(read_template_sql_file("profile_anomaly_scoring_rollup.sql", sub_directory="profiling"))
+        return strQ
+
     def GetAnomalyTestTypesQuery(self):
         # Runs on DK Postgres Server
         strQ = self.ReplaceParms(read_template_sql_file("profile_anomaly_types_get.sql", sub_directory="profiling"))
@@ -175,6 +181,16 @@ def GetAnomalyTestQuery(self, dct_test_type):
 
         return strQ
 
+    def GetAnomalyScoringQuery(self, dct_test_type):
+        # Runs on DK Postgres Server
+        strQ = read_template_sql_file("profile_anomaly_scoring.sql", sub_directory="profiling")
+        if strQ:
+            strQ = strQ.replace("{PROFILE_RUN_ID}", self.profile_run_id)
+            strQ = strQ.replace("{ANOMALY_ID}", dct_test_type["id"])
+            strQ = strQ.replace("{PREV_FORMULA}", dct_test_type["dq_score_prevalence_formula"])
+            strQ = strQ.replace("{RISK}", dct_test_type["dq_score_risk_factor"])
+        return strQ
+
     def GetDataCharsRefreshQuery(self):
         # Runs on DK Postgres Server
         strQ = self.ReplaceParms(
@@ -227,16 +243,6 @@ def _get_mask_query(self, mask, is_include):
             sub_query += ")"
         return sub_query
 
-    def GetFunctionCreatorQuery(self):
-        # Runs on Project DB
-        strQ = self.ReplaceParms(
-            read_template_sql_file(
-                f"project_function_creator_{self.flavor}.sql",
-                sub_directory=f"flavors/{self.flavor}/setup_profiling_tools",
-            )
-        )
-        return strQ
-
     def GetProfilingQuery(self):
         # Runs on Project DB
         if not self.dctSnippetTemplate:

diff --git a/testgen/commands/run_execute_cat_tests.py b/testgen/commands/run_execute_cat_tests.py
@@ -61,7 +61,10 @@ def ParseCATResults(clsCATExecute):
 
 
 def FinalizeTestRun(clsCATExecute):
-    lstQueries = [clsCATExecute.FinalizeTestResultsSQL(), clsCATExecute.PushTestRunStatusUpdateSQL()]
+    lstQueries = [clsCATExecute.FinalizeTestResultsSQL(),
+                  clsCATExecute.PushTestRunStatusUpdateSQL(),
+                  clsCATExecute.FinalizeTestSuiteUpdateSQL(),
+                  clsCATExecute.TestScoringRollupSQL()]
     RunActionQueryList(("DKTG"), lstQueries)
 
 
@@ -80,6 +83,7 @@ def run_cat_test_queries(
     )
     clsCATExecute.test_run_id = strTestRunID
     clsCATExecute.run_date = strTestTime
+    clsCATExecute.table_groups_id = dctParms["table_groups_id"]
     clsCATExecute.exception_message += error_msg
 
     # Set Project Connection Params in common.db_bridgers from retrieved params
@@ -119,7 +123,6 @@ def run_cat_test_queries(
             for dctTable in lstTables:
                 clsCATExecute.target_schema = dctTable["schema_name"]
                 clsCATExecute.target_table = dctTable["table_name"]
-                clsCATExecute.replace_qc_schema = dctTable["replace_qc_schema"]
                 AggregateTableTests(clsCATExecute)
 
             LOG.info("CurrentStep: Retrieving CAT Tests to Run")

diff --git a/testgen/commands/run_launch_db_config.py b/testgen/commands/run_launch_db_config.py
@@ -47,7 +47,6 @@ def _get_params_mapping() -> dict:
         "PROFILING_SAMPLE_MIN_COUNT": "",
         "PROFILING_DELAY_DAYS": "",
         "CONNECTION_NAME": settings.PROJECT_CONNECTION_NAME,
-        "PROJECT_QC_SCHEMA": settings.PROJECT_QC_SCHEMA,
         "TABLE_GROUPS_NAME": settings.DEFAULT_TABLE_GROUPS_NAME,
         "TEST_SUITE": settings.DEFAULT_TEST_SUITE_KEY,
         "TEST_SUITE_DESCRIPTION": settings.DEFAULT_TEST_SUITE_DESCRIPTION,